def get_pca(self, components: int = 3):

        data = self.df
        results = dict()
        pca = PCA(n_components=components,
                  n_iter=100,
                  rescale_with_mean=True,
                  rescale_with_std=True,
                  copy=True,
                  check_input=True)
        results['fit'] = pca.fit(data)
        results['rotated'] = pca.fit_transform(data)
        results['feature_correlations'] = fit.column_correlations(data)

        return results
Ejemplo n.º 2
0
def run():
    # read command line arguments
    arg_parser = get_arg_parser()
    args = arg_parser.parse_args()

    # create output folder if it doesn't exist
    os.makedirs(args.path, exist_ok=True)

    print("Reading data...")

    # read all csv choices files
    dfs = [pd.read_csv(file, header=[0, 1]) for file in args.files]

    # if more than one file was read, concat them
    if len(dfs) > 1:
        # concat their columns
        choices = pd.concat(dfs, axis=1, join='inner')

        # remove duplicate columns
        choices = choices.loc[:, ~choices.columns.duplicated()]
    else:
        choices = dfs[0]

    # remove columns that will not be used
    choices.drop(columns=['drafter', 'entropy'], level=0, inplace=True)

    # get list of drafters from remaining columns
    drafters = list(choices.columns.get_level_values(0).unique())

    print("Processing data...")

    # concat 1st and 2nd players' choices into a new dataframe
    temp = pd.DataFrame(index=range(len(choices.index) * 2), columns=drafters)
    for drafter in drafters:
        drafter_columns = [
            choices[(drafter, '1st')], choices[(drafter, '2nd')]
        ]

        temp[drafter] = pd.concat(drafter_columns, ignore_index=True)

    # discard original dataframe in favor of the new one
    choices = temp

    print("Calculating similarities...")

    # initialize the similarities dataframe
    similarities = pd.DataFrame(index=drafters, columns=drafters)

    # populate the similarities dataframe
    total_rows = len(choices.index)
    for drafter1 in drafters:
        # the similarity of a drafter and itself is of 100%
        similarities[drafter1][drafter1] = 1.0

        for drafter2 in drafters:
            if drafter1 == drafter2:
                continue

            # calculate amount of equal choices
            equal_rows = (choices[drafter1] == choices[drafter2]).sum()

            # calculate similarity
            similarity = equal_rows / total_rows

            # update appropriate cells in the dataframe
            similarities[drafter2][drafter1] = similarity
            similarities[drafter1][drafter2] = similarity

    # save similarities dataframe to files
    similarities.to_pickle(args.path + '/similarities.pkl')
    similarities.to_csv(args.path + '/similarities.csv')

    print("Applying PCA...")

    # create mapping between choices and equidistant points in a circumference
    choices_to_points = {
        0: math.sin(30),
        1: math.sin(120),
        2: math.sin(210),
        3: math.cos(30),
        4: math.cos(120),
        5: math.cos(210)
    }

    # double the amount of rows to store the points' x and y
    choices = pd.concat([choices, choices + 3])

    # map choices to points
    choices = choices.applymap(choices_to_points.__getitem__)

    # apply PCA down to 3 or 3 dimensions
    pca = PCA(n_components=args.dimensions, random_state=824)
    coords = pca.fit_transform(choices.T)
    coords.columns = ['x', 'y', 'z'] if args.dimensions == 3 else ['x', 'y']

    print("Applying k-means...")

    # apply K-Means to original choices data, finding the optimal value of k
    # with the average silhouette method
    silhouettes, clusterings = [], []

    for k in range(2, len(drafters)):
        print(f"Trying k={k}", end="")

        kmeans = KMeans(n_clusters=k, random_state=824).fit(coords)
        silhouette = silhouette_score(coords, kmeans.labels_, random_state=824)

        silhouettes.append(silhouette)
        clusterings.append(kmeans.labels_)
        print(f", silhouette={silhouette}, labels={kmeans.labels_}")

    best_k = np.argmax(silhouettes) + 2

    labels = clusterings[best_k - 2]
    print(f"Best k: {best_k}. Labels: {labels}")

    # color the drafters according to their cluster
    all_colors = [
        'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
        'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'
    ]
    coords['color'] = [all_colors[cluster_id] for cluster_id in labels]

    print("All done.")

    # rename agents
    for i in range(len(drafters)):
        tokens = drafters[i].split('/')

        if len(tokens) > 1:
            battler = {'max-attack': 'MA', 'greedy': 'GR'}[tokens[-3]]
            drafter = tokens[-2]

            drafters[i] = f"{drafter}/{battler}"

    print("columns")
    print(coords.columns)

    # normalize the axes
    for axis in coords.columns[:-1]:
        coords[axis] -= coords[axis].min()
        coords[axis] /= coords[axis].max()

    print(coords)

    if args.dimensions == 3:
        # plot the PCA coordinates in 3D
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        objs = []

        for name, x, y, z, color in coords.itertuples():
            objs.append(ax.scatter(x, y, z, marker='o', c=color, label=name))

        plt.legend(objs, drafters, ncol=3, fontsize=8, loc='upper left')

        ax.set_xlabel('X Label')
        ax.set_ylabel('Y Label')
        ax.set_zlabel('Z Label')
    else:
        # plot the PCA coordinates in 2D
        plt.subplots_adjust(bottom=0.1)

        for name, x, y, color in coords.itertuples():
            plt.scatter(x, y, marker='o', label=name, c=color)

        for label, x, y in zip(drafters, coords['x'], coords['y']):
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(-20, 20),
                         textcoords='offset points',
                         ha='right',
                         va='bottom',
                         bbox=dict(boxstyle='round,pad=0.5',
                                   fc='yellow',
                                   alpha=0.5),
                         arrowprops=dict(arrowstyle='->',
                                         connectionstyle='arc3,rad=0'))

    plt.savefig(args.path +
                f'/similarities{"3D" if args.dimensions == 3 else ""}.png')

    plt.show()

    print("✅")