Ejemplo n.º 1
0
def main():
    p = optparse.OptionParser()
    p.add_option('--max_count_features',
                 '-m',
                 type=int,
                 default=1000,
                 help='max number of count features for baseline1')
    p.add_option('--sim',
                 type=str,
                 default='NPMI1s',
                 help='similarity operation (PMIs, NPMI1s, prob)')
    p.add_option('--delta',
                 '-d',
                 type=float,
                 default=0.0,
                 help='smoothing parameter')
    p.add_option('-k', type=int, default=200, help='number of eigenvalues')
    p.add_option('--sphere',
                 '-s',
                 action='store_true',
                 default=False,
                 help='normalize in sphere')
    p.add_option('--steps',
                 type=int,
                 default=1,
                 help='number of random walk steps')
    p.add_option('--rwcontext',
                 action='store_true',
                 default=False,
                 help='use context only in random walk')
    p.add_option('--save',
                 '-v',
                 action='store_true',
                 default=False,
                 help='save plot')

    opts, args = p.parse_args()

    max_count_features, sim, delta, k, sphere, steps, rwcontext, save = opts.max_count_features, opts.sim, opts.delta, opts.k, opts.sphere, opts.steps, opts.rwcontext, opts.save

    selected_attrs = pd.read_csv('selected_attrs.csv')

    fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2,
                                                 2,
                                                 sharex=False,
                                                 sharey=False,
                                                 figsize=(12, 8),
                                                 facecolor='white')
    axes = [ax0, ax1, ax2, ax3]

    for (ctr, attr_type) in enumerate(attr_types):
        print(attr_type)
        attrs_for_type = selected_attrs[selected_attrs['attributeType'] ==
                                        attr_type]

        results_df = pd.DataFrame()
        results_df['baseline'] = [('baseline%s' % b)
                                  for b in baselines] * len(ns)
        results_df['n'] = list(
            itertools.chain(*[[n for b in baselines] for n in ns]))
        for rank in ranks:
            results_df[rank] = np.zeros(len(baselines) * len(ns), dtype=int)

        for (i, n) in enumerate(ns):
            for (attr, freq) in zip(attrs_for_type['attribute'],
                                    attrs_for_type['freq']):
                if (2 * n <= freq):
                    max_mean_prec_df = pd.DataFrame(
                        columns=[('baseline%s' % b) for b in baselines])
                    for b in baselines:
                        if (b == '1'):
                            df = pd.read_csv(
                                'gplus0_lcc/baseline1/%s_%s_n%d_m%d_precision.csv'
                                % (attr_type, attr, n, max_count_features))
                            max_mean_prec_df['baseline1'] = df[
                                'mean_logreg_prec']  # logistic regression only
                        elif (b[:2] == '12'):
                            df = pd.read_csv(
                                'gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_m%d_precision.csv'
                                % (b, attr_type, attr, n, embedding, k,
                                   '_normalize' if sphere else '',
                                   max_count_features))
                            max_mean_prec_df['baseline%s' % b] = df[
                                'mean_logreg_prec']  # logistic regression only
                        elif (b == '6'):
                            df = pd.read_csv(
                                'gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv'
                                % (attr_type, attr, n, sim, delta))
                            max_mean_prec_df['baseline6'] = df[str(
                                ('mean', steps,
                                 'context' if rwcontext else 'both'))]
                        else:
                            df = pd.read_csv(
                                'gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv'
                                % (b, attr_type, attr, n, embedding, k,
                                   '_normalize' if sphere else ''))
                            max_mean_prec_df['baseline%s' % b] = df[
                                'mean_logreg_prec']  # logistic regression only
                        #max_mean_prec_df['baseline%s' % b] = df['max_mean_prec']

                    for rank in ranks:
                        (best_index, best_prec) = max(enumerate(
                            max_mean_prec_df.loc[rank - 1]),
                                                      key=lambda pair: pair[1])
                        results_df.ix[len(baselines) * i + best_index,
                                      rank] += 1

        results_agg_df = results_df.drop(['n'],
                                         axis=1).groupby('baseline').sum()

        print(results_agg_df)

        ind = np.arange(len(ranks))
        width = 0.5
        plots = []
        plot = axes[ctr].bar(range(len(ranks)),
                             results_agg_df.loc['baseline1'],
                             width=width,
                             color=colors[0])
        plots.append(plot)
        cumsums = deepcopy(np.asarray(results_agg_df.loc['baseline1']))
        for (b, c) in zip(baselines[1:], colors[1:]):
            plot = axes[ctr].bar(range(len(ranks)),
                                 results_agg_df.loc['baseline%s' % b],
                                 width=width,
                                 bottom=cumsums,
                                 color=c)
            plots.append(plot)
            cumsums += np.asarray(results_agg_df.loc['baseline%s' % b])
        axes[ctr].set_xlim((-0.5, len(ranks)))
        axes[ctr].set_title(attr_type.replace('_', ' '))

    plt.setp(axes,
             xticks=ind + width / 2,
             xticklabels=results_agg_df.columns,
             yticks=np.arange(0, 35, 5))
    fig.text(0.5, 0.04, 'rank', ha='center', fontsize=14)
    fig.text(0.07, 0.5, 'wins', va='center', rotation='vertical', fontsize=14)
    plt.figlegend(plots, keys, 'center')
    plt.suptitle("Relative performance of baselines",
                 fontsize=16,
                 fontweight='bold')
    plt.subplots_adjust(wspace=0.64, hspace=0.58)
    for ax in axes:
        rstyle(ax)

    if save:
        filename = 'gplus0_lcc/compare/wins/m%d_%s_k%d%s_steps%d%s_baseline%s_wins.png' % (
            max_count_features, embedding, k, '_normalize' if sphere else '',
            steps, '_rwcontext' if rwcontext else '', '_'.join(baselines))
        plt.savefig(filename)
    else:
        plt.show()
Ejemplo n.º 2
0
def main():
    p = optparse.OptionParser()
    p.add_option('--attr', '-a', type=str, help='attribute')
    p.add_option('--attr_type', '-t', type=str, help='attribute type')
    p.add_option(
        '--num_train_each',
        '-n',
        type=int,
        help=
        'number of training samples of True and False for the attribute (for total of 2n training samples)'
    )
    p.add_option('--max_count_features',
                 '-m',
                 type=int,
                 default=1000,
                 help='max number of count features for baseline1')
    p.add_option('--embedding',
                 '-e',
                 type=str,
                 default='adj',
                 help='embedding (adj, adj+diag, normlap, regnormlap)')
    p.add_option('--sim',
                 type=str,
                 default='NPMI1s',
                 help='similarity operation (PMIs, NPMI1s, prob)')
    p.add_option('--delta',
                 '-d',
                 type=float,
                 default=0.0,
                 help='smoothing parameter')
    p.add_option('-k', type=int, default=200, help='number of eigenvalues')
    p.add_option('--sphere',
                 '-s',
                 action='store_true',
                 default=False,
                 help='normalize in sphere')
    p.add_option('--steps',
                 type=int,
                 default=1,
                 help='number of random walk steps')
    p.add_option('--rwcontext',
                 action='store_true',
                 default=False,
                 help='use context only in random walk')
    p.add_option('-v', action='store_true', default=False, help='save results')
    p.add_option('-N',
                 type=int,
                 default=500,
                 help='top N precisions to display')

    opts, args = p.parse_args()

    attr, attr_type, num_train_each, max_count_features, embedding, sim, delta, k, sphere, steps, rwcontext, save, N = opts.attr, opts.attr_type, opts.num_train_each, opts.max_count_features, opts.embedding, opts.sim, opts.delta, opts.k, opts.sphere, opts.steps, opts.rwcontext, opts.v, opts.N

    max_mean_prec_df = pd.DataFrame(columns=['rank'] + [('baseline%s' % b)
                                                        for b in baselines])

    for b in baselines:
        if (b == '1'):
            df = pd.read_csv(
                'gplus0_lcc/baseline1/%s_%s_n%d_m%d_precision.csv' %
                (attr_type, attr, num_train_each, max_count_features))
            max_mean_prec_df['baseline1'] = df[
                'mean_logreg_prec'][:N]  # logistic regression only
        elif (b[:2] == '12'):
            df = pd.read_csv(
                'gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_m%d_precision.csv' %
                (b, attr_type, attr, num_train_each, embedding, k,
                 '_normalize' if sphere else '', max_count_features))
            max_mean_prec_df[
                'baseline%s' %
                b] = df['mean_logreg_prec'][:N]  # logistic regression only
        elif (b == '6'):
            df = pd.read_csv(
                'gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv' %
                (attr_type, attr, num_train_each, sim, delta))
            filename = 'gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv' % (
                attr_type, attr, num_train_each, sim, delta)
            if ('max_mean_prec' not in df.columns):
                cols = [col for col in df.columns if 'mean' in col]
                df['max_mean_prec'] = df[cols].max(axis=1)
                df.to_csv(filename, index=False)
            max_mean_prec_df['baseline6'] = df[str(
                ('mean', steps, 'context' if rwcontext else 'both'))][:N]
        else:
            df = pd.read_csv(
                'gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv' %
                (b, attr_type, attr, num_train_each, embedding, k,
                 '_normalize' if sphere else ''))
            filename = 'gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv' % (
                b, attr_type, attr, num_train_each, embedding, k,
                '_normalize' if sphere else '')
            if ('max_mean_prec' not in df.columns):
                df['max_mean_prec'] = df[[
                    'mean_rfc_prec', 'mean_boost_prec', 'mean_logreg_prec',
                    'mean_gnb_prec'
                ]].max(axis=1)
                df.to_csv(filename, index=False)
            max_mean_prec_df[
                'baseline%s' %
                b] = df['mean_logreg_prec'][:N]  # logistic regression only
        #max_mean_prec_df['baseline%s' % b] = df['max_mean_prec'][:N]  # max of all systems

    selected_attrs = pd.read_csv('selected_attrs.csv')
    row = selected_attrs[selected_attrs['attribute'] == attr].iloc[0]
    num_true_in_test = row['freq'] - num_train_each
    num_test = row['totalKnown'] - 2 * num_train_each
    guess_rate = num_true_in_test / num_test
    max_mean_prec_df['guess'] = guess_rate

    fig = plt.figure(figsize=(12, 8), facecolor='white')
    ax = fig.add_axes([0.1, 0.1, 0.7, 0.8])
    plots = []
    axes = []
    plots.append(
        ax.plot(max_mean_prec_df.index,
                max_mean_prec_df['guess'],
                color='black',
                linewidth=4,
                linestyle='dashed')[0])
    plots[-1].set_dash_capstyle('projecting')
    axes.append(plt.gca())
    for (b, c, sty) in zip(baselines, colors, linestyles):
        plots.append(
            ax.plot(max_mean_prec_df.index,
                    max_mean_prec_df['baseline%s' % b],
                    color=c,
                    linestyle=sty,
                    linewidth=4)[0])
        if (sty == 'dashed'):
            plots[-1].set_dash_capstyle('projecting')
        axes.append(plt.gca())
    plt.xlabel('rank', fontsize=14, labelpad=8)
    plt.ylabel('precision', fontsize=14, labelpad=12)
    plt.title("Best nomination precision\n%s: %s" %
              (attr_type.replace('_', ' '), attr),
              fontsize=16,
              fontweight='bold',
              y=1.02)
    plt.setp(axes, xticks=np.arange(0, N + 1,
                                    100))  #, yticks = np.arange(0, 1.1, 0.25))
    #plt.ylim((0.0, 1.0))
    plt.legend(plots, ['guess'] + keys, loc=(1.01, 0.5))
    for a in axes:
        rstyle(a)

    filename = 'gplus0_lcc/compare/prec/%s_%s_n%d_m%d_%s_%s_delta%s_k%d%s_steps%d%s_mean_prec.png' % (
        attr, attr_type, num_train_each, max_count_features, embedding, sim,
        str(delta), k, '_normalize' if sphere else '', steps,
        '_rwcontext' if rwcontext else '')

    if save:
        plt.savefig(filename)
    else:
        plt.show()
Ejemplo n.º 3
0
def main():
    classifier_vals = ['logreg', 'randfor', 'boost', 'kde']
    #classifier_vals = ['kde']
    embedding_info_vals = ['context', 'NPMIs', 'both']
    #sphere_content_vals = [True, False]
    sphere_content_vals = [True]
    params = {
        'classifier': classifier_vals,
        'embedding_info': embedding_info_vals,
        'sphere_content': sphere_content_vals
    }

    # free to permute these (but not remove them)
    vars_by_distinguisher = {
        'color': 'classifier',
        'xfacet': 'embedding_info',
        'yfacet': 'sphere_content'
    }
    #vars_by_distinguisher = {'color' : 'embedding_info', 'xfacet' : 'classifier', 'yfacet' : 'sphere_content'}

    numvals_by_distinguisher = {
        dist: len(params[var])
        for (dist, var) in vars_by_distinguisher.items()
    }
    cmap = plt.cm.gist_ncar
    colors = {
        j: cmap(
            int((j + 1) * cmap.N / (numvals_by_distinguisher['color'] + 1.0)))
        for j in range(numvals_by_distinguisher['color'])
    } if ('color' in vars_by_distinguisher) else {
        0: 'blue'
    }
    vars_to_suppress_in_legend = ['embedding_info', 'classifier'
                                  ]  # show values but not variable names

    gplus_attr_types = ['employer', 'major', 'places_lived', 'school']

    pd.options.display.max_rows = None
    pd.options.display.width = 1000

    topN_save = 1000  # number of precisions to save
    topN_plot = 500  # number of precisions to plot

    p = optparse.OptionParser()
    p.add_option('--attr', '-a', type=str, help='attribute')
    p.add_option('--attr_type', '-t', type=str, help='attribute type')
    p.add_option('--pos_seeds',
                 '-p',
                 type=int,
                 default=50,
                 help='number of positive seeds')
    p.add_option('--neg_seeds',
                 '-n',
                 type=int,
                 default=50,
                 help='number of negative seeds')
    p.add_option('--num_samples',
                 '-S',
                 type=int,
                 default=50,
                 help='number of Monte Carlo samples')
    p.add_option('--save_plot',
                 '-v',
                 action='store_true',
                 default=False,
                 help='save plot')
    p.add_option('--path', type=str, default='gplus0_sub', help='path to data')
    opts, args = p.parse_args()

    attr, attr_type, pos_seeds, neg_seeds, num_samples, save_plot, path = opts.attr, opts.attr_type, opts.pos_seeds, opts.neg_seeds, opts.num_samples, opts.save_plot, opts.path
    sqrt_samples = np.sqrt(num_samples)

    pm = imp.load_source('params', path + '/params.py')
    attr_filename = path + '/' + pm.attr_filename

    csv_path = 'test_gplus/%s_%s_+%d_-%d.csv' % (attr_type, attr, pos_seeds,
                                                 neg_seeds)

    try:
        print(csv_path)
        prec_df = pd.read_csv(csv_path)
    except:
        return

    n = 4690159
    print("\nCreating AttributeAnalyzer...")
    a = timeit(AttributeAnalyzer, True)(attr_filename, n, gplus_attr_types)
    ind = a.get_attribute_indicator(attr, attr_type)
    true_seeds, false_seeds = ind[ind == 1].index, ind[ind == 0].index
    num_true_seeds, num_false_seeds = len(true_seeds), len(false_seeds)
    all_seeds = set(true_seeds).union(set(false_seeds))
    assert ((num_true_seeds > 1)
            and (num_false_seeds > 1))  # can't handle this otherwise, yet
    print("\n%d known instances of %s (%d positive, %d negative)" %
          (num_true_seeds + num_false_seeds, attr_type, num_true_seeds,
           num_false_seeds))
    if (pos_seeds >= num_true_seeds):
        print("\tWarning: changing pos_seeds from %d to %d." %
              (pos_seeds, num_true_seeds - 1))
        pos_seeds = num_true_seeds - 1
    if (neg_seeds >= num_false_seeds):
        print("\tWarning: changing neg_seeds from %d to %d." %
              (neg_seeds, num_false_seeds - 1))
        neg_seeds = num_false_seeds - 1
    print("Sampling %d positive seeds, %d negative seeds" %
          (pos_seeds, neg_seeds))
    num_pos_in_test = num_true_seeds - pos_seeds
    num_test = num_true_seeds + num_false_seeds - pos_seeds - neg_seeds
    guess_rate = num_pos_in_test / num_test
    topN_save = min(topN_save, num_test)
    topN_plot = min(topN_plot, topN_save)

    try:
        pass
    except:
        # load all feature matrices, AttributeAnalyzer, identify seeds
        return
        sys.argv = ['embed', path]
        (context_features, attr_features_by_type) = embed.main(
        )  # use sim, delta, embedding, etc. from params.py file
        assert ((context_features is not None)
                and (len(attr_features_by_type) == 4))
        other_attr_types = [at for at in gplus_attr_types if (at != attr_type)]
        n = context_features.shape[0]

        # construct classifiers
        clf_dict = {
            'logreg': LogisticRegression(),
            'naive_bayes': GaussianNB(),
            'randfor': RandomForestClassifier(n_estimators=pm.num_trees),
            'boost': AdaBoostClassifier(n_estimators=pm.num_trees),
            'kde': TwoClassKDE()
        }
        prec_df = pd.DataFrame(
        )  # for storing mean & stdev topN_save precisions for each parameter combo

        # run nomination
        for embedding_info in embedding_info_vals:
            for sphere_content in sphere_content_vals:
                print("\nembedding_info = %s, sphere_content = %s" %
                      (embedding_info, str(sphere_content)))
                # stack all desired feature matrices, with or without projecting to sphere
                embedding_mats = []
                if (embedding_info != 'NPMIs'):
                    context_mat = deepcopy(context_features)
                    if pm.sphere_context:
                        normalize_mat_rows(context_mat)
                    embedding_mats.append(context_mat)
                if (embedding_info != 'context'):
                    for at in other_attr_types:
                        attr_mat = deepcopy(attr_features_by_type[at])
                        if sphere_content:
                            normalize_mat_rows(attr_mat)
                        embedding_mats.append(attr_mat)
                mat = np.hstack(embedding_mats)
                mat = StandardScaler().fit_transform(mat)
                if pm.use_pca:  # perform PCA on features, if desired
                    ncomps = mat.shape[1] if (pm.max_eig_pca is None) else min(
                        pm.max_eig_pca, mat.shape[1])
                    pca = PCA(n_components=ncomps, whiten=pm.whiten)
                    if pm.verbose:
                        print("\nPerforming PCA on feature matrix...")
                    mat = timeit(pca.fit_transform)(mat)
                    sq_sing_vals = pca.explained_variance_
                    if (pm.which_elbow > 0):
                        elbows = get_elbows(sq_sing_vals,
                                            n=pm.which_elbow,
                                            thresh=0.0)
                        k = elbows[min(len(elbows), pm.which_elbow) - 1]
                    else:
                        k = len(sq_sing_vals)
                    mat = mat[:, :k]
                precs_by_classifier = {
                    classifier: np.zeros((num_samples, topN_save))
                    for classifier in classifier_vals
                }  # top N cumulative precisions
                for s in range(num_samples):
                    print("\nSEED = %d" % s)
                    np.random.seed(s)
                    ts = true_seeds[np.random.choice(range(num_true_seeds),
                                                     pos_seeds,
                                                     replace=False)]
                    fs = false_seeds[np.random.choice(range(num_false_seeds),
                                                      neg_seeds,
                                                      replace=False)]
                    training = list(ts) + list(fs)
                    test = list(all_seeds.difference(set(training)))
                    train_in, train_out = mat[training], ind[training]
                    test_in, test_out = mat[test], ind[test]
                    for classifier in classifier_vals:
                        print("classifier = %s" % classifier)
                        clf = clf_dict[classifier]
                        if (clf == 'kde'):
                            clf.fit_with_optimal_bandwidth(
                                train_in,
                                train_out,
                                ridsize=pm.kde_cv_gridsize,
                                dynamic_range=pm.kde_cv_dynamic_range,
                                cv=pm.kde_cv_folds)
                        else:
                            clf.fit(train_in, train_out)
                        df = pd.DataFrame(index=test)
                        df['ind'] = test_out
                        df['prob'] = clf.predict_proba(test_in)[:, 1]
                        df = df.sort_values(by='prob', ascending=False)
                        prec = np.cumsum(np.asarray(
                            df['ind'])[:topN_save]) / np.arange(
                                1.0, topN_save + 1.0)
                        precs_by_classifier[classifier][s] = prec
                for classifier in classifier_vals:
                    prec_df[str(
                        (embedding_info, sphere_content, classifier,
                         'mean_prec'))] = precs_by_classifier[classifier].mean(
                             axis=0)
                    prec_df[str((
                        embedding_info, sphere_content, classifier,
                        'stderr_prec'))] = precs_by_classifier[classifier].std(
                            axis=0) / sqrt_samples
        prec_df.to_csv(csv_path, index=False)

    mean_cols = [col for col in prec_df.columns
                 if ('mean_prec' in col)]  #(col[-1] == 'mean_prec')]
    y_max = min(1.0, 1.1 * prec_df[mean_cols].max().max())

    fig, axis_grid = plt.subplots(numvals_by_distinguisher['yfacet'],
                                  numvals_by_distinguisher['xfacet'],
                                  sharex='col',
                                  sharey='row',
                                  figsize=(18, 6),
                                  facecolor='white')
    axis_grid = np.array(axis_grid).reshape(
        (numvals_by_distinguisher['yfacet'],
         numvals_by_distinguisher['xfacet']))
    plots_for_legend = []
    keys_for_legend = []
    param_dict = dict()
    for x in range(numvals_by_distinguisher['xfacet']):
        param_dict[vars_by_distinguisher['xfacet']] = params[
            vars_by_distinguisher['xfacet']][x]
        for y in range(numvals_by_distinguisher['yfacet']):
            param_dict[vars_by_distinguisher['yfacet']] = params[
                vars_by_distinguisher['yfacet']][y]
            ax = axis_grid[y, x]
            for i in range(numvals_by_distinguisher['color']):
                param_dict[vars_by_distinguisher['color']] = params[
                    vars_by_distinguisher['color']][i]
                mean_prec = prec_df[str(
                    (param_dict['embedding_info'],
                     param_dict['sphere_content'], param_dict['classifier'],
                     'mean_prec'))][:topN_plot]
                stderr_prec = prec_df[str(
                    (param_dict['embedding_info'],
                     param_dict['sphere_content'], param_dict['classifier'],
                     'stderr_prec'))][:topN_plot]
                plot, = ax.plot(np.arange(topN_plot),
                                mean_prec,
                                color=colors[i],
                                linewidth=2)
                if ('color' in vars_by_distinguisher):
                    if ((x == 0) and (y == 0)):
                        plots_for_legend.append(plot)
                        key = legend_str(
                            vars_by_distinguisher['color'],
                            params[vars_by_distinguisher['color']][i],
                            vars_by_distinguisher['color']
                            in vars_to_suppress_in_legend)
                        keys_for_legend.append(key)
                ax.fill_between(np.arange(topN_plot),
                                mean_prec - 2 * stderr_prec,
                                mean_prec + 2 * stderr_prec,
                                color=colors[i],
                                alpha=0.1)
            plot, = ax.plot(np.arange(topN_plot),
                            guess_rate * np.ones(topN_plot, dtype=float),
                            color='black',
                            linestyle='dashed',
                            linewidth=2)
            plot.set_dash_capstyle('projecting')
            if ((x == 0) and (y == 0)):
                plots_for_legend.append(plot)
                keys_for_legend.append('guess')
            ax.axvline(x=num_pos_in_test,
                       color='black',
                       linestyle='dashed',
                       linewidth=2)
            if ((numvals_by_distinguisher['yfacet'] > 1) and (x == 0)):
                ax.annotate(legend_str(
                    vars_by_distinguisher['yfacet'],
                    str(params[vars_by_distinguisher['yfacet']][y]),
                    vars_by_distinguisher['yfacet']
                    in vars_to_suppress_in_legend),
                            xy=(0, 0.5),
                            xytext=(-ax.yaxis.labelpad, 0),
                            xycoords=ax.yaxis.label,
                            textcoords='offset points',
                            ha='right',
                            va='center',
                            rotation='vertical')
            if ((numvals_by_distinguisher['xfacet'] > 1) and (y == 0)):
                ax.annotate(legend_str(
                    vars_by_distinguisher['xfacet'],
                    str(params[vars_by_distinguisher['xfacet']][x]),
                    vars_by_distinguisher['xfacet']
                    in vars_to_suppress_in_legend),
                            xy=(0.5, 1.01),
                            xytext=(0, 0),
                            xycoords='axes fraction',
                            textcoords='offset points',
                            ha='center',
                            va='baseline')
            ax.set_xlim((0, topN_plot - 1))
            ax.set_ylim((0.0, y_max))
            rstyle(ax)
            ax.patch.set_facecolor('0.89')

    this_plot_title = 'Cumulative precision plots\n%s = %s, %d +seeds, %d -seeds' % (
        attr_type, attr, pos_seeds, neg_seeds)
    fig.text(0.5, 0.02, 'rank', ha='center', fontsize=14)
    fig.text(0.02,
             0.5,
             'precision',
             va='center',
             rotation='vertical',
             fontsize=14)
    plt.figlegend(plots_for_legend, keys_for_legend, 'right', fontsize=10)
    plt.suptitle(this_plot_title, fontsize=16, fontweight='bold', y=0.99)
    plt.subplots_adjust(left=0.06, right=0.92, top=0.87)

    if save_plot:
        plot_path = 'test_gplus/%s_%s_+%d_-%d.png' % (attr_type, attr,
                                                      pos_seeds, neg_seeds)
        plt.savefig(plot_path)
    plt.show(block=False)
Ejemplo n.º 4
0
def main():
    classifier_vals = ['logreg', 'randfor', 'boost', 'kde']
    #classifier_vals = ['kde']
    embedding_info_vals = ['context', 'NPMIs', 'both']
    #sphere_content_vals = [True, False]
    sphere_content_vals = [True]
    params = {'classifier' : classifier_vals, 'embedding_info' : embedding_info_vals, 'sphere_content' : sphere_content_vals}

    # free to permute these (but not remove them)
    vars_by_distinguisher = {'color' : 'classifier', 'xfacet' : 'embedding_info', 'yfacet' : 'sphere_content'}
    #vars_by_distinguisher = {'color' : 'embedding_info', 'xfacet' : 'classifier', 'yfacet' : 'sphere_content'}

    numvals_by_distinguisher = {dist : len(params[var]) for (dist, var) in vars_by_distinguisher.items()} 
    cmap = plt.cm.gist_ncar
    colors = {j : cmap(int((j + 1) * cmap.N / (numvals_by_distinguisher['color'] + 1.0))) for j in range(numvals_by_distinguisher['color'])} if ('color' in vars_by_distinguisher) else {0 : 'blue'}
    vars_to_suppress_in_legend = ['embedding_info', 'classifier']  # show values but not variable names

    gplus_attr_types = ['employer', 'major', 'places_lived', 'school']

    pd.options.display.max_rows = None
    pd.options.display.width = 1000

    topN_save = 1000    # number of precisions to save
    topN_plot = 500     # number of precisions to plot

    p = optparse.OptionParser()
    p.add_option('--attr', '-a', type = str, help = 'attribute')
    p.add_option('--attr_type', '-t', type = str, help = 'attribute type')
    p.add_option('--pos_seeds', '-p', type = int, default = 50, help = 'number of positive seeds')
    p.add_option('--neg_seeds', '-n', type = int, default = 50, help = 'number of negative seeds')
    p.add_option('--num_samples', '-S', type = int, default = 50, help = 'number of Monte Carlo samples')
    p.add_option('--save_plot', '-v', action = 'store_true', default = False, help = 'save plot')
    p.add_option('--path', type = str, default = 'gplus0_sub', help = 'path to data')
    opts, args = p.parse_args()

    attr, attr_type, pos_seeds, neg_seeds, num_samples, save_plot, path = opts.attr, opts.attr_type, opts.pos_seeds, opts.neg_seeds, opts.num_samples, opts.save_plot, opts.path
    sqrt_samples = np.sqrt(num_samples)

    pm = imp.load_source('params', path + '/params.py')
    attr_filename = path + '/' + pm.attr_filename

    csv_path = 'test_gplus/%s_%s_+%d_-%d.csv' % (attr_type, attr, pos_seeds, neg_seeds)

    try:
        print(csv_path)
        prec_df = pd.read_csv(csv_path)
    except:
        return

    n = 4690159
    print("\nCreating AttributeAnalyzer...")
    a = timeit(AttributeAnalyzer, True)(attr_filename, n, gplus_attr_types)
    ind = a.get_attribute_indicator(attr, attr_type)
    true_seeds, false_seeds = ind[ind == 1].index, ind[ind == 0].index
    num_true_seeds, num_false_seeds = len(true_seeds), len(false_seeds)
    all_seeds = set(true_seeds).union(set(false_seeds))
    assert ((num_true_seeds > 1) and (num_false_seeds > 1))  # can't handle this otherwise, yet
    print("\n%d known instances of %s (%d positive, %d negative)" % (num_true_seeds + num_false_seeds, attr_type, num_true_seeds, num_false_seeds))
    if (pos_seeds >= num_true_seeds):
        print("\tWarning: changing pos_seeds from %d to %d." % (pos_seeds, num_true_seeds - 1))
        pos_seeds = num_true_seeds - 1
    if (neg_seeds >= num_false_seeds):
        print("\tWarning: changing neg_seeds from %d to %d." % (neg_seeds, num_false_seeds - 1))
        neg_seeds = num_false_seeds - 1
    print("Sampling %d positive seeds, %d negative seeds" % (pos_seeds, neg_seeds))
    num_pos_in_test = num_true_seeds - pos_seeds
    num_test = num_true_seeds + num_false_seeds - pos_seeds - neg_seeds
    guess_rate = num_pos_in_test / num_test
    topN_save = min(topN_save, num_test)
    topN_plot = min(topN_plot, topN_save)

    try:
        pass
    except:
        # load all feature matrices, AttributeAnalyzer, identify seeds
        return
        sys.argv = ['embed', path]
        (context_features, attr_features_by_type) = embed.main()  # use sim, delta, embedding, etc. from params.py file
        assert ((context_features is not None) and (len(attr_features_by_type) == 4))
        other_attr_types = [at for at in gplus_attr_types if (at != attr_type)]
        n = context_features.shape[0]

        # construct classifiers
        clf_dict = {'logreg' : LogisticRegression(), 'naive_bayes' : GaussianNB(), 'randfor' : RandomForestClassifier(n_estimators = pm.num_trees), 'boost' : AdaBoostClassifier(n_estimators = pm.num_trees), 'kde' : TwoClassKDE()}
        prec_df = pd.DataFrame()  # for storing mean & stdev topN_save precisions for each parameter combo

        # run nomination
        for embedding_info in embedding_info_vals:
            for sphere_content in sphere_content_vals:
                print("\nembedding_info = %s, sphere_content = %s" % (embedding_info, str(sphere_content)))
                # stack all desired feature matrices, with or without projecting to sphere
                embedding_mats = []
                if (embedding_info != 'NPMIs'):
                    context_mat = deepcopy(context_features)
                    if pm.sphere_context:
                        normalize_mat_rows(context_mat)
                    embedding_mats.append(context_mat)
                if (embedding_info != 'context'):
                    for at in other_attr_types:
                        attr_mat = deepcopy(attr_features_by_type[at])
                        if sphere_content:
                            normalize_mat_rows(attr_mat)
                        embedding_mats.append(attr_mat)
                mat = np.hstack(embedding_mats)
                mat = StandardScaler().fit_transform(mat)
                if pm.use_pca:  # perform PCA on features, if desired
                    ncomps = mat.shape[1] if (pm.max_eig_pca is None) else min(pm.max_eig_pca, mat.shape[1])
                    pca = PCA(n_components = ncomps, whiten = pm.whiten)
                    if pm.verbose:
                        print("\nPerforming PCA on feature matrix...")
                    mat = timeit(pca.fit_transform)(mat)
                    sq_sing_vals = pca.explained_variance_
                    if (pm.which_elbow > 0):
                        elbows = get_elbows(sq_sing_vals, n = pm.which_elbow, thresh = 0.0)
                        k = elbows[min(len(elbows), pm.which_elbow) - 1]
                    else:
                        k = len(sq_sing_vals)
                    mat = mat[:, :k]
                precs_by_classifier = {classifier : np.zeros((num_samples, topN_save)) for classifier in classifier_vals}  # top N cumulative precisions
                for s in range(num_samples):
                    print("\nSEED = %d" % s)
                    np.random.seed(s)
                    ts = true_seeds[np.random.choice(range(num_true_seeds), pos_seeds, replace = False)]
                    fs = false_seeds[np.random.choice(range(num_false_seeds), neg_seeds, replace = False)]
                    training = list(ts) + list(fs)
                    test = list(all_seeds.difference(set(training)))
                    train_in, train_out = mat[training], ind[training]
                    test_in, test_out = mat[test], ind[test]
                    for classifier in classifier_vals:
                        print("classifier = %s" % classifier)
                        clf = clf_dict[classifier]
                        if (clf == 'kde'):
                            clf.fit_with_optimal_bandwidth(train_in, train_out, ridsize = pm.kde_cv_gridsize, dynamic_range = pm.kde_cv_dynamic_range, cv = pm.kde_cv_folds)
                        else:
                            clf.fit(train_in, train_out)
                        df = pd.DataFrame(index = test)
                        df['ind'] = test_out
                        df['prob'] = clf.predict_proba(test_in)[:, 1]
                        df = df.sort_values(by = 'prob', ascending = False)
                        prec = np.cumsum(np.asarray(df['ind'])[:topN_save]) / np.arange(1.0, topN_save + 1.0)
                        precs_by_classifier[classifier][s] = prec
                for classifier in classifier_vals:
                    prec_df[str((embedding_info, sphere_content, classifier, 'mean_prec'))] = precs_by_classifier[classifier].mean(axis = 0)
                    prec_df[str((embedding_info, sphere_content, classifier, 'stderr_prec'))] = precs_by_classifier[classifier].std(axis = 0) / sqrt_samples
        prec_df.to_csv(csv_path, index = False)


    mean_cols = [col for col in prec_df.columns if ('mean_prec' in col)] #(col[-1] == 'mean_prec')]
    y_max = min(1.0, 1.1 * prec_df[mean_cols].max().max())


    fig, axis_grid = plt.subplots(numvals_by_distinguisher['yfacet'], numvals_by_distinguisher['xfacet'], sharex = 'col', sharey = 'row', figsize = (18, 6), facecolor = 'white')
    axis_grid = np.array(axis_grid).reshape((numvals_by_distinguisher['yfacet'], numvals_by_distinguisher['xfacet']))
    plots_for_legend = []
    keys_for_legend = []
    param_dict = dict()
    for x in range(numvals_by_distinguisher['xfacet']):
        param_dict[vars_by_distinguisher['xfacet']] = params[vars_by_distinguisher['xfacet']][x]
        for y in range(numvals_by_distinguisher['yfacet']):
            param_dict[vars_by_distinguisher['yfacet']] = params[vars_by_distinguisher['yfacet']][y]
            ax = axis_grid[y, x]
            for i in range(numvals_by_distinguisher['color']):
                param_dict[vars_by_distinguisher['color']] = params[vars_by_distinguisher['color']][i]
                mean_prec = prec_df[str((param_dict['embedding_info'], param_dict['sphere_content'], param_dict['classifier'], 'mean_prec'))][:topN_plot]
                stderr_prec = prec_df[str((param_dict['embedding_info'], param_dict['sphere_content'], param_dict['classifier'], 'stderr_prec'))][:topN_plot]
                plot, = ax.plot(np.arange(topN_plot), mean_prec, color = colors[i], linewidth = 2)
                if ('color' in vars_by_distinguisher):
                    if ((x == 0) and (y == 0)):
                        plots_for_legend.append(plot)
                        key = legend_str(vars_by_distinguisher['color'], params[vars_by_distinguisher['color']][i], vars_by_distinguisher['color'] in vars_to_suppress_in_legend)
                        keys_for_legend.append(key)
                ax.fill_between(np.arange(topN_plot), mean_prec - 2 * stderr_prec, mean_prec + 2 * stderr_prec, color = colors[i], alpha = 0.1)
            plot, = ax.plot(np.arange(topN_plot), guess_rate * np.ones(topN_plot, dtype = float), color = 'black', linestyle = 'dashed', linewidth = 2)
            plot.set_dash_capstyle('projecting')
            if ((x == 0) and (y == 0)):
                plots_for_legend.append(plot)
                keys_for_legend.append('guess')
            ax.axvline(x = num_pos_in_test, color = 'black', linestyle = 'dashed', linewidth = 2)
            if ((numvals_by_distinguisher['yfacet'] > 1) and (x == 0)):
                ax.annotate(legend_str(vars_by_distinguisher['yfacet'], str(params[vars_by_distinguisher['yfacet']][y]), vars_by_distinguisher['yfacet'] in vars_to_suppress_in_legend), xy = (0, 0.5), xytext = (-ax.yaxis.labelpad, 0), xycoords = ax.yaxis.label, textcoords = 'offset points', ha = 'right', va = 'center', rotation = 'vertical')
            if ((numvals_by_distinguisher['xfacet'] > 1) and (y == 0)):
                ax.annotate(legend_str(vars_by_distinguisher['xfacet'], str(params[vars_by_distinguisher['xfacet']][x]), vars_by_distinguisher['xfacet'] in vars_to_suppress_in_legend), xy = (0.5, 1.01), xytext = (0, 0), xycoords = 'axes fraction', textcoords = 'offset points', ha = 'center', va = 'baseline')
            ax.set_xlim((0, topN_plot - 1))
            ax.set_ylim((0.0, y_max))
            rstyle(ax)
            ax.patch.set_facecolor('0.89')

    this_plot_title = 'Cumulative precision plots\n%s = %s, %d +seeds, %d -seeds' % (attr_type, attr, pos_seeds, neg_seeds)
    fig.text(0.5, 0.02, 'rank', ha = 'center', fontsize = 14)
    fig.text(0.02, 0.5, 'precision', va = 'center', rotation = 'vertical', fontsize = 14)
    plt.figlegend(plots_for_legend, keys_for_legend, 'right', fontsize = 10)
    plt.suptitle(this_plot_title, fontsize = 16, fontweight = 'bold', y = 0.99)
    plt.subplots_adjust(left = 0.06, right = 0.92, top = 0.87)

    if save_plot:
        plot_path = 'test_gplus/%s_%s_+%d_-%d.png' % (attr_type, attr, pos_seeds, neg_seeds)
        plt.savefig(plot_path)
    plt.show(block = False)
Ejemplo n.º 5
0
def main():
    p = optparse.OptionParser()
    p.add_option('--max_count_features', '-m', type = int, default = 1000, help = 'max number of count features for baseline1')
    p.add_option('--sim', type = str, default = 'NPMI1s', help = 'similarity operation (PMIs, NPMI1s, prob)')
    p.add_option('--delta', '-d', type = float, default = 0.0, help = 'smoothing parameter')
    p.add_option('-k', type = int, default = 200, help = 'number of eigenvalues')
    p.add_option('--sphere', '-s', action = 'store_true', default = False, help = 'normalize in sphere')
    p.add_option('--steps', type = int, default = 1, help = 'number of random walk steps')
    p.add_option('--rwcontext', action = 'store_true', default = False, help = 'use context only in random walk')
    p.add_option('--save', '-v', action = 'store_true', default = False, help = 'save plot')

    opts, args = p.parse_args()

    max_count_features, sim, delta, k, sphere, steps, rwcontext, save = opts.max_count_features, opts.sim, opts.delta, opts.k, opts.sphere, opts.steps, opts.rwcontext, opts.save

    selected_attrs = pd.read_csv('selected_attrs.csv')

    fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, sharex = False, sharey = False, figsize = (12, 8), facecolor = 'white')
    axes = [ax0, ax1, ax2, ax3]

    for (ctr, attr_type) in enumerate(attr_types):
        print(attr_type)
        attrs_for_type = selected_attrs[selected_attrs['attributeType'] == attr_type]

        results_df = pd.DataFrame()
        results_df['baseline'] = [('baseline%s' % b) for b in baselines] * len(ns)
        results_df['n'] = list(itertools.chain(*[[n for b in baselines] for n in ns]))
        for rank in ranks:
            results_df[rank] = np.zeros(len(baselines) * len(ns), dtype = int)

        for (i, n) in enumerate(ns):
            for (attr, freq) in zip(attrs_for_type['attribute'], attrs_for_type['freq']):
                if (2 * n <= freq):
                    max_mean_prec_df = pd.DataFrame(columns = [('baseline%s' % b) for b in baselines])
                    for b in baselines:
                        if (b == '1'):
                            df = pd.read_csv('gplus0_lcc/baseline1/%s_%s_n%d_m%d_precision.csv' % (attr_type, attr, n, max_count_features))
                            max_mean_prec_df['baseline1'] = df['mean_logreg_prec']  # logistic regression only
                        elif (b[:2] == '12'):
                            df = pd.read_csv('gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_m%d_precision.csv' % (b, attr_type, attr, n, embedding, k, '_normalize' if sphere else '', max_count_features))
                            max_mean_prec_df['baseline%s' % b] = df['mean_logreg_prec']  # logistic regression only
                        elif (b == '6'):
                            df = pd.read_csv('gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv' % (attr_type, attr, n, sim, delta))
                            max_mean_prec_df['baseline6'] = df[str(('mean', steps, 'context' if rwcontext else 'both'))]
                        else:
                            df = pd.read_csv('gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv' % (b, attr_type, attr, n, embedding, k, '_normalize' if sphere else ''))
                            max_mean_prec_df['baseline%s' % b] = df['mean_logreg_prec']  # logistic regression only
                        #max_mean_prec_df['baseline%s' % b] = df['max_mean_prec']

                    for rank in ranks:
                        (best_index, best_prec) = max(enumerate(max_mean_prec_df.loc[rank - 1]), key = lambda pair : pair[1])
                        results_df.ix[len(baselines) * i + best_index, rank] += 1

        results_agg_df = results_df.drop(['n'], axis = 1).groupby('baseline').sum()

        print(results_agg_df)

        ind = np.arange(len(ranks))
        width = 0.5
        plots = []
        plot = axes[ctr].bar(range(len(ranks)), results_agg_df.loc['baseline1'], width = width, color = colors[0])
        plots.append(plot)
        cumsums = deepcopy(np.asarray(results_agg_df.loc['baseline1']))
        for (b, c) in zip(baselines[1:], colors[1:]):
            plot = axes[ctr].bar(range(len(ranks)), results_agg_df.loc['baseline%s' % b], width = width, bottom = cumsums, color = c)
            plots.append(plot)
            cumsums += np.asarray(results_agg_df.loc['baseline%s' % b])
        axes[ctr].set_xlim((-0.5, len(ranks)))
        axes[ctr].set_title(attr_type.replace('_', ' '))

    plt.setp(axes, xticks = ind + width / 2, xticklabels = results_agg_df.columns, yticks = np.arange(0, 35, 5))
    fig.text(0.5, 0.04, 'rank', ha = 'center', fontsize = 14)
    fig.text(0.07, 0.5, 'wins', va = 'center', rotation = 'vertical', fontsize = 14)
    plt.figlegend(plots, keys, 'center')
    plt.suptitle("Relative performance of baselines", fontsize = 16, fontweight = 'bold')
    plt.subplots_adjust(wspace = 0.64, hspace = 0.58)
    for ax in axes:
        rstyle(ax)

    if save:
        filename = 'gplus0_lcc/compare/wins/m%d_%s_k%d%s_steps%d%s_baseline%s_wins.png' % (max_count_features, embedding, k,  '_normalize' if sphere else '', steps, '_rwcontext' if rwcontext else '', '_'.join(baselines))
        plt.savefig(filename)
    else:
        plt.show()
Ejemplo n.º 6
0
def main():
    p = optparse.OptionParser()
    p.add_option("--attr", "-a", type=str, help="attribute")
    p.add_option("--attr_type", "-t", type=str, help="attribute type")
    p.add_option(
        "--num_train_each",
        "-n",
        type=int,
        help="number of training samples of True and False for the attribute (for total of 2n training samples)",
    )
    p.add_option(
        "--max_count_features", "-m", type=int, default=1000, help="max number of count features for baseline1"
    )
    p.add_option("--embedding", "-e", type=str, default="adj", help="embedding (adj, adj+diag, normlap, regnormlap)")
    p.add_option("--sim", type=str, default="NPMI1s", help="similarity operation (PMIs, NPMI1s, prob)")
    p.add_option("--delta", "-d", type=float, default=0.0, help="smoothing parameter")
    p.add_option("-k", type=int, default=200, help="number of eigenvalues")
    p.add_option("--sphere", "-s", action="store_true", default=False, help="normalize in sphere")
    p.add_option("--steps", type=int, default=1, help="number of random walk steps")
    p.add_option("--rwcontext", action="store_true", default=False, help="use context only in random walk")
    p.add_option("-v", action="store_true", default=False, help="save results")
    p.add_option("-N", type=int, default=500, help="top N precisions to display")

    opts, args = p.parse_args()

    attr, attr_type, num_train_each, max_count_features, embedding, sim, delta, k, sphere, steps, rwcontext, save, N = (
        opts.attr,
        opts.attr_type,
        opts.num_train_each,
        opts.max_count_features,
        opts.embedding,
        opts.sim,
        opts.delta,
        opts.k,
        opts.sphere,
        opts.steps,
        opts.rwcontext,
        opts.v,
        opts.N,
    )

    max_mean_prec_df = pd.DataFrame(columns=["rank"] + [("baseline%s" % b) for b in baselines])

    for b in baselines:
        if b == "1":
            df = pd.read_csv(
                "gplus0_lcc/baseline1/%s_%s_n%d_m%d_precision.csv"
                % (attr_type, attr, num_train_each, max_count_features)
            )
            max_mean_prec_df["baseline1"] = df["mean_logreg_prec"][:N]  # logistic regression only
        elif b[:2] == "12":
            df = pd.read_csv(
                "gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_m%d_precision.csv"
                % (b, attr_type, attr, num_train_each, embedding, k, "_normalize" if sphere else "", max_count_features)
            )
            max_mean_prec_df["baseline%s" % b] = df["mean_logreg_prec"][:N]  # logistic regression only
        elif b == "6":
            df = pd.read_csv(
                "gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv"
                % (attr_type, attr, num_train_each, sim, delta)
            )
            filename = "gplus0_lcc/baseline6/%s_%s_n%d_%s_delta%s_precision.csv" % (
                attr_type,
                attr,
                num_train_each,
                sim,
                delta,
            )
            if "max_mean_prec" not in df.columns:
                cols = [col for col in df.columns if "mean" in col]
                df["max_mean_prec"] = df[cols].max(axis=1)
                df.to_csv(filename, index=False)
            max_mean_prec_df["baseline6"] = df[str(("mean", steps, "context" if rwcontext else "both"))][:N]
        else:
            df = pd.read_csv(
                "gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv"
                % (b, attr_type, attr, num_train_each, embedding, k, "_normalize" if sphere else "")
            )
            filename = "gplus0_lcc/baseline%s/%s_%s_n%d_%s_k%d%s_precision.csv" % (
                b,
                attr_type,
                attr,
                num_train_each,
                embedding,
                k,
                "_normalize" if sphere else "",
            )
            if "max_mean_prec" not in df.columns:
                df["max_mean_prec"] = df[["mean_rfc_prec", "mean_boost_prec", "mean_logreg_prec", "mean_gnb_prec"]].max(
                    axis=1
                )
                df.to_csv(filename, index=False)
            max_mean_prec_df["baseline%s" % b] = df["mean_logreg_prec"][:N]  # logistic regression only
        # max_mean_prec_df['baseline%s' % b] = df['max_mean_prec'][:N]  # max of all systems

    selected_attrs = pd.read_csv("selected_attrs.csv")
    row = selected_attrs[selected_attrs["attribute"] == attr].iloc[0]
    num_true_in_test = row["freq"] - num_train_each
    num_test = row["totalKnown"] - 2 * num_train_each
    guess_rate = num_true_in_test / num_test
    max_mean_prec_df["guess"] = guess_rate

    fig = plt.figure(figsize=(12, 8), facecolor="white")
    ax = fig.add_axes([0.1, 0.1, 0.7, 0.8])
    plots = []
    axes = []
    plots.append(
        ax.plot(max_mean_prec_df.index, max_mean_prec_df["guess"], color="black", linewidth=4, linestyle="dashed")[0]
    )
    plots[-1].set_dash_capstyle("projecting")
    axes.append(plt.gca())
    for (b, c, sty) in zip(baselines, colors, linestyles):
        plots.append(
            ax.plot(max_mean_prec_df.index, max_mean_prec_df["baseline%s" % b], color=c, linestyle=sty, linewidth=4)[0]
        )
        if sty == "dashed":
            plots[-1].set_dash_capstyle("projecting")
        axes.append(plt.gca())
    plt.xlabel("rank", fontsize=14, labelpad=8)
    plt.ylabel("precision", fontsize=14, labelpad=12)
    plt.title(
        "Best nomination precision\n%s: %s" % (attr_type.replace("_", " "), attr),
        fontsize=16,
        fontweight="bold",
        y=1.02,
    )
    plt.setp(axes, xticks=np.arange(0, N + 1, 100))  # , yticks = np.arange(0, 1.1, 0.25))
    # plt.ylim((0.0, 1.0))
    plt.legend(plots, ["guess"] + keys, loc=(1.01, 0.5))
    for a in axes:
        rstyle(a)

    filename = "gplus0_lcc/compare/prec/%s_%s_n%d_m%d_%s_%s_delta%s_k%d%s_steps%d%s_mean_prec.png" % (
        attr,
        attr_type,
        num_train_each,
        max_count_features,
        embedding,
        sim,
        str(delta),
        k,
        "_normalize" if sphere else "",
        steps,
        "_rwcontext" if rwcontext else "",
    )

    if save:
        plt.savefig(filename)
    else:
        plt.show()