feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=10000, n_dev_pairs=1000000000000, vector_space_model='std', balanced_pairs=False) verifier.vectorize(verif_dataset) dev_results = verifier.fit() dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1 (pairwise):', max_f) print('getting distance table') df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev') df.to_csv('outputs/dm_no_sampl.csv') print('saved dist table!') df = pd.read_csv('outputs/dm_no_sampl.csv') df = df.set_index('id') print('loaded dist table!') top_scores = 0 if top_scores: scores = [] for author1 in df.index: for author2 in df.columns: if author1.split('_', 1)[0] != author2.split('_', 1)[0]: sc = df[author1][author2] if sc > 0:
verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id") test_df = test_df.applymap(lambda x: int(x * 1000)).corr() # heatmap plotting: sb.heatmap(test_df) ax = sb.plt.gca() for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontname('Arial') label.set_fontsize(3) sb.plt.savefig("../outputs/caesar_imposter_heatmap.pdf") sb.plt.clf()
verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id") test_df = test_df.applymap(lambda x:int(x*1000)).corr() # heatmap plotting: sb.heatmap(test_df) ax = sb.plt.gca() for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontname('Arial') label.set_fontsize(3) sb.plt.savefig("../outputs/caesar_imposter_heatmap.pdf") sb.plt.clf()
metric=dm, sample_authors=False, n_features=5000, n_dev_pairs=10000, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev) dev_results = verifier.fit() logging.info("Computing results") test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev") ax = plt.Subplot(fig, outer_grid[cnt]) linkage_matrix = linkage(test_df, 'complete') f = dendrogram(linkage_matrix, truncate_mode='lastp', show_leaf_counts=True, ax=ax, orientation='right', labels=test_df.columns, leaf_font_size=0.5, link_color_func=None, color_threshold=np.inf) tickL = ax.yaxis.get_ticklabels() for t in tickL: t.set_fontsize(1) t.set_color('grey')