random_state = 1000 data_path = "../data/" corpus = "gr_articles" n_pairs = 100000 n_features = 10000 logging.info("preparing corpus") X_dev, X_test = split_corpus(prepare_corpus(data_path+corpus), controlled="authors", random_state=random_state) verifier = Verification(random_state=random_state, metric="minmax", feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=n_pairs, n_dev_pairs=n_pairs, em_iterations=100, vector_space_model="std", weight=0.2, eps=0.01, norm="l2", balanced_pairs=False) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results, test_results = verifier.predict() logging.info("Computing results") # first prec rec curve of test results: test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results) fig = sb.plt.figure() sb.plt.xlabel("recall", fontsize=10)
# first baseline: df = pd.DataFrame(columns=["vector space model"]+list(dms)) for vsm_cnt, vsm in enumerate(vsms): print("\t+ "+vsm) fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print("* "+dm) verifier = Verification(random_state=random_state, metric=dm, feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=0, n_dev_pairs=n_dev_pairs, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print "\t\t + F1: "+str(max_f) fscore_row.append(format(max_f*100, '.1f'))
df_dev = pd.DataFrame(columns=["nb_iterations"]+[str(n) for n in prop_ranges]) for i, iteration in enumerate(iteration_ranges): dev_row = [str(iteration)] print "* nr of sampling iterations: "+str(iteration) for prop in prop_ranges: print "\t+ sampling proportion: "+str(prop) verifier = Verification(n_features=mfw, feature_type="words", random_prop=prop, sample_features=True, sample_authors=True, metric=dm, text_cutoff=None, sample_iterations=iteration, n_potential_imposters=60, n_actual_imposters=10, n_test_pairs=0, n_dev_pairs=n_dev_pairs, random_state=random_state, top_rank=10, vector_space_model=vsm, balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") # get dev results: dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) dev_row.append(max_f)
# set fig params fig = sb.plt.figure(figsize=(len(vsms), len(dms))) cnt = 0 outer_grid = gridspec.GridSpec(len(vsms), len(dms), wspace=0.1, hspace=0.1) c1, c2 = sb.color_palette("Set1")[:2] for vsm_cnt, vsm in enumerate(vsms): print("\t+ " + vsm) fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print("\t\t* " + dm) verifier = Verification(random_state=random_state, metric=dm, feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=0, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print "\t\t\t+ F1: " + str(max_f) fscore_row.append(format(max_f * 100, '.1f')) # distribution of scores:
corpus = 'soldier_letters' logging.info('preparing corpus') verif_dataset = prepare_corpus(data_path+corpus) fit = 0 if fit: """ We fit a vectorizer with the best parametrization we obtained during the development phase. """ verifier = Verification(random_state=1066, metric='minmax', feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=10000, n_dev_pairs=1000000000000, vector_space_model='std', balanced_pairs=False) verifier.vectorize(verif_dataset) dev_results = verifier.fit() dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1 (pairwise):', max_f) print('getting distance table') df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev') df.to_csv('outputs/dm_no_sampl.csv') print('saved dist table!')
c1, c2 = sb.color_palette('Set1')[:2] # first baseline: df = pd.DataFrame(columns=['vector space model'] + list(dms)) for vsm_cnt, vsm in enumerate(vsms): print('\t+ ' + vsm) fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print('\t\t* ' + dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f * 100, '.1f')) # distribution of scores:
# first baseline: df = pd.DataFrame(columns=['vector space model']+list(dms)) for vsm_cnt, vsm in enumerate(vsms): print('\t+ '+vsm) fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print('\t\t* '+dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f*100, '.1f')) # distribution of scores:
from verification.verification import Verification from verification.preprocessing import prepare_corpus, split_corpus data_path = "../data/" corpora = ["du_essays", "gr_articles", "sp_articles"] random_state = 1000 df = pd.DataFrame(columns=["name", "total words", "unique words", "authors", "docs", "SADPs", "DADPs"]) for corpus in corpora: print("=== "+corpus+" ===") # prepare data: X_dev, X_test = split_corpus(prepare_corpus(data_path+corpus), controlled="authors", random_state=random_state) # first *all* pairs verifier = Verification(random_state=random_state, n_test_pairs=None, n_dev_pairs=None, balanced_pairs=False) verifier.vectorize(X_dev, X_test) # first dev data: nr_docs = len(X_dev.texts) total_nr_words = sum((len(doc) for doc in X_dev.texts)) unique_words = len(set(sum(X_dev.texts, []))) distinct_authors = len(set(X_dev.authors)) dev_pairs = verifier._setup_pairs(phase="dev") SADPs, DADPs = 0, 0 for (i, j) in dev_pairs: if X_dev.authors[i] == X_dev.authors[j]: SADPs+=1 else: DADPs+=1 row = [corpus+" (dev)", total_nr_words, unique_words, distinct_authors, nr_docs, SADPs, DADPs]
# we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(test) X_test = prepare_corpus(test) dm = 'minmax' vsm = 'tf' print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id")
vsms = ('std', 'plm', 'tf') fig = plt.figure() cnt = 0 outer_grid = gridspec.GridSpec(len(dms), len(vsms)) for dm_cnt, dm in enumerate(dms): print dm for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False, metric=dm, sample_authors=False, n_features=5000, n_dev_pairs=10000, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev) dev_results = verifier.fit() logging.info("Computing results") test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev") ax = plt.Subplot(fig, outer_grid[cnt]) linkage_matrix = linkage(test_df, 'complete') f = dendrogram(linkage_matrix,
# we prepare the corpus logging.info("preparing corpus") X_dev = prepare_corpus(test) X_test = prepare_corpus(test) dm = 'minmax' vsm = 'tf' print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id") test_df = test_df.applymap(lambda x:int(x*1000)).corr()
for ftype in ftypes: # first baseline: df = pd.DataFrame(columns=["vector space model"] + list(dms)) for vsm_cnt, vsm in enumerate(vsms): print("\t+ " + vsm) fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print("\t\t* " + dm) verifier = Verification(random_state=random_state, metric=dm, feature_type=ftype, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=0, ngram_range=4, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print "\t\t\t + F1: " + str(max_f) fscore_row.append(format(max_f * 100, '.1f'))
fig = plt.figure() cnt = 0 outer_grid = gridspec.GridSpec(len(dms), len(vsms)) for dm_cnt, dm in enumerate(dms): print dm for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False, metric=dm, sample_authors=False, n_features=5000, n_dev_pairs=10000, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev) dev_results = verifier.fit() logging.info("Computing results") test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev") ax = plt.Subplot(fig, outer_grid[cnt]) linkage_matrix = linkage(test_df, 'complete') f = dendrogram(linkage_matrix, truncate_mode='lastp',