metric="minmax", feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=n_pairs, n_dev_pairs=n_pairs, em_iterations=100, vector_space_model="std", weight=0.2, eps=0.01, norm="l2", balanced_pairs=False) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results, test_results = verifier.predict() logging.info("Computing results") # first prec rec curve of test results: test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results) fig = sb.plt.figure() sb.plt.xlabel("recall", fontsize=10) sb.plt.ylabel("precision", fontsize=10) sb.plt.xlim(0.4, 1) sb.plt.ylim(0.4, 1.05) sb.plt.plot(test_Rs, test_Ps, label="baseline") # get max for dev: dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results) best_t = dev_Ts[np.nanargmax(dev_Fs)] baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results, t=best_t)
print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id") test_df = test_df.applymap(lambda x: int(x * 1000)).corr() # heatmap plotting: sb.heatmap(test_df) ax = sb.plt.gca() for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontname('Arial')
metric="minmax", feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=n_pairs, n_dev_pairs=n_pairs, em_iterations=100, vector_space_model="std", weight=0.2, eps=0.01, norm="l2", balanced_pairs=False) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results, test_results = verifier.predict() logging.info("Computing results") # first prec rec curve of test results: test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results) fig = sb.plt.figure() sb.plt.xlabel("recall", fontsize=10) sb.plt.ylabel("precision", fontsize=10) sb.plt.xlim(0.4, 1) sb.plt.ylim(0.4, 1.05) sb.plt.plot(test_Rs, test_Ps, label="baseline") # get max for dev: dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results) best_t = dev_Ts[np.nanargmax(dev_Fs)] baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results,
print dm print vsm verifier = Verification(random_state=1000, metric=dm, n_features=10000, n_dev_pairs=0, n_test_pairs=99999999, vector_space_model=vsm, balanced_pairs=False, control_pairs=False) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev, X_test) train_results, test_results = verifier.predict(filter_imposters=False) logging.info("Computing results") test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test") test_df.to_csv("../outputs/caesar_test.csv") test_df = pd.read_csv("../outputs/caesar_test.csv") test_df = test_df.set_index("id") test_df = test_df.applymap(lambda x:int(x*1000)).corr() # heatmap plotting: sb.heatmap(test_df) ax = sb.plt.gca() for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontname('Arial') label.set_fontsize(3)