Beispiel #1
0
                        n_features=n_features,
                        n_test_pairs=n_pairs,
                        n_dev_pairs=n_pairs,
                        em_iterations=100,
                        vector_space_model="std",
                        weight=0.2,
                        eps=0.01,
                        norm="l2",
                        balanced_pairs=False)
logging.info("Starting verification [dev / test]")
verifier.vectorize(X_dev, X_test)
dev_results, test_results = verifier.predict()
logging.info("Computing results")

# first prec rec curve of test results:
test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results)
fig = sb.plt.figure()
sb.plt.xlabel("recall", fontsize=10)
sb.plt.ylabel("precision", fontsize=10)
sb.plt.xlim(0.4, 1)
sb.plt.ylim(0.4, 1.05)
sb.plt.plot(test_Rs, test_Ps, label="baseline")

# get max for dev:
dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results)
best_t = dev_Ts[np.nanargmax(dev_Fs)]
baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results, t=best_t)
print "Baseline F1: "+str(baseline_test_f)

verifier = Verification(random_state=random_state,
                        metric="minmax",
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()

        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print "\t\t + F1: "+str(max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print "\t\t- KS: D = "+str(D)+" (p = "+str(p)+")"
        sb.set_style("dark")
        ax = sb.plt.Subplot(fig, outer_grid[cnt])
        ax.set_xlim([0, 1])
        sb.kdeplot(diff_author_densities, shade=True, legend=False, c=c1, ax=ax, lw=0.5)
        sb.kdeplot(same_author_densities, shade=True, legend=False, c=c2, ax=ax, lw=0.5)
Beispiel #3
0
                        n_features=n_features,
                        n_test_pairs=n_pairs,
                        n_dev_pairs=n_pairs,
                        em_iterations=100,
                        vector_space_model="std",
                        weight=0.2,
                        eps=0.01,
                        norm="l2",
                        balanced_pairs=False)
logging.info("Starting verification [dev / test]")
verifier.vectorize(X_dev, X_test)
dev_results, test_results = verifier.predict()
logging.info("Computing results")

# first prec rec curve of test results:
test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results)
fig = sb.plt.figure()
sb.plt.xlabel("recall", fontsize=10)
sb.plt.ylabel("precision", fontsize=10)
sb.plt.xlim(0.4, 1)
sb.plt.ylim(0.4, 1.05)
sb.plt.plot(test_Rs, test_Ps, label="baseline")

# get max for dev:
dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results)
best_t = dev_Ts[np.nanargmax(dev_Fs)]
baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results,
                                                          t=best_t)
print "Baseline F1: " + str(baseline_test_f)

verifier = Verification(random_state=random_state,