def make_plot_from_text(sim_function, texts=['../data/enwiki_pairs.txt'], labels=['pairs'], colors=['red'], N=5100000, w2v_model=None, docfreqs=None, output='plot.png'): data = [] for t in texts: r = process_pairs(sim_function, t, N, w2v_model, docfreqs) data.append(r) plt.clf() _, _, _ = plt.hist(data, 300, normed=1, histtype='step', label=labels, color=colors) #plt.legend() plt.savefig(output) print 'Calculating optimal error rate...' (error, split) = metrics.optimal_error_rate(data[0], data[1]) print 'Optimal error: %.5f' % error print 'Optimal split point: %.5f' % split
def calculate_split_from_table(tables=['../data/tfidf-pairs.txt'], verbose=True, normalize=(0.0, 1.0)): data = [] for t in tables: f = open(t, 'rb') r = np.load(f) data.append(r) for d in xrange(len(data)): data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0]) if verbose: print 'Calculating optimal error rate...' (error, split) = metrics.optimal_error_rate(data[0], data[1]) if verbose: print 'Optimal error: %.5f' % error print 'Optimal split point: %.5f' % split return split