# min, max = sp.calculate_min_max_from_table(tables)
# split = sp.calculate_split_from_table(tables, verbose=False, normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google-val.png', log=False, normalize=(min, max))
# tables = ['../data/google/pairs/sets/wmean_no_pairs_r-test.npy', '../data/google/pairs/sets/wmean_pairs_r-test.npy']
# sp.calculate_error_rate_from_table(tables, split, normalize=(min, max))
# sp.calculate_hellinger_distance_from_table(tables, normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google.png', log=False, normalize=(min, max))
# print ""
#
tables = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy']
min, max = sp.calculate_min_max_from_table(tables)
split = sp.calculate_split_from_table(tables, verbose=True, normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google-val.png', log=False, normalize=(min, max))
tables = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy']
sp.calculate_error_rate_from_table(tables, split, verbose=True, normalize=(min, max))
sp.calculate_JS_from_table(tables, verbose=True, normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google.png', log=False, normalize=(min, max))
print ""

# PROCESSING
# texts1 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-validation.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-validation.txt']
# output1 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy']
# p1 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, 'VM'), texts1, output1, 2000000, w, docfreqs))
# p1.start()
#
# texts2 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-test.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-test.txt']
# output2 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy']
# p2 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, 'VM'), texts2, output2, 1600000, w, docfreqs))
# p2.start()
#
# texts3 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-validation.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-validation.txt']
    '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy'
]
min, max = sp.calculate_min_max_from_table(tables)
split = sp.calculate_split_from_table(tables,
                                      verbose=True,
                                      normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google-val.png', log=False, normalize=(min, max))
tables = [
    '../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy',
    '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy'
]
sp.calculate_error_rate_from_table(tables,
                                   split,
                                   verbose=True,
                                   normalize=(min, max))
sp.calculate_JS_from_table(tables, verbose=True, normalize=(min, max))
# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google.png', log=False, normalize=(min, max))
print ""

# PROCESSING
# texts1 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-validation.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-validation.txt']
# output1 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy']
# p1 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, 'VM'), texts1, output1, 2000000, w, docfreqs))
# p1.start()
#
# texts2 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-test.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-test.txt']
# output2 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy']
# p2 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, 'VM'), texts2, output2, 1600000, w, docfreqs))
# p2.start()
#
# texts3 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-validation.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-validation.txt']
# print ""
#
# tablesA = ['../data/google/pairs/sets/nntop_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntop_pairs_r-validation.npy']
# minA, maxA = sp.calculate_min_max_from_table(tablesA)
# splitA = sp.calculate_split_from_table(tablesA, verbose=True, normalize=(minA, maxA))
# tablesA = ['../data/google/pairs/sets/nntop_no_pairs_r-test.npy', '../data/google/pairs/sets/nntop_pairs_r-test.npy']
# sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA))
# sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True)
# print ""

tablesA = ['../data/tweets/pairs/sets/nntopcontr_no_pairs-validation.npy', '../data/tweets/pairs/sets/nntopcontr_pairs-validation.npy']
minA, maxA = sp.calculate_min_max_from_table(tablesA)
splitA = sp.calculate_split_from_table(tablesA, verbose=True, normalize=(minA, maxA))
tablesA = ['../data/tweets/pairs/sets/nntopcontr_no_pairs-test.npy', '../data/tweets/pairs/sets/nntopcontr_pairs-test.npy']
sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA))
sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True)
print ""



#sp.calculate_bootstrap_test_from_table(tablesB, tablesA, splitB, splitA, True, 5000, (minB, maxB), (minA, maxA))



# LDA-code
# dictionary = gensim.corpora.Dictionary.load('../data/model/wiki_wordids_filtered_2.dict')
# dictionary.num_docs = metrics.N_DOCUMENTS
#
# #pca_model = np.load('../data/model/pca.model.npz')['arr_0']
# import lda
# lda_model = lda.lda()
Ejemplo n.º 4
0
# print ""

tablesA = [
    '../data/tweets/pairs/sets/nntopcontr_no_pairs-validation.npy',
    '../data/tweets/pairs/sets/nntopcontr_pairs-validation.npy'
]
minA, maxA = sp.calculate_min_max_from_table(tablesA)
splitA = sp.calculate_split_from_table(tablesA,
                                       verbose=True,
                                       normalize=(minA, maxA))
tablesA = [
    '../data/tweets/pairs/sets/nntopcontr_no_pairs-test.npy',
    '../data/tweets/pairs/sets/nntopcontr_pairs-test.npy'
]
sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA))
sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True)
print ""

#sp.calculate_bootstrap_test_from_table(tablesB, tablesA, splitB, splitA, True, 5000, (minB, maxB), (minA, maxA))

# LDA-code
# dictionary = gensim.corpora.Dictionary.load('../data/model/wiki_wordids_filtered_2.dict')
# dictionary.num_docs = metrics.N_DOCUMENTS
#
# #pca_model = np.load('../data/model/pca.model.npz')['arr_0']
# import lda
# lda_model = lda.lda()
# lda_model.load('../data/model/lda.model')
#
# texts = ['../data/pairs/enwiki_no_pairs_10.txt', '../data/pairs/enwiki_pairs_10.txt']
# labels = ['Pairs', 'No pairs']