def topic_lmi(phi): # lmi(p, x) = p(x, y) * pmi(x, y) # = p(x, y) * log[ p(x | y) / p(x)] # = p(x, y) * log[ p(x | y) ] - log [ p(x) ] # = p(w, t) * log[ p(w | t) ] - log [ p(t) ] pwt = phi / phi.sum() pwgivent = row_norm(phi) pt = phi.sum(axis=1) pt = pt / pt.sum() pt = np.array([pt]).T pmi = np.log(pwgivent) - np.log(pt) lmi = pwt * pmi lmi[lmi < 0] = 0 return lmi
def main(): comps = pd.read_table(COMP_FILE) comps = comps[comps.compound != comps.const] calcsims = list(chain(*zip(comps['compound'], comps['const']))) label_vocab = load_labels("target-labels.txt") phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w } model = np.load("model_250.npy.npz") phi = row_norm(model["phi"].T) ranked_sims = {} done = set() for z, word in enumerate(calcsims): if word in done or word not in phi_nn: continue done.add(word) i = phi_nn[word] w_dist = phi[i] sims = calc_similarities(w_dist, phi) percentile = percentile_ranked(sims) ranked_sims[word] = sims logging.info("Done with %d/%d [%s]" % (z + 1, len(calcsims), word)) ratings_compound = [] ratings_const = [] gold = [] for compound, const, mean in zip(comps.compound, comps.const, comps['mean']): if compound not in ranked_sims or const not in ranked_sims: continue ranked_sims_compound = ranked_sims[compound] ranked_sims_const = ranked_sims[const] ratings_compound.append(ranked_sims_compound[phi_nn[const]]) ratings_const.append(ranked_sims_const[phi_nn[compound]]) gold.append(mean) print ratings_compound print ratings_const print gold print spearmanr(ratings_compound, gold) print spearmanr(ratings_const, gold)
def main(): parser = argparse.ArgumentParser(description='Outputs a human readable model.') parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+') parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp', choices=['disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid', 'comp', 'compmod', 'comphead', 'schm280'], help="The data set to evaluate against.") parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--acc-thresh', type=float, default=0, help="Don't include pairwise comparisons whose judgements are closer than this threshold.") args = parser.parse_args() vocab_labels = load_labels(args.vocab) vocab_labels = {w : i for i, w in vocab_labels.iteritems()} eval_tab = load_eval_table(vocab_labels, args.eval) model_evaluations = [] for model in args.models: logging.info("Processing model '%s'..." % model) m = np.load(model) k = m['k'] ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:]) iter = m['max_iteration'] time = np.sum(m['timediffs']) phi = np.ascontiguousarray(m['phi']) topic_normed = row_norm(phi) word_normed = col_norm(phi) lmid = topic_lmi(phi) model_eval = dict(k=k, ll=ll, iter=iter, time=time, alpha=m['alpha'], eta=m['eta'], mu=m['mu'], eval=args.eval, input=m['input_filename']) similarities = {} for i, pair in eval_tab.iterrows(): try: left_id = vocab_labels[pair['left']] right_id = vocab_labels[pair['right']] except KeyError: continue pair_k = (pair['left'], pair['right']) right_given_left = np.dot(topic_normed[:,right_id], word_normed[:,left_id]) left_given_right = np.dot(topic_normed[:,left_id], word_normed[:,right_id]) jsdiv_sim = jsdiv(word_normed[:,right_id], word_normed[:,left_id]) symkldiv_sim = symkldiv(word_normed[:,right_id], word_normed[:,left_id]) kldiv1 = kldiv(word_normed[:,right_id], word_normed[:,left_id]) kldiv2 = kldiv(word_normed[:,left_id], word_normed[:,right_id]) cos_lmi = cos(lmid[:,right_id], lmid[:,left_id]) similarities[pair_k] = {'right': pair['right'], 'left': pair['left'], 'right|left': right_given_left, 'left|right': left_given_right, 'jsdiv': jsdiv_sim, 'symkldiv': symkldiv_sim, 'kldiv1': kldiv1, 'kldiv2': kldiv2, 'coslmi': cos_lmi, 'human': pair['similarity'], } # let's compute spearman's rho for each of the measures: tmp = pd.DataFrame(similarities.values()) for m in ['right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi']: rho, p = scipy.stats.spearmanr(tmp[m], tmp['human']) model_eval['rho_' + m] = rho model_eval['p_' + m] = p model_eval['n'] = len(tmp[m]) # okay now let's do accuracy style measures baseline_correct = 0 jsdiv_correct = 0 symkldiv_correct = 0 kldiv1_correct = 0 kldiv2_correct = 0 rightleft_correct = 0 leftright_correct = 0 lmicos_correct = 0 pairs_compared = 0.0 for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2): if pair1['similarity'] == pair2['similarity'] or abs(pair1['similarity'] - pair2['similarity']) < 1.0: continue try: pair1_k = (pair1['left'], pair1['right']) similarities1 = similarities[pair1_k] pair2_k = (pair2['left'], pair2['right']) similarities2 = similarities[pair2_k] except KeyError: continue gold = pair1['similarity'] < pair2['similarity'] pairs_compared += 1 baseline_correct += (gold == 1) jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv'])) symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv'])) rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left'])) leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right'])) lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi'])) prod = 100.0 / pairs_compared model_eval['filename'] = model model_eval['model_type'] = os.path.dirname(model) model_eval['acc_baseline'] = baseline_correct / pairs_compared model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared model_eval['acc_right|left'] = rightleft_correct / pairs_compared model_eval['acc_left|right'] = leftright_correct / pairs_compared model_eval['acc_coslmi'] = lmicos_correct / pairs_compared model_evaluations.append(model_eval) pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
def main(): parser = argparse.ArgumentParser( description='Outputs a human readable model.') parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+') parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp', choices=[ 'disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid', 'comp', 'compmod', 'comphead', 'schm280' ], help="The data set to evaluate against.") parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument( '--acc-thresh', type=float, default=0, help= "Don't include pairwise comparisons whose judgements are closer than this threshold." ) args = parser.parse_args() vocab_labels = load_labels(args.vocab) vocab_labels = {w: i for i, w in vocab_labels.iteritems()} eval_tab = load_eval_table(vocab_labels, args.eval) model_evaluations = [] for model in args.models: logging.info("Processing model '%s'..." % model) m = np.load(model) k = m['k'] ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:]) iter = m['max_iteration'] time = np.sum(m['timediffs']) phi = np.ascontiguousarray(m['phi']) topic_normed = row_norm(phi) word_normed = col_norm(phi) lmid = topic_lmi(phi) model_eval = dict(k=k, ll=ll, iter=iter, time=time, alpha=m['alpha'], eta=m['eta'], mu=m['mu'], eval=args.eval, input=m['input_filename']) similarities = {} for i, pair in eval_tab.iterrows(): try: left_id = vocab_labels[pair['left']] right_id = vocab_labels[pair['right']] except KeyError: continue pair_k = (pair['left'], pair['right']) right_given_left = np.dot(topic_normed[:, right_id], word_normed[:, left_id]) left_given_right = np.dot(topic_normed[:, left_id], word_normed[:, right_id]) jsdiv_sim = jsdiv(word_normed[:, right_id], word_normed[:, left_id]) symkldiv_sim = symkldiv(word_normed[:, right_id], word_normed[:, left_id]) kldiv1 = kldiv(word_normed[:, right_id], word_normed[:, left_id]) kldiv2 = kldiv(word_normed[:, left_id], word_normed[:, right_id]) cos_lmi = cos(lmid[:, right_id], lmid[:, left_id]) similarities[pair_k] = { 'right': pair['right'], 'left': pair['left'], 'right|left': right_given_left, 'left|right': left_given_right, 'jsdiv': jsdiv_sim, 'symkldiv': symkldiv_sim, 'kldiv1': kldiv1, 'kldiv2': kldiv2, 'coslmi': cos_lmi, 'human': pair['similarity'], } # let's compute spearman's rho for each of the measures: tmp = pd.DataFrame(similarities.values()) for m in [ 'right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi' ]: rho, p = scipy.stats.spearmanr(tmp[m], tmp['human']) model_eval['rho_' + m] = rho model_eval['p_' + m] = p model_eval['n'] = len(tmp[m]) # okay now let's do accuracy style measures baseline_correct = 0 jsdiv_correct = 0 symkldiv_correct = 0 kldiv1_correct = 0 kldiv2_correct = 0 rightleft_correct = 0 leftright_correct = 0 lmicos_correct = 0 pairs_compared = 0.0 for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2): if pair1['similarity'] == pair2['similarity'] or abs( pair1['similarity'] - pair2['similarity']) < 1.0: continue try: pair1_k = (pair1['left'], pair1['right']) similarities1 = similarities[pair1_k] pair2_k = (pair2['left'], pair2['right']) similarities2 = similarities[pair2_k] except KeyError: continue gold = pair1['similarity'] < pair2['similarity'] pairs_compared += 1 baseline_correct += (gold == 1) jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv'])) symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv'])) rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left'])) leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right'])) lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi'])) prod = 100.0 / pairs_compared model_eval['filename'] = model model_eval['model_type'] = os.path.dirname(model) model_eval['acc_baseline'] = baseline_correct / pairs_compared model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared model_eval['acc_right|left'] = rightleft_correct / pairs_compared model_eval['acc_left|right'] = leftright_correct / pairs_compared model_eval['acc_coslmi'] = lmicos_correct / pairs_compared model_evaluations.append(model_eval) pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
def main(): parser = argparse.ArgumentParser( description='Outputs a human readable model.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--topics', '-t', action='store_true', help='Output topics in human readable form.') parser.add_argument('--words', '-w', metavar='FILE', help='Output the distributions of these words.') parser.add_argument('--all-words', '-W', action='store_true', help='Ignore -w. Output all words.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') parser.add_argument('--features2', '-g', metavar='FILE', help='The feature2 labels.') parser.add_argument( '--docs', '-D', metavar='FILE', help='Output the document distributions for these documents.') parser.add_argument('--docids', '-d', metavar='FILE', help='The document labels.') parser.add_argument( '--detailedtopics', '-p', action='store_true', help='Output nice, human readable information about documents.') parser.add_argument('--csv', '-c', action='store_true', help='Output in a CSV format we can use for ggplot2.') parser.add_argument( '--dont-norm', '-N', action='store_true', help="Don't norm the probabilities to be conditional distributions.") args = parser.parse_args() model = np.load(args.model) #from onlineldavb import dirichlet_expectation phi = row_norm(np.ascontiguousarray(model['phi'])) #phi = np.ascontiguousarray(model['expElogbeta']) #phi = np.exp(dirichlet_expectation(phi)) psi = row_norm(np.ascontiguousarray(model['psi'])) psi2 = row_norm(np.ascontiguousarray(model['psi2'])) label_vocab = load_labels(args.vocab) label_features = load_labels(args.features) label_features2 = load_labels(args.features2) #print "Loglikelihood: %.5f" % model["loglikelihoods"][-1] # phi is vocab # psi is features # pi is documents topic_strings = {} if args.topics or args.detailedtopics: for k in xrange(model['k']): bestphi = ranked_list(phi[k], TOPIC_WORDS_SHOW) bestpsi = ranked_list(psi[k], TOPIC_FEATS_SHOW) bestpsi2 = ranked_list(psi2[k], TOPIC_FEATS_SHOW) topic_str = [] topic_str.append("Topic %d:" % k) topic_str.append(" Phi (vocab):") for i, p in bestphi: topic_str.append(" %.5f %s" % (p, label_vocab.get(i, "word_%d" % i))) topic_str.append(" Psi (features):") for i, p in bestpsi: topic_str.append(" %.5f %s" % (p, label_features.get(i, "feat_%d" % i))) topic_str.append(" Psi2 (features):") for i, p in bestpsi2: topic_str.append(" %.5f %s" % (p, label_features2.get(i, "feat2_%d" % i))) if args.topics: print '\n'.join(topic_str) print if args.detailedtopics: topic_strings[k] = pad_same(topic_str) if args.docs: docids = codecs.getreader('utf-8')(open(args.docids)).readlines() docids = (d[:d.rindex('/')] for d in docids) docids = {dname: dnum for dnum, dname in enumerate(docids)} whitedocs = list( codecs.getreader('utf-8')(open(args.docs)).read().split()) for docname in whitedocs: try: docid = docids[docname] except KeyError: pass docdist = model['pi'][docid] if not args.detailedtopics: docdist_s = ' '.join(map(repr, docdist)) #print "%s\t%s" % (docname, docdist_s) for i, p in enumerate(docdist): print "%s,%d,%f" % (docname, i, p) else: # output nice stuff. sorted_dist = sorted(list(enumerate(docdist)), key=lambda x: x[1], reverse=True) sorted_dist = [(i, p) for i, p in sorted_dist if p > 1e-6] print "Document: %s" % docname for i, p in sorted_dist: print " Topic %d: %f" % (i, p) print column_join([topic_strings[k] for k, p in sorted_dist]) print if args.words and not args.all_words: whitewords = codecs.getreader('utf-8')(open(args.words)).read().split() mappings = {} for k, v in label_vocab.iteritems(): nicev = v[:v.rindex("/")] #if nicev in whitewords and '/NN' in v: if nicev in whitewords: mappings[nicev] = k for ww in whitewords: if ww not in mappings: continue m = mappings[ww] word = label_vocab[m] probs = phi[:, m] if not args.dont_norm: probs = probs / np.sum(probs) niceword = word[:word.rindex("/")] if args.detailedtopics: niceprobs = [(i, p) for i, p in enumerate(probs) if p > .05] print "Word: %s" % niceword for i, p in niceprobs: print " Topic %d: %f" % (i, p) print column_join([topic_strings[k] for k, p in niceprobs]) print elif args.csv: for i, p in enumerate(probs): print "%s,%d,%f" % (niceword, i, p) else: print word[:word.rindex("/")] + "\t" + " ".join( repr(p) for p in probs) if args.all_words: for wid, w in label_vocab.iteritems(): probs = phi[:, wid] if '/NN' in w: # hack for later w = w[:w.rindex('/')] if not args.dont_norm: probs = probs / np.sum(probs) print w + "\t" + " ".join(repr(p) for p in probs)
def main(): parser = argparse.ArgumentParser(description='Outputs a human readable model.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--topics', '-t', action='store_true', help='Output topics in human readable form.') parser.add_argument('--words', '-w', metavar='FILE', help='Output the distributions of these words.') parser.add_argument('--all-words', '-W', action='store_true', help='Ignore -w. Output all words.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') parser.add_argument('--features2', '-g', metavar='FILE', help='The feature2 labels.') parser.add_argument('--docs', '-D', metavar='FILE', help='Output the document distributions for these documents.') parser.add_argument('--docids', '-d', metavar='FILE', help='The document labels.') parser.add_argument('--detailedtopics', '-p', action='store_true', help='Output nice, human readable information about documents.') parser.add_argument('--csv', '-c', action='store_true', help='Output in a CSV format we can use for ggplot2.') parser.add_argument('--dont-norm', '-N', action='store_true', help="Don't norm the probabilities to be conditional distributions.") args = parser.parse_args() model = np.load(args.model) #from onlineldavb import dirichlet_expectation phi = row_norm(np.ascontiguousarray(model['phi'])) #phi = np.ascontiguousarray(model['expElogbeta']) #phi = np.exp(dirichlet_expectation(phi)) psi = row_norm(np.ascontiguousarray(model['psi'])) psi2 = row_norm(np.ascontiguousarray(model['psi2'])) label_vocab = load_labels(args.vocab) label_features = load_labels(args.features) label_features2 = load_labels(args.features2) #print "Loglikelihood: %.5f" % model["loglikelihoods"][-1] # phi is vocab # psi is features # pi is documents topic_strings = {} if args.topics or args.detailedtopics: for k in xrange(model['k']): bestphi = ranked_list(phi[k], TOPIC_WORDS_SHOW) bestpsi = ranked_list(psi[k], TOPIC_FEATS_SHOW) bestpsi2 = ranked_list(psi2[k], TOPIC_FEATS_SHOW) topic_str = [] topic_str.append("Topic %d:" % k) topic_str.append(" Phi (vocab):") for i, p in bestphi: topic_str.append(" %.5f %s" % (p, label_vocab.get(i, "word_%d" % i))) topic_str.append(" Psi (features):") for i, p in bestpsi: topic_str.append(" %.5f %s" % (p, label_features.get(i, "feat_%d" % i))) topic_str.append(" Psi2 (features):") for i, p in bestpsi2: topic_str.append(" %.5f %s" % (p, label_features2.get(i, "feat2_%d" % i))) if args.topics: print '\n'.join(topic_str) print if args.detailedtopics: topic_strings[k] = pad_same(topic_str) if args.docs: docids = codecs.getreader('utf-8')(open(args.docids)).readlines() docids = (d[:d.rindex('/')] for d in docids) docids = {dname: dnum for dnum, dname in enumerate(docids)} whitedocs = list(codecs.getreader('utf-8')(open(args.docs)).read().split()) for docname in whitedocs: try: docid = docids[docname] except KeyError: pass docdist = model['pi'][docid] if not args.detailedtopics: docdist_s = ' '.join(map(repr, docdist)) #print "%s\t%s" % (docname, docdist_s) for i, p in enumerate(docdist): print "%s,%d,%f" % (docname, i, p) else: # output nice stuff. sorted_dist = sorted(list(enumerate(docdist)), key=lambda x: x[1], reverse=True) sorted_dist = [(i, p) for i, p in sorted_dist if p > 1e-6] print "Document: %s" % docname for i, p in sorted_dist: print " Topic %d: %f" % (i, p) print column_join([topic_strings[k] for k, p in sorted_dist]) print if args.words and not args.all_words: whitewords = codecs.getreader('utf-8')(open(args.words)).read().split() mappings = {} for k,v in label_vocab.iteritems(): nicev = v[:v.rindex("/")] #if nicev in whitewords and '/NN' in v: if nicev in whitewords: mappings[nicev] = k for ww in whitewords: if ww not in mappings: continue m = mappings[ww] word = label_vocab[m] probs = phi[:,m] if not args.dont_norm: probs = probs / np.sum(probs) niceword = word[:word.rindex("/")] if args.detailedtopics: niceprobs = [(i, p) for i, p in enumerate(probs) if p > .05] print "Word: %s" % niceword for i, p in niceprobs: print " Topic %d: %f" % (i, p) print column_join([topic_strings[k] for k, p in niceprobs]) print elif args.csv: for i, p in enumerate(probs): print "%s,%d,%f" % (niceword, i, p) else: print word[:word.rindex("/")] + "\t" + " ".join(repr(p) for p in probs) if args.all_words: for wid, w in label_vocab.iteritems(): probs = phi[:,wid] if '/NN' in w: # hack for later w = w[:w.rindex('/')] if not args.dont_norm: probs = probs / np.sum(probs) print w + "\t" + " ".join(repr(p) for p in probs)
def main(): parser = argparse.ArgumentParser(description='Checks for prediction of association norms.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') #parser.add_argument('--docs', '-D', metavar='FILE', # help='Output the document distributions for these documents.') #parser.add_argument('--docids', '-d', metavar='FILE', # help='The document labels.') args = parser.parse_args() model = np.load(args.model) phi = row_norm(np.ascontiguousarray(model["phi"].T)) #pi = safe_pi_read(args.model) label_vocab = load_labels(args.vocab) #docids = codecs.getreader('utf-8')(open(args.docids)).readlines() phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w } nopos_labels = mdict() for i, v in label_vocab.iteritems(): nopos = v[:v.rindex('/')] nopos_labels[nopos] = i assocs = load_associations() to_compute_similarities = list(set(t for t, a, c in assocs)) ranked_sims = {} logging.info("compute similarities...") for z, w_i in enumerate(to_compute_similarities): if w_i not in phi_nn: continue i = phi_nn[w_i] w_i_dist = norm1(phi[i]) similarities = np.array([cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi)]) ranked_sims[w_i] = percentile_ranked(similarities) logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities))) logging.info("finished computing similarities.") measures = [] oov_count = 0 noov_count = 0 for t, a, c in assocs: if t not in ranked_sims or a not in nopos_labels: oov_count += 1 continue noov_count += 1 ranked = ranked_sims[t] m = max(ranked[i] for i in nopos_labels[a]) measures += [m] * c measures = np.array(measures) print "mean: %f" % measures.mean() print "std: %f" % measures.std() print "oov: %d" % oov_count print "len(measures) = %d" % len(measures) print "# hit: %d" % noov_count print "Percentiles [.05, .10, .25, .5, .75, .90, .95] =" print " [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95]])
def main(): parser = argparse.ArgumentParser( description='Checks for prediction of association norms.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') #parser.add_argument('--docs', '-D', metavar='FILE', # help='Output the document distributions for these documents.') #parser.add_argument('--docids', '-d', metavar='FILE', # help='The document labels.') args = parser.parse_args() model = np.load(args.model) phi = row_norm(np.ascontiguousarray(model["phi"].T)) #pi = safe_pi_read(args.model) label_vocab = load_labels(args.vocab) #docids = codecs.getreader('utf-8')(open(args.docids)).readlines() phi_nn = { w[:w.rindex('/')]: i for i, w in label_vocab.iteritems() if '/NN' in w } nopos_labels = mdict() for i, v in label_vocab.iteritems(): nopos = v[:v.rindex('/')] nopos_labels[nopos] = i assocs = load_associations() to_compute_similarities = list(set(t for t, a, c in assocs)) ranked_sims = {} logging.info("compute similarities...") for z, w_i in enumerate(to_compute_similarities): if w_i not in phi_nn: continue i = phi_nn[w_i] w_i_dist = norm1(phi[i]) similarities = np.array([ cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi) ]) ranked_sims[w_i] = percentile_ranked(similarities) logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities))) logging.info("finished computing similarities.") measures = [] oov_count = 0 noov_count = 0 for t, a, c in assocs: if t not in ranked_sims or a not in nopos_labels: oov_count += 1 continue noov_count += 1 ranked = ranked_sims[t] m = max(ranked[i] for i in nopos_labels[a]) measures += [m] * c measures = np.array(measures) print "mean: %f" % measures.mean() print "std: %f" % measures.std() print "oov: %d" % oov_count print "len(measures) = %d" % len(measures) print "# hit: %d" % noov_count print "Percentiles [.05, .10, .25, .5, .75, .90, .95] =" print " [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([ scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95] ])