Esempio n. 1
0
def topic_lmi(phi):
    # lmi(p, x) = p(x, y) * pmi(x, y)
    #           = p(x, y) * log[ p(x | y) / p(x)]
    #           = p(x, y) * log[ p(x | y) ] - log [ p(x) ]
    #           = p(w, t) * log[ p(w | t) ] - log [ p(t) ]

    pwt = phi / phi.sum()
    pwgivent = row_norm(phi)
    pt = phi.sum(axis=1)
    pt = pt / pt.sum()
    pt = np.array([pt]).T

    pmi = np.log(pwgivent) - np.log(pt)
    lmi = pwt * pmi
    lmi[lmi < 0] = 0
    return lmi
Esempio n. 2
0
def topic_lmi(phi):
    # lmi(p, x) = p(x, y) * pmi(x, y)
    #           = p(x, y) * log[ p(x | y) / p(x)]
    #           = p(x, y) * log[ p(x | y) ] - log [ p(x) ]
    #           = p(w, t) * log[ p(w | t) ] - log [ p(t) ]

    pwt = phi / phi.sum()
    pwgivent = row_norm(phi)
    pt = phi.sum(axis=1)
    pt = pt / pt.sum()
    pt = np.array([pt]).T

    pmi = np.log(pwgivent) - np.log(pt)
    lmi = pwt * pmi
    lmi[lmi < 0] = 0
    return lmi
Esempio n. 3
0
def main():
    comps = pd.read_table(COMP_FILE)
    comps = comps[comps.compound != comps.const]
    calcsims = list(chain(*zip(comps['compound'], comps['const'])))
    label_vocab = load_labels("target-labels.txt")
    phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w }
    model = np.load("model_250.npy.npz")
    phi = row_norm(model["phi"].T)

    ranked_sims = {}
    done = set()
    for z, word in enumerate(calcsims):
        if word in done or word not in phi_nn:
            continue
        done.add(word)
        i = phi_nn[word]
        w_dist = phi[i]
        sims = calc_similarities(w_dist, phi)
        percentile = percentile_ranked(sims)
        ranked_sims[word] = sims
        logging.info("Done with %d/%d [%s]" % (z + 1, len(calcsims), word))

    ratings_compound = []
    ratings_const = []
    gold = []
    for compound, const, mean in zip(comps.compound, comps.const, comps['mean']):
        if compound not in ranked_sims or const not in ranked_sims:
            continue
        ranked_sims_compound = ranked_sims[compound]
        ranked_sims_const = ranked_sims[const]

        ratings_compound.append(ranked_sims_compound[phi_nn[const]])
        ratings_const.append(ranked_sims_const[phi_nn[compound]])
        gold.append(mean)

    print ratings_compound
    print ratings_const
    print gold
    print spearmanr(ratings_compound, gold)
    print spearmanr(ratings_const, gold)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(description='Outputs a human readable model.')
    parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+')
    parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp',
                        choices=['disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid',
                                 'comp', 'compmod', 'comphead',
                                 'schm280'],
                        help="The data set to evaluate against.")
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--acc-thresh', type=float, default=0,
                        help="Don't include pairwise comparisons whose judgements are closer than this threshold.")
    args = parser.parse_args()

    vocab_labels = load_labels(args.vocab)
    vocab_labels = {w : i for i, w in vocab_labels.iteritems()}
    eval_tab = load_eval_table(vocab_labels, args.eval)

    model_evaluations = []
    for model in args.models:
        logging.info("Processing model '%s'..." % model)
        m = np.load(model)
        k = m['k']
        ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:])
        iter = m['max_iteration']
        time = np.sum(m['timediffs'])
        phi = np.ascontiguousarray(m['phi'])
        topic_normed = row_norm(phi)
        word_normed = col_norm(phi)

        lmid = topic_lmi(phi)

        model_eval = dict(k=k, ll=ll, iter=iter, time=time, 
                          alpha=m['alpha'], eta=m['eta'], mu=m['mu'],
                          eval=args.eval, input=m['input_filename'])

        similarities = {}
        for i, pair in eval_tab.iterrows():
            try:
                left_id = vocab_labels[pair['left']]
                right_id = vocab_labels[pair['right']]
            except KeyError:
                continue

            pair_k = (pair['left'], pair['right'])
            right_given_left = np.dot(topic_normed[:,right_id], word_normed[:,left_id])
            left_given_right = np.dot(topic_normed[:,left_id], word_normed[:,right_id])
            jsdiv_sim = jsdiv(word_normed[:,right_id], word_normed[:,left_id])
            symkldiv_sim = symkldiv(word_normed[:,right_id], word_normed[:,left_id])
            kldiv1 = kldiv(word_normed[:,right_id], word_normed[:,left_id])
            kldiv2 = kldiv(word_normed[:,left_id], word_normed[:,right_id])
            cos_lmi = cos(lmid[:,right_id], lmid[:,left_id])

            similarities[pair_k] = {'right':  pair['right'],
                                    'left':     pair['left'],
                                    'right|left': right_given_left,
                                    'left|right': left_given_right,
                                    'jsdiv':     jsdiv_sim,
                                    'symkldiv':  symkldiv_sim,
                                    'kldiv1':    kldiv1,
                                    'kldiv2':    kldiv2,
                                    'coslmi':    cos_lmi,
                                    'human':     pair['similarity'],
                                    }

        # let's compute spearman's rho for each of the measures:
        tmp = pd.DataFrame(similarities.values())
        for m in ['right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi']:
            rho, p = scipy.stats.spearmanr(tmp[m], tmp['human'])
            model_eval['rho_' + m] = rho
            model_eval['p_' + m] = p
            model_eval['n'] = len(tmp[m])

        # okay now let's do accuracy style measures
        baseline_correct = 0
        jsdiv_correct = 0
        symkldiv_correct = 0
        kldiv1_correct = 0
        kldiv2_correct = 0
        rightleft_correct = 0
        leftright_correct = 0
        lmicos_correct = 0
        pairs_compared = 0.0
        for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2):
            if pair1['similarity'] == pair2['similarity'] or abs(pair1['similarity'] - pair2['similarity']) < 1.0:
                continue

            try:
                pair1_k = (pair1['left'], pair1['right'])
                similarities1 = similarities[pair1_k]
                pair2_k = (pair2['left'], pair2['right'])
                similarities2 = similarities[pair2_k]
            except KeyError:
                continue

            gold = pair1['similarity'] < pair2['similarity']

            pairs_compared += 1
            baseline_correct += (gold == 1)

            jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv']))
            symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv']))
            rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left']))
            leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right']))
            lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi']))

        prod = 100.0 / pairs_compared
        model_eval['filename'] = model
        model_eval['model_type'] = os.path.dirname(model)
        model_eval['acc_baseline'] = baseline_correct / pairs_compared
        model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_right|left'] = rightleft_correct / pairs_compared
        model_eval['acc_left|right'] = leftright_correct / pairs_compared
        model_eval['acc_coslmi'] = lmicos_correct / pairs_compared

        model_evaluations.append(model_eval)

    pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        description='Outputs a human readable model.')
    parser.add_argument('models',
                        metavar='FILE',
                        help='The saved models.',
                        nargs='+')
    parser.add_argument('--eval',
                        '-e',
                        metavar='EVALDATA',
                        default='comp',
                        choices=[
                            'disco', 'discotrain', 'discovalid', 'discotest',
                            'discotrainvalid', 'comp', 'compmod', 'comphead',
                            'schm280'
                        ],
                        help="The data set to evaluate against.")
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument(
        '--acc-thresh',
        type=float,
        default=0,
        help=
        "Don't include pairwise comparisons whose judgements are closer than this threshold."
    )
    args = parser.parse_args()

    vocab_labels = load_labels(args.vocab)
    vocab_labels = {w: i for i, w in vocab_labels.iteritems()}
    eval_tab = load_eval_table(vocab_labels, args.eval)

    model_evaluations = []
    for model in args.models:
        logging.info("Processing model '%s'..." % model)
        m = np.load(model)
        k = m['k']
        ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:]
                     or m['perwordbounds'][-5:])
        iter = m['max_iteration']
        time = np.sum(m['timediffs'])
        phi = np.ascontiguousarray(m['phi'])
        topic_normed = row_norm(phi)
        word_normed = col_norm(phi)

        lmid = topic_lmi(phi)

        model_eval = dict(k=k,
                          ll=ll,
                          iter=iter,
                          time=time,
                          alpha=m['alpha'],
                          eta=m['eta'],
                          mu=m['mu'],
                          eval=args.eval,
                          input=m['input_filename'])

        similarities = {}
        for i, pair in eval_tab.iterrows():
            try:
                left_id = vocab_labels[pair['left']]
                right_id = vocab_labels[pair['right']]
            except KeyError:
                continue

            pair_k = (pair['left'], pair['right'])
            right_given_left = np.dot(topic_normed[:, right_id],
                                      word_normed[:, left_id])
            left_given_right = np.dot(topic_normed[:, left_id],
                                      word_normed[:, right_id])
            jsdiv_sim = jsdiv(word_normed[:, right_id], word_normed[:,
                                                                    left_id])
            symkldiv_sim = symkldiv(word_normed[:, right_id],
                                    word_normed[:, left_id])
            kldiv1 = kldiv(word_normed[:, right_id], word_normed[:, left_id])
            kldiv2 = kldiv(word_normed[:, left_id], word_normed[:, right_id])
            cos_lmi = cos(lmid[:, right_id], lmid[:, left_id])

            similarities[pair_k] = {
                'right': pair['right'],
                'left': pair['left'],
                'right|left': right_given_left,
                'left|right': left_given_right,
                'jsdiv': jsdiv_sim,
                'symkldiv': symkldiv_sim,
                'kldiv1': kldiv1,
                'kldiv2': kldiv2,
                'coslmi': cos_lmi,
                'human': pair['similarity'],
            }

        # let's compute spearman's rho for each of the measures:
        tmp = pd.DataFrame(similarities.values())
        for m in [
                'right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1',
                'kldiv2', 'coslmi'
        ]:
            rho, p = scipy.stats.spearmanr(tmp[m], tmp['human'])
            model_eval['rho_' + m] = rho
            model_eval['p_' + m] = p
            model_eval['n'] = len(tmp[m])

        # okay now let's do accuracy style measures
        baseline_correct = 0
        jsdiv_correct = 0
        symkldiv_correct = 0
        kldiv1_correct = 0
        kldiv2_correct = 0
        rightleft_correct = 0
        leftright_correct = 0
        lmicos_correct = 0
        pairs_compared = 0.0
        for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2):
            if pair1['similarity'] == pair2['similarity'] or abs(
                    pair1['similarity'] - pair2['similarity']) < 1.0:
                continue

            try:
                pair1_k = (pair1['left'], pair1['right'])
                similarities1 = similarities[pair1_k]
                pair2_k = (pair2['left'], pair2['right'])
                similarities2 = similarities[pair2_k]
            except KeyError:
                continue

            gold = pair1['similarity'] < pair2['similarity']

            pairs_compared += 1
            baseline_correct += (gold == 1)

            jsdiv_correct += (gold == (similarities1['jsdiv'] >
                                       similarities2['jsdiv']))
            symkldiv_correct += (gold == (similarities1['symkldiv'] >
                                          similarities2['symkldiv']))
            rightleft_correct += (gold == (similarities1['right|left'] <
                                           similarities2['right|left']))
            leftright_correct += (gold == (similarities1['left|right'] <
                                           similarities2['left|right']))
            lmicos_correct += (gold == (similarities1['coslmi'] <
                                        similarities2['coslmi']))

        prod = 100.0 / pairs_compared
        model_eval['filename'] = model
        model_eval['model_type'] = os.path.dirname(model)
        model_eval['acc_baseline'] = baseline_correct / pairs_compared
        model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_right|left'] = rightleft_correct / pairs_compared
        model_eval['acc_left|right'] = leftright_correct / pairs_compared
        model_eval['acc_coslmi'] = lmicos_correct / pairs_compared

        model_evaluations.append(model_eval)

    pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser(
        description='Outputs a human readable model.')
    parser.add_argument('--model',
                        '-m',
                        metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--topics',
                        '-t',
                        action='store_true',
                        help='Output topics in human readable form.')
    parser.add_argument('--words',
                        '-w',
                        metavar='FILE',
                        help='Output the distributions of these words.')
    parser.add_argument('--all-words',
                        '-W',
                        action='store_true',
                        help='Ignore -w. Output all words.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features',
                        '-f',
                        metavar='FILE',
                        help='The feature labels.')
    parser.add_argument('--features2',
                        '-g',
                        metavar='FILE',
                        help='The feature2 labels.')
    parser.add_argument(
        '--docs',
        '-D',
        metavar='FILE',
        help='Output the document distributions for these documents.')
    parser.add_argument('--docids',
                        '-d',
                        metavar='FILE',
                        help='The document labels.')
    parser.add_argument(
        '--detailedtopics',
        '-p',
        action='store_true',
        help='Output nice, human readable information about documents.')
    parser.add_argument('--csv',
                        '-c',
                        action='store_true',
                        help='Output in a CSV format we can use for ggplot2.')
    parser.add_argument(
        '--dont-norm',
        '-N',
        action='store_true',
        help="Don't norm the probabilities to be conditional distributions.")
    args = parser.parse_args()

    model = np.load(args.model)
    #from onlineldavb import dirichlet_expectation
    phi = row_norm(np.ascontiguousarray(model['phi']))
    #phi = np.ascontiguousarray(model['expElogbeta'])
    #phi = np.exp(dirichlet_expectation(phi))
    psi = row_norm(np.ascontiguousarray(model['psi']))
    psi2 = row_norm(np.ascontiguousarray(model['psi2']))

    label_vocab = load_labels(args.vocab)
    label_features = load_labels(args.features)
    label_features2 = load_labels(args.features2)

    #print "Loglikelihood: %.5f" % model["loglikelihoods"][-1]

    # phi is vocab
    # psi is features
    # pi is documents

    topic_strings = {}

    if args.topics or args.detailedtopics:
        for k in xrange(model['k']):
            bestphi = ranked_list(phi[k], TOPIC_WORDS_SHOW)
            bestpsi = ranked_list(psi[k], TOPIC_FEATS_SHOW)
            bestpsi2 = ranked_list(psi2[k], TOPIC_FEATS_SHOW)

            topic_str = []
            topic_str.append("Topic %d:" % k)
            topic_str.append("  Phi (vocab):")
            for i, p in bestphi:
                topic_str.append("    %.5f  %s" %
                                 (p, label_vocab.get(i, "word_%d" % i)))
            topic_str.append("  Psi (features):")
            for i, p in bestpsi:
                topic_str.append("    %.5f  %s" %
                                 (p, label_features.get(i, "feat_%d" % i)))
            topic_str.append("  Psi2 (features):")
            for i, p in bestpsi2:
                topic_str.append("    %.5f  %s" %
                                 (p, label_features2.get(i, "feat2_%d" % i)))

            if args.topics:
                print '\n'.join(topic_str)
                print
            if args.detailedtopics:
                topic_strings[k] = pad_same(topic_str)

    if args.docs:
        docids = codecs.getreader('utf-8')(open(args.docids)).readlines()
        docids = (d[:d.rindex('/')] for d in docids)
        docids = {dname: dnum for dnum, dname in enumerate(docids)}
        whitedocs = list(
            codecs.getreader('utf-8')(open(args.docs)).read().split())
        for docname in whitedocs:
            try:
                docid = docids[docname]
            except KeyError:
                pass
            docdist = model['pi'][docid]
            if not args.detailedtopics:
                docdist_s = ' '.join(map(repr, docdist))
                #print "%s\t%s" % (docname, docdist_s)
                for i, p in enumerate(docdist):
                    print "%s,%d,%f" % (docname, i, p)
            else:
                # output nice stuff.
                sorted_dist = sorted(list(enumerate(docdist)),
                                     key=lambda x: x[1],
                                     reverse=True)
                sorted_dist = [(i, p) for i, p in sorted_dist if p > 1e-6]
                print "Document: %s" % docname
                for i, p in sorted_dist:
                    print "  Topic %d: %f" % (i, p)
                print column_join([topic_strings[k] for k, p in sorted_dist])
                print

    if args.words and not args.all_words:
        whitewords = codecs.getreader('utf-8')(open(args.words)).read().split()
        mappings = {}
        for k, v in label_vocab.iteritems():
            nicev = v[:v.rindex("/")]
            #if nicev in whitewords and '/NN' in v:
            if nicev in whitewords:
                mappings[nicev] = k

        for ww in whitewords:
            if ww not in mappings:
                continue
            m = mappings[ww]
            word = label_vocab[m]
            probs = phi[:, m]
            if not args.dont_norm:
                probs = probs / np.sum(probs)
            niceword = word[:word.rindex("/")]
            if args.detailedtopics:
                niceprobs = [(i, p) for i, p in enumerate(probs) if p > .05]
                print "Word: %s" % niceword
                for i, p in niceprobs:
                    print "  Topic %d: %f" % (i, p)
                print column_join([topic_strings[k] for k, p in niceprobs])
                print
            elif args.csv:
                for i, p in enumerate(probs):
                    print "%s,%d,%f" % (niceword, i, p)
            else:
                print word[:word.rindex("/")] + "\t" + " ".join(
                    repr(p) for p in probs)

    if args.all_words:
        for wid, w in label_vocab.iteritems():
            probs = phi[:, wid]
            if '/NN' in w:
                # hack for later
                w = w[:w.rindex('/')]
            if not args.dont_norm:
                probs = probs / np.sum(probs)
            print w + "\t" + " ".join(repr(p) for p in probs)
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(description='Outputs a human readable model.')
    parser.add_argument('--model', '-m', metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--topics', '-t', action='store_true',
                        help='Output topics in human readable form.')
    parser.add_argument('--words', '-w', metavar='FILE',
                        help='Output the distributions of these words.')
    parser.add_argument('--all-words', '-W', action='store_true',
                        help='Ignore -w. Output all words.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features', '-f', metavar='FILE',
                        help='The feature labels.')
    parser.add_argument('--features2', '-g', metavar='FILE',
                        help='The feature2 labels.')
    parser.add_argument('--docs', '-D', metavar='FILE',
                        help='Output the document distributions for these documents.')
    parser.add_argument('--docids', '-d', metavar='FILE',
                        help='The document labels.')
    parser.add_argument('--detailedtopics', '-p', action='store_true',
                        help='Output nice, human readable information about documents.')
    parser.add_argument('--csv', '-c', action='store_true',
                        help='Output in a CSV format we can use for ggplot2.')
    parser.add_argument('--dont-norm', '-N', action='store_true',
                        help="Don't norm the probabilities to be conditional distributions.")
    args = parser.parse_args()

    model = np.load(args.model)
    #from onlineldavb import dirichlet_expectation
    phi = row_norm(np.ascontiguousarray(model['phi']))
    #phi = np.ascontiguousarray(model['expElogbeta'])
    #phi = np.exp(dirichlet_expectation(phi))
    psi = row_norm(np.ascontiguousarray(model['psi']))
    psi2 = row_norm(np.ascontiguousarray(model['psi2']))

    label_vocab = load_labels(args.vocab)
    label_features = load_labels(args.features)
    label_features2 = load_labels(args.features2)

    #print "Loglikelihood: %.5f" % model["loglikelihoods"][-1]

    # phi is vocab
    # psi is features
    # pi is documents

    topic_strings = {}

    if args.topics or args.detailedtopics:
        for k in xrange(model['k']):
            bestphi = ranked_list(phi[k], TOPIC_WORDS_SHOW)
            bestpsi = ranked_list(psi[k], TOPIC_FEATS_SHOW)
            bestpsi2 = ranked_list(psi2[k], TOPIC_FEATS_SHOW)

            topic_str = []
            topic_str.append("Topic %d:" % k)
            topic_str.append("  Phi (vocab):")
            for i, p in bestphi:
                topic_str.append("    %.5f  %s" % (p, label_vocab.get(i, "word_%d" % i)))
            topic_str.append("  Psi (features):")
            for i, p in bestpsi:
                topic_str.append("    %.5f  %s" % (p, label_features.get(i, "feat_%d" % i)))
            topic_str.append("  Psi2 (features):")
            for i, p in bestpsi2:
                topic_str.append("    %.5f  %s" % (p, label_features2.get(i, "feat2_%d" % i)))

            if args.topics:
                print '\n'.join(topic_str)
                print
            if args.detailedtopics:
                topic_strings[k] = pad_same(topic_str)



    if args.docs:
        docids = codecs.getreader('utf-8')(open(args.docids)).readlines()
        docids = (d[:d.rindex('/')] for d in docids)
        docids = {dname: dnum for dnum, dname in enumerate(docids)}
        whitedocs = list(codecs.getreader('utf-8')(open(args.docs)).read().split())
        for docname in whitedocs:
            try:
                docid = docids[docname]
            except KeyError:
                pass
            docdist = model['pi'][docid]
            if not args.detailedtopics:
                docdist_s = ' '.join(map(repr, docdist))
                #print "%s\t%s" % (docname, docdist_s)
                for i, p in enumerate(docdist):
                    print "%s,%d,%f" % (docname, i, p)
            else:
                # output nice stuff.
                sorted_dist = sorted(list(enumerate(docdist)), key=lambda x: x[1], reverse=True)
                sorted_dist = [(i, p) for i, p in sorted_dist if p > 1e-6]
                print "Document: %s" % docname
                for i, p in sorted_dist:
                    print "  Topic %d: %f" % (i, p)
                print column_join([topic_strings[k] for k, p in sorted_dist])
                print



    if args.words and not args.all_words:
        whitewords = codecs.getreader('utf-8')(open(args.words)).read().split()
        mappings = {}
        for k,v in label_vocab.iteritems():
            nicev = v[:v.rindex("/")]
            #if nicev in whitewords and '/NN' in v:
            if nicev in whitewords:
                mappings[nicev] = k

        for ww in whitewords:
            if ww not in mappings:
                continue
            m = mappings[ww]
            word = label_vocab[m]
            probs = phi[:,m]
            if not args.dont_norm:
                probs = probs / np.sum(probs)
            niceword = word[:word.rindex("/")]
            if args.detailedtopics:
                niceprobs = [(i, p) for i, p in enumerate(probs) if p > .05]
                print "Word: %s" % niceword
                for i, p in niceprobs:
                    print "  Topic %d: %f" % (i, p)
                print column_join([topic_strings[k] for k, p in niceprobs])
                print
            elif args.csv:
                for i, p in enumerate(probs):
                    print "%s,%d,%f" % (niceword, i, p)
            else:
                print word[:word.rindex("/")] + "\t" + " ".join(repr(p) for p in probs)

    if args.all_words:
        for wid, w in label_vocab.iteritems():
            probs = phi[:,wid]
            if '/NN' in w:
                # hack for later
                w = w[:w.rindex('/')]
            if not args.dont_norm:
                probs = probs / np.sum(probs)
            print w + "\t" + " ".join(repr(p) for p in probs)
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(description='Checks for prediction of association norms.')
    parser.add_argument('--model', '-m', metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features', '-f', metavar='FILE',
                        help='The feature labels.')
    #parser.add_argument('--docs', '-D', metavar='FILE',
    #                    help='Output the document distributions for these documents.')
    #parser.add_argument('--docids', '-d', metavar='FILE',
    #                    help='The document labels.')
    args = parser.parse_args()

    model = np.load(args.model)
    phi = row_norm(np.ascontiguousarray(model["phi"].T))
    #pi = safe_pi_read(args.model)

    label_vocab = load_labels(args.vocab)
    #docids = codecs.getreader('utf-8')(open(args.docids)).readlines()

    phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w }

    nopos_labels = mdict()
    for i, v in label_vocab.iteritems():
        nopos = v[:v.rindex('/')]
        nopos_labels[nopos] = i

    assocs = load_associations()
    to_compute_similarities = list(set(t for t, a, c in assocs))

    ranked_sims = {}

    logging.info("compute similarities...")

    for z, w_i in enumerate(to_compute_similarities):
        if w_i not in phi_nn:
            continue
        i = phi_nn[w_i]
        w_i_dist = norm1(phi[i])
        similarities = np.array([cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi)])
        ranked_sims[w_i] = percentile_ranked(similarities)
        logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities)))

    logging.info("finished computing similarities.")

    measures = []
    oov_count = 0
    noov_count = 0
    for t, a, c in assocs:
        if t not in ranked_sims or a not in nopos_labels:
            oov_count += 1
            continue
        noov_count += 1
        ranked = ranked_sims[t]
        m = max(ranked[i] for i in nopos_labels[a])
        measures += [m] * c

    measures = np.array(measures)
    print "mean: %f" % measures.mean()
    print "std: %f" % measures.std()
    print "oov: %d" % oov_count
    print "len(measures) = %d" % len(measures)
    print "# hit: %d" % noov_count
    print "Percentiles [.05, .10, .25, .5, .75, .90, .95] ="
    print "     [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95]])
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser(
        description='Checks for prediction of association norms.')
    parser.add_argument('--model',
                        '-m',
                        metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features',
                        '-f',
                        metavar='FILE',
                        help='The feature labels.')
    #parser.add_argument('--docs', '-D', metavar='FILE',
    #                    help='Output the document distributions for these documents.')
    #parser.add_argument('--docids', '-d', metavar='FILE',
    #                    help='The document labels.')
    args = parser.parse_args()

    model = np.load(args.model)
    phi = row_norm(np.ascontiguousarray(model["phi"].T))
    #pi = safe_pi_read(args.model)

    label_vocab = load_labels(args.vocab)
    #docids = codecs.getreader('utf-8')(open(args.docids)).readlines()

    phi_nn = {
        w[:w.rindex('/')]: i
        for i, w in label_vocab.iteritems() if '/NN' in w
    }

    nopos_labels = mdict()
    for i, v in label_vocab.iteritems():
        nopos = v[:v.rindex('/')]
        nopos_labels[nopos] = i

    assocs = load_associations()
    to_compute_similarities = list(set(t for t, a, c in assocs))

    ranked_sims = {}

    logging.info("compute similarities...")

    for z, w_i in enumerate(to_compute_similarities):
        if w_i not in phi_nn:
            continue
        i = phi_nn[w_i]
        w_i_dist = norm1(phi[i])
        similarities = np.array([
            cached_jsdiv(i, j, w_i_dist, w_j_dist)
            for j, w_j_dist in enumerate(phi)
        ])
        ranked_sims[w_i] = percentile_ranked(similarities)
        logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities)))

    logging.info("finished computing similarities.")

    measures = []
    oov_count = 0
    noov_count = 0
    for t, a, c in assocs:
        if t not in ranked_sims or a not in nopos_labels:
            oov_count += 1
            continue
        noov_count += 1
        ranked = ranked_sims[t]
        m = max(ranked[i] for i in nopos_labels[a])
        measures += [m] * c

    measures = np.array(measures)
    print "mean: %f" % measures.mean()
    print "std: %f" % measures.std()
    print "oov: %d" % oov_count
    print "len(measures) = %d" % len(measures)
    print "# hit: %d" % noov_count
    print "Percentiles [.05, .10, .25, .5, .75, .90, .95] ="
    print "     [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([
        scipy.stats.scoreatpercentile(measures, p)
        for p in [5, 10, 25, 50, 75, 90, 95]
    ])