Ejemplo n.º 1
0
def make_series(model_root_folder,
                plot_interval=100,
                limit=None,
                no_new=False,
                dont_average_embeddings=False,
                **run_model_args):
    average_embeddings = not dont_average_embeddings
    if average_embeddings:
        suffix = 'eval-averaged.pkl'
    else:
        suffix = 'eval-None-False.pkl' # holdout from when we had include_synsets and normalize_components
    store_fname = join(model_root_folder, suffix)
    print store_fname
    try:
        stats = pandas.read_pickle(store_fname)
        print "read pickle from %s" % store_fname
    except:
        stats = pandas.DataFrame()
        print 'created new frame'
    if no_new:
        return stats

    models = models_in_folder(model_root_folder)
    model_nums = sorted(models.keys())

    latest_num = model_nums[-1] if model_nums else -1
    latest_num = -1

    print 'plotting every %i' % plot_interval

    to_plot = [n for n in model_nums if n % plot_interval == 0 and n != latest_num]
    if 1 in model_nums:
        to_plot = [1] + to_plot
    if limit is not None:
        to_plot = [n for n in to_plot if n <= limit]
    vocab_container = None
    print model_root_folder
    for n in to_plot:
        if n in stats.index:
            print 'already has %i' % n
            continue
        try:
            print 'loading %i' % n
            with gzip.open(models[n]) as f:
                model = cPickle.load(f)
        except Exception as e:
            print e
            continue
        # load the vocabulary if not already cached
        if not vocab_container:
            vocab_container = get_vocab_container(model)
        embeddings = model.embeddings if not average_embeddings else model.averaged_embeddings()
        this_stats = run_model(embeddings, vocab_container,  **run_model_args)
        stats = pandas.concat([stats, pandas.DataFrame([this_stats], index=[n])]).sort()
        stats.to_pickle(store_fname)
    return stats
Ejemplo n.º 2
0
def test(model):
    relationship_path = os.path.join(model.other_params['base_dir'], 'relationships.pkl.gz')
    with gzip.open(relationship_path) as f:
        relationships = cPickle.load(f)

    vocabulary = get_vocab_container(model).word_array

    s2w = SynsetToWord(vocabulary)

    testing_data = make_testing_data(model, relationships, vocabulary, s2w)

    scores, correct_synsets, correct_indices = score_model(model, vocabulary, s2w, relationships, testing_data)

    return ranks(scores, correct_indices)
Ejemplo n.º 3
0
def test(model):
    relationship_path = os.path.join(model.other_params['base_dir'],
                                     'relationships.pkl.gz')
    with gzip.open(relationship_path) as f:
        relationships = cPickle.load(f)

    vocabulary = get_vocab_container(model).word_array

    s2w = SynsetToWord(vocabulary)

    testing_data = make_testing_data(model, relationships, vocabulary, s2w)

    scores, correct_synsets, correct_indices = score_model(
        model, vocabulary, s2w, relationships, testing_data)

    return ranks(scores, correct_indices)
Ejemplo n.º 4
0
        lines = list(csv.reader(csvfile))[1:]

    words = [line[:2] for line in lines]
    human_scores = [float(line[-1]) for line in lines]

    model_scores = [1 - cosine(get_embedding(word1.lower()), get_embedding(word2.lower()))
                    for word1, word2 in words]

    rho, p = spearmanr(model_scores, human_scores)
    return rho, p


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('model', help="model file to be used for semeval.py script")
    parser.add_argument('--all_synsets', action='store_true',)
    parser.add_argument('--top_synset', action='store_true',)
    parser.add_argument('--normalize_components', action='store_true',)
    parser.add_argument('--wordsim_root', help="folder containing wordsim353 csv file", default="/home/dfried/data/wordsim/combined.csv")
    args = parser.parse_args()

    with gzip.open(args.model) as f:
        model = cPickle.load(f)

    vocab_container = get_vocab_container(model)

    rho, p = run(model.embeddings, vocab_container, args.wordsim_root)

    print 'rho: %f\tp: %f' % (rho, p)