def make_series(model_root_folder, plot_interval=100, limit=None, no_new=False, dont_average_embeddings=False, **run_model_args): average_embeddings = not dont_average_embeddings if average_embeddings: suffix = 'eval-averaged.pkl' else: suffix = 'eval-None-False.pkl' # holdout from when we had include_synsets and normalize_components store_fname = join(model_root_folder, suffix) print store_fname try: stats = pandas.read_pickle(store_fname) print "read pickle from %s" % store_fname except: stats = pandas.DataFrame() print 'created new frame' if no_new: return stats models = models_in_folder(model_root_folder) model_nums = sorted(models.keys()) latest_num = model_nums[-1] if model_nums else -1 latest_num = -1 print 'plotting every %i' % plot_interval to_plot = [n for n in model_nums if n % plot_interval == 0 and n != latest_num] if 1 in model_nums: to_plot = [1] + to_plot if limit is not None: to_plot = [n for n in to_plot if n <= limit] vocab_container = None print model_root_folder for n in to_plot: if n in stats.index: print 'already has %i' % n continue try: print 'loading %i' % n with gzip.open(models[n]) as f: model = cPickle.load(f) except Exception as e: print e continue # load the vocabulary if not already cached if not vocab_container: vocab_container = get_vocab_container(model) embeddings = model.embeddings if not average_embeddings else model.averaged_embeddings() this_stats = run_model(embeddings, vocab_container, **run_model_args) stats = pandas.concat([stats, pandas.DataFrame([this_stats], index=[n])]).sort() stats.to_pickle(store_fname) return stats
def test(model): relationship_path = os.path.join(model.other_params['base_dir'], 'relationships.pkl.gz') with gzip.open(relationship_path) as f: relationships = cPickle.load(f) vocabulary = get_vocab_container(model).word_array s2w = SynsetToWord(vocabulary) testing_data = make_testing_data(model, relationships, vocabulary, s2w) scores, correct_synsets, correct_indices = score_model(model, vocabulary, s2w, relationships, testing_data) return ranks(scores, correct_indices)
def test(model): relationship_path = os.path.join(model.other_params['base_dir'], 'relationships.pkl.gz') with gzip.open(relationship_path) as f: relationships = cPickle.load(f) vocabulary = get_vocab_container(model).word_array s2w = SynsetToWord(vocabulary) testing_data = make_testing_data(model, relationships, vocabulary, s2w) scores, correct_synsets, correct_indices = score_model( model, vocabulary, s2w, relationships, testing_data) return ranks(scores, correct_indices)
lines = list(csv.reader(csvfile))[1:] words = [line[:2] for line in lines] human_scores = [float(line[-1]) for line in lines] model_scores = [1 - cosine(get_embedding(word1.lower()), get_embedding(word2.lower())) for word1, word2 in words] rho, p = spearmanr(model_scores, human_scores) return rho, p if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('model', help="model file to be used for semeval.py script") parser.add_argument('--all_synsets', action='store_true',) parser.add_argument('--top_synset', action='store_true',) parser.add_argument('--normalize_components', action='store_true',) parser.add_argument('--wordsim_root', help="folder containing wordsim353 csv file", default="/home/dfried/data/wordsim/combined.csv") args = parser.parse_args() with gzip.open(args.model) as f: model = cPickle.load(f) vocab_container = get_vocab_container(model) rho, p = run(model.embeddings, vocab_container, args.wordsim_root) print 'rho: %f\tp: %f' % (rho, p)