def test_inplace_transform_word_OrderedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = OrderedVocabulary(words=['dog', 'cat', ' cat']) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) pe = e.transform_words(lambda x: x.strip(), inplace=True) assert pe is e and pe == e assert len(pe.vocabulary) == 2 assert len(pe.vectors) == 2 # 'dog' assert [0, 0, 11] in pe.vectors.tolist() # 'cat' assert [0, 11, 12] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 11]) # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 11, 12]) assert type(pe.vocabulary) == OrderedVocabulary
def get_vector_pairs(w, X, y, dataset='simlex', save=True): if isinstance(w, dict): w = list(w.values()) names = list(w.keys()) w_source = Embedding.from_dict(w[0]) w_target = Embedding.from_dict(w[1]) missing_words = 0 source_words = w['source'].vocabulary.word_id for query in X: for query_word in query: if query_word not in source_words: missing_words += 1 if missing_words > 0: logger.warning("Missing {} source words. Will replace them with mean vector".format(missing_words)) mean_vector_source = np.mean(w['source'].vectors, axis=0, keepdims=True) mean_vector_target = np.mean(w['target'].vectors, axis=0, keepdims=True) x = list(set(list(X[:,0])+list(X[:,1]))) x1 = np.vstack(w['source'].get(word, mean_vector_source) for word in x) x2 = np.vstack(w['target'].get(word, mean_vector_target) for word in x) if save: filename = w['source']+'2'+w['target']+ '_'+dataset word2vec = {'source':{},'target':{}} save_vectors(filename) return (x1, x2)
def test_inplace_transform_word_CountedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)]) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) pe = e.transform_words(lambda x: x.strip(), inplace=True) assert pe is e and pe == e assert len(pe.vocabulary) == 2 assert len(pe.vectors) == 2 # 'dog' assert [0, 0, 11] in pe.vectors.tolist() # 'cat' assert [0, 11, 12] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words l = pe.vocabulary.getstate() d = {l[0][i]: l[1][i] for i in range(len(l[0]))} # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 11]) assert d['dog'] == 60 # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 11, 12]) assert d['cat'] == 50 assert type(pe.vocabulary) == CountedVocabulary
def test_noinplace_transform_word_prefer_shortestword2_Vocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = Vocabulary(words=['dog', 'cat', ' pikatchu ', 'pikatchu', ' cat ']) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) pe = e.transform_words(lambda x: x.strip(), inplace=False) assert len(pe.vocabulary) == 3 assert len(pe.vectors) == 3 # 'dog' assert [0, 0, 1] in pe.vectors.tolist() # 'cat' assert [0, 1, 11] in pe.vectors.tolist() # pikatchu assert [0, 12, 13] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words assert 'pikatchu' in pe.vocabulary.words # pikatchu assert pe.vocabulary.words[2] == 'pikatchu' assert np.array_equal(pe.vectors[2], [0, 12, 13]) # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 1]) # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 1, 11]) assert type(pe.vocabulary) == Vocabulary
def test_save_2(): dirpath = tempfile.mkdtemp() w = ["a", "b", "c"] vectors = np.array([[1., 2.], [2., 3.], [3., 4.]]) e = Embedding(Vocabulary(w), vectors) Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) assert np.array_equal(e2.vectors, vectors)
def test_save_2(): dirpath = tempfile.mkdtemp() w = ["a", "b", "c"] vectors = np.array([[1.,2.] ,[2.,3.], [3.,4.]]) e = Embedding(Vocabulary(w), vectors) Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) assert np.array_equal(e2.vectors, vectors)
def test_save(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) dirpath = tempfile.mkdtemp() w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True) w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False) w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True) w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False) assert np.array_equal(w.vectors, w2.vectors) assert w.vocabulary.words == w2.vocabulary.words assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5 assert w.vocabulary.words == w3.vocabulary.words
def test_analogy_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_google_analogy() ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) X, y = data.X[ids], data.y[ids] category = data.category_high_level[ids] results = evaluate_analogy(w=w, X=X, y=y, category=category) assert results['accuracy']['all'] >= 0.65 assert results['accuracy']['semantic'] >= 0.7 assert results['accuracy']['syntactic'] >= 0.63 results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") assert results['accuracy']['all'] >= 0.7 assert results['accuracy']['semantic'] >= 0.75 assert results['accuracy']['syntactic'] >= 0.64 results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic'] assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
def evaluate_similarity(w, X, y, restrict_to_words=None): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ from web.embedding import Embedding if isinstance(w, dict): w = Embedding.from_dict(w) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def test_standardize_preserve_identity(): d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]} w3 = Embedding.from_dict(d) w4 = w3.standardize_words(inplace=False, lower=True) assert w4['spider'][0] == 1 w3.standardize_words(inplace=True, lower=True) assert w3['spider'][0] == 1
def evaluate_ana(wv, w2i, vocab): W_norm = np.zeros(wv.shape) d = (np.sum(wv**2, 1)**(0.5)) W_norm = (wv.T / d).T evaluate_analogy_msr(W_norm, w2i) evaluate_analogy_google(W_norm, w2i) wv_dict = dict() for w in vocab: wv_dict[w] = W_norm[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) evaluate_analogy_semeval2012(w) # analogy_tasks = { # "Google": fetch_google_analogy(), # "MSR": fetch_msr_analogy() # } # analogy_results = {} # for name, data in iteritems(analogy_tasks): # analogy_results[name] = evaluate_analogy(w, data.X, data.y) # print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
def checkpoint(self, epoch, sess): """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ embed_matrix, \ context_matrix = sess.run([self.embed_matrix, self.context_matrix]) voc2vec = {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) self.logger.info('Current Score: {}'.format(curr_int)) if curr_int > self.best_int_avg: self.logger.info("Saving embedding matrix") f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w') for id, wrd in self.id2voc.items(): f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()]))) self.saver.save(sess=sess, save_path=self.save_path) self.best_int_avg = curr_int
def test_categorization(): data = fetch_ESSLI_2c() url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2
def evaluate_synonyms(e, problems): correct = 0 total = 0 #debugging... if not e: all_words = np.concatenate([[q] + o for q, o, _ in problems]) e = Embedding.from_dict({w: np.random.random(10) for w in all_words}) meanvec = np.mean(e.vectors, axis=0) # with open('synonyms_test_words', 'a') as testw: for question, options, answer in problems: # testw.write('\n'.join(options+[question])+'\n') if question in e: print('question: ' + question) print(options) q_v = e[question].reshape(1, -1) q_ops = np.vstack( [e[op] if op in e else meanvec for op in options]) distances = cdist(q_v, q_ops, metric='cosine')[0] selected = np.argsort(distances)[0] if selected == answer: correct += 1 total += 1 score = correct * 1. / total return score
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) total_words = 0 missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: total_words += 1 if query_word not in words: missing_words += 1 if missing_words > 0: logger.info( "Missing {} words out of {} total words in test ({}% of words are missing)." .format(missing_words, total_words, missing_words / total_words * 100.0)) #logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) ''' mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation ''' words = zip(X[:, 0], X[:, 1]) A = [] B = [] new_y = [] for (w1, w2), score in zip(words, y): if w1 in w and w2 in w: A.append(w[w1]) B.append(w[w2]) new_y.append(score) A = np.vstack(A) B = np.vstack(B) y = np.vstack(new_y) assert len(A) == len(B) == len(y) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_on_all(w): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = {"WS353": fetch_WS353()} similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format( name, similarity_results[name])) sim = pd.DataFrame([similarity_results]) results = sim return results
def evaluate_simi(wv, w2i, vocab): wv_dict = dict() for w in vocab: wv_dict[w] = wv[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) # Calculate results on similarity print("Calculating similarity benchmarks") similarity_tasks = { "WS353": fetch_WS353(), "RG65": fetch_RG65(), # "WS353R": fetch_WS353(which="relatedness"), # "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "MTurk": fetch_MTurk(), "RW": fetch_RW(), "MEN": fetch_MEN(), } # similarity_results = {} for name, data in iteritems(similarity_tasks): print( "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}" .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0])) score = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, score))
def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. method : {"add", "mul"} Method to use when finding analogy answer, see "Improving Distributional Similarity with Lessons Learned from Word Embeddings" X : array-like, shape (n_samples, 3) Analogy questions. y : array-like, shape (n_samples, ) Analogy answers. k : int, default: None If not None will select k top most frequent words from embedding batch_size : int, default: 100 Increase to increase memory consumption and decrease running time category : list, default: None Category of each example, if passed function returns accuracy per category in addition to the overall performance. Analogy datasets have "category" field that can be supplied here. Returns ------- result: dict Results, where each key is for given category and special empty key "" stores summarized accuracy across categories """ if isinstance(w, dict): w = Embedding.from_dict(w) assert category is None or len(category) == y.shape[0], "Passed incorrect category list" solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k) y_pred = solver.predict(X) if category is not None: results = OrderedDict({"all": np.mean(y_pred == y)}) count = OrderedDict({"all": len(y_pred)}) correct = OrderedDict({"all": np.sum(y_pred == y)}) for cat in set(category): results[cat] = np.mean(y_pred[category == cat] == y[category == cat]) count[cat] = np.sum(category == cat) correct[cat] = np.sum(y_pred[category == cat] == y[category == cat]) return pd.concat([pd.Series(results, name="accuracy"), pd.Series(correct, name="correct"), pd.Series(count, name="count")], axis=1) else: return np.mean(y_pred == y)
def test_standardize(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) w2 = w.standardize_words(inplace=False, lower=False, clean_words=True) w3 = Embedding.from_word2vec(file_name, binary=True) assert len(w2.words) == 95 for word in w.vocabulary.words: if standardize_string(word, lower=False, clean_words=True): assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)]) w3.standardize_words(inplace=True, clean_words=True, lower=False) assert len(w3.words) == 95 for word in w.vocabulary.words: if standardize_string(word, lower=False): assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" if format == "word2vec_bin": w = Embedding.from_word2vec(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname, "rb"), encoding='latin1') w = Embedding.from_dict(d) if normalize: w.normalize_words(inplace=True) if lower or clean_words: w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 # if missing_words > 0: # print("Missing {} words. Will replace them with mean vector".format(missing_words)) new_x = [] new_y = [] exist_cnt = 0 for i in range(len(X)): if X[i, 0] in words and X[i, 1] in words: new_x.append(X[i]) new_y.append(y[i]) exist_cnt += 1 print('exist {} in {}'.format(exist_cnt, len(X))) X = np.array(new_x) y = np.array(new_y) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) # scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_on_semeval_2012_2(w): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. Returns ------- result: pandas.DataFrame Results with spearman correlation per broad category with special key "all" for summary spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_semeval_2012_2() mean_vector = np.mean(w.vectors, axis=0, keepdims=True) categories = data.y.keys() results = defaultdict(list) for c in categories: # Get mean of left and right vector prototypes = data.X_prot[c] prot_left = np.mean( np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0, ) prot_right = np.mean( np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0, ) questions = data.X[c] question_left, question_right = ( np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), np.vstack(w.get(word, mean_vector) for word in questions[:, 1]), ) scores = np.dot(prot_left - prot_right, (question_left - question_right).T) c_name = data.categories_names[c].split("_")[0] # NaN happens when there are only 0s, which might happen for very rare words or # very insufficient word vocabulary cor = scipy.stats.spearmanr(scores, data.y[c]).correlation results[c_name].append(0 if np.isnan(cor) else cor) final_results = OrderedDict() final_results["all"] = sum(sum(v) for v in results.values()) / len(categories) for k in results: final_results[k] = sum(results[k]) / len(results[k]) return pd.Series(final_results)
def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = CountedVocabulary( word_count=[('dog', 60), ('cat', 50), (' pikatchu ', 10), ('pikatchu', 10), (' cat ', 5)]) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) pe = e.transform_words(lambda x: x.strip(), inplace=False) assert len(pe.vocabulary) == 3 assert len(pe.vectors) == 3 # 'dog' assert [0, 0, 1] in pe.vectors.tolist() # 'cat' assert [0, 1, 11] in pe.vectors.tolist() # pikatchu assert [0, 12, 13] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words assert 'pikatchu' in pe.vocabulary.words l = pe.vocabulary.getstate() d = {l[0][i]: l[1][i] for i in range(len(l[0]))} # pikatchu assert pe.vocabulary.words[2] == 'pikatchu' assert np.array_equal(pe.vectors[2], [0, 12, 13]) assert d['pikatchu'] == 10 # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 1]) assert d['dog'] == 60 # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 1, 11]) assert d['cat'] == 50 assert type(pe.vocabulary) == CountedVocabulary
def test_similarity(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_SimLex999() result_1 = evaluate_similarity(w, data.X, data.y) result_2 = evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y) assert result_2 > 0 assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
def test_similarity_norm(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) w_norm = w.normalize_words() data = fetch_SimLex999() result_1 = evaluate_similarity(w, data.X, data.y) result_2 = evaluate_similarity(w_norm, data.X, data.y) assert result_2 > 0 assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
def evaluate_similarity(w, X, y, missing_words='mean'): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) n_missing_words = count_missing_words(w, X) if n_missing_words > 0: logger.warning("Missing {} words.".format(n_missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A, B = [], [] if missing_words == 'mean' or n_missing_words == 0: if n_missing_words: logger.info( "Will replace them with mean vector".format(missing_words)) A = [w.get(word, mean_vector) for word in X[:, 0]] B = [w.get(word, mean_vector) for word in X[:, 1]] elif missing_words == 'filter_out': logger.info("Will ignore them") y_filtered = [] for x, gt in zip(X, y): a, b = x if a not in w or b not in w: continue A.append(w.get(a, mean_vector)) B.append(w.get(b, mean_vector)) y_filtered.append(gt) y = np.asarray(y_filtered) #A = np.asarray([w.get(word, mean_vector) for word in X[:, 0]]) #B = np.asarray([w.get(word, mean_vector) for word in X[:, 1]]) scores = np.array([cosine_similarity(v1, v2) for v1, v2 in zip(A, B)]) #scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) #avs: mean_vector for missing_words mean_vector = np.mean(w.vectors, axis=0, keepdims=True) #avs: getting the vector for each word A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) #avs: calculate the cosine distance between the 2 vectores # why v1.dot(v2.T): because we are working with matrixes !!! http://www.thefactmachine.com/cosine-similarity/ scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_word_analogy(wv, w2i, vocab): W_norm = np.zeros(wv.shape) d = (np.sum(wv**2, 1)**(0.5)) W_norm = (wv.T / d).T evaluate_analogy_msr(W_norm, w2i) evaluate_analogy_google(W_norm, w2i) wv_dict = dict() for w in vocab: wv_dict[w] = W_norm[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) evaluate_analogy_semeval2012(w)
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float: """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) return curr_int
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id idx = set() for i, query in enumerate(X): missing_words = 0 for query_word in query: if query_word not in words: missing_words += 1 break if missing_words == 0: idx.add(i) idx = list(idx) y = y[idx] X = X[idx] A = np.vstack(w.get(word) for word in X[:, 0]) B = np.vstack(w.get(word) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id idx = [] for i, query in enumerate(X): if query[0] not in words or query[1] not in words: missing_words += 1 else: idx.append(i) if missing_words > 0: logger.warning("Missing {} pairs. ".format(missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) print("norms", np.linalg.norm(A[0]), len(idx)) #print("scores",np.count_nonzero(np.isnan(A))) return scipy.stats.spearmanr(scores[idx], y[idx]).correlation
def evaluate_on_semeval_2012_2(w): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. Returns ------- result: pandas.DataFrame Results with spearman correlation per broad category with special key "all" for summary spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_semeval_2012_2() mean_vector = np.mean(w.vectors, axis=0, keepdims=True) categories = data.y.keys() results = defaultdict(list) for c in categories: # Get mean of left and right vector prototypes = data.X_prot[c] prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0) prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0) questions = data.X[c] question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \ np.vstack(w.get(word, mean_vector) for word in questions[:, 1]) scores = np.dot(prot_left - prot_right, (question_left - question_right).T) c_name = data.categories_names[c].split("_")[0] # NaN happens when there are only 0s, which might happen for very rare words or # very insufficient word vocabulary cor = scipy.stats.spearmanr(scores, data.y[c]).correlation results[c_name].append(0 if np.isnan(cor) else cor) final_results = OrderedDict() final_results['all'] = sum(sum(v) for v in results.values()) / len(categories) for k in results: final_results[k] = sum(results[k]) / len(results[k]) return pd.Series(final_results)
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) #avs: mean_vector for missing_words mean_vector = np.mean(w.vectors, axis=0, keepdims=True) #avs: getting the vector for each word A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) #avs: calculate the cosine distance between the 2 vectores # why v1.dot(v2.T): because we are working with matrixes !!! http://www.thefactmachine.com/cosine-similarity/ scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack([w.get(word, mean_vector) for word in X[:, 0]]) B = np.vstack([w.get(word, mean_vector) for word in X[:, 1]]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_categorization(w, X, y, method="all", seed=None): """ Evaluate embeddings on categorization task. Parameters ---------- w: Embedding or dict Embedding to test. X: vector, shape: (n_samples, ) Vector of words. y: vector, shape: (n_samples, ) Vector of cluster assignments. method: string, default: "all" What method to use. Possible values are "agglomerative", "kmeans", "all. If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude hyperparameter tuning to avoid overfitting). If "kmeans" is passed, method will fit KMeans. In both cases number of clusters is preset to the correct value. seed: int, default: None Seed passed to KMeans. Returns ------- purity: float Purity of the best obtained clustering. Notes ----- KMedoids method was excluded as empirically didn't improve over KMeans (for categorization tasks available in the package). """ if isinstance(w, dict): w = Embedding.from_dict(w) assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method" mean_vector = np.mean(w.vectors, axis=0, keepdims=True) w.oov = 0 words = np.vstack(w.get(word, mean_vector) for word in X.flatten()) print ('{} oov words out of {}'.format(w.oov, len(X.flatten()))) ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False) # Evaluate clustering on several hyperparameters of AgglomerativeClustering and # KMeans best_purity = 0 if method == "all" or method == "agglomerative": best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity="euclidean", linkage="ward").fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward')) for affinity in ["cosine", "euclidean"]: for linkage in ["average", "complete"]: purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity=affinity, linkage=linkage).fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage)) best_purity = max(best_purity, purity) if method == "all" or method == "kmeans": purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))). fit_predict(words[ids])) logger.debug("Purity={:.3f} using KMeans".format(purity)) best_purity = max(purity, best_purity) return best_purity
def evaluate_on_all(w): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "WS353R": fetch_WS353(which="relatedness"), "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "RW": fetch_RW(), "RG65": fetch_RG65(), "MTurk": fetch_MTurk(), } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy logger.info("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) # Calculate results on categorization logger.info("Calculating categorization benchmarks") categorization_tasks = { "AP": fetch_AP(), "BLESS": fetch_BLESS(), "Battig": fetch_battig(), "ESSLI_2c": fetch_ESSLI_2c(), "ESSLI_2b": fetch_ESSLI_2b(), "ESSLI_1a": fetch_ESSLI_1a() } categorization_results = {} # Calculate results using helper function for name, data in iteritems(categorization_tasks): categorization_results[name] = evaluate_categorization(w, data.X, data.y) logger.info("Cluster purity on {} {}".format(name, categorization_results[name])) # Construct pd table cat = pd.DataFrame([categorization_results]) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = cat.join(sim).join(analogy) return results
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}): """ Evaluate on WordRep dataset Parameters ---------- w : Embedding or dict Embedding or dict instance. max_pairs: int, default: 1000 Each category will be constrained to maximum of max_pairs pairs (which results in max_pair * (max_pairs - 1) examples) solver_kwargs: dict, default: {} Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words in the dictionary. References ---------- Bin Gao, Jiang Bian, Tie-Yan Liu (2015) "WordRep: A Benchmark for Research on Learning Word Representations" """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_wordrep() categories = set(data.category) accuracy = {} correct = {} count = {} for cat in categories: X_cat = data.X[data.category == cat] X_cat = X_cat[0:max_pairs] logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0] , X_cat.shape[0] * (X_cat.shape[0] - 1))) # For each category construct question-answer pairs size = X_cat.shape[0] * (X_cat.shape[0] - 1) X = np.zeros(shape=(size, 3), dtype="object") y = np.zeros(shape=(size,), dtype="object") id = 0 for left, right in product(X_cat, X_cat): if not np.array_equal(left, right): X[id, 0:2] = left X[id, 2] = right[0] y[id] = right[1] id += 1 # Run solver solver = SimpleAnalogySolver(w=w, **solver_kwargs) y_pred = solver.predict(X) correct[cat] = float(np.sum(y_pred == y)) count[cat] = size accuracy[cat] = float(np.sum(y_pred == y)) / size # Add summary results correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories) correct['all'] = sum(correct[c] for c in categories) correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories) count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories) count['all'] = sum(count[c] for c in categories) count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories) accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia'] accuracy['all'] = correct['all'] / count['all'] accuracy['wordnet'] = correct['wordnet'] / count['wordnet'] return pd.concat([pd.Series(accuracy, name="accuracy"), pd.Series(correct, name="correct"), pd.Series(count, name="count")], axis=1)
def test_semeval_solver(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) results = evaluate_on_semeval_2012_2(w) assert results['all'] >= 0, "Should have some results on SemEval2012"
def test_wordrep_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) P = evaluate_on_WordRep(w, max_pairs=2) assert P['accuracy']['all'] >= 0