def rank_by_semantic_shift_degree(model1, model2): wv1 = model1.wv.vocab wv2 = model2.wv.vocab top_n = 1000 print(list(wv1.keys())[:10]) print("Computing intersections of model1 and model2") intersection = list( set(list(wv1.keys())[:top_n]) & set(list(wv2.keys())[:top_n])) print("Computing the scores of semantic shifts") intersection_scores = [] for word in intersection: if "_NOUN" in word: #rint("Procrustes aligner score: {} (from -1 to 1)".format( # ProcrustesAligner(model1, model2).get_score(word))) score = Jaccard(model1, model2, 50).get_score(word) if score > 0.0: intersection_scores.append(tuple((word, score))) intersection_scores = sorted(intersection_scores, key=lambda tup: tup[1]) print("All done") return intersection_scores
def get_algo_by_kind_and_two_models(kind: str, model1: KeyedVectors, model2: KeyedVectors): if kind.lower() == "global_anchors": return GlobalAnchors(model1, model2) elif kind.lower() == "jaccard": return Jaccard(model1, model2, top_n_neighbors=50) elif kind.lower() == "kendall_tau": return KendallTau(model1, model2, top_n_neighbors=50)
def main(): parser = ArgumentParser() parser.add_argument('--word', '-w', required=True, help='word to be scored') parser.add_argument('--model1', '-m1', required=True, help='Path to the word embeddings model') parser.add_argument('--model2', '-m2', required=True, help='Path to the second word embedding model') parser.add_argument( '--top-n-neighbors', '-n', dest='topn', default=50, help='number of word neighbors to analyze ' '(optional, used in Kendall tau and Jaccard algo, default=50)') args = parser.parse_args() if not os.path.isfile(args.model1): raise FileNotFoundError("File {} not found".format(args.model1)) if not os.path.isfile(args.model2): raise FileNotFoundError("File {} not found".format(args.model2)) model1 = load_model(args.model1) model2 = load_model(args.model2) if args.word not in model1.vocab: raise ValueError("Word {} not in {}".format(args.word, args.model1)) if args.word not in model2.vocab: raise ValueError("Word {} not in {}".format(args.word, args.model2)) print("KendallTau score: {} (from -1 to 1)".format( KendallTau(model1, model2, top_n_neighbors=args.topn).get_score(args.word))) print("Jaccard score: {} (from 0 to 1)".format( Jaccard(model1, model2, args.topn).get_score(args.word))) print("Global Anchors score: {} (from -1 to 1)".format( GlobalAnchors(model1, model2).get_score(args.word))) print("Procrustes aligner score: {} (from -1 to 1)".format( ProcrustesAligner(model1, model2).get_score(args.word))) ranks = rank_by_semantic_shift_degree(model1, model2) list_of_pairs = [rank[0] + ':' + str(rank[1]) for rank in ranks[:100]] print("\n".join(list_of_pairs))
def get_mean_dist_jaccard(wordlist, modellist, top_n_neighbors): mean_scores = {} for word in wordlist: scores = 0 for i in range(len(modellist) - 1): score = Jaccard(w2v1=modellist[i], w2v2=modellist[i + 1], top_n_neighbors=top_n_neighbors).get_score(word) scores += (1 - score) mean_scores[word] = scores / (len(modellist) - 1) return mean_scores
def get_move_from_initial_jaccard(wordlist, modellist, top_n_neighbors): move_from_init = {} for word in wordlist: deltas = 0 previous = Jaccard(modellist[0], modellist[1], top_n_neighbors=top_n_neighbors).get_score(word) for i in range(2, len(modellist)): similarity = Jaccard( modellist[0], modellist[i], top_n_neighbors=top_n_neighbors).get_score(word) delta = similarity - previous if delta > 0: deltas -= 1 elif delta < 0: deltas += 1 previous = similarity move_from_init[word] = deltas / (len(modellist) - 2) return move_from_init