def intersec_models(modellist, intersec_vocab): for model in modellist[1:]: _, _ = intersection_align_gensim(m1=modellist[0], m2=model, words=intersec_vocab) return modellist
def __init__(self, w2v1, w2v2, assume_vocabs_are_identical=False): if not assume_vocabs_are_identical: w2v1, w2v2 = intersection_align_gensim(copy.copy(w2v1), copy.copy(w2v2)) self.w2v1 = w2v1 self.w2v2 = w2v2
def sims_aligned(year, word, *args): all_most_sim = [] wrd_vectors = [] def unite_sims(m): for x in m: all_most_sim.append(x[0]) for pair in combinations(args, 2): _, _ = intersection_align_gensim(pair[0], pair[1]) _ = smart_procrustes_align_gensim(pair[0], pair[1]) for i in range(len(args)): unite_sims(args[i].most_similar(word, topn=7)) if i != 0: wrd_vectors.append(args[i][word]) return all_most_sim, wrd_vectors, args[0], word, year
def main(): parser = argparse.ArgumentParser() parser.add_argument('--models', nargs='+', help='paths to models to compare pairwise') parser.add_argument( '--pos-tag', default=None, type=str, help='specify this to remove words with other pos tags', dest='pos_tag') parser.add_argument( '--top-n-most-frequent-words', default=None, type=int, help= 'you can specify n so that both positive and negative samples are from ' 'top n most frequent words', dest="top_n_most_frequent_words") parser.add_argument('--positive-samples', type=int, default=10, help='words that have changed most', dest='positive_samples') parser.add_argument('--negative-samples', type=int, default=10, help='randomly samples words', dest='negative_samples') parser.add_argument('--shuffle', action='store_true', help='use this argument to shuffle the output') parser.add_argument('--savefig', type=str, default=None, help='if specified, this program will plot ' 'the distributions of positive and negative samples' 'and store them in the path specified') args = parser.parse_args() samples = list() labels = list() base_year = list() ratings = list() pos_counts = list() neg_counts = list() for num, (model1_path, model2_path) in enumerate(zip(args.models, args.models[1:])): log("{num} / {total} {model1} {model2}".format(num=num, total=len(args.models) - 1, model1=model1_path, model2=model2_path), end='\r') log('Done') model1 = load_model(model1_path) model2 = load_model(model2_path) model1, model2 = intersection_align_gensim( model1, model2, pos_tag=args.pos_tag, top_n_most_frequent_words=args.top_n_most_frequent_words) global_anchors_result = GlobalAnchors.get_changes( model1, model2, args.positive_samples) positive_samples = [word for (word, score) in global_anchors_result] possible_negative_samples = set( model1.wv.vocab.keys()) - set(positive_samples) negative_samples = list() for positive_sample in positive_samples: number_of_bin = get_bin(positive_sample, model1.vocab) eligible_negative_samples = [ word for word in possible_negative_samples if get_bin(word, model1.vocab) == number_of_bin ] negative_sample = random.choice(eligible_negative_samples) negative_samples.append(negative_sample) import sys print("Positive sample:", positive_sample, file=sys.stderr) print("Negative sample:", negative_sample, file=sys.stderr) print("Positive bin:", str(get_bin(positive_sample, model1.vocab)), file=sys.stderr) print("Negative bin:", str(get_bin(negative_sample, model1.vocab)), file=sys.stderr) print("Positive percentile:", str(get_percentile(positive_sample, model1.vocab)), file=sys.stderr) print("Negative percentile:", str(get_percentile(negative_sample, model1.vocab)), file=sys.stderr) print("Positive count:", str(get_count(positive_sample, model1.vocab)), file=sys.stderr) print("Negative count:", str(get_count(negative_sample, model1.vocab)), file=sys.stderr) print("==============================================", file=sys.stderr) samples.extend(positive_samples) samples.extend(negative_samples) labels.extend([1] * args.positive_samples) labels.extend([0] * args.negative_samples) pos_counts.extend( [get_count(word, model1.vocab) for word in positive_samples]) neg_counts.extend( [get_count(word, model1.vocab) for word in negative_samples]) ratings.extend(range(1, args.positive_samples + 1)) ratings.extend([-1] * args.negative_samples) if model1_path.startswith('wordvectors/') and model1_path.endswith( '.model'): year = int(model1_path[len('wordvectors/'):-len('.model')]) else: raise ValueError( "Pattern of {path} is not recognized. Path to model must start with 'wordvectors/'" " and end with '.model'. Feel free to change the pattern in the source code." .format(path=model1_path)) base_year.extend([year] * (args.positive_samples + args.negative_samples)) output = pd.DataFrame({ "WORD": samples, "LABEL": labels, 'ASSESSOR_LABEL': -1, 'BASE_YEAR': base_year, "RATING": ratings }) if args.shuffle: output = output.sample(frac=1).reset_index( drop=True) # this shuffles the dataframe but not its index output.index.names = ['ID'] print(output.to_csv()) with open('neg_counts.txt', 'w') as f: print(neg_counts, file=f) with open('pos_counts.txt', 'w') as f: print(pos_counts, file=f) if args.savefig is not None: upper = np.quantile(pos_counts, 0.99) plt.hist(pos_counts, alpha=0.5, bins=10, label='positive', range=(0, upper)) plt.hist(neg_counts, alpha=0.5, bins=10, label='negative', range=(0, upper)) plt.legend(loc='upper right') plt.savefig(args.savefig)
def __init__(self, w2v1: gensim.models.KeyedVectors, w2v2: gensim.models.KeyedVectors): self.w2v1, self.w2v2 = intersection_align_gensim(w2v1, w2v2) self.w2v2_changed = smart_procrustes_align_gensim(w2v1, w2v2)
def comparison(w2v1_path: str, w2v2_path: str, top_n_neighbors: int, top_n_changed_words: (int, None), top_n_most_frequent_words: (int, None), pos_tag: (str, None), informative: bool): """ This module extracts two models from two specified paths and compares the meanings of words within their vocabulary. :param w2v1_path: the path to the first model :param w2v2_path: the path to the second modet :param top_n_neighbors: we will compare top n neighbors of words :param top_n_changed_words: we will output top n most interesting words, may be int or None :param top_n_most_frequent_words: we will use top n most frequent words from each model, may be int or None :param pos_tag: specify this to consider only words with a specific pos_tag :param informative: if True we use informative_output for printing output (more verbose and interpretable) :return: None """ w2v1 = load_model(w2v1_path) w2v2 = load_model(w2v2_path) log("The first model contains {words1} words, e. g. {word1}\n" "The second model contains {words2} words, e. g. {word2}".format( words1=len(w2v1.wv.vocab), words2=len(w2v2.wv.vocab), word1=random.choice(list(w2v1.wv.vocab.keys())), word2=random.choice(list(w2v2.wv.vocab.keys())))) w2v1, w2v2 = intersection_align_gensim( w2v1, w2v2, pos_tag=pos_tag, top_n_most_frequent_words=top_n_most_frequent_words) log("After preprocessing, the first model contains {words1} words, e. g. {word1}\n" "The second model contains {words2} words, e. g. {word2}".format( words1=len(w2v1.wv.vocab), words2=len(w2v2.wv.vocab), word1=random.choice(list(w2v1.wv.vocab.keys())), word2=random.choice(list(w2v2.wv.vocab.keys())))) jaccard_result = Jaccard(w2v1=w2v1, w2v2=w2v2, top_n_neighbors=top_n_neighbors).get_changes( top_n_changed_words=top_n_changed_words) kendalltau_result = KendallTau( w2v1=w2v1, w2v2=w2v2, top_n_neighbors=top_n_neighbors).get_changes( top_n_changed_words=top_n_changed_words) procrustes_result = ProcrustesAligner( w2v1=w2v1, w2v2=w2v2).get_changes(top_n_changed_words=top_n_changed_words) global_anchors_result = GlobalAnchors( w2v1=w2v1, w2v2=w2v2).get_changes(top_n_changed_words=top_n_changed_words) results = (jaccard_result, kendalltau_result, procrustes_result, global_anchors_result) names = ('JACCARD', 'KENDALL TAU', 'PROCRUSTES', 'GLOBAL ANCHORS') if informative: for result, name in zip(results, names): informative_output(result, w2v1, w2v2, top_n_neighbors, name) else: for result, name in zip(results, names): simple_output(result, name)
def intersec_models(self, modeldict, intersec_vocab): for year, model in modeldict.items(): if year != 2015: _, _ = intersection_align_gensim(m1=modeldict.get(2015), m2=model, words=intersec_vocab) return modeldict