def do_search(word1): if not word1 in search_cache: embeddings = helpers.load_embeddings() words = word1.split(":") all_lookups = {} all_sims = defaultdict(list) all_terms = defaultdict(list) for word2 in words: if not word2 in term_cache: term_cache[word2] = helpers.get_time_sims(embeddings, word2) else: print "USING CACHED NEIGHBORS FOR", word2 time_sims, lookups, nearests, sims = term_cache[word2] for word in lookups: all_terms[word].append(word2) for word in lookups: all_sims[word].append(sims[word]) all_lookups.update(lookups) words = all_lookups.keys() values = [ all_lookups[word] for word in words ] fitted = helpers.fit_tsne(values) # we should stitch the arrays together into objects, i guess objs = [] for i in xrange(len(words)): word = words[i] ww, decade = word.split("|") obj = { "word" : ww, "query" : all_terms[word], "year" : int(decade), "similarity" : all_sims[word], "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]), "sum_similarity" : sum(all_sims[word]), "position" : { "x" : round(fitted[i][0], 3), "y" : round(fitted[i][1], 3) } } objs.append(obj) search_cache[word1] = objs return { "term" : word1, "results" : search_cache[word1] }
def do_search(word1): if not word1 in search_cache: embeddings = helpers.load_embeddings() words = word1.split(":") all_lookups = {} all_sims = defaultdict(list) all_terms = defaultdict(list) for word2 in words: if not word2 in term_cache: term_cache[word2] = helpers.get_time_sims(embeddings, word2) else: print("USING CACHED NEIGHBORS FOR", word2) time_sims, lookups, nearests, sims = term_cache[word2] for word in lookups: all_terms[word].append(word2) for word in lookups: all_sims[word].append(sims[word]) all_lookups.update(lookups) words = list(all_lookups.keys()) values = [ all_lookups[word] for word in words ] fitted = helpers.fit_tsne(values) # we should stitch the arrays together into objects, i guess objs = [] for i in range(len(words)): word = words[i] ww, decade = word.split("|") obj = { "word" : ww, "query" : all_terms[word], "year" : int(decade), "similarity" : all_sims[word], "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]), "sum_similarity" : sum(all_sims[word]), "position" : { "x" : round(fitted[i][0], 3), "y" : round(fitted[i][1], 3) } } objs.append(obj) search_cache[word1] = objs return { "term" : word1, "results" : search_cache[word1] }
""" Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims( embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [lookups[word] for word in words] fitted = helpers.fit_tsne(values) if not len(fitted): print "Couldn't model word", word1 continue cmap = helpers.get_cmap(len(time_sims))
def main(): parser = argparse.ArgumentParser( description="Plot semantic shift of words") parser.add_argument('-w', '--words', nargs='+', help='List of words to plot', required=True) parser.add_argument("-n", "--neighbors", type=int, default=15, help="Number of neighbors to plot", required=True) parser.add_argument( "--protocol_type", type=str, help= "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)", required=True) parser.add_argument("--model_folder", type=str, help="Folder where word2vec models are located", required=False) args = parser.parse_args() words_to_plot = args.words n = args.neighbors if args.protocol_type == 'RT': embeddings = SequentialEmbedding.load(args.model_folder) if args.protocol_type == 'BRD': embeddings = SequentialEmbedding.load(args.model_folder) for word1 in words_to_plot: helpers.clear_figure() try: time_sims, lookups, nearests, sims = helpers.get_time_sims( embeddings, word1, topn=n) words = list(lookups.keys()) values = [lookups[word] for word in words] fitted = helpers.fit_tsne(values) if not len(fitted): print(f"Couldn't model word {word1}") continue # draw the words onto the graph cmap = helpers.get_cmap(len(time_sims)) annotations = helpers.plot_words(word1, words, fitted, cmap, sims, len(embeddings.embeds) + 1, args.protocol_type) print(f'Annotations:{annotations}') if annotations: helpers.plot_annotations(annotations) helpers.savefig(word1, args.protocol_type, n) for year, sim in time_sims.items(): print(year, sim) except KeyError: print(f'{word1} is not in the embedding space.')
Let's examine the closest neighbors for a word over time """ import collections from sklearn.manifold import TSNE import numpy as np import matplotlib.pyplot as plt WORDS = helpers.get_words() if __name__ == "__main__": embeddings = helpers.load_embeddings() for word1 in WORDS: time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1) helpers.clear_figure() # we remove word1 from our words because we just want to plot the different # related words words = filter(lambda word: word.split("|")[0] != word1, lookups.keys()) values = [ lookups[word] for word in words ] fitted = helpers.fit_tsne(values) if not len(fitted): print "Couldn't model word", word1 continue cmap = helpers.get_cmap(len(time_sims)) annotations = helpers.plot_words(word1, words, fitted, cmap, sims)