Exemple #1
0
def do_search(word1):

    if not word1 in search_cache:
        embeddings = helpers.load_embeddings()
        words = word1.split(":")
        all_lookups = {}
        all_sims = defaultdict(list)
        all_terms = defaultdict(list)
        for word2 in words:
            if not word2 in term_cache:
                term_cache[word2] = helpers.get_time_sims(embeddings, word2)
            else:
                print "USING CACHED NEIGHBORS FOR", word2

            time_sims, lookups, nearests, sims = term_cache[word2]

            for word in lookups:
                all_terms[word].append(word2)

            for word in lookups:
                all_sims[word].append(sims[word])

            all_lookups.update(lookups)

        words = all_lookups.keys()
        values = [ all_lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)


        # we should stitch the arrays together into objects, i guess
        objs = []
        for i in xrange(len(words)):
            word = words[i]
            ww, decade = word.split("|")
            obj = {
                "word" : ww,
                "query" : all_terms[word],
                "year" : int(decade),
                "similarity" : all_sims[word],
                "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]),
                "sum_similarity" : sum(all_sims[word]),
                "position" : {
                    "x" : round(fitted[i][0], 3),
                    "y" : round(fitted[i][1], 3)
                }
            }

            objs.append(obj)

        search_cache[word1] = objs

    return {
        "term" : word1,
        "results" : search_cache[word1]
    }
Exemple #2
0
def do_search(word1):

    if not word1 in search_cache:
        embeddings = helpers.load_embeddings()
        words = word1.split(":")
        all_lookups = {}
        all_sims = defaultdict(list)
        all_terms = defaultdict(list)
        for word2 in words:
            if not word2 in term_cache:
                term_cache[word2] = helpers.get_time_sims(embeddings, word2)
            else:
                print("USING CACHED NEIGHBORS FOR", word2)

            time_sims, lookups, nearests, sims = term_cache[word2]

            for word in lookups:
                all_terms[word].append(word2)

            for word in lookups:
                all_sims[word].append(sims[word])

            all_lookups.update(lookups)

        words = list(all_lookups.keys())
        values = [ all_lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)


        # we should stitch the arrays together into objects, i guess
        objs = []
        for i in range(len(words)):
            word = words[i]
            ww, decade = word.split("|")
            obj = {
                "word" : ww,
                "query" : all_terms[word],
                "year" : int(decade),
                "similarity" : all_sims[word],
                "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]),
                "sum_similarity" : sum(all_sims[word]),
                "position" : {
                    "x" : round(fitted[i][0], 3),
                    "y" : round(fitted[i][1], 3)
                }
            }

            objs.append(obj)

        search_cache[word1] = objs

    return {
        "term" : word1,
        "results" : search_cache[word1]
    }
Exemple #3
0
"""
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE

import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(
            embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1,
                       lookups.keys())

        values = [lookups[word] for word in words]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
            print "Couldn't model word", word1
            continue

        cmap = helpers.get_cmap(len(time_sims))
def main():
    parser = argparse.ArgumentParser(
        description="Plot semantic shift of words")
    parser.add_argument('-w',
                        '--words',
                        nargs='+',
                        help='List of words to plot',
                        required=True)
    parser.add_argument("-n",
                        "--neighbors",
                        type=int,
                        default=15,
                        help="Number of neighbors to plot",
                        required=True)
    parser.add_argument(
        "--protocol_type",
        type=str,
        help=
        "Whether to run test for Reichstagsprotokolle (RT) or Bundestagsprotokolle (BRD)",
        required=True)
    parser.add_argument("--model_folder",
                        type=str,
                        help="Folder where word2vec models are located",
                        required=False)

    args = parser.parse_args()
    words_to_plot = args.words
    n = args.neighbors

    if args.protocol_type == 'RT':
        embeddings = SequentialEmbedding.load(args.model_folder)

    if args.protocol_type == 'BRD':
        embeddings = SequentialEmbedding.load(args.model_folder)

    for word1 in words_to_plot:
        helpers.clear_figure()
        try:
            time_sims, lookups, nearests, sims = helpers.get_time_sims(
                embeddings, word1, topn=n)

            words = list(lookups.keys())
            values = [lookups[word] for word in words]
            fitted = helpers.fit_tsne(values)
            if not len(fitted):
                print(f"Couldn't model word {word1}")
                continue

            # draw the words onto the graph
            cmap = helpers.get_cmap(len(time_sims))
            annotations = helpers.plot_words(word1, words, fitted, cmap, sims,
                                             len(embeddings.embeds) + 1,
                                             args.protocol_type)
            print(f'Annotations:{annotations}')

            if annotations:
                helpers.plot_annotations(annotations)

            helpers.savefig(word1, args.protocol_type, n)
            for year, sim in time_sims.items():
                print(year, sim)
        except KeyError:
            print(f'{word1} is not in the embedding space.')
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE


import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1, lookups.keys())

        values = [ lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
            print "Couldn't model word", word1
            continue

        cmap = helpers.get_cmap(len(time_sims))
        annotations = helpers.plot_words(word1, words, fitted, cmap, sims)