Ejemplo n.º 1
0
def do_search(word1):

    if not word1 in search_cache:
        embeddings = helpers.load_embeddings()
        words = word1.split(":")
        all_lookups = {}
        all_sims = defaultdict(list)
        all_terms = defaultdict(list)
        for word2 in words:
            if not word2 in term_cache:
                term_cache[word2] = helpers.get_time_sims(embeddings, word2)
            else:
                print "USING CACHED NEIGHBORS FOR", word2

            time_sims, lookups, nearests, sims = term_cache[word2]

            for word in lookups:
                all_terms[word].append(word2)

            for word in lookups:
                all_sims[word].append(sims[word])

            all_lookups.update(lookups)

        words = all_lookups.keys()
        values = [ all_lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)


        # we should stitch the arrays together into objects, i guess
        objs = []
        for i in xrange(len(words)):
            word = words[i]
            ww, decade = word.split("|")
            obj = {
                "word" : ww,
                "query" : all_terms[word],
                "year" : int(decade),
                "similarity" : all_sims[word],
                "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]),
                "sum_similarity" : sum(all_sims[word]),
                "position" : {
                    "x" : round(fitted[i][0], 3),
                    "y" : round(fitted[i][1], 3)
                }
            }

            objs.append(obj)

        search_cache[word1] = objs

    return {
        "term" : word1,
        "results" : search_cache[word1]
    }
Ejemplo n.º 2
0
def do_search(word1):

    if not word1 in search_cache:
        embeddings = helpers.load_embeddings()
        words = word1.split(":")
        all_lookups = {}
        all_sims = defaultdict(list)
        all_terms = defaultdict(list)
        for word2 in words:
            if not word2 in term_cache:
                term_cache[word2] = helpers.get_time_sims(embeddings, word2)
            else:
                print("USING CACHED NEIGHBORS FOR", word2)

            time_sims, lookups, nearests, sims = term_cache[word2]

            for word in lookups:
                all_terms[word].append(word2)

            for word in lookups:
                all_sims[word].append(sims[word])

            all_lookups.update(lookups)

        words = list(all_lookups.keys())
        values = [ all_lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)


        # we should stitch the arrays together into objects, i guess
        objs = []
        for i in range(len(words)):
            word = words[i]
            ww, decade = word.split("|")
            obj = {
                "word" : ww,
                "query" : all_terms[word],
                "year" : int(decade),
                "similarity" : all_sims[word],
                "avg_similarity" : sum(all_sims[word]) / len(all_sims[word]),
                "sum_similarity" : sum(all_sims[word]),
                "position" : {
                    "x" : round(fitted[i][0], 3),
                    "y" : round(fitted[i][1], 3)
                }
            }

            objs.append(obj)

        search_cache[word1] = objs

    return {
        "term" : word1,
        "results" : search_cache[word1]
    }
Ejemplo n.º 3
0
 def __init__(self):
     self.batch_idx = 0
     self.questions = []
     self.responses = []
     self.labels = []
     self.embeddings = helpers.load_embeddings()
     data = helpers.load_data()
     for item in data:
         self.questions.append(item[0])
         self.responses.append(item[1])
         self.labels.append(item[2])
     del data
Ejemplo n.º 4
0
import helpers
import sys
from representations.sequentialembedding import SequentialEmbedding
"""
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE

import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(
            embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1,
                       lookups.keys())

        values = [lookups[word] for word in words]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
Ejemplo n.º 5
0
#x2 = x[len(q1):]

# The models are not perfectly symmetric in the combination layer, so we can flip the order of the
# questions to synthesize additional training examples
# x1 = np.concatenate((x1_sliced, x2_sliced), axis=0)
# x2 = np.concatenate((x2_sliced, x1_sliced), axis=0)
# y = np.concatenate((y, y), axis=0)
# x1_lengths = np.concatenate((q1_lengths, q2_lengths), axis=0)
# x2_lengths = np.concatenate((q2_lengths, q1_lengths), axis=0)

# Create word embeddings
print "Loading word embeddings..."
vocab_dict = vocab_processor.vocabulary_._mapping
#print vocab_dict
pretrained_embeddings = helpers.load_embeddings(FLAGS.embeddings_file,
                                                vocab_dict,
                                                FLAGS.embedding_dim,
                                                FLAGS.use_cached_embeddings)

# Randomly shuffle data
print "Shuffling data..."
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))

x1_shuffled = x1[shuffle_indices]
x2_shuffled = x2[shuffle_indices]
y_shuffled = y[shuffle_indices]
q1_lengths_shuffled = x1_lengths[shuffle_indices]
q2_lengths_shuffled = x2_lengths[shuffle_indices]

# Split train/test set
print "Splitting training/dev..."
from representations.sequentialembedding import SequentialEmbedding

"""
Let's examine the closest neighbors for a word over time
"""

import collections
from sklearn.manifold import TSNE


import numpy as np
import matplotlib.pyplot as plt

WORDS = helpers.get_words()
if __name__ == "__main__":
    embeddings = helpers.load_embeddings()

    for word1 in WORDS:
        time_sims, lookups, nearests, sims = helpers.get_time_sims(embeddings, word1)

        helpers.clear_figure()

        # we remove word1 from our words because we just want to plot the different
        # related words
        words = filter(lambda word: word.split("|")[0] != word1, lookups.keys())

        values = [ lookups[word] for word in words ]
        fitted = helpers.fit_tsne(values)
        if not len(fitted):
            print "Couldn't model word", word1
            continue