Esempio n. 1
0
def intersection_align(embed1, embed2):
    """ 
        Get the intersection of two embeddings.
        Returns embeddings with common vocabulary and indices.
    """
    common_vocab = filter(set(embed1.iw).__contains__, embed2.iw)
    newvecs1 = np.empty((len(common_vocab), embed1.m.shape[1]))
    newvecs2 = np.empty((len(common_vocab), embed2.m.shape[1]))
    for i in xrange(len(common_vocab)):
        newvecs1[i] = embed1.m[embed1.wi[common_vocab[i]]]
        newvecs2[i] = embed2.m[embed2.wi[common_vocab[i]]]
    return Embedding(newvecs1, common_vocab), Embedding(newvecs2, common_vocab)
Esempio n. 2
0
 def get_dim_reduced_fixed(self, dim=2, normalize=False, basis=2000):
     pca = PCA(n_components=2)
     pca.fit(self.embeds[basis].m)
     year_reduced_embeds = collections.OrderedDict()
     for year,embed in self.embeds.iteritems():
         proj_embed_mat = pca.transform(embed.m)
         year_reduced_embeds[year] = Embedding(proj_embed_mat, embed.iw, normalize=normalize) 
     return SequentialEmbedding.from_ordered_dict(year_reduced_embeds)
Esempio n. 3
0
 def get_word_path(self, word, n=3, dim=2, num_rand=None, word_list=None, basis_year=2000):
    subembeds = self.get_word_subembeds(word, n=n, num_rand=num_rand, word_list=word_list) 
    pca = PCA(n_components=dim)
    basis = pca.fit_transform(subembeds.embeds[basis_year].m)
    basis = Embedding(basis, subembeds.embeds[basis_year].iw, normalize=False)
    word_path = collections.OrderedDict()
    for year, embed in self.embeds.iteritems():
        word_path[year] = pca.transform(embed.represent(word)).flatten()
    return word_path, basis
Esempio n. 4
0
def linear_align(base_embed, other_embed):
    """
        Align other embedding to base embedding using best linear transform.
        NOTE: Assumes indices are aligned
    """
    basevecs = base_embed.m
    othervecs = other_embed.m
    fixedvecs = othervecs.dot(np.linalg.pinv(othervecs)).dot(basevecs)
    return Embedding(fixedvecs, other_embed.iw)
Esempio n. 5
0
def smart_procrustes_align(base_embed, other_embed, post_normalize=False):
    in_base_embed, in_other_embed = intersection_align(base_embed, other_embed)
    base_vecs = in_base_embed.m
    other_vecs = in_other_embed.m
    m = other_vecs.T.dot(base_vecs)
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v)
    return Embedding((other_embed.m).dot(ortho),
                     other_embed.iw,
                     normalize=post_normalize)
Esempio n. 6
0
def procrustes_align(base_embed, other_embed):
    """ 
        Align other embedding to base embeddings via Procrustes.
        Returns best distance-preserving aligned version of other_embed
        NOTE: Assumes indices are aligned
    """
    basevecs = base_embed.m - base_embed.m.mean(0)
    othervecs = other_embed.m - other_embed.m.mean(0)
    m = othervecs.T.dot(basevecs)
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v)
    fixedvecs = othervecs.dot(ortho)
    return Embedding(fixedvecs, other_embed.iw)
Esempio n. 7
0
 def get_dim_reduced_avg(self, dim=2, normalize=False):
     first_iter = True
     for year,embed in self.embeds.iteritems():
         if first_iter:
             align_words = set(embed.iw)
             first_iter = False
         else:
             align_words = align_words.intersection(set(embed.iw))
     first_iter = True 
     for year,embed in self.embeds.iteritems():
         if first_iter:
             avg_mat = embed.get_subembed(list(align_words)).m
             first_iter = False
         else:
             avg_mat += embed.get_subembed(list(align_words)).m
     avg_mat /= float(len(self.embeds))
     pca = PCA(n_components=2)
     pca.fit(avg_mat)
     year_reduced_embeds = collections.OrderedDict()
     for year,embed in self.embeds.iteritems():
         proj_embed_mat = pca.transform(embed.m)
         year_reduced_embeds[year] = Embedding(proj_embed_mat, embed.iw, normalize=normalize) 
     return SequentialEmbedding.from_ordered_dict(year_reduced_embeds)
Esempio n. 8
0
def write_vecs(finname, foutname):
    og_embed = Embedding.load(finname, normalize=False)
    red_embed = og_embed.get_subembed(TOP_WORDS)
    np.save(foutname+".npy", red_embed.m)
    with file(foutname+".vocab","w") as outf:
       print >> outf, " ".join(red_embed.iw)
Esempio n. 9
0
 def __init__(self, years, top_freq=None):
     self.embeds = collections.OrderedDict()
     for year in years:
         self.embeds[year] = Embedding.load(INPUT_PATH.format(year=year))
Esempio n. 10
0
def reduce_dim(embedding, dim=2, post_normalize=False):
    pca = PCA(n_components=2)
    reduced_vecs = pca.fit_transform(embedding.m)
    return Embedding(reduced_vecs, embedding.iw, normalize=post_normalize)