def intersection_align(embed1, embed2): """ Get the intersection of two embeddings. Returns embeddings with common vocabulary and indices. """ common_vocab = filter(set(embed1.iw).__contains__, embed2.iw) newvecs1 = np.empty((len(common_vocab), embed1.m.shape[1])) newvecs2 = np.empty((len(common_vocab), embed2.m.shape[1])) for i in xrange(len(common_vocab)): newvecs1[i] = embed1.m[embed1.wi[common_vocab[i]]] newvecs2[i] = embed2.m[embed2.wi[common_vocab[i]]] return Embedding(newvecs1, common_vocab), Embedding(newvecs2, common_vocab)
def get_dim_reduced_fixed(self, dim=2, normalize=False, basis=2000): pca = PCA(n_components=2) pca.fit(self.embeds[basis].m) year_reduced_embeds = collections.OrderedDict() for year,embed in self.embeds.iteritems(): proj_embed_mat = pca.transform(embed.m) year_reduced_embeds[year] = Embedding(proj_embed_mat, embed.iw, normalize=normalize) return SequentialEmbedding.from_ordered_dict(year_reduced_embeds)
def get_word_path(self, word, n=3, dim=2, num_rand=None, word_list=None, basis_year=2000): subembeds = self.get_word_subembeds(word, n=n, num_rand=num_rand, word_list=word_list) pca = PCA(n_components=dim) basis = pca.fit_transform(subembeds.embeds[basis_year].m) basis = Embedding(basis, subembeds.embeds[basis_year].iw, normalize=False) word_path = collections.OrderedDict() for year, embed in self.embeds.iteritems(): word_path[year] = pca.transform(embed.represent(word)).flatten() return word_path, basis
def linear_align(base_embed, other_embed): """ Align other embedding to base embedding using best linear transform. NOTE: Assumes indices are aligned """ basevecs = base_embed.m othervecs = other_embed.m fixedvecs = othervecs.dot(np.linalg.pinv(othervecs)).dot(basevecs) return Embedding(fixedvecs, other_embed.iw)
def smart_procrustes_align(base_embed, other_embed, post_normalize=False): in_base_embed, in_other_embed = intersection_align(base_embed, other_embed) base_vecs = in_base_embed.m other_vecs = in_other_embed.m m = other_vecs.T.dot(base_vecs) u, _, v = np.linalg.svd(m) ortho = u.dot(v) return Embedding((other_embed.m).dot(ortho), other_embed.iw, normalize=post_normalize)
def procrustes_align(base_embed, other_embed): """ Align other embedding to base embeddings via Procrustes. Returns best distance-preserving aligned version of other_embed NOTE: Assumes indices are aligned """ basevecs = base_embed.m - base_embed.m.mean(0) othervecs = other_embed.m - other_embed.m.mean(0) m = othervecs.T.dot(basevecs) u, _, v = np.linalg.svd(m) ortho = u.dot(v) fixedvecs = othervecs.dot(ortho) return Embedding(fixedvecs, other_embed.iw)
def get_dim_reduced_avg(self, dim=2, normalize=False): first_iter = True for year,embed in self.embeds.iteritems(): if first_iter: align_words = set(embed.iw) first_iter = False else: align_words = align_words.intersection(set(embed.iw)) first_iter = True for year,embed in self.embeds.iteritems(): if first_iter: avg_mat = embed.get_subembed(list(align_words)).m first_iter = False else: avg_mat += embed.get_subembed(list(align_words)).m avg_mat /= float(len(self.embeds)) pca = PCA(n_components=2) pca.fit(avg_mat) year_reduced_embeds = collections.OrderedDict() for year,embed in self.embeds.iteritems(): proj_embed_mat = pca.transform(embed.m) year_reduced_embeds[year] = Embedding(proj_embed_mat, embed.iw, normalize=normalize) return SequentialEmbedding.from_ordered_dict(year_reduced_embeds)
def write_vecs(finname, foutname): og_embed = Embedding.load(finname, normalize=False) red_embed = og_embed.get_subembed(TOP_WORDS) np.save(foutname+".npy", red_embed.m) with file(foutname+".vocab","w") as outf: print >> outf, " ".join(red_embed.iw)
def __init__(self, years, top_freq=None): self.embeds = collections.OrderedDict() for year in years: self.embeds[year] = Embedding.load(INPUT_PATH.format(year=year))
def reduce_dim(embedding, dim=2, post_normalize=False): pca = PCA(n_components=2) reduced_vecs = pca.fit_transform(embedding.m) return Embedding(reduced_vecs, embedding.iw, normalize=post_normalize)