def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print "Loading year:", year # for each year year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) # load in embedding pkl year_words = words_above_count( count_dir, year, min_count) # load count pkl, returns only words greater min_count year_embed.get_subembed( year_words ) # keep the embeddings for only the words in year_words, if not out of vocabulary print "Aligning year:", year if first_iter: # for first iteration, our aligned embed is our base embed so basically skip it aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align( base_embed, year_embed) base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy", aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100): while True: if queue.empty(): break year = queue.get() print("Loading embeddings for year", year) time.sleep(random.random() * 120) valid_words = set(words_above_count(count_dir, year, min_count)) print(len(valid_words)) words = list(valid_words.intersection(words[year][:num_words])) print(len(words)) base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False) base_embed = base_embed.get_subembed(words, restrict_context=True) print("SVD for year", year) u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) print("Saving year", year) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s) write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) year_words = valid_words[year][:num_words] count_words = set(ioutils.words_above_count(count_dir, year, min_count)) freq = CachedFreqDist( ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = list(count_words.intersection(year_words)) embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(mat.data[i] * prop_keep)): fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in year_words: if not word in count_words: print >> fp, word.encode("utf-8"), 1 else: print >> fp, word.encode("utf-8"), int( mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str( year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) year_words = valid_words[year][:num_words] count_words = set(ioutils.words_above_count(count_dir, year, min_count)) freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = list(count_words.intersection(year_words)) embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(mat.data[i] * prop_keep)): fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in year_words: if not word in count_words: print >>fp, word.encode("utf-8"), 1 else: print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print("Loading year:", year) year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) year_words = words_above_count(count_dir, year, min_count) year_embed.get_subembed(year_words) print("Aligning year:", year) if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed) base_embed = aligned_embed print("Writing year:", year) foutname = out_dir + str(year) np.save(foutname + "-w.npy",aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print "Loading year:", year year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) year_words = words_above_count(count_dir, year, min_count) year_embed.get_subembed(year_words) print "Aligning year:", year if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed) base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy",aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100): while True: if queue.empty(): break year = queue.get() print "Loading embeddings for year", year time.sleep(random.random() * 120) valid_words = set(words_above_count(count_dir, year, min_count)) print len(valid_words) words = list(valid_words.intersection(words[year][:num_words])) print len(words) base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False) base_embed = base_embed.get_subembed(words, restrict_context=True) print "SVD for year", year u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) print "Saving year", year np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s) write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")