def worker(proc_num, queue, out_dir, in_dir): while True: try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade counts = collections.defaultdict(int) for year in range(10): embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False) if year == 0: merged_index = embed.wi year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl") mat = embed.m.tocoo() for i in xrange(len(mat.data)): if mat.data[i] == 0: continue new_row = get_index(merged_index, year_list, mat.row[i]) new_col = get_index(merged_index, year_list, mat.col[i]) counts[(new_row, new_col)] += mat.data[i] print "Done year ", decade + year export_mat_from_dict(counts, decade, out_dir) write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) year_words = valid_words[year][:num_words] count_words = set(ioutils.words_above_count(count_dir, year, min_count)) freq = CachedFreqDist( ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = list(count_words.intersection(year_words)) embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(mat.data[i] * prop_keep)): fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in year_words: if not word in count_words: print >> fp, word.encode("utf-8"), 1 else: print >> fp, word.encode("utf-8"), int( mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str( year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) year_words = valid_words[year][:num_words] count_words = set(ioutils.words_above_count(count_dir, year, min_count)) freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = list(count_words.intersection(year_words)) embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(mat.data[i] * prop_keep)): fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in year_words: if not word in count_words: print >>fp, word.encode("utf-8"), 1 else: print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def __init__(self, input_dir, years, top_freq=None, normalize=True): self.embeds = collections.OrderedDict() for year in years: self.embeds[year] = Explicit.load(input_dir + "/" + str(year) + ".bin", normalize=normalize)
import sys import numpy as np from sklearn.decomposition import TruncatedSVD from googlengram import util from vecanalysis.representations.explicit import Explicit INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/" OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/" INPUT_PATH = INPUT_DIR + "{year}.bin" OUTPUT_PATH = OUTPUT_DIR + "{year}-300vecs" if __name__ == "__main__": year = sys.argv[1] print "Loading embeddings for year", year words = util.load_pickle("/dfs/scratch0/google_ngrams/info/interestingwords.pkl") base_embed = Explicit.load(INPUT_PATH.format(year=year), restricted_context=words) print "SVD for year", year pca = TruncatedSVD(n_components=300) new_mat = pca.fit_transform(base_embed.m) print "Saving year", year np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat) vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w") words = [word.encode("utf-8") for word in base_embed.iw] vocab_outfp.write(" ".join(words))
import sys import numpy as np from sklearn.decomposition import TruncatedSVD from googlengram import util from vecanalysis.representations.explicit import Explicit INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/" OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/" INPUT_PATH = INPUT_DIR + '{year}.bin' OUTPUT_PATH = OUTPUT_DIR + '{year}-300vecs' if __name__ == '__main__': year = sys.argv[1] print "Loading embeddings for year", year words = util.load_pickle( "/dfs/scratch0/google_ngrams/info/interestingwords.pkl") base_embed = Explicit.load(INPUT_PATH.format(year=year), restricted_context=words) print "SVD for year", year pca = TruncatedSVD(n_components=300) new_mat = pca.fit_transform(base_embed.m) print "Saving year", year np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat) vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w") words = [word.encode('utf-8') for word in base_embed.iw] vocab_outfp.write(" ".join(words))