def merge(out_pref, years, word_list): vol_yearstats = {} disp_yearstats = {} for word in word_list: vol_yearstats[word] = {} disp_yearstats[word] = {} for year in years: vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl") disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl") for word in word_list: if word not in vol_yearstat: vol = float('nan') else: vol = vol_yearstat[word] if word not in disp_yearstat: disp = float('nan') else: disp = disp_yearstat[word] vol_yearstats[word][year] = vol disp_yearstats[word][year] = disp os.remove(out_pref + str(year) + "-vols.pkl") os.remove(out_pref + str(year) + "-disps.pkl") ioutils.write_pickle(vol_yearstats, out_pref + "-vols.pkl") ioutils.write_pickle(disp_yearstats, out_pref + "-disps.pkl")
def merge_year_counts(out_dir, name_list, years): for year in years: year_counts = {} year_doc_counts = {} year_pos = {} for name in name_list: tmp_year_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-counts.pkl") tmp_year_doc_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-doc_counts.pkl") tmp_year_pos = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-pos.pkl") for word, count in tmp_year_counts.iteritems(): if not word in year_counts: year_counts[word] = 0 year_doc_counts[word] = 0 year_pos[word] = collections.Counter() year_counts[word] += tmp_year_counts[word] year_doc_counts[word] += tmp_year_doc_counts[word] counter_keys = tmp_year_pos[word].keys() for pos in counter_keys: year_pos[word][pos] += tmp_year_pos[word][pos] print "Writing merged counts for " + str(year) + " ..." ioutils.write_pickle(year_counts, out_dir + str(year) + "-counts.pkl") ioutils.write_pickle(year_doc_counts, out_dir + str(year) + "-doc_counts.pkl") ioutils.write_pickle(year_pos, out_dir + str(year) + "-pos.pkl") print "Deleting temp dirs ..." remove_tmp_dirs(out_dir, name_list)
def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} old_mat = sparse_io.retrieve_mat_as_dict(in_dir + str(year) + ".bin") old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems(): try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) fixed_counts[new_pair] = count print proc_num, "Writing counts for year", year sparse_io.export_mats_from_dicts({str(year): fixed_counts}, out_dir)
def make_word_list(type): process_word = lambda word: word if type != "lemma_pos" else word.split( "_")[0] freqs = load_pickle(FREQS.format(type=type)) word_lists = {} nstop_lists = {} nproper_lists = {} nstop_nproper_lists = {} print "Processing type: ", type proper_nouns = load_pickle(PROPER_NOUNS) word_lists = [ word for word in sorted(freqs, key=lambda val: -1 * freqs[val]) if word != "" and word.isalnum() ] nstop_lists = [ word for word in sorted(freqs, key=lambda val: -1 * freqs[val]) if not process_word(word) in STOPWORDS if word != "" and word.isalnum() ] nproper_lists = [ word for word in sorted(freqs, key=lambda val: -1 * freqs[val]) if not process_word(word) in proper_nouns if word != "" and word.isalnum() ] nstop_nproper_lists = [ word for word in sorted(freqs, key=lambda val: -1 * freqs[val]) if not process_word(word) in proper_nouns and not process_word(word) in STOPWORDS if word != "" and word.isalnum() ] write_pickle(word_lists, OUT.format(type=type, cond="all")) write_pickle(nstop_lists, OUT.format(type=type, cond="nstop")) write_pickle(nproper_lists, OUT.format(type=type, cond="nproper")) write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin") old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems(): try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) fixed_counts[new_pair] = count print proc_num, "Writing counts for year", year matstore.export_mats_from_dicts({str(year) : fixed_counts}, out_dir)
def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} # This is the new co-occurrence matrix old_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin").todok() old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems( ): # Iterates through the unmerged co-occurrence matrix ... try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] try: new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) except KeyError: # Filters words to drop out continue fixed_counts[ new_pair] = count # ... and add the counts to the new one print proc_num, "Writing counts for year", year # Saves the new co-occurrence matrices sparse_io.export_mat_from_dict(fixed_counts, out_dir + str(year) + ".bin")
def merge(out_pref, tmp_dir, years): net_stats = collections.defaultdict(dict) rewire_net_stats = collections.defaultdict(dict) for year in years: year_stats = ioutils.load_pickle(tmp_dir + str(year) + "-tmp.pkl") rewire_year_stats = ioutils.load_pickle(tmp_dir + "rewire" + str(year) + "-tmp.pkl") for stat, val in year_stats.iteritems(): net_stats[stat][year] = val for stat, val in rewire_year_stats.iteritems(): rewire_net_stats[stat][year] = val os.remove(tmp_dir + str(year) + "-tmp.pkl") os.remove(tmp_dir + "rewire" + str(year) + "-tmp.pkl") for stat, year_vals in net_stats.iteritems(): ioutils.write_pickle(year_vals, out_pref + "-" + stat + ".pkl") for stat, year_vals in rewire_net_stats.iteritems(): ioutils.write_pickle(year_vals, out_pref + "-rw-" + stat + ".pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: # Iterates through the decades try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade for year in range(10): # Iterates through the years of the indiviual decades year_counts = load_pickle(in_dir + str(decade + year) + "-pos.pkl") if year == 0: merged_pos_counts = year_counts # this variable counts the occurrence of words (distinguish words by pos) else: for word, pos_counts in year_counts.iteritems(): # Iterates through the words and adds the occurrence to the merged counter for pos, count in pos_counts.iteritems(): if not word in merged_pos_counts: merged_pos_counts[word] = collections.Counter() merged_pos_counts[word][pos] += count maj_tags = {} # Classifies words about the occurrence of the major pos for word, pos_counts in merged_pos_counts.iteritems(): if len(pos_counts) < 1: continue max_label = sorted(pos_counts, key= lambda w : pos_counts[w], reverse=True)[0] if pos_counts[max_label] > 0.5 * np.sum(pos_counts.values()): maj_tags[word] = max_label else: maj_tags[word] = "AMB" write_pickle(merged_pos_counts, out_dir + str(decade) + "-pos_counts.pkl") # Saves the counts write_pickle(maj_tags, out_dir + str(decade) + "-pos.pkl") # Saves the maj_tags
def worker(proc_num, queue, dir, count_dir, min_count): while True: if queue.empty(): break year = queue.get() print("Loading data..", year) # time.sleep(120 * random.random()) freqs = load_pickle(count_dir + str(year) + "-counts.pkl") iw = [] with open(dir + str(year) + "-w.txt") as fp: info = fp.readline().split() vocab_size = int(info[0]) dim = int(info[1]) w_mat = np.zeros((vocab_size, dim)) for i, line in enumerate(fp): line = line.strip().split() iw.append(line[0].decode("utf-8")) if freqs[iw[-1]] >= 500: w_mat[i, :] = np.array(list(map(float, line[1:]))) c_mat = np.zeros((vocab_size, dim)) with open(dir + str(year) + "-c.txt") as fp: fp.readline() for i, line in enumerate(fp): line = line.strip().split() if freqs[line[0]] >= min_count: c_mat[i, :] = np.array(list(map(float, line[1:]))) np.save(dir + str(year) + "-w.npy", w_mat) np.save(dir + str(year) + "-c.npy", c_mat) write_pickle(iw, dir + str(year) + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade counts = collections.defaultdict(int) for year in range(10): embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False) if year == 0: merged_index = embed.wi year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl") mat = embed.m.tocoo() for i in xrange(len(mat.data)): if mat.data[i] == 0: continue new_row = get_index(merged_index, year_list, mat.row[i]) new_col = get_index(merged_index, year_list, mat.col[i]) counts[(new_row, new_col)] += mat.data[i] print "Done year ", decade + year export_mat_from_dict(counts, decade, out_dir) write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: try: decade = queue.get(block=False) except Empty: break print("Processing decade", decade) for year in range(10): year_counts = load_pickle(in_dir + str(decade + year) + "-pos.pkl") if year == 0: merged_pos_counts = year_counts for word, pos_counts in year_counts.items(): for pos, count in pos_counts.items(): if not word in merged_pos_counts: merged_pos_counts[word] = collections.Counter() merged_pos_counts[word][pos] += count maj_tags = {} for word, pos_counts in merged_pos_counts.items(): if len(pos_counts) < 1: continue max_label = sorted(pos_counts, key= lambda w : pos_counts[w], reverse=True)[0] if pos_counts[max_label] > 0.5 * np.sum(list(pos_counts.values())): maj_tags[word] = max_label else: maj_tags[word] = "AMB" write_pickle(merged_pos_counts, out_dir + str(decade) + "-pos_counts.pkl") write_pickle(maj_tags, out_dir + str(decade) + "-pos.pkl")
def merge(out_pref, tmp_out_pref, years, word_list): vol_yearstats = {} disp_yearstats = {} for word in word_list: vol_yearstats[word] = {} disp_yearstats[word] = {} for year in years: vol_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jvols.pkl") disp_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jdisps.pkl") for word in word_list: vol_yearstats[word][year] = vol_yearstat[word] disp_yearstats[word][year] = disp_yearstat[word] os.remove(tmp_out_pref + str(year) + "-jvols.pkl") os.remove(tmp_out_pref + str(year) + "-jdisps.pkl") ioutils.write_pickle(vol_yearstats, out_pref + "-jvols.pkl") ioutils.write_pickle(disp_yearstats, out_pref + "-jdisps.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, vocab_dir, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = ioutils.load_word_list(vocab_dir + str(year) + ".vocab") embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(np.ceil(mat.data[i] * prop_keep))): fp.write(line) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def worker(proc_num, queue, dir, count_dir, min_count): while True: if queue.empty(): break year = queue.get() print "Loading data..", year # time.sleep(120 * random.random()) freqs = load_pickle(count_dir + str(year) + "-counts.pkl") iw = [] with open(dir + str(year) + "-w.txt") as fp: info = fp.readline().split() vocab_size = int(info[0]) dim = int(info[1]) w_mat = np.zeros((vocab_size, dim)) for i, line in enumerate(fp): line = line.strip().split() iw.append(line[0].decode("utf-8")) if freqs[iw[-1]] >= 500: w_mat[i,:] = np.array(map(float, line[1:])) c_mat = np.zeros((vocab_size, dim)) with open(dir + str(year) + "-c.txt") as fp: fp.readline() for i, line in enumerate(fp): line = line.strip().split() if freqs[line[0]] >= min_count: c_mat[i,:] = np.array(map(float, line[1:])) np.save(dir + str(year) + "-w.npy", w_mat) np.save(dir + str(year) + "-c.npy", c_mat) write_pickle(iw, dir + str(year) + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir, use_words, count_dir, num_sam, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load( in_dir + str(year) + ".bin", normalize=False) # Loads embedding and its count data embed = embed.get_subembed( use_words, restrict_context=True) # Restricts the vocabulary to given words counts = ioutils.load_pickle(count_dir + "/" + str(year) + "-counts.pkl") all_count = sum(counts.values()) mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len( mat.data)): # Iterates through the occured word pairs if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / counts[word] * all_count), 1.0) prop_keep *= min( np.sqrt(sample / counts[context] * all_count), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange( int(mat.data[i] * prop_keep) ): # Writes down the word pair as many times as it is needed fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in use_words: print >> fp, word.encode("utf-8"), int( mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str( year) + ".txt" os.system( "shuf " + out_dir + str(year) + ".tmp.txt -r -n " + str(num_sam) + " > " + out_dir + str(year) + ".txt" ) # Sampling randomly from the word pairs as many times as is given os.remove(out_dir + str(year) + ".tmp.txt")
def make_word_list(type): process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0] freqs = load_pickle(FREQS.format(type=type)) word_lists = {} nstop_lists = {} nproper_lists = {} nstop_nproper_lists = {} print "Processing type: ", type proper_nouns = load_pickle(PROPER_NOUNS) word_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if word != "" and word.isalnum()] nstop_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in STOPWORDS if word != "" and word.isalnum()] nproper_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in proper_nouns if word != "" and word.isalnum()] nstop_nproper_lists = [word for word in sorted(freqs, key = lambda val : -1*freqs[val]) if not process_word(word) in proper_nouns and not process_word(word) in STOPWORDS if word != "" and word.isalnum()] write_pickle(word_lists, OUT.format(type=type, cond="all")) write_pickle(nstop_lists, OUT.format(type=type, cond="nstop")) write_pickle(nproper_lists, OUT.format(type=type, cond="nproper")) write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
def worker(proc_num, queue, dir, count_dir, min_count): while True: if queue.empty(): break year = queue.get() print "Loading data..", year # time.sleep(120 * random.random()) freqs = load_pickle(count_dir + str(year) + "-counts.pkl") text2numpy(dir, freqs, year)
def get_extra_vectors(year, train_dir, print_dir, queries_list): extra_vectors = [] vocab = ioutils.load_pickle(train_dir + VOCAB_FILE.format(year=year)) print_vectors = load_file_lines(print_dir + TRAINED_VEC_FILE.format(year=year)) for i, w in enumerate(queries_list): if w.decode("utf-8") not in vocab: extra_vectors.append(print_vectors[i + 1]) return extra_vectors
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict(dict) word_lists = {} word_set = set([]) for year in years: word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: year_freqs= ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") ioutils.write_pickle(word_lists, out_pref + ".pkl")
def merge(word_list, years, in_dir, out_file): yearstats = {} for word in word_list: yearstats[word] = {} for year in years: yearstat = ioutils.load_pickle(in_dir + str(year) + "-freqstmp.pkl") for word in yearstat.keys(): yearstats[word][year] = yearstat[word] os.remove(in_dir + str(year) + "-freqstmp.pkl") ioutils.write_pickle(yearstats, out_file)
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict(dict) word_lists = {} word_set = set([]) for year in years: word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") ioutils.write_pickle(word_lists, out_pref + ".pkl")
def worker(proc_num, queue, out_dir, input_dir, out_suffix): while True: if queue.empty(): break year = queue.get() print proc_num, "Cleaning vocab of year", year vocab_list = ioutils.load_pickle(input_dir + VOCAB_FILE.format(year=year)) cleaned_vocab_list = remove_non_alph(vocab_list) ioutils.write_list(out_dir + str(year) + out_suffix, cleaned_vocab_list)
def numpy2text(vec_path, year, extension): vocab_list = load_pickle(vec_path + str(year) + "-vocab.pkl") w_mat = np.load(vec_path + str(year) + "-w.npy") vocab_size = len(vocab_list) dim = len(w_mat[0]) ioutils.write_list(vec_path + str(year) + ".vocab", vocab_list) with open(vec_path + str(year) + "-w" + extension, "w") as fp: print >> fp, str(vocab_size), str(dim) for i, w in enumerate(vocab_list): print >> fp, w.encode("utf-8"), " ".join(map(str, w_mat[i, :]))
def main(proc_num, lock, out_dir, in_dir, years): print proc_num, "Start loop" years.reverse() while True: lock.acquire() work_left = False # Iterates through the years (in a so complicated way) for year in years: dirs = set(os.listdir(out_dir)) if str( year ) + ".bin" in dirs: # Checks if the individual year exists in the target directory. If it doesn't, allows the merging continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + ".bin" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Merging counts for year", year # Merging starts here full_counts = collections.defaultdict(float) merged_index = collections.OrderedDict() for chunk_num in os.listdir( in_dir ): # Iterates through the alphabetically separated co-occurrence data chunk_name = in_dir + str(chunk_num) + "/" + str(year) + ".bin" if not os.path.isfile(chunk_name): continue chunk_counts = sparse_io.retrieve_mat_as_coo(chunk_name) chunk_index = ioutils.load_pickle(in_dir + str(chunk_num) + "/index.pkl") chunk_index = list(chunk_index) for pair, count in chunk_counts.todok().iteritems( ): # Iterates through the co-occurrence matrices and add the occurrence of the word-pairs to the merged co-occurrence matrix i_word = chunk_index[pair[0]] c_word = chunk_index[pair[1]] new_pair = (indexing.word_to_cached_id(i_word, merged_index), indexing.word_to_cached_id(c_word, merged_index)) full_counts[new_pair] += count print proc_num, "Writing counts for year", year sparse_io.export_mat_from_dict( full_counts, out_dir + str(year) + ".bin") # Saves the yearly merged co-occurrence matrices. ioutils.write_pickle(merged_index, out_dir + str(year) + "-index.pkl") # ... and the merged index ioutils.write_pickle(list(merged_index), out_dir + str(year) + "-list.pkl")
def load_vocabulary(mat, path): if os.path.isfile(path.split(".")[0] + "-index.pkl"): path = path.split(".")[0] + "-index.pkl" else: print "Could not find local index. Attempting to load directory wide index..." path = "/".join(path.split("/")[:-1]) + "/index.pkl" index = util.load_pickle(path) vocab = sorted(index, key=lambda word: index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def make_word_list(type): process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0] freqs = load_pickle(FREQS.format(type=type)) word_lists = {} nstop_lists = {} nproper_lists = {} nstop_nproper_lists = {} print "Processing type: ", type for year, year_freqs in freqs.iteritems(): proper_nouns = load_pickle(PROPER_NOUNS.format(year=year)) word_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if word != "" and word.isalnum()] nstop_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in STOPWORDS and not word == "" and word.isalnum()] nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not word == "" and word.isalnum()] nstop_nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not process_word(word) in STOPWORDS and not word == "" and word.isalnum()] print "Finished year: ", year write_pickle(word_lists, OUT.format(type=type, cond="all")) write_pickle(nstop_lists, OUT.format(type=type, cond="nstop")) write_pickle(nproper_lists, OUT.format(type=type, cond="nproper")) write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
def load_vocabulary(mat, path): if os.path.isfile(path.split(".")[0] + "-index.pkl"): path = path.split(".")[0] + "-index.pkl" else: print "Could not find local index. Attempting to load directory wide index..." path = "/".join(path.split("/")[:-1]) + "/index.pkl" index = util.load_pickle(path) vocab = sorted(index, key = lambda word : index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def main(years, out_dir, in_dir, count_dir, min_count, num_words): print "Making common vocab" words = ioutils.load_pickle(in_dir + str(years[0]) + "-list.pkl") for year in years: counts_year = ioutils.load_pickle(count_dir + str(year) + "-counts.pkl") use_words = sorted(counts_year.keys(), key=lambda word: counts_year[word])[:num_words] use_words = [ word for word in use_words if counts_year[word] > min_count ] i = 0 while i < len(words): if words[i] not in use_words: words.pop(i) i -= 1 i += 1 print year, "vocab, done" ioutils.write_pickle(list(words), out_dir + "common_vocab.pkl")
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict( dict) # dict mapping year to word-relative_frequency pairs word_lists = {} # dict mapping year to list of used words word_set = set([]) # set of words ever used for year in years: # Collects word_lists word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: # Collects relative frequencies year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") # Saves relative frequencies ioutils.write_pickle(word_lists, out_pref + ".pkl") # Saves word_lists
def load_vocabulary(mat, path): """Loads index from path + "-index.pkl" sorts words by their ids and return the first mat.shape[0] elements and the first mat.shape[1] elements in two different lists.""" if os.path.isfile(path.split(".")[0] + "-index.pkl"): path = path.split(".")[0] + "-index.pkl" else: print "Could not find local index. Attempting to load directory wide index..." path = "/".join(path.split("/")[:-1]) + "/merged_index.pkl" index = util.load_pickle(path) vocab = sorted(index, key=lambda word: index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def run(out_dir, in_dir): index = collections.OrderedDict() for year in YEARS: print "Merging year", year year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") i = 0 for i in xrange(len(year_list)): word = year_list[i] indexing.word_to_cached_id(word, index) ioutils.write_pickle(index, out_dir + "merged_index.pkl") ioutils.write_pickle(list(index), out_dir + "merged_list.pkl")
def worker(proc_num, queue, dir, count_dir, min_count, checkpoints): while True: if queue.empty(): break year = queue.get() freqs = load_pickle(count_dir + str(year) + "-counts.pkl") for n in checkpoints: out_dir =dir + '{:03d}'.format(n) + "/" mkdir(out_dir) subprocess.call(['mv', dir + str(year) + '-w.' + '{:03d}'.format(n), out_dir + str(year) + '-w']) print "Loading data..", year, "iterations", n text2numpy(out_dir, freqs, year)
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print "Proc:", proc_num, "Decade:", decade proper_nouns = set([]) pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl") for word, tag in pos_tags.iteritems(): if tag == "np": proper_nouns.add(word) write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: # Iterates through the decades try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade counts = collections.defaultdict( int) # this dict represents the co-occurrence matrix for year in range(10): # Iterates through the years in the decade embed = Explicit.load( in_dir + str(decade + year) + ".bin", normalize=False ) # Makes an embedding about the individual year (here is needed the own index) if year == 0: merged_index = embed.wi if os.path.isfile(in_dir + str(decade + year) + "-list.pkl"): year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl") else: year_list = load_pickle(in_dir + "merged_list.pkl") mat = embed.m.tocoo() for i in xrange( len(mat.data) ): # Iterates through the word-context pairs and counts the co-occurrence if mat.data[i] == 0: continue new_row = word_to_cached_id(year_list[mat.row[i]], merged_index) new_col = word_to_cached_id(year_list[mat.col[i]], merged_index) counts[(new_row, new_col)] += mat.data[ i] # Adds the co-occurrence to the decade-data print "Done year ", decade + year export_mat_from_dict(counts, out_dir + str(decade) + ".bin") # Saves the decadely co-occurrence matrix write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") # Saves the decadely index write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print("Proc:", proc_num, "Decade:", decade) proper_nouns = set([]) pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl") for word, tag in pos_tags.items(): if tag == "np": proper_nouns.add(word) write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
def load_shared_vocabulary(mat, mat_file): vocab_file = "" i = 0 path = mat_file.split("/") while True: if "nppmi" in path[i]: break vocab_file += "/" + path[i] i += 1 vocab_file += "/5grams/merged_list.pkl" shared_vocab = ioutils.load_pickle(vocab_file) iw = shared_vocab[:mat.shape[0]] ic = shared_vocab[:mat.shape[1]] return iw, ic
def main(proc_num, lock, out_dir, in_dir): years = YEARS random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(out_dir)) if str(year) + "-a.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + "-a.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Merging grams for year", year year_grams = {} for letter in string.ascii_lowercase: year_grams[letter] = collections.defaultdict(list) for chunk_name in os.listdir(in_dir): print "Processing chunk", chunk_name chunk_name = in_dir + str(chunk_name) + "/" + str(year) + ".pkl" if not os.path.isfile(chunk_name): continue chunk_counts = ioutils.load_pickle(chunk_name) for word, info_list in chunk_counts.iteritems(): if word[0] not in year_grams: continue for info in info_list: gram = info[0].split("\t")[0] count = info[1] year_grams[word[0]][word].append((gram, count)) print proc_num, "Writing counts for year", year for letter, letter_grams in year_grams.iteritems(): for word in letter_grams: letter_grams[word] = sorted(letter_grams[word], key = lambda info : info[1], reverse=True) ioutils.write_pickle(letter_grams, out_dir + str(year) + "-" + letter + ".pkl")
def merge(out_pref, years, full_word_list): merged_word_stats = {} for stat in STATS: merged_word_stats[stat] = {} for word in full_word_list: merged_word_stats[stat][word] = {} for year in years: year_stats = ioutils.load_pickle(out_pref + str(year) + "-tmp.pkl") for stat, stat_vals in year_stats.iteritems(): for word in full_word_list: if not word in stat_vals: merged_word_stats[stat][word][year] = NAN else: merged_word_stats[stat][word][year] = stat_vals[word] os.remove(out_pref + str(year) + "-tmp.pkl") ioutils.write_pickle(merged_word_stats, out_pref + ".pkl")
def merge_bootstrap(out_pref): dir = "/".join(out_pref.split("/")[0:-1]) bootfiles = os.listdir(dir) word_stat_lists = {} first_file = True file_num = 0 for file in bootfiles: bootstats = ioutils.load_pickle(dir + "/" + file) print "Processing file", file for stat, stat_vals in bootstats.iteritems(): if first_file: word_stat_lists[stat] = {} for word, val in stat_vals.iteritems(): year_vals = stat_vals[word] if first_file: word_stat_lists[stat][word] = {} for year, val in year_vals.iteritems(): if type(val) == float and np.isnan(val): word_stat_lists[stat][word][year] = float('nan') else: if first_file: word_stat_lists[stat][word][year] = np.empty((val.shape[0] * len(bootfiles))) word_stat_lists[stat][word][year][file_num * val.shape[0]:(file_num + 1) * val.shape[0]] = val[:] first_file = False file_num += 1 print "Making means and stds" word_stat_means = {} word_stat_stds = {} for stat, stat_vals in word_stat_lists.iteritems(): word_stat_means[stat] = {} word_stat_stds[stat] = {} for word, year_vals in stat_vals.iteritems(): word_stat_means[stat][word] = {} word_stat_stds[stat][word] = {} for year, val in year_vals.iteritems(): if type(val) == float and np.isnan(val): word_stat_means[stat][word][year] = float('nan') word_stat_stds[stat][word][year] = float('nan') else: word_stat_means[stat][word][year] = val.mean() word_stat_stds[stat][word][year] = val.std() print "Writing data" for stat, mean_vals in word_stat_means.iteritems(): ioutils.write_pickle(mean_vals, out_pref + "-" + stat + "-mean.pkl") for stat, std_vals in word_stat_stds.iteritems(): ioutils.write_pickle(std_vals, out_pref + "-" + stat + "-std.pkl")
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "Getting counts and matrix year", year embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False) year_words = valid_words[year][:num_words] count_words = set(ioutils.words_above_count(count_dir, year, min_count)) freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl")) use_words = list(count_words.intersection(year_words)) embed = embed.get_subembed(use_words, restrict_context=True) sample_corr = min(SAMPLE_MAX / freq.N(), 1.0) print "Sample correction..", sample_corr embed.m = embed.m * sample_corr mat = embed.m.tocoo() print proc_num, "Outputing pairs for year", year with open(out_dir + str(year) + ".tmp.txt", "w") as fp: for i in xrange(len(mat.data)): if i % 10000 == 0: print "Done ", i, "of", len(mat.data) word = embed.iw[mat.row[i]] context = embed.ic[mat.col[i]] if sample != 0: prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) else: prop_keep = 1.0 word = word.encode("utf-8") context = context.encode("utf-8") line = word + " " + context + "\n" for j in xrange(int(mat.data[i] * prop_keep)): fp.write(line) mat = mat.tocsr() print proc_num, "Outputing vocab for year", year with open(out_dir + str(year) + ".vocab", "w") as fp: for word in year_words: if not word in count_words: print >>fp, word.encode("utf-8"), 1 else: print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum()) print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt") os.remove(out_dir + str(year) + ".tmp.txt")
def __init__(self, path, normalize=True, eig=0.0, **kwargs): ut = np.load(path + '-u.npy') s = np.load(path + '-s.npy') vocabfile = path + '-vocab.pkl' self.iw = load_pickle(vocabfile) self.wi = {w:i for i, w in enumerate(self.iw)} if eig == 0.0: self.m = ut elif eig == 1.0: self.m = s * ut else: self.m = np.power(s, eig) * ut self.dim = self.m.shape[1] if normalize: self.normalize()
def get_sorted_words(years, out_dir, in_dir): word_freqs = collections.defaultdict(float) for year in years: print "Processing year", year year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl") sum = 0.0 for _, counts in year_freqs.iteritems(): sum += counts[0] for word, counts in year_freqs.iteritems(): if not word.isalpha(): continue word_freqs[word] += float(counts[0]) / sum print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) out_pref = out_dir + "sortedwords-" + str(years[0]) + "-" + str(years[-1]) out_fp = open(out_pref + ".txt", "w") for word in sorted_list: out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n") ioutils.write_pickle(sorted_list, out_pref + ".pkl")
def worker(proc_num, queue, in_dir): print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat) old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl")) new_index = collections.OrderedDict() for i in xrange(len(keep_rows)): new_index[old_index[keep_rows[i]]] = i ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl") print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
def main(proc_num, lock, in_dir, years, word_list): years = range(years[0], years[-1] + 1) random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir)) if str(year) + "-freqstmp.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = in_dir + str(year) + "-freqstmp.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break year_freqs = ioutils.load_pickle(in_dir + "/" + str(year) + "-freqs.pkl") word_stats = {} print proc_num, "Getting stats for year", year sum = 0 for word in word_list: if word in year_freqs: word_count = year_freqs[word][1] sum += word_count word_stats[word] = word_count for word in word_stats: word_stats[word] /= float(sum) print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl")
def get_sorted_words(years, out_pref, in_dir, avg_thresh, min_thresh): stop_set = set(stopwords.words('english')) word_freqs = collections.defaultdict(float) word_mins = collections.defaultdict(lambda : 1.0) for year in years: print "Processing year", year year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl") sum = 0.0 for _, counts in year_freqs.iteritems(): sum += counts[0] for word, counts in year_freqs.iteritems(): if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = float(counts[0]) / sum word_freqs[word] += year_freq word_mins[word] = min(word_mins[word], year_freq) print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if (word_freqs[word] / float(len(years)) > avg_thresh and word_mins[word] > min_thresh)] out_fp = open(out_pref + ".txt", "w") for word in sorted_list: out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n") ioutils.write_pickle(sorted_list, out_pref + ".pkl")
parser.add_argument( "--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1, ) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) parser.add_argument("--thresh", type=float, help="optional threshold", default=None) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) if args.word_file != None: if args.index_dir == None: print >> sys.stderr, "Must specify index dir with word file!" sys.exit() word_pickle = ioutils.load_pickle(args.word_file) if not args.start_year in word_pickle: word_lists = {} for year in years: word_lists[year] = word_pickle else: word_lists = word_pickle word_infos = {} for year, word_list in word_lists.iteritems(): year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl") if args.num_words != -1: word_list = word_list[: args.num_words] word_list, word_indices = get_word_indices(word_list, year_index) word_infos[year] = (word_list, word_indices) outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1:
ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl") def run_parallel(num_procs, years, out_pref, out_dir, in_dir, index, freq_thresh, lang): queue = Queue() for year in years: queue.put(year) procs = [Process(target=main, args=[i, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() merge(years, out_pref, out_dir) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Get yearly sorted by-frequency list of (non-stop) words and dicts with their frequencies") parser.add_argument("out_dir", help="output directory") parser.add_argument("in_dir", help="directory with 5 grams and index") parser.add_argument("num_procs", type=int, help="num procs") parser.add_argument("--start-year", type=int, default=1900, help="start year (inclusive)") parser.add_argument("--end-year", type=int, default=2000, help="end year (inclusive)") parser.add_argument("--freq-thresh", type=int, default=7, help="frequency threshold (neg. power of 10)") parser.add_argument("--lang", type=str, default="english", help="language") args = parser.parse_args() years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(args.in_dir + "/merged_list.pkl") out_pref = args.out_dir + "/freqnonstop_peryear-" + str(years[0]) + "-" + str(years[-1]) + "-" + str(args.freq_thresh) freq_thresh = 10.0 ** (-1.0 * float(args.freq_thresh)) run_parallel(args.num_procs, years, out_pref , args.out_dir + "/", args.in_dir + "/", index, freq_thresh, args.lang)
word_stats[word] = word_count for word in word_stats: word_stats[word] /= float(sum) print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl") def run_parallel(num_procs, in_dir, years, word_list, out_file): lock = Lock() procs = [Process(target=main, args=[i, lock, in_dir, years, word_list]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() print "Merging" merge(word_list, years, in_dir, out_file) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("out_file", help="path to network data (also where output goes)") parser.add_argument("in_dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) word_list = ioutils.load_pickle(args.word_file) run_parallel(args.num_procs, args.in_dir + "/", years, word_list, args.out_file)
if __name__ == "__main__": queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) procs = [Process(target=worker, args=[i, queue]) for i in range(25)] for p in procs: p.start() for p in procs: p.join() print "Getting full set..." proper_nouns = set([]) pos_counts = {} print "Merging pos counts.." for decade in range(1810, 2010, 10): decade_pos_counts = load_pickle(DATA + str(decade) + "-pos-counts.pkl") for word, counts in decade_pos_counts.iteritems(): if word not in pos_counts: pos_counts[word] = collections.Counter() for pos, count in counts.iteritems(): pos_counts[word][pos] += count write_pickle(pos_counts, DATA + "all-pos-counts.pkl") pos_maj = {} proper_nouns = set([]) for word, p_counts in pos_counts.iteritems(): pos_maj[word] = sorted(p_counts, key = lambda t : -1*p_counts[t])[0] if pos_maj[word] == "np": proper_nouns.add(word) write_pickle(pos_maj, OUT + "all-pos-maj.pkl") write_pickle(proper_nouns, OUT + "proper_nouns.pkl")
from ioutils import load_pickle, write_pickle DIR = "/dfs/scratch0/COHA/decade_freqs/" word = {} lemma = {} lemma_pos = {} for year in range(1810, 2010, 10): word[year] = load_pickle(DIR + str(year) + "-word.pkl") lemma[year] = load_pickle(DIR + str(year) + "-lemma.pkl") lemma_pos[year] = load_pickle(DIR + str(year) + "-lemma_pos.pkl") write_pickle(word, DIR + "word.pkl") write_pickle(lemma, DIR + "lemma.pkl") write_pickle(lemma_pos, DIR + "lemma_pos.pkl")
def load_vocabulary(mat, path): shared_vocab = list(ioutils.load_pickle(path.split(".")[0] + "-index.pkl")) iw = shared_vocab[:mat.shape[0]] ic = shared_vocab[:mat.shape[1]] return iw, ic
continue word_freqs[word] += 1 lemma_freqs[lemma] += 1 lemma_pos_freqs[lemma_pos] += 1 write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") if __name__ == "__main__": queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) procs = [Process(target=worker, args=[i, queue]) for i in range(25)] for p in procs: p.start() for p in procs: p.join() print "Getting full freqs..." word_freqs = Counter() lemma_freqs = Counter() lemma_pos_freqs = Counter() for decade in range(1810, 2010, 10): decade = str(decade) print decade word_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-word.pkl") lemma_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma.pkl") lemma_pos_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") write_pickle(word_freqs, OUT + "full_freqs/word.pkl") write_pickle(lemma_freqs, OUT + "full_freqs/lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "full_freqs/lemma_pos.pkl")