def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh): while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir + "/volstats/")) if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = tmp_out_pref + str(year) + "-jvols.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE) base = base.tocsr() delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE) delta = delta.tocsr() print proc_num, "Getting deltas..." year_vols = get_jaccard_deltas(base, delta, word_list, word_indices) year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl") ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break stop_set = set(stopwords.words(lang)) word_freqs = {} print "Loading mat for year", year year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if word_freqs[word] > freq_thresh] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def main(proc_num, queue, out_dir, download_dir, context_size): print proc_num, "Start loop" while True: if queue.empty(): break name = queue.get() loc_dir = out_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) time.sleep(120 * random.random()) with open(download_dir + name) as f: for i, l in enumerate(f): split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()] year = split[1] count = int(split[2]) if context_size == 2: year_counters = update_count(ngram, 2, year, count, year_counters) elif context_size == 4: year_counters = update_count(ngram, 0, year, count, year_counters) year_counters = update_count(ngram, 4, year, count, year_counters) else: raise Exception("Unsupported context size") print proc_num, "Writing", name time.sleep(120 * random.random()) sparse_io.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl")
def worker(proc_num, queue, dir, count_dir, min_count): while True: if queue.empty(): break year = queue.get() print "Loading data..", year # time.sleep(120 * random.random()) freqs = load_pickle(count_dir + str(year) + "-counts.pkl") iw = [] with open(dir + str(year) + "-w.txt") as fp: info = fp.readline().split() vocab_size = int(info[0]) dim = int(info[1]) w_mat = np.zeros((vocab_size, dim)) for i, line in enumerate(fp): line = line.strip().split() iw.append(line[0].decode("utf-8")) if freqs[iw[-1]] >= 500: w_mat[i,:] = np.array(map(float, line[1:])) c_mat = np.zeros((vocab_size, dim)) with open(dir + str(year) + "-c.txt") as fp: fp.readline() for i, line in enumerate(fp): line = line.strip().split() if freqs[line[0]] >= min_count: c_mat[i,:] = np.array(map(float, line[1:])) np.save(dir + str(year) + "-w.npy", w_mat) np.save(dir + str(year) + "-c.npy", c_mat) write_pickle(iw, dir + str(year) + "-vocab.pkl")
def main(out_dir, source): page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page year_freqs = {} for year in YEARS: year_freqs[year] = {} print "Start loop" for url in urls: name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1) print "Downloading", name success = False while not success: with open(out_dir + name + '.csv.zip', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print "Unzipping", name subprocess.call(['unzip', '-o', out_dir + name + '.csv.zip', '-d', out_dir]) subprocess.call(['mv', out_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', out_dir + name]) print "Going through", name with open(out_dir + name) as f: for l in f: try: split = l.strip().split('\t') word = split[0].decode('utf-8').lower() word = word.strip("\"") word = word.strip("'s") year = int(split[1]) count = int(split[2]) doc_count = int(split[4]) if not year in YEARS: continue if not word in year_freqs[year]: year_freqs[year][word] = (count, doc_count) else: old_counts = year_freqs[year][word] year_freqs[year][word] = (old_counts[0] + count, old_counts[1] + count) except UnicodeDecodeError: pass print "Deleting", name try: os.remove(out_dir + name) os.remove(out_dir + name + '.csv.zip') except: pass print "Writing..." for year in YEARS: ioutils.write_pickle(year_freqs[year], out_dir + str(year) + "-freqs.pkl")
def main(proc_num, lock, in_dir, years, word_list, index): years = range(years[0], years[-1] + 1) random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir)) if str(year) + "-freqs.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = in_dir + str(year) + "-freqs.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") print proc_num, "Making inverse freq mat", year mat = mat.tocsr() mat = mat / mat.sum() word_stats = {} print proc_num, "Getting stats for year", year for word in word_list: word_stats[word] = compute_word_stats(mat, word, index) print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade counts = collections.defaultdict(int) for year in range(10): embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False) if year == 0: merged_index = embed.wi year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl") mat = embed.m.tocoo() for i in xrange(len(mat.data)): if mat.data[i] == 0: continue new_row = get_index(merged_index, year_list, mat.row[i]) new_col = get_index(merged_index, year_list, mat.col[i]) counts[(new_row, new_col)] += mat.data[i] print "Done year ", decade + year export_mat_from_dict(counts, decade, out_dir) write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
def merge(word_list, years, in_dir, out_file): yearstats = {} for word in word_list: yearstats[word] = {} for year in years: yearstat = ioutils.load_pickle(in_dir + str(year) + "-freqstmp.pkl") for word in yearstat.keys(): yearstats[word][year] = yearstat[word] os.remove(in_dir + str(year) + "-freqstmp.pkl") ioutils.write_pickle(yearstats, out_file)
def run(out_file, in_dir, years, year_indices): samplesizes = {} for year in years: print "Processing year", year indices = year_indices[year] mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat = mat.tocsr() mat = mat[indices, :] mat = mat[:, indices] samplesizes[year] = mat.sum() ioutils.write_pickle(samplesizes, out_file)
def run(out_dir, in_dir): index = collections.OrderedDict() for year in YEARS: print "Merging year", year year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") i = 0 for i in xrange(len(year_list)): word = year_list[i] indexing.word_to_cached_id(word, index) ioutils.write_pickle(index, out_dir + "merged_index.pkl") ioutils.write_pickle(list(index), out_dir + "merged_list.pkl")
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print "Proc:", proc_num, "Decade:", decade proper_nouns = set([]) pos_tags = load_pickle(DATA + str(decade) + "-pos-maj.pkl") for word, tag in pos_tags.iteritems(): if tag == "np": proper_nouns.add(word) write_pickle(proper_nouns, OUT + str(decade) + "-proper_nouns.pkl")
def main(proc_num, lock, out_dir, in_dir): years = YEARS random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(out_dir)) if str(year) + "-a.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + "-a.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Merging grams for year", year year_grams = {} for letter in string.ascii_lowercase: year_grams[letter] = collections.defaultdict(list) for chunk_name in os.listdir(in_dir): print "Processing chunk", chunk_name chunk_name = in_dir + str(chunk_name) + "/" + str(year) + ".pkl" if not os.path.isfile(chunk_name): continue chunk_counts = ioutils.load_pickle(chunk_name) for word, info_list in chunk_counts.iteritems(): if word[0] not in year_grams: continue for info in info_list: gram = info[0].split("\t")[0] count = info[1] year_grams[word[0]][word].append((gram, count)) print proc_num, "Writing counts for year", year for letter, letter_grams in year_grams.iteritems(): for word in letter_grams: letter_grams[word] = sorted(letter_grams[word], key = lambda info : info[1], reverse=True) ioutils.write_pickle(letter_grams, out_dir + str(year) + "-" + letter + ".pkl")
def merge(out_pref, years, full_word_list): merged_word_stats = {} for stat in STATS: merged_word_stats[stat] = {} for word in full_word_list: merged_word_stats[stat][word] = {} for year in years: year_stats = ioutils.load_pickle(out_pref + str(year) + "-tmp.pkl") for stat, stat_vals in year_stats.iteritems(): for word in full_word_list: if not word in stat_vals: merged_word_stats[stat][word][year] = NAN else: merged_word_stats[stat][word][year] = stat_vals[word] os.remove(out_pref + str(year) + "-tmp.pkl") ioutils.write_pickle(merged_word_stats, out_pref + ".pkl")
def merge(out_pref, tmp_dir, years): net_stats = collections.defaultdict(dict) rewire_net_stats = collections.defaultdict(dict) for year in years: year_stats = ioutils.load_pickle(tmp_dir + str(year) + "-tmp.pkl") rewire_year_stats = ioutils.load_pickle(tmp_dir + "rewire" + str(year) + "-tmp.pkl") for stat, val in year_stats.iteritems(): net_stats[stat][year] = val for stat, val in rewire_year_stats.iteritems(): rewire_net_stats[stat][year] = val os.remove(tmp_dir + str(year) + "-tmp.pkl") os.remove(tmp_dir + "rewire" + str(year) + "-tmp.pkl") for stat, year_vals in net_stats.iteritems(): ioutils.write_pickle(year_vals, out_pref + "-" + stat + ".pkl") for stat, year_vals in rewire_net_stats.iteritems(): ioutils.write_pickle(year_vals, out_pref + "-rw-" + stat + ".pkl")
def merge(out_pref, tmp_out_pref, years, word_list): vol_yearstats = {} disp_yearstats = {} for word in word_list: vol_yearstats[word] = {} disp_yearstats[word] = {} for year in years: vol_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jvols.pkl") disp_yearstat = ioutils.load_pickle(tmp_out_pref + str(year) + "-jdisps.pkl") for word in word_list: vol_yearstats[word][year] = vol_yearstat[word] disp_yearstats[word][year] = disp_yearstat[word] os.remove(tmp_out_pref + str(year) + "-jvols.pkl") os.remove(tmp_out_pref + str(year) + "-jdisps.pkl") ioutils.write_pickle(vol_yearstats, out_pref + "-jvols.pkl") ioutils.write_pickle(disp_yearstats, out_pref + "-jdisps.pkl")
def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists, displacement_base, thresh, year_inc, type): time.sleep(10*random.random()) while True: if queue.empty(): print proc_num, "Finished" break year = queue.get() print proc_num, "Loading matrices..." base = create_representation(type, in_dir + str(year-year_inc), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) delta = create_representation(type, in_dir + str(year), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, target_lists[year], type) year_disp = get_cosine_deltas(displacement_base, delta, target_lists[year], type) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def merge_bootstrap(out_pref): dir = "/".join(out_pref.split("/")[0:-1]) bootfiles = os.listdir(dir) word_stat_lists = {} first_file = True file_num = 0 for file in bootfiles: bootstats = ioutils.load_pickle(dir + "/" + file) print "Processing file", file for stat, stat_vals in bootstats.iteritems(): if first_file: word_stat_lists[stat] = {} for word, val in stat_vals.iteritems(): year_vals = stat_vals[word] if first_file: word_stat_lists[stat][word] = {} for year, val in year_vals.iteritems(): if type(val) == float and np.isnan(val): word_stat_lists[stat][word][year] = float('nan') else: if first_file: word_stat_lists[stat][word][year] = np.empty((val.shape[0] * len(bootfiles))) word_stat_lists[stat][word][year][file_num * val.shape[0]:(file_num + 1) * val.shape[0]] = val[:] first_file = False file_num += 1 print "Making means and stds" word_stat_means = {} word_stat_stds = {} for stat, stat_vals in word_stat_lists.iteritems(): word_stat_means[stat] = {} word_stat_stds[stat] = {} for word, year_vals in stat_vals.iteritems(): word_stat_means[stat][word] = {} word_stat_stds[stat][word] = {} for year, val in year_vals.iteritems(): if type(val) == float and np.isnan(val): word_stat_means[stat][word][year] = float('nan') word_stat_stds[stat][word][year] = float('nan') else: word_stat_means[stat][word][year] = val.mean() word_stat_stds[stat][word][year] = val.std() print "Writing data" for stat, mean_vals in word_stat_means.iteritems(): ioutils.write_pickle(mean_vals, out_pref + "-" + stat + "-mean.pkl") for stat, std_vals in word_stat_stds.iteritems(): ioutils.write_pickle(std_vals, out_pref + "-" + stat + "-std.pkl")
def worker(proc_num, queue, out_pref, in_dir, word_list, displacement_base, thresh): while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = simple_create_representation(REP_TYPE, in_dir + str(year-1) + ".bin", restricted_context=word_list[year-1], thresh=thresh) delta = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", restricted_context=word_list[year], thresh=thresh) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, word_list[year]) year_disp = get_cosine_deltas(displacement_base, delta, word_list[year]) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print "Loading year:", year year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) year_words = words_above_count(count_dir, year, min_count) year_embed.get_subembed(year_words) print "Aligning year:", year if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed) base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy",aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def worker(proc_num, queue, in_dir): print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat) old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl")) new_index = collections.OrderedDict() for i in xrange(len(keep_rows)): new_index[old_index[keep_rows[i]]] = i ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl") print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
def get_sorted_words(years, out_dir, in_dir): word_freqs = collections.defaultdict(float) for year in years: print "Processing year", year year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl") sum = 0.0 for _, counts in year_freqs.iteritems(): sum += counts[0] for word, counts in year_freqs.iteritems(): if not word.isalpha(): continue word_freqs[word] += float(counts[0]) / sum print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) out_pref = out_dir + "sortedwords-" + str(years[0]) + "-" + str(years[-1]) out_fp = open(out_pref + ".txt", "w") for word in sorted_list: out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n") ioutils.write_pickle(sorted_list, out_pref + ".pkl")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print("Loading year:", year) year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) year_words = words_above_count(count_dir, year, min_count) year_embed.get_subembed(year_words) print("Aligning year:", year) if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed) base_embed = aligned_embed print("Writing year:", year) foutname = out_dir + str(year) np.save(foutname + "-w.npy",aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict(dict) word_lists = {} word_set = set([]) for year in years: word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") ioutils.write_pickle(word_lists, out_pref + ".pkl")
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict(dict) word_lists = {} word_set = set([]) for year in years: word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: year_freqs= ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") ioutils.write_pickle(word_lists, out_pref + ".pkl")
def main(proc_num, queue, out_dir, download_dir, context_size, is_zipped): print proc_num, "Start loop" while True: if queue.empty(): break name = queue.get() if is_zipped: if not name.endswith((".gz")): continue print "Unzipping " + name + " ..." subprocess.call(['gunzip', '-f', download_dir + name, '-d']) name = name.split(".gz")[0] loc_dir = out_dir + "/" + name + "/" ioutils.mkdir(loc_dir) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) time.sleep(120 * random.random()) with open(download_dir + name) as f: for i, l in enumerate(f): split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()] year = split[1] count = int(split[2]) if context_size == 2: year_counters = update_count(ngram, 2, year, count, year_counters) elif context_size == 4: year_counters = update_count(ngram, 0, year, count, year_counters) year_counters = update_count(ngram, 4, year, count, year_counters) else: raise Exception("Unsupported context size") print proc_num, "Writing", name time.sleep(120 * random.random()) sparse_io_ref.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") os.remove(download_dir + name)
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100): while True: if queue.empty(): break year = queue.get() print "Loading embeddings for year", year time.sleep(random.random() * 120) valid_words = set(words_above_count(count_dir, year, min_count)) print len(valid_words) words = list(valid_words.intersection(words[year][:num_words])) print len(words) base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False) base_embed = base_embed.get_subembed(words, restrict_context=True) print "SVD for year", year u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) print "Saving year", year np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s) write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade for year in range(10): year_counts = load_pickle(in_dir + str(decade + year) + "-counts.pkl") if year == 0: merged_year_counts = year_counts for word, count in year_counts.iteritems(): if not word in merged_year_counts: merged_year_counts[word] = 0 merged_year_counts[word] += year_counts[word] write_pickle(merged_year_counts, out_dir + str(decade) + "-counts.pkl")
def main(years, out_dir, in_dir, count_dir, min_count, num_words): print "Making common vocab" words = ioutils.load_pickle(in_dir + str(years[0]) + "-list.pkl") for year in years: counts_year = ioutils.load_pickle(count_dir + str(year) + "-counts.pkl") use_words = sorted(counts_year.keys(), key=lambda word: counts_year[word])[:num_words] use_words = [ word for word in use_words if counts_year[word] > min_count ] i = 0 while i < len(words): if words[i] not in use_words: words.pop(i) i -= 1 i += 1 print year, "vocab, done" ioutils.write_pickle(list(words), out_dir + "common_vocab.pkl")
def main(proc_num, queue, out_dir, in_dir, context_size): ioutils.mkdir(out_dir) print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "- Loading mat for year", year year_mat = load_matrix(in_dir + str(year) + ".bin") index = ioutils.load_pickle(in_dir + str(year) + "-index.pkl") print proc_num, "- Processing data for year", year counts = year_mat.sum(1) / (2 * context_size) # sums up the occurrence counts = { word: int(counts[index[word]]) for word in index if index[word] < len(counts) } ioutils.write_pickle(counts, out_dir + "/" + str(year) + "-counts.pkl") # writes it in a file
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh): print proc_num, "Start loop" time.sleep(10 * random.random()) while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year if thresh != None: mat = sparse_io.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh) else: mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=5000000) print proc_num, "Getting stats for year", year year_stats = get_year_stats(mat, year_index_infos[year]["index"], year_index_infos[year]["list"], index_set = set(year_index_infos[year]["indices"])) print proc_num, "Writing stats for year", year ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def main(proc_num, lock, years, out_pref, out_dir, in_dir, index, freq_thresh): random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(out_dir)) if str(year) + "tmp.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + "tmp.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break stop_set = set(stopwords.words('english')) word_freqs = {} print "Loading mat for year", year year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if word_freqs[word] > freq_thresh] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh): while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir + "/volstats/")) if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = tmp_out_pref + str(year) + "-jvols.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = matstore.retrieve_mat_as_binary_coo_thresh( in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE) base = base.tocsr() delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE) delta = delta.tocsr() print proc_num, "Getting deltas..." year_vols = get_jaccard_deltas(base, delta, word_list, word_indices) year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl") ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, knn, thresh): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_embed = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", thresh=thresh) old_embed = old_embed.get_subembed(year_index_infos[year]["list"]) old_mat = old_embed.m.tocoo() row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat, thresh=thresh, min_cooccurs=0, shrink_mat=False) second_mat = coo_matrix((data_d, (row_d, col_d))) if knn != None: row_d, col_d, data_d = make_knn_mat(second_mat, knn) second_mat = coo_matrix((data_d, (row_d, col_d))) year_stats = get_year_stats(second_mat, old_embed.wi, old_embed.iw, stats=STATS) print proc_num, "Writing stats for year", year ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def align_years(years, rep_type, in_dir, out_dir, **rep_args): first_iter = True base_embed = None for year in years: # Iterates through years print "Loading year:", year year_embed = create_representation( rep_type, in_dir + str(year), **rep_args) # Loads the individual embedding print "Aligning year:", year if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align( base_embed, year_embed, post_normalize=False) # Rotates to the previous year embedding base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy", aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def merge(years, out_pref, out_dir): word_freqs = collections.defaultdict( dict) # dict mapping year to word-relative_frequency pairs word_lists = {} # dict mapping year to list of used words word_set = set([]) # set of words ever used for year in years: # Collects word_lists word_lists[year] = ioutils.load_pickle(out_dir + str(year) + "tmp.pkl") word_set = word_set.union(set(word_lists[year])) os.remove(out_dir + str(year) + "tmp.pkl") for year in years: # Collects relative frequencies year_freqs = ioutils.load_pickle(out_dir + str(year) + "freqstmp.pkl") for word in word_set: if word not in year_freqs: word_freqs[word][year] = float('nan') else: word_freqs[word][year] = year_freqs[word] os.remove(out_dir + str(year) + "freqstmp.pkl") ioutils.write_pickle(word_freqs, out_pref + "-freqs.pkl") # Saves relative frequencies ioutils.write_pickle(word_lists, out_pref + ".pkl") # Saves word_lists
def main(proc_num, lock, out_pref, tmp_dir, in_dir, years, word_infos, thresh): random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: existing_files = set(os.listdir(tmp_dir)) fname = str(year) + "-tmp.pkl" if fname in existing_files: continue work_left = True print proc_num, "year", year with open(tmp_dir + fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year if thresh != None: mat = matstore.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh) else: mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat.setdiag(0) if word_infos != None: word_indices = word_infos[year][1] indices = word_indices[word_indices < min(mat.shape[1], mat.shape[0])] else: indices = np.arange(mat.shape[0]) year_graph = make_snap_graph(indices, mat) print proc_num, "Getting statistics for year", year year_stats = compute_graph_stats(year_graph) rewire_year_stats = compute_graph_stats(snap.GenRewire(year_graph, REWIRE_EDGE_SWITCHES)) ioutils.write_pickle(year_stats, tmp_dir + fname) ioutils.write_pickle(rewire_year_stats, tmp_dir + "rewire" + fname)
def merge_year_counts(out_dir, name_list, years): for year in years: year_counts = {} year_doc_counts = {} year_pos = {} for name in name_list: tmp_year_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-counts.pkl") tmp_year_doc_counts = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-doc_counts.pkl") tmp_year_pos = ioutils.load_pickle(out_dir + "/" + name + "/" + str(year) + "-pos.pkl") for word, count in tmp_year_counts.iteritems(): if not word in year_counts: year_counts[word] = 0 year_doc_counts[word] = 0 year_pos[word] = collections.Counter() year_counts[word] += tmp_year_counts[word] year_doc_counts[word] += tmp_year_doc_counts[word] counter_keys = tmp_year_pos[word].keys() for pos in counter_keys: year_pos[word][pos] += tmp_year_pos[word][pos] print "Writing merged counts for " + str(year) + " ..." ioutils.write_pickle(year_counts, out_dir + str(year) + "-counts.pkl") ioutils.write_pickle(year_doc_counts, out_dir + str(year) + "-doc_counts.pkl") ioutils.write_pickle(year_pos, out_dir + str(year) + "-pos.pkl") print "Deleting temp dirs ..." remove_tmp_dirs(out_dir, name_list)
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print("Proc:", proc_num, "Decade:", decade) word_freqs = Counter() lemma_freqs = Counter() lemma_pos_freqs = Counter() for file in os.listdir(DATA + decade): with open(DATA + decade + "/" + file) as fp: print(proc_num, file) fp.readline() for line in fp: word, lemma, lemma_pos, _ = process_lemma_line(line) if word == None: continue if lemma_pos == None: continue word_freqs[word] += 1 lemma_freqs[lemma] += 1 lemma_pos_freqs[lemma_pos] += 1 write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl")
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print "Proc:", proc_num, "Decade:", decade word_freqs = Counter() lemma_freqs = Counter() lemma_pos_freqs = Counter() for file in os.listdir(DATA + decade): with open(DATA + decade + "/" + file) as fp: print proc_num, file fp.readline() for line in fp: word, lemma, lemma_pos, _ = process_lemma_line(line) if word == None: continue if lemma_pos == None: continue word_freqs[word] += 1 lemma_freqs[lemma] += 1 lemma_pos_freqs[lemma_pos] += 1 write_pickle(word_freqs, OUT + "decade_freqs/" + decade + "-word.pkl") write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl")
def worker(proc_num, queue, out_dir, in_dir): while True: # Iterates through the decades try: decade = queue.get(block=False) except Empty: break print "Processing decade", decade counts = collections.defaultdict( int) # this dict represents the co-occurrence matrix for year in range(10): # Iterates through the years in the decade embed = Explicit.load( in_dir + str(decade + year) + ".bin", normalize=False ) # Makes an embedding about the individual year (here is needed the own index) if year == 0: merged_index = embed.wi if os.path.isfile(in_dir + str(decade + year) + "-list.pkl"): year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl") else: year_list = load_pickle(in_dir + "merged_list.pkl") mat = embed.m.tocoo() for i in xrange( len(mat.data) ): # Iterates through the word-context pairs and counts the co-occurrence if mat.data[i] == 0: continue new_row = word_to_cached_id(year_list[mat.row[i]], merged_index) new_col = word_to_cached_id(year_list[mat.col[i]], merged_index) counts[(new_row, new_col)] += mat.data[ i] # Adds the co-occurrence to the decade-data print "Done year ", decade + year export_mat_from_dict(counts, out_dir + str(decade) + ".bin") # Saves the decadely co-occurrence matrix write_pickle(merged_index, out_dir + str(decade) + "-index.pkl") # Saves the decadely index write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
def main(proc_num, lock, in_dir, years, word_list): years = range(years[0], years[-1] + 1) random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir)) if str(year) + "-freqstmp.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = in_dir + str(year) + "-freqstmp.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break year_freqs = ioutils.load_pickle(in_dir + "/" + str(year) + "-freqs.pkl") word_stats = {} print proc_num, "Getting stats for year", year sum = 0 for word in word_list: if word in year_freqs: word_count = year_freqs[word][1] sum += word_count word_stats[word] = word_count for word in word_stats: word_stats[word] /= float(sum) print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqstmp.pkl")
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): #random.shuffle(years) # I don't know what it is for print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break #stop_set = set(stopwords.words(lang)) word_freqs = {} # dict with word-relative_freq pairs print "Loading mat for year", year year_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum( ) # normalizes the co-occurrence matrix print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha( ): # or word in stop_set or len(word) == 1: # filters out the degenerated words continue year_freq = year_mat[word_i, :].sum( ) # thank to the normalization it's the relative frequency of the word word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key=lambda key: word_freqs[key], reverse=True) # sorting and filtering sorted_list = [ word for word in sorted_list if word_freqs[word] > freq_thresh ] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") # Saves the list of words ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl") # Saves the relative frequencies
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break stop_set = set(stopwords.words(lang)) word_freqs = {} print "Loading mat for year", year try: year_mat = sparse_io_ref.retrieve_mat_as_coo(in_dir + str(year) + ".bin") except (TypeError, ValueError): continue #year_mat = sparse_io_ref.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key=lambda key: word_freqs[key], reverse=True) sorted_list = [ word for word in sorted_list if word_freqs[word] > freq_thresh ] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def get_sorted_words(years, out_pref, in_dir, avg_thresh, min_thresh): stop_set = set(stopwords.words('english')) word_freqs = collections.defaultdict(float) word_mins = collections.defaultdict(lambda : 1.0) for year in years: print "Processing year", year year_freqs = ioutils.load_pickle(in_dir + str(year) + "-freqs.pkl") sum = 0.0 for _, counts in year_freqs.iteritems(): sum += counts[0] for word, counts in year_freqs.iteritems(): if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = float(counts[0]) / sum word_freqs[word] += year_freq word_mins[word] = min(word_mins[word], year_freq) print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if (word_freqs[word] / float(len(years)) > avg_thresh and word_mins[word] > min_thresh)] out_fp = open(out_pref + ".txt", "w") for word in sorted_list: out_fp.write(word.encode('utf-8') + " " + str(word_freqs[word] / float(len(years))) + "\n") ioutils.write_pickle(sorted_list, out_pref + ".pkl")
def merge(out_pref, years, word_list): vol_yearstats = {} disp_yearstats = {} for word in word_list: vol_yearstats[word] = {} disp_yearstats[word] = {} for year in years: vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl") disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl") for word in word_list: if word not in vol_yearstat: vol = float('nan') else: vol = vol_yearstat[word] if word not in disp_yearstat: disp = float('nan') else: disp = disp_yearstat[word] vol_yearstats[word][year] = vol disp_yearstats[word][year] = disp os.remove(out_pref + str(year) + "-vols.pkl") os.remove(out_pref + str(year) + "-disps.pkl") ioutils.write_pickle(vol_yearstats, out_pref + "-vols.pkl") ioutils.write_pickle(disp_yearstats, out_pref + "-disps.pkl")
def worker(proc_num, queue): while True: try: decade = str(queue.get(block=False)) except Empty: break print("Proc:", proc_num, "Decade:", decade) pos_tags = collections.defaultdict(collections.Counter) for file in os.listdir(DATA + decade): with open(DATA + decade + "/" + file) as fp: print(proc_num, file) fp.readline() for line in fp: word, lemma, lemma_pos, _ = process_lemma_line(line) if word == None: continue if lemma_pos == None: continue pos_tags[word][lemma_pos.split("_")[1]] += 1 write_pickle(pos_tags, OUT + str(decade) + "-pos-counts.pkl") pos_maj = {} for word, pos_counts in pos_tags.items(): pos_maj[word] = sorted(pos_counts, key = lambda t : -1*pos_counts[t])[0] write_pickle(pos_maj, OUT + str(decade) + "-pos-maj.pkl")
def merge(out_pref, years, word_list): vol_yearstats = {} disp_yearstats = {} for word in word_list: vol_yearstats[word] = {} disp_yearstats[word] = {} for year in years: vol_yearstat = ioutils.load_pickle(out_pref + str(year) + "-vols.pkl") disp_yearstat = ioutils.load_pickle(out_pref + str(year) + "-disps.pkl") for word in word_list: if word not in vol_yearstat: vol = float('nan') else: vol = vol_yearstat[word] if word not in disp_yearstat: disp = float('nan') else: disp = disp_yearstat[word] vol_yearstats[word][year] = vol disp_yearstats[word][year] = disp os.remove(out_pref + str(year) + "-vols.pkl") os.remove(out_pref + str(year) + "-disps.pkl") ioutils.write_pickle(vol_yearstats, out_pref + "vols.pkl") ioutils.write_pickle(disp_yearstats, out_pref + "disps.pkl")
def make_word_list(type): process_word = lambda word : word if type != "lemma_pos" else word.split("_")[0] freqs = load_pickle(FREQS.format(type=type)) word_lists = {} nstop_lists = {} nproper_lists = {} nstop_nproper_lists = {} print "Processing type: ", type for year, year_freqs in freqs.iteritems(): proper_nouns = load_pickle(PROPER_NOUNS.format(year=year)) word_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if word != "" and word.isalnum()] nstop_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in STOPWORDS and not word == "" and word.isalnum()] nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not word == "" and word.isalnum()] nstop_nproper_lists[year] = [word for word in sorted(year_freqs, key = lambda val : -1*year_freqs[val]) if not process_word(word) in proper_nouns and not process_word(word) in STOPWORDS and not word == "" and word.isalnum()] print "Finished year: ", year write_pickle(word_lists, OUT.format(type=type, cond="all")) write_pickle(nstop_lists, OUT.format(type=type, cond="nstop")) write_pickle(nproper_lists, OUT.format(type=type, cond="nproper")) write_pickle(nstop_nproper_lists, OUT.format(type=type, cond="nstop_nproper"))
def main(proc_num, lock, download_dir, source): page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: name = re.search('%s-(.*).gz' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d']) print proc_num, "Going through", name year_grams = collections.defaultdict(dict) n = 0 with open(loc_dir + name) as f: for l in f: l = l.decode('utf-8').lower() split = l.strip().split('\t') if EXCLUDE_PATTERN.match(split[0]): continue try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] if (not item.isalpha()) or item in STOPWORDS: continue year = split[1] count = int(split[2]) if item not in year_grams[year]: year_grams[year][item] = [(l, count)] else: year_grams[year][item].append((l, count)) except: #print "!", l.strip().split() pass print proc_num, "Writing", name, n for year in year_grams: ioutils.write_pickle(year_grams[year], loc_dir + str(year) + ".pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name + '.gz') except: pass
from ioutils import load_pickle, write_pickle DIR = "/dfs/scratch0/COHA/decade_freqs/" word = {} lemma = {} lemma_pos = {} for year in range(1810, 2010, 10): word[year] = load_pickle(DIR + str(year) + "-word.pkl") lemma[year] = load_pickle(DIR + str(year) + "-lemma.pkl") lemma_pos[year] = load_pickle(DIR + str(year) + "-lemma_pos.pkl") write_pickle(word, DIR + "word.pkl") write_pickle(lemma, DIR + "lemma.pkl") write_pickle(lemma_pos, DIR + "lemma_pos.pkl")
write_pickle(lemma_freqs, OUT + "decade_freqs/" + decade + "-lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") if __name__ == "__main__": queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) procs = [Process(target=worker, args=[i, queue]) for i in range(25)] for p in procs: p.start() for p in procs: p.join() print("Getting full freqs...") word_freqs = Counter() lemma_freqs = Counter() lemma_pos_freqs = Counter() for decade in range(1810, 2010, 10): decade = str(decade) print(decade) word_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-word.pkl") lemma_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma.pkl") lemma_pos_freqs += load_pickle(OUT + "decade_freqs/" + decade + "-lemma_pos.pkl") write_pickle(word_freqs, OUT + "full_freqs/word.pkl") write_pickle(lemma_freqs, OUT + "full_freqs/lemma.pkl") write_pickle(lemma_pos_freqs, OUT + "full_freqs/lemma_pos.pkl")
def main(proc_num, lock, download_dir, source): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page print proc_num, "Start loop" while True: lock.acquire() work_left = False for url in urls: name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1) dirs = set(os.listdir(download_dir)) if name in dirs: continue work_left = True print proc_num, "Name", name loc_dir = download_dir + "/" + name + "/" ioutils.mkdir(loc_dir) break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Downloading", name success = False while not success: with open(loc_dir + name + '.csv.zip', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print proc_num, "Unzipping", name subprocess.call( ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir]) subprocess.call([ 'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name ]) print proc_num, "Going through", name index = collections.OrderedDict() year_counters = collections.defaultdict(collections.Counter) n = 0 with open(loc_dir + name) as f: for l in f: split = l.strip().split('\t') try: ngram = split[0].split() middle_index = len(ngram) // 2 item = ngram[middle_index] context = ngram[:middle_index] + ngram[middle_index + 1:] item_id = indexing.word_to_id(item, index) year = split[1] count = int(split[2]) for context_word in context: pair = (item_id, indexing.word_to_id(context_word, index)) year_counters[year][pair] += count except: pass print proc_num, "Writing", name, n matstore.export_mats_from_dicts(year_counters, loc_dir) ioutils.write_pickle(index, loc_dir + "index.pkl") print proc_num, "Deleting", name try: os.remove(loc_dir + name) os.remove(loc_dir + name + '.csv.zip') except: pass
for line in fp: word, lemma, lemma_pos, _ = process_lemma_line(line) if word == None: continue if lemma_pos == None: continue if word not in word_dict: id = len(word_dict) word_dict[word] = id if lemma not in lemma_dict: id = len(lemma_dict) lemma_dict[lemma] = id if lemma_pos not in lemma_pos_dict: id = len(lemma_pos_dict) lemma_pos_dict[lemma_pos] = id if __name__ == "__main__": word_dict = {} lemma_dict = {} lemma_pos_dict = {} for decade in range(1810, 2010, 10): folder = str(decade) print("Processing decade...", folder) for file in os.listdir(DATA + folder): with open(DATA + folder + "/" + file) as fp: print("Processing file..", folder + "/" + file) process_file(fp, word_dict, lemma_dict, lemma_pos_dict) write_pickle(word_dict, OUT + "word-dict.pkl") write_pickle(lemma_dict, OUT + "lemma-dict.pkl") write_pickle(lemma_pos_dict, OUT + "lemma-pos-dict.pkl")
def main(out_dir, source, years): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html" ) # gets the urls of the 1gram datafiles pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page year_counts = { } # These are dicts that contain the occurence of a word in each year year_doc_counts = {} year_pos = {} for year in years: year_pos[year] = { } # Counts the occurrence of a word (distinguish words by pos) year_counts[year] = { } # Counts the occurrence of a word (does not distinguish words by pos) year_doc_counts[year] = {} # Counts the books where the word occurred print "Start loop" for url in urls: # iterates through the urls name = re.search('%s-(.*).gz' % VERSION, url).group(1) print "Downloading", name success = False while not success: # downloads the acutal datafile with open(out_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print "Unzipping", name # unzips the downloaded datafile subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d']) print "Going through", name # iterates through the lines of the datafile and counts the uccurrence of the words with open(out_dir + name) as f: for l in f: try: split = l.strip().split('\t') if not POS.match(split[0]): continue count = int(split[2]) if count < 10: continue word_info = split[0].split("_") pos = word_info[-1] word = word_info[0].decode('utf-8').lower() word = word.strip("\"") word = word.split("\'s")[0] if not word.isalpha(): continue esb = snowball.EnglishStemmer() word = str(esb.stem(word)) year = int(split[1]) doc_count = int(split[3]) if not year in years: continue if not word in year_counts[year]: year_counts[year][word] = 0 year_doc_counts[year][word] = 0 year_pos[year][word] = collections.Counter() year_counts[year][word] += count year_doc_counts[year][word] += doc_count year_pos[year][word][pos] += count except UnicodeDecodeError: pass print "Deleting", name # deletes the downloaded files try: os.remove(out_dir + name) os.remove(out_dir + name + '.gz') except: pass print "Writing..." # writes the data into pkl files for year in years: ioutils.write_pickle(year_counts[year], out_dir + str(year) + "-counts.pkl") ioutils.write_pickle(year_doc_counts[year], out_dir + str(year) + "-doc_counts.pkl") ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")
if __name__ == "__main__": queue = Queue() for decade in range(1810, 2010, 10): queue.put(decade) procs = [Process(target=worker, args=[i, queue]) for i in range(25)] for p in procs: p.start() for p in procs: p.join() print("Getting full set...") proper_nouns = set([]) pos_counts = {} print("Merging pos counts..") for decade in range(1810, 2010, 10): decade_pos_counts = load_pickle(DATA + str(decade) + "-pos-counts.pkl") for word, counts in decade_pos_counts.items(): if word not in pos_counts: pos_counts[word] = collections.Counter() for pos, count in counts.items(): pos_counts[word][pos] += count write_pickle(pos_counts, DATA + "all-pos-counts.pkl") pos_maj = {} proper_nouns = set([]) for word, p_counts in pos_counts.items(): pos_maj[word] = sorted(p_counts, key = lambda t : -1*p_counts[t])[0] if pos_maj[word] == "np": proper_nouns.add(word) write_pickle(pos_maj, OUT + "all-pos-maj.pkl") write_pickle(proper_nouns, OUT + "proper_nouns.pkl")
def main(out_dir, source, years): page = requests.get( "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html") pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION)) urls = pattern.findall(page.text) del page year_counts = {} year_doc_counts = {} year_pos = {} for year in years: year_pos[year] = {} year_counts[year] = {} year_doc_counts[year] = {} print "Start loop" for url in urls: name = re.search('%s-(.*).gz' % VERSION, url).group(1) print "Downloading", name success = False while not success: with open(out_dir + name + '.gz', 'w') as f: try: f.write(urllib2.urlopen(url, timeout=60).read()) success = True except: continue print "Unzipping", name subprocess.call(['gunzip', '-f', out_dir + name + '.gz', '-d']) print "Going through", name with open(out_dir + name) as f: for l in f: try: split = l.strip().split('\t') if not POS.match(split[0]): continue count = int(split[2]) if count < 10: continue word_info = split[0].split("_") pos = word_info[-1] word = word_info[0].decode('utf-8').lower() word = word.strip("\"") word = word.split("\'s")[0] year = int(split[1]) doc_count = int(split[3]) if not year in years: continue if not word in year_counts[year]: year_counts[year][word] = 0 year_doc_counts[year][word] = 0 year_pos[year][word] = collections.Counter() year_counts[year][word] += count year_doc_counts[year][word] += doc_count year_pos[year][word][pos] += count except UnicodeDecodeError: pass print "Deleting", name try: os.remove(out_dir + name) os.remove(out_dir + name + '.gz') except: pass print "Writing..." for year in years: ioutils.write_pickle(year_counts[year], out_dir + str(year) + "-counts.pkl") ioutils.write_pickle(year_doc_counts[year], out_dir + str(year) + "-doc_counts.pkl") ioutils.write_pickle(year_pos[year], out_dir + str(year) + "-pos.pkl")