def main(proc_num, lock, out_dir, in_dir, years): random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(out_dir)) if str(year) + ".bin" in dirs: continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + ".bin" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Loading matrix", year coo_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=230000) csr_mat = coo_mat.tocsr() sum_mat = (csr_mat + csr_mat.T) sum_mat = sum_mat.tocoo() print proc_num, "Writing counts for year", year matstore.export_mat_eff(sum_mat.row, sum_mat.col, sum_mat.data, year, out_dir)
def main(proc_num, lock, in_dir, years, k): random.shuffle(years) print proc_num, "Start loop" tmp_pref = in_dir + "dknn-" + str(k) + "/" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(tmp_pref)) if str(year) + ".bin" in dirs: continue work_left = True print proc_num, "year", year fname = tmp_pref + str(year) + ".bin" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Making knn net for year", year old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") row_d, col_d, data_d = make_knn_mat(old_mat, k) print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, tmp_pref)
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): random.shuffle(years) print(proc_num, "Start loop") while True: try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") break stop_set = set(stopwords.words(lang)) word_freqs = {} print("Loading mat for year", year) year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print("Processing data for year", year) for word_i in range(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print("Writing data") sorted_list = sorted(list(word_freqs.keys()), key=lambda key: word_freqs[key], reverse=True) sorted_list = [ word for word in sorted_list if word_freqs[word] > freq_thresh ] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def main(proc_num, lock, in_dir, years, word_list, index): years = range(years[0], years[-1] + 1) random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir)) if str(year) + "-freqs.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = in_dir + str(year) + "-freqs.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") print proc_num, "Making inverse freq mat", year mat = mat.tocsr() mat = mat / mat.sum() word_stats = {} print proc_num, "Getting stats for year", year for word in word_list: word_stats[word] = compute_word_stats(mat, word, index) print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break stop_set = set(stopwords.words(lang)) word_freqs = {} print "Loading mat for year", year year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if word_freqs[word] > freq_thresh] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def load_matrix(f, thresh=None): if f.endswith('.bin'): if thresh == None: return matstore.retrieve_mat_as_coo(f, min_size=250000).tocsr() else: return matstore.retrieve_mat_as_coo_thresh(f, thresh, min_size=250000).tocsr() if not f.endswith('.npz'): f += '.npz' loader = np.load(f) return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
def run(out_file, in_dir, years, year_indices): samplesizes = {} for year in years: print "Processing year", year indices = year_indices[year] mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat = mat.tocsr() mat = mat[indices, :] mat = mat[:, indices] samplesizes[year] = mat.sum() ioutils.write_pickle(samplesizes, out_file)
def load_matrix(f, thresh=None): if f.endswith('.bin'): if thresh == None: return matstore.retrieve_mat_as_coo(f, min_size=250000).tocsr() else: return matstore.retrieve_mat_as_coo_thresh( f, thresh, min_size=250000).tocsr() if not f.endswith('.npz'): f += '.npz' loader = np.load(f) return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
def load_year_freqs(in_dir, years): year_freqs = {} year_sample_sizes = {} for year in years: mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat = mat.tocsr() year_sum = mat.sum() mat = mat / year_sum year_sample_sizes[year] = year_sum / 4.0 year_freqs[year] = {} for i in xrange(mat.shape[0]): year_freqs[year][i] = mat[i, :].sum() print "Loaded year", year return year_freqs, year_sample_sizes
def worker(proc_num, queue, in_dir): print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat) old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl")) new_index = collections.OrderedDict() for i in xrange(len(keep_rows)): new_index[old_index[keep_rows[i]]] = i ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl") print proc_num, "Writing counts for year", year matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
def main(proc_num, queue, out_dir, in_dir): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Loading matrix", year coo_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=10**6) csr_mat = coo_mat.tocsr() sum_mat = (csr_mat + csr_mat.T) sum_mat = sum_mat.tocoo() for i in xrange(len(sum_mat.data)): sum_mat.data[i] = max(csr_mat[sum_mat.row[i], sum_mat.col[i]], csr_mat[sum_mat.col[i], sum_mat.row[i]]) print proc_num, "Writing counts for year", year matstore.export_mat_eff(sum_mat.row, sum_mat.col, sum_mat.data, year, out_dir)
def main(proc_num, lock, years, out_pref, out_dir, in_dir, index, freq_thresh): random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(out_dir)) if str(year) + "tmp.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + "tmp.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break stop_set = set(stopwords.words('english')) word_freqs = {} print "Loading mat for year", year year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum() print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha() or word in stop_set or len(word) == 1: continue year_freq = year_mat[word_i, :].sum() word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True) sorted_list = [word for word in sorted_list if word_freqs[word] > freq_thresh] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
def main(proc_num, lock, out_pref, tmp_dir, in_dir, years, word_infos, thresh): random.shuffle(years) print proc_num, "Start loop" while True: lock.acquire() work_left = False for year in years: existing_files = set(os.listdir(tmp_dir)) fname = str(year) + "-tmp.pkl" if fname in existing_files: continue work_left = True print proc_num, "year", year with open(tmp_dir + fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year if thresh != None: mat = matstore.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh) else: mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat.setdiag(0) if word_infos != None: word_indices = word_infos[year][1] indices = word_indices[word_indices < min(mat.shape[1], mat.shape[0])] else: indices = np.arange(mat.shape[0]) year_graph = make_snap_graph(indices, mat) print proc_num, "Getting statistics for year", year year_stats = compute_graph_stats(year_graph) rewire_year_stats = compute_graph_stats(snap.GenRewire(year_graph, REWIRE_EDGE_SWITCHES)) ioutils.write_pickle(year_stats, tmp_dir + fname) ioutils.write_pickle(rewire_year_stats, tmp_dir + "rewire" + fname)