def load_matrix(f): if not f.endswith('.bin'): f += ".bin" import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from representations import sparse_io return sparse_io.retrieve_mat_as_coo(f.encode()).tocsr()
def main(proc_num, queue, out_dir, in_dir): merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Fixing counts for year", year fixed_counts = {} # This is the new co-occurrence matrix old_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin").todok() old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") for pair, count in old_mat.iteritems( ): # Iterates through the unmerged co-occurrence matrix ... try: i_word = old_index[pair[0]] except IndexError: print pair sys.exit(0) c_word = old_index[pair[1]] try: new_pair = (indexing.word_to_static_id(i_word, merged_index), indexing.word_to_static_id(c_word, merged_index)) except KeyError: # Filters words to drop out continue fixed_counts[ new_pair] = count # ... and add the counts to the new one print proc_num, "Writing counts for year", year # Saves the new co-occurrence matrices sparse_io.export_mat_from_dict(fixed_counts, out_dir + str(year) + ".bin")
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh): print(proc_num, "Start loop") time.sleep(10 * random.random()) while True: try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") break print(proc_num, "Retrieving mat for year", year) if thresh != None: mat = sparse_io.retrieve_mat_as_coo_thresh( in_dir + str(year) + ".bin", thresh) else: mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=5000000) print(proc_num, "Getting stats for year", year) year_stats = get_year_stats(mat, year_index_infos[year]["index"], year_index_infos[year]["list"], index_set=set( year_index_infos[year]["indices"])) print(proc_num, "Writing stats for year", year) ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def load_matrix(f): if not f.endswith('.bin'): f += ".bin" import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from representations import sparse_io return sparse_io.retrieve_mat_as_coo(f).tocsr()
def randomize(infile, mode, outfile): """Betolti a coocurence matrixot, es a mode-nak megfeleloen randomizalja azaelemeit""" dok_matrix = sparse_io.retrieve_mat_as_coo(infile).todok() print "matrix loaded and converted to dok." dok_matrix = mode(dok_matrix) print "randomization done. writing..." sparse_io.export_mat_from_dict(dok_matrix, outfile) print "success"
def main(proc_num, lock, out_dir, in_dir, years): print proc_num, "Start loop" years.reverse() while True: lock.acquire() work_left = False # Iterates through the years (in a so complicated way) for year in years: dirs = set(os.listdir(out_dir)) if str( year ) + ".bin" in dirs: # Checks if the individual year exists in the target directory. If it doesn't, allows the merging continue work_left = True print proc_num, "year", year fname = out_dir + str(year) + ".bin" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Merging counts for year", year # Merging starts here full_counts = collections.defaultdict(float) merged_index = collections.OrderedDict() for chunk_num in os.listdir( in_dir ): # Iterates through the alphabetically separated co-occurrence data chunk_name = in_dir + str(chunk_num) + "/" + str(year) + ".bin" if not os.path.isfile(chunk_name): continue chunk_counts = sparse_io.retrieve_mat_as_coo(chunk_name) chunk_index = ioutils.load_pickle(in_dir + str(chunk_num) + "/index.pkl") chunk_index = list(chunk_index) for pair, count in chunk_counts.todok().iteritems( ): # Iterates through the co-occurrence matrices and add the occurrence of the word-pairs to the merged co-occurrence matrix i_word = chunk_index[pair[0]] c_word = chunk_index[pair[1]] new_pair = (indexing.word_to_cached_id(i_word, merged_index), indexing.word_to_cached_id(c_word, merged_index)) full_counts[new_pair] += count print proc_num, "Writing counts for year", year sparse_io.export_mat_from_dict( full_counts, out_dir + str(year) + ".bin") # Saves the yearly merged co-occurrence matrices. ioutils.write_pickle(merged_index, out_dir + str(year) + "-index.pkl") # ... and the merged index ioutils.write_pickle(list(merged_index), out_dir + str(year) + "-list.pkl")
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, thresh): print proc_num, "Start loop" time.sleep(10 * random.random()) while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Retrieving mat for year", year if thresh != None: mat = sparse_io.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh) else: mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=5000000) print proc_num, "Getting stats for year", year year_stats = get_year_stats(mat, year_index_infos[year]["index"], year_index_infos[year]["list"], index_set = set(year_index_infos[year]["indices"])) print proc_num, "Writing stats for year", year ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang): #random.shuffle(years) # I don't know what it is for print proc_num, "Start loop" while True: # Iterates through the years try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break #stop_set = set(stopwords.words(lang)) word_freqs = {} # dict with word-relative_freq pairs print "Loading mat for year", year year_mat = sparse_io.retrieve_mat_as_coo(in_dir + str(year) + ".bin") year_mat = year_mat.tocsr() year_mat = year_mat / year_mat.sum( ) # normalizes the co-occurrence matrix print "Processing data for year", year for word_i in xrange(year_mat.shape[0]): word = index[word_i] if not word.isalpha( ): # or word in stop_set or len(word) == 1: # filters out the degenerated words continue year_freq = year_mat[word_i, :].sum( ) # thank to the normalization it's the relative frequency of the word word_freqs[word] = year_freq print "Writing data" sorted_list = sorted(word_freqs.keys(), key=lambda key: word_freqs[key], reverse=True) # sorting and filtering sorted_list = [ word for word in sorted_list if word_freqs[word] > freq_thresh ] ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl") # Saves the list of words ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl") # Saves the relative frequencies
def load_matrix(f): """Returns matrix from f as Compressed Sparse Column matrix""" if not f.endswith('.bin'): f += ".bin" return sparse_io.retrieve_mat_as_coo(f).tocsr()