Ejemplo n.º 1
0
def main(proc_num, lock, out_dir, in_dir, years):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(year) + ".bin" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + ".bin"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Loading  matrix", year
        coo_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin",
                                               min_size=230000)
        csr_mat = coo_mat.tocsr()
        sum_mat = (csr_mat + csr_mat.T)
        sum_mat = sum_mat.tocoo()
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(sum_mat.row, sum_mat.col, sum_mat.data, year,
                                out_dir)
Ejemplo n.º 2
0
def main(proc_num, lock, in_dir, years, k):
    random.shuffle(years)
    print proc_num, "Start loop"
    tmp_pref = in_dir + "dknn-" + str(k) + "/"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(tmp_pref))
            if str(year) + ".bin" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_pref + str(year) + ".bin"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Making knn net for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d = make_knn_mat(old_mat, k)
        
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, tmp_pref)
Ejemplo n.º 3
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    random.shuffle(years)
    print(proc_num, "Start loop")
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            break
        stop_set = set(stopwords.words(lang))
        word_freqs = {}
        print("Loading mat for year", year)
        year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print("Processing data for year", year)
        for word_i in range(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print("Writing data")
        sorted_list = sorted(list(word_freqs.keys()),
                             key=lambda key: word_freqs[key],
                             reverse=True)
        sorted_list = [
            word for word in sorted_list if word_freqs[word] > freq_thresh
        ]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Ejemplo n.º 4
0
def main(proc_num, lock, in_dir, years, word_list, index):
    years = range(years[0], years[-1] + 1)
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir))
            if str(year) + "-freqs.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = in_dir + str(year) + "-freqs.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        print proc_num, "Making inverse freq mat", year
        mat = mat.tocsr()
        mat = mat / mat.sum()
        word_stats = {}
        print proc_num, "Getting stats for year", year
        for word in word_list:
            word_stats[word] = compute_word_stats(mat, word, index)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")
Ejemplo n.º 5
0
def main(proc_num, lock, in_dir, years, word_list, index):
    years = range(years[0], years[-1] + 1)
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir))
            if str(year) + "-freqs.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = in_dir + str(year) + "-freqs.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        print proc_num, "Making inverse freq mat", year
        mat = mat.tocsr()
        mat = mat / mat.sum()
        word_stats = {}
        print proc_num, "Getting stats for year", year
        for word in word_list:
            word_stats[word] = compute_word_stats(mat, word, index)

        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")
Ejemplo n.º 6
0
def main(proc_num, lock, in_dir, years, k):
    random.shuffle(years)
    print proc_num, "Start loop"
    tmp_pref = in_dir + "dknn-" + str(k) + "/"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(tmp_pref))
            if str(year) + ".bin" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_pref + str(year) + ".bin"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Making knn net for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d = make_knn_mat(old_mat, k)

        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, tmp_pref)
Ejemplo n.º 7
0
def main(proc_num, queue, out_pref, out_dir, in_dir, index, freq_thresh, lang):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        stop_set = set(stopwords.words(lang))
        word_freqs = {}
        print "Loading mat for year", year
        year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
        sorted_list = [word for word in sorted_list 
                    if word_freqs[word] > freq_thresh]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Ejemplo n.º 8
0
def load_matrix(f, thresh=None):
    if f.endswith('.bin'):
        if thresh == None:
            return matstore.retrieve_mat_as_coo(f, min_size=250000).tocsr()
        else:
            return matstore.retrieve_mat_as_coo_thresh(f, thresh, min_size=250000).tocsr()
    if not f.endswith('.npz'):
        f += '.npz'
    loader = np.load(f)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
Ejemplo n.º 9
0
def run(out_file, in_dir, years, year_indices):
    samplesizes = {}
    for year in years:
        print "Processing year", year
        indices = year_indices[year]
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        mat = mat[indices, :]
        mat = mat[:, indices]
        samplesizes[year] = mat.sum()
    ioutils.write_pickle(samplesizes, out_file)
Ejemplo n.º 10
0
def run(out_file, in_dir, years, year_indices):
    samplesizes = {}
    for year in years:
        print "Processing year", year
        indices = year_indices[year]
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        mat = mat[indices, :]
        mat = mat[:, indices]
        samplesizes[year] = mat.sum()
    ioutils.write_pickle(samplesizes, out_file)
Ejemplo n.º 11
0
def load_matrix(f, thresh=None):
    if f.endswith('.bin'):
        if thresh == None:
            return matstore.retrieve_mat_as_coo(f, min_size=250000).tocsr()
        else:
            return matstore.retrieve_mat_as_coo_thresh(
                f, thresh, min_size=250000).tocsr()
    if not f.endswith('.npz'):
        f += '.npz'
    loader = np.load(f)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])
Ejemplo n.º 12
0
def load_year_freqs(in_dir, years):
    year_freqs = {}
    year_sample_sizes = {}
    for year in years:
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        year_sum = mat.sum()
        mat = mat / year_sum
        year_sample_sizes[year] = year_sum / 4.0
        year_freqs[year] = {}
        for i in xrange(mat.shape[0]):
            year_freqs[year][i] = mat[i, :].sum()
        print "Loaded year", year
    return year_freqs, year_sample_sizes
Ejemplo n.º 13
0
def load_year_freqs(in_dir, years):
    year_freqs = {}
    year_sample_sizes = {}
    for year in years:
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        year_sum = mat.sum() 
        mat = mat / year_sum
        year_sample_sizes[year] = year_sum / 4.0
        year_freqs[year] = {}
        for i in xrange(mat.shape[0]):
            year_freqs[year][i] = mat[i, :].sum() 
        print "Loaded year", year
    return year_freqs, year_sample_sizes
Ejemplo n.º 14
0
def worker(proc_num, queue, in_dir):
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Making second orders for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")
Ejemplo n.º 15
0
def main(proc_num, queue, out_dir, in_dir):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Loading  matrix", year
        coo_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin", min_size=10**6)
        csr_mat = coo_mat.tocsr()
        sum_mat = (csr_mat + csr_mat.T) 
        sum_mat = sum_mat.tocoo()
        for i in xrange(len(sum_mat.data)):
            sum_mat.data[i] = max(csr_mat[sum_mat.row[i], sum_mat.col[i]], csr_mat[sum_mat.col[i], sum_mat.row[i]])
        
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(sum_mat.row, sum_mat.col, sum_mat.data, year, out_dir)
Ejemplo n.º 16
0
def main(proc_num, lock, years, out_pref, out_dir, in_dir, index, freq_thresh):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(out_dir))
            if str(year) + "tmp.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = out_dir + str(year) + "tmp.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        stop_set = set(stopwords.words('english'))
        word_freqs = {}
        print "Loading mat for year", year
        year_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        year_mat = year_mat.tocsr()
        year_mat = year_mat / year_mat.sum()
        print "Processing data for year", year
        for word_i in xrange(year_mat.shape[0]):
            word = index[word_i]
            if not word.isalpha() or word in stop_set or len(word) == 1:
                continue
            year_freq = year_mat[word_i, :].sum()
            word_freqs[word] = year_freq
        print "Writing data"
        sorted_list = sorted(word_freqs.keys(), key = lambda key : word_freqs[key], reverse=True)
        sorted_list = [word for word in sorted_list 
                    if word_freqs[word] > freq_thresh]
        ioutils.write_pickle(sorted_list, out_dir + str(year) + "tmp.pkl")
        ioutils.write_pickle(word_freqs, out_dir + str(year) + "freqstmp.pkl")
Ejemplo n.º 17
0
def main(proc_num, lock, out_pref, tmp_dir, in_dir, years, word_infos, thresh):
    random.shuffle(years)
    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            existing_files = set(os.listdir(tmp_dir))
            fname = str(year) + "-tmp.pkl"
            if fname in existing_files:
                continue
            work_left = True
            print proc_num, "year", year
            with open(tmp_dir + fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Retrieving mat for year", year
        if thresh != None:
            mat = matstore.retrieve_mat_as_coo_thresh(in_dir + str(year) + ".bin", thresh)
        else:
            mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")

        mat.setdiag(0)
        if word_infos != None:
            word_indices = word_infos[year][1]
            indices = word_indices[word_indices < min(mat.shape[1], mat.shape[0])]
        else:
            indices = np.arange(mat.shape[0])
        year_graph = make_snap_graph(indices, mat)
        print proc_num, "Getting statistics for year", year
        year_stats = compute_graph_stats(year_graph)
        rewire_year_stats = compute_graph_stats(snap.GenRewire(year_graph, REWIRE_EDGE_SWITCHES))
        ioutils.write_pickle(year_stats, tmp_dir + fname)
        ioutils.write_pickle(rewire_year_stats, tmp_dir + "rewire" + fname)
Ejemplo n.º 18
0
def worker(proc_num, queue, in_dir):
    print proc_num, "Start loop"
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break

        print proc_num, "Making second orders for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) +
                                             "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index,
                             in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year,
                                in_dir + "/second/")