Esempio n. 1
0
def load_year_index_infos_common(common_index, years, word_file, num_words=-1):
    year_index_infos = collections.defaultdict(dict)
    word_lists = load_year_words(word_file, years)
    for year, word_list in word_lists.iteritems():
        year_index = common_index
        year_index_infos[year]["index"] = year_index
        if num_words != -1:
            word_list = word_list[:num_words]
        word_list, word_indices = get_word_indices(word_list, year_index)
        year_index_infos[year]["list"] = word_list
        year_index_infos[year]["indices"] = word_indices
    return year_index_infos
Esempio n. 2
0
def load_year_index_infos(index_dir, years, word_file, num_words=-1):
    year_index_infos = collections.defaultdict(dict)
    word_lists = load_year_words(word_file, years)
    for year, word_list in word_lists.iteritems():
        year_index = load_pickle(index_dir + "/" + str(year) + "-index.pkl")
        year_index_infos[year]["index"] = year_index
        if num_words != -1:
            word_list = word_list[:num_words]
        word_list, word_indices = get_word_indices(word_list, year_index)
        year_index_infos[year]["list"] = word_list
        year_index_infos[year]["indices"] = word_indices
    return year_index_infos
Esempio n. 3
0
def load_year_index_infos(index_dir, years, word_file, num_words=-1):
    year_index_infos = collections.defaultdict(dict)
    word_lists = load_year_words(word_file, years)
    for year, word_list in word_lists.iteritems():
        year_index = load_pickle(index_dir + "/" + str(year) + "-index.pkl") 
        year_index_infos[year]["index"] = year_index
        if num_words != -1:
            word_list = word_list[:num_words]
        word_list, word_indices = get_word_indices(word_list, year_index)
        year_index_infos[year]["list"] = word_list
        year_index_infos[year]["indices"] = word_indices
    return year_index_infos
Esempio n. 4
0
def load_year_index_infos_common(common_index, years, word_file, num_words=-1):
    year_index_infos = collections.defaultdict(dict)
    word_lists = load_year_words(word_file, years)
    for year, word_list in word_lists.iteritems():
        year_index = common_index
        year_index_infos[year]["index"] = year_index
        if num_words != -1:
            word_list = word_list[:num_words]
        word_list, word_indices = get_word_indices(word_list, year_index)
        year_index_infos[year]["list"] = word_list
        year_index_infos[year]["indices"] = word_indices
    return year_index_infos
Esempio n. 5
0
    if args.word_file != None:
        if args.index_dir == None:
            print >> sys.stderr, "Must specify index dir with word file!"
            sys.exit()
        word_pickle = ioutils.load_pickle(args.word_file)
        if not args.start_year in word_pickle:
            word_lists = {}
            for year in years:
                word_lists[year] = word_pickle
        else:
            word_lists = word_pickle
        word_infos = {}
        for year, word_list in word_lists.iteritems():
            year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl")
            if args.num_words != -1:
                word_list = word_list[: args.num_words]
            word_list, word_indices = get_word_indices(word_list, year_index)
            word_infos[year] = (word_list, word_indices)
        outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0]
        if args.num_words != -1:
            outpref += "-top" + str(args.num_words)
    else:
        word_info = None
        outpref = "/netstats/net"
    if args.thresh != None:
        outpref += "-" + str(args.thresh)
    ioutils.mkdir(args.dir + "/netstats")
    run_parallel(
        args.num_procs, args.dir + outpref, args.dir + "/netstats/", args.dir + "/", years, word_info, args.thresh
    )
Esempio n. 6
0
    merge(word_list, years, in_dir, out_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Merges years of raw 5gram data.")
    parser.add_argument("out_file",
                        help="path to network data (also where output goes)")
    parser.add_argument("in_dir",
                        help="path to network data (also where output goes)")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("index_file", help="path to sorted word file")
    parser.add_argument("num_procs",
                        type=int,
                        help="number of processes to spawn")
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(args.index_file)
    word_list = ioutils.load_pickle(args.word_file)
    word_list, _ = get_word_indices(word_list, index)
    run_parallel(args.num_procs, args.in_dir + "/", years, word_list, index,
                 args.out_file)
Esempio n. 7
0
        print proc_num, "Writing stats for year", year
        ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl")


def run_parallel(num_procs, in_dir, years, word_list, index, out_file):
    lock = Lock()
    procs = [Process(target=main, args=[i, lock, in_dir, years, word_list, index]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    print "Merging"
    merge(word_list, years, in_dir, out_file)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("out_file", help="path to network data (also where output goes)")
    parser.add_argument("in_dir", help="path to network data (also where output goes)")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("index_file", help="path to sorted word file")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(args.index_file)
    word_list = ioutils.load_pickle(args.word_file)
    word_list, _ = get_word_indices(word_list, index)
    run_parallel(args.num_procs, args.in_dir + "/", years, word_list, index, args.out_file)       
Esempio n. 8
0
        indices = year_indices[year]
        mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        mat = mat.tocsr()
        mat = mat[indices, :]
        mat = mat[:, indices]
        samplesizes[year] = mat.sum()
    ioutils.write_pickle(samplesizes, out_file)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="get sample sizes")
    parser.add_argument("out_file", help="output file")
    parser.add_argument("in_dir", help="input directory")
    parser.add_argument("--word-file", help="path to sorted word file(s). Must also specify index.", default=None)
    parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(INDEX_FILE)
    word_pickle = ioutils.load_pickle(args.word_file)
    word_info = {}
    if not args.start_year in word_pickle:
        word_pickle = word_pickle[:args.num_words]
        year_word_info = get_word_indices(word_pickle, index)[1] 
        for year in years:
            word_info[year] = year_word_info
    else:
        for year in years:
            word_info[year] = get_word_indices(word_pickle[year][:args.num_words], index)[1]
    run(args.out_file, args.in_dir + "/", years, word_info)
Esempio n. 9
0
        default=None)
    parser.add_argument(
        "--num-words",
        type=int,
        help=
        "Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.",
        default=-1)
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(INDEX_FILE)
    word_pickle = ioutils.load_pickle(args.word_file)
    word_info = {}
    if not args.start_year in word_pickle:
        word_pickle = word_pickle[:args.num_words]
        year_word_info = get_word_indices(word_pickle, index)[1]
        for year in years:
            word_info[year] = year_word_info
    else:
        for year in years:
            word_info[year] = get_word_indices(
                word_pickle[year][:args.num_words], index)[1]
    run(args.out_file, args.in_dir + "/", years, word_info)