Ejemplo n.º 1
0
        os.remove(out_dir + str(year) + ".tmp.txt")

def run_parallel(num_procs, out_dir, in_dir, count_dir, years, words, num_words, min_count, sample):
    queue = Queue()
    for year in years:
        queue.put(year)
    procs = [Process(target=worker, args=[i, queue, out_dir, in_dir, count_dir, words, num_words, min_count, sample]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes various frequency statistics.")
    parser.add_argument("out_dir")
    parser.add_argument("in_dir")
    parser.add_argument("count_dir")
    parser.add_argument("word_file")
    parser.add_argument("--workers", type=int, default=10)
    parser.add_argument("--num-words", type=int, default=None)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000)
    parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1)
    parser.add_argument("--min-count", type=int, default=100)
    parser.add_argument("--sample", type=float, default=1e-5)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    words = ioutils.load_year_words(args.word_file, years)
    ioutils.mkdir(args.out_dir)
    run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", args.count_dir + "/", years, words, args.num_words, args.min_count, args.sample)       
Ejemplo n.º 2
0
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
        write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")

if __name__ == '__main__':
    parser = ArgumentParser("Run SVD on historical co-occurrence matrices")
    parser.add_argument("in_dir", help="Directory with PPMI data")
    parser.add_argument("count_dir", help="Directory with PPMI data")
    parser.add_argument("word_file", help="File containing sorted list of words to potentially include")
    parser.add_argument("--num-words", type=int, help="Number of words to include", default=1000000)
    parser.add_argument("--dim", type=int, default=300)
    parser.add_argument("--workers", type=int, default=50)
    parser.add_argument("--start-year", type=int, default=1800)
    parser.add_argument("--end-year", type=int, default=1990)
    parser.add_argument("--year-inc", type=int, default=10)
    parser.add_argument("--min-count", type=int, default=100)
    args = parser.parse_args()
    queue = Queue()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    years.reverse()
    for year in years:
        queue.put(year)
    out_dir = args.in_dir + "/svd/" + str(args.dim) + "/" + str(args.num_words) + "/" + str(args.min_count) + "/"
    mkdir(out_dir)
    words = load_year_words(args.word_file, years)
    procs = [Process(target=worker, args=[i, queue, out_dir, args.in_dir, args.count_dir, words, args.dim, args.num_words, args.min_count]) for i in range(args.workers)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
        description="Computes various frequency statistics.")
    parser.add_argument("out_dir")
    parser.add_argument("in_dir")
    parser.add_argument("count_dir")
    parser.add_argument(
        "word_file",
        help="file maps from year to word list (the output of freqperyear)")
    parser.add_argument("--workers", type=int, default=10)
    parser.add_argument("--num-words", type=int, default=None)
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=1800)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=2000)
    parser.add_argument("--year-inc",
                        type=int,
                        help="end year (inclusive)",
                        default=1)
    parser.add_argument("--min-count", type=int, default=100)
    parser.add_argument("--sample", type=float, default=1e-5)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    words = ioutils.load_year_words(args.word_file, years)
    ioutils.mkdir(args.out_dir)
    run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/",
                 args.count_dir + "/", years, words, args.num_words,
                 args.min_count, args.sample)
Ejemplo n.º 4
0
                        type=int,
                        help="Number of words to include",
                        default=1000000)
    parser.add_argument("--dim", type=int, default=300)
    parser.add_argument("--workers", type=int, default=50)
    parser.add_argument("--start-year", type=int, default=1800)
    parser.add_argument("--end-year", type=int, default=1990)
    parser.add_argument("--year-inc", type=int, default=10)
    parser.add_argument("--min-count", type=int, default=100)
    args = parser.parse_args()
    queue = Queue()
    years = list(range(args.start_year, args.end_year + 1, args.year_inc))
    years.reverse()
    for year in years:
        queue.put(year)
    out_dir = args.in_dir + "/svd/" + str(args.dim) + "/" + str(
        args.num_words) + "/" + str(args.min_count) + "/"
    mkdir(out_dir)
    words = load_year_words(args.word_file, years)
    procs = [
        Process(target=worker,
                args=[
                    i, queue, out_dir, args.in_dir, args.count_dir, words,
                    args.dim, args.num_words, args.min_count
                ]) for i in range(args.workers)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()