def leveldb_writer(entries, db_path, batch_size, bench_freq):
    log = logging.getLogger(__name__).getChild('leveldb')
    log.info("Path - %s" % db_path)
    if batch_size:
        log.info("Batch Size - %s" % batch_size)
    log.info("Benchmark Freq - %s" % bench_freq)

    db = leveldb.LevelDB(
        db_path,
        error_if_exists=True,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB
    if batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    b = Benchmark(bench_freq)
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            db.Put(entry["id"].encode(),
                   msgpack.dumps(entry, default=encode_datetime))
            b.increment()
            if batch_size and b.count % batch_size == 0:
                db.Write(writer)
        entries.task_done()

    if batch_size:
        db.Write(writer)

    b.print_freq()
    log.info(db.GetStats())
    entries.task_done()
Example #2
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Example #3
0
        '-a',
        '--after',
        help="Only include nodes published on or after this year")
    parser.add_argument('-bf',
                        '--benchmark_freq',
                        help="How often to emit benchmark info",
                        type=int,
                        default=1000000)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate,
                          arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(
                ".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break
                        help="Convert scores to integers, larger is better",
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-l',
        '--limit',
        type=int,
        help="Max number of recommendations to generate per-paper",
        default=10)
    args = parser.parse_args()

    db = leveldb.LevelDB(
        args.db_path,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB
    b = Benchmark(args.benchmark_freq)
    tf = TreeFile(args.infile)

    if args.batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    for recs in make_expert_rec(tf, args.limit):
        recd = [r.pid for r in recs]
        key = recs[0].target_pid + "|expert"
        writer.Put(key.encode(), msgpack.packb(recd))
        b.increment()
        if args.batch_size and b.count % args.batch_size == 0:
            db.Write(writer)