def leveldb_writer(entries, db_path, batch_size, bench_freq):
    log = logging.getLogger(__name__).getChild('leveldb')
    log.info("Path - %s" % db_path)
    if batch_size:
        log.info("Batch Size - %s" % batch_size)
    log.info("Benchmark Freq - %s" % bench_freq)

    db = leveldb.LevelDB(db_path,
                         error_if_exists=True,
                         write_buffer_size=100 << 20,  # 100MB
                         block_cache_size=400 << 20)  # 400MB
    if batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    b = Benchmark(bench_freq)
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime))
            b.increment()
            if batch_size and b.count % batch_size == 0:
                db.Write(writer)
        entries.task_done()

    if batch_size:
        db.Write(writer)

    b.print_freq()
    log.info(db.GetStats())
    entries.task_done()
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
def leveldb_writer(entries, db_path, batch_size, bench_freq):
    log = logging.getLogger(__name__).getChild('leveldb')
    log.info("Path - %s" % db_path)
    if batch_size:
        log.info("Batch Size - %s" % batch_size)
    log.info("Benchmark Freq - %s" % bench_freq)

    db = leveldb.LevelDB(
        db_path,
        error_if_exists=True,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB
    if batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    b = Benchmark(bench_freq)
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            db.Put(entry["id"].encode(),
                   msgpack.dumps(entry, default=encode_datetime))
            b.increment()
            if batch_size and b.count % batch_size == 0:
                db.Write(writer)
        entries.task_done()

    if batch_size:
        db.Write(writer)

    b.print_freq()
    log.info(db.GetStats())
    entries.task_done()
Example #4
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Example #5
0
        '-a',
        '--after',
        help="Only include nodes published on or after this year")
    parser.add_argument('-bf',
                        '--benchmark_freq',
                        help="How often to emit benchmark info",
                        type=int,
                        default=1000000)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate,
                          arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(
                ".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break
Example #6
0
    parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from WOS XML")
    parser.add_argument("--outdir", help="Directory to write JSON files to")
    parser.add_argument("--wos-only", action="store_true", help="Only include nodes/edges in WOS")
    parser.add_argument("--sample-rate", help="Edge sample rate", type=float, default=None)
    parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes")
    parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true")
    parser.add_argument("-a", "--after", help="Only include nodes published on or after this year")
    parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000)
    parser.add_argument("infile", nargs="+")
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                        help="Convert scores to integers, larger is better",
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-l',
        '--limit',
        type=int,
        help="Max number of recommendations to generate per-paper",
        default=10)
    args = parser.parse_args()

    db = leveldb.LevelDB(
        args.db_path,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB
    b = Benchmark(args.benchmark_freq)
    tf = TreeFile(args.infile)

    if args.batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    for recs in make_expert_rec(tf, args.limit):
        recd = [r.pid for r in recs]
        key = recs[0].target_pid + "|expert"
        writer.Put(key.encode(), msgpack.packb(recd))
        b.increment()
        if args.batch_size and b.count % args.batch_size == 0:
            db.Write(writer)
Example #8
0
        client = boto3.resource('dynamodb', endpoint_url="http://localhost:8000")
    else:
        client = boto3.resource('dynamodb')

    t = Table(client, table_name)

    if args.flush:
        logging.info("Deleting table: " + t.table_name)
        if not args.dryrun:
            t.delete()

    if args.create:
        logging.info("Creating table: " + t.table_name)
        if not args.dryrun:
            t.create(write=2000)

    b = Benchmark()
    if args.rec_type:
        if args.rec_type == "classic":
            make_classic(args, b, t)
        else:
            make_expert(args, b, t)
    else:
        make_classic(args, b, t)
        make_expert(args, b, t)
    b.print_freq()

    if not args.dryrun:
        t.update_throughput()

Example #9
0
        client = boto3.resource('dynamodb',
                                endpoint_url="http://localhost:8000")
    else:
        client = boto3.resource('dynamodb')

    t = Recommendation(client, table_name)

    if args.flush:
        logging.info("Deleting table: " + t.table_name)
        if not args.dryrun:
            t.delete()

    if args.create:
        logging.info("Creating table: " + t.table_name)
        if not args.dryrun:
            t.create(write=2000)

    b = Benchmark()
    if args.rec_type:
        if args.rec_type == "classic":
            make_classic(args, b, t)
        else:
            make_expert(args, b, t)
    else:
        make_classic(args, b, t)
        make_expert(args, b, t)
    b.print_freq()

    if not args.dryrun:
        t.update_throughput()