def leveldb_writer(entries, db_path, batch_size, bench_freq): log = logging.getLogger(__name__).getChild('leveldb') log.info("Path - %s" % db_path) if batch_size: log.info("Batch Size - %s" % batch_size) log.info("Benchmark Freq - %s" % bench_freq) db = leveldb.LevelDB(db_path, error_if_exists=True, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB if batch_size: writer = leveldb.WriteBatch() else: writer = db b = Benchmark(bench_freq) for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime)) b.increment() if batch_size and b.count % batch_size == 0: db.Write(writer) entries.task_done() if batch_size: db.Write(writer) b.print_freq() log.info(db.GetStats()) entries.task_done()
def pjk_writer(entries, output_file, bench_freq): pjk = PajekFactory() b = Benchmark() for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: for citation in entry["citations"]: pjk.add_edge(entry["id"], citation) b.increment() entries.task_done() b.print_freq() with open_file(output_file, "w") as f: pjk.write(f) entries.task_done()
def leveldb_writer(entries, db_path, batch_size, bench_freq): log = logging.getLogger(__name__).getChild('leveldb') log.info("Path - %s" % db_path) if batch_size: log.info("Batch Size - %s" % batch_size) log.info("Benchmark Freq - %s" % bench_freq) db = leveldb.LevelDB( db_path, error_if_exists=True, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB if batch_size: writer = leveldb.WriteBatch() else: writer = db b = Benchmark(bench_freq) for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime)) b.increment() if batch_size and b.count % batch_size == 0: db.Write(writer) entries.task_done() if batch_size: db.Write(writer) b.print_freq() log.info(db.GetStats()) entries.task_done()
'-a', '--after', help="Only include nodes published on or after this year") parser.add_argument('-bf', '--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split( ".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break
parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from WOS XML") parser.add_argument("--outdir", help="Directory to write JSON files to") parser.add_argument("--wos-only", action="store_true", help="Only include nodes/edges in WOS") parser.add_argument("--sample-rate", help="Edge sample rate", type=float, default=None) parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes") parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true") parser.add_argument("-a", "--after", help="Only include nodes published on or after this year") parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument("infile", nargs="+") arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse():
help="Convert scores to integers, larger is better", action='store_true', default=False) parser.add_argument( '-l', '--limit', type=int, help="Max number of recommendations to generate per-paper", default=10) args = parser.parse_args() db = leveldb.LevelDB( args.db_path, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB b = Benchmark(args.benchmark_freq) tf = TreeFile(args.infile) if args.batch_size: writer = leveldb.WriteBatch() else: writer = db for recs in make_expert_rec(tf, args.limit): recd = [r.pid for r in recs] key = recs[0].target_pid + "|expert" writer.Put(key.encode(), msgpack.packb(recd)) b.increment() if args.batch_size and b.count % args.batch_size == 0: db.Write(writer)
client = boto3.resource('dynamodb', endpoint_url="http://localhost:8000") else: client = boto3.resource('dynamodb') t = Table(client, table_name) if args.flush: logging.info("Deleting table: " + t.table_name) if not args.dryrun: t.delete() if args.create: logging.info("Creating table: " + t.table_name) if not args.dryrun: t.create(write=2000) b = Benchmark() if args.rec_type: if args.rec_type == "classic": make_classic(args, b, t) else: make_expert(args, b, t) else: make_classic(args, b, t) make_expert(args, b, t) b.print_freq() if not args.dryrun: t.update_throughput()
client = boto3.resource('dynamodb', endpoint_url="http://localhost:8000") else: client = boto3.resource('dynamodb') t = Recommendation(client, table_name) if args.flush: logging.info("Deleting table: " + t.table_name) if not args.dryrun: t.delete() if args.create: logging.info("Creating table: " + t.table_name) if not args.dryrun: t.create(write=2000) b = Benchmark() if args.rec_type: if args.rec_type == "classic": make_classic(args, b, t) else: make_expert(args, b, t) else: make_classic(args, b, t) make_expert(args, b, t) b.print_freq() if not args.dryrun: t.update_throughput()