def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after, filter_set): log = logging.getLogger(__name__).getChild('parser') batch = [] filtered_out = 0 wrote = 0 for filename in iter(files.get, 'STOP'): with open_file(filename) as f: p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after) for entry in p.parse(): if filter_set and entry["id"] not in filter_set: filtered_out += 1 continue batch.append(entry) if len(batch) >= batch_size: entries.put(batch) batch = [] wrote += batch_size if len(batch): entries.put(batch) wrote += len(batch) batch = [] files.task_done() log.info("Wrote %s entries", wrote) if filter_set: log.info("Excluded %s entries", filtered_out) files.task_done()
def pjk_writer(entries, output_file, bench_freq): pjk = PajekFactory() b = Benchmark() for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: for citation in entry["citations"]: pjk.add_edge(entry["id"], citation) b.increment() entries.task_done() b.print_freq() with open_file(output_file, "w") as f: pjk.write(f) entries.task_done()
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after): batch = [] for filename in iter(files.get, 'STOP'): with open_file(filename) as f: p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after) for entry in p.parse(): batch.append(entry) if len(batch) >= batch_size: entries.put(batch) batch = [] if len(batch): entries.put(batch) batch = [] files.task_done() files.task_done()
from util.PajekFactory import PajekFactory from util.misc import open_file if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from an edge/link file") parser.add_argument('outfile') parser.add_argument('--delimiter', '-d', help="Field delimiter", default='\t') parser.add_argument('--temp-dir', help="Directory to store temporary files in", default=None) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() pjk = PajekFactory(temp_dir=arguments.temp_dir) for filename in arguments.infile: with open_file(filename) as f: for line in f: v_from, v_to = line.split(arguments.delimiter) pjk.add_edge(v_from, v_to.strip()) with open_file(arguments.outfile, 'w') as f: pjk.write(f)
parser.add_argument( '--wos-only', help="For WoS, exclude any citations or ids that contain a dot (.)", action="store_true") parser.add_argument('infile', nargs='+') arguments = parser.parse_args() b = Benchmark() pjk = PajekFactory(temp_dir=arguments.temp_dir) subjects = None if arguments.subject: subjects = set(arguments.subject.split(",")) for filename in arguments.infile: with open_file(filename) as f: for line in f: entry = ujson.loads(line) b.increment() if arguments.wos_only and '.' in entry["id"]: continue if subjects: if "subject" not in entry: continue if not subjects.intersection(entry["subject"]): continue for citation in entry["citations"]:
parser.add_argument('-bf', '--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('--log', help="Logging level", default="WARNING", choices=["WARNING", "INFO", "DEBUG", "ERROR", "CRITICAL"]) parser.add_argument('wosfiles', nargs='+') arguments = parser.parse_args() logging.basicConfig(level=arguments.log) file_queue = JoinableQueue() result_queue = JoinableQueue() log = logging.getLogger(__name__) filter_set = None if arguments.filter: filter_set = set() log.info("Building filter from " + arguments.filter) with open_file(arguments.filter) as f: if arguments.filter.endswith(".pickle"): log.info("Filter is a pickle, unpickling") filter_set = pickle.load(f) else: tf = TreeFile(f) filter_set = {e.pid for e in tf} pickle_path = arguments.filter+".pickle" log.info("Pickling filter to %s" % pickle_path) with open_file(pickle_path, "w") as pf: pickle.dump(filter_set, pf, pickle.HIGHEST_PROTOCOL) tf = None log.info("Found %s ids to include" % len(filter_set)) date_after = None if arguments.after:
if not args.username or not args.password: logging.warning("No username or password provided") pub_up = None if args.publisher: pub_up = partial(add_publisher, args.publisher) key_filter = None if args.keys: keys = set((args.keys.split(","))) if not keys <= Metadata.fields: raise ValueError("Trying to include an unknown field: % " % keys - Metadata.fields) key_filter = partial(filter_dict, keys) p = Metadata(args.host, args.username, args.password) for filename in args.infile: with open_file(filename) as f: itr = map(ujson.loads, f) if pub_up: itr = map(pub_up, itr) if key_filter: itr = map(key_filter, itr) if args.dryrun: for e in itr: print(e) else: print(p.insert_bulk(itr))
logging.warning("No username or password provided") pub_up = None if args.publisher: pub_up = partial(add_publisher, args.publisher) key_filter = None if args.keys: keys = set((args.keys.split(","))) if not keys <= Metadata.fields: raise ValueError("Trying to include an unknown field: % " % keys - Metadata.fields) key_filter = partial(filter_dict, keys) p = Metadata(args.host, args.username, args.password) for filename in args.infile: with open_file(filename) as f: itr = map(ujson.loads, f) if pub_up: itr = map(pub_up, itr) if key_filter: itr = map(key_filter, itr) if args.dryrun: for e in itr: print(e) else: print(p.insert_bulk(itr))
#!/usr/bin/env python3 from parsers.wos import WOSStream from util.PajekFactory import PajekFactory from util.misc import open_file, Benchmark if __name__ == "__main__": import argparse import sys parser = argparse.ArgumentParser( description="Creates Pajek (.net) files from WOS XML") parser.add_argument('infile') parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) arguments = parser.parse_args() with open_file(arguments.infile) as f: p = WOSStream(f) for entry in p.parse(): for citation in entry["citations"]: arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))
action="store_true") parser.add_argument('--benchmark-freq', default=10000, type=int) parser.add_argument( '-l', '--limit', type=int, help="Max number of recommendations to generate per-paper", default=10) args = parser.parse_args() db = leveldb.LevelDB( args.db_path, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB with open_file(args.infile) as ifs: b = Benchmark(args.benchmark_freq) # Remove any non-wos entries if args.wos_only: tf = filter( lambda r: r.pid.startswith("WOS:") and '.' not in r.pid, TreeFile(ifs)) else: tf = TreeFile(ifs) writer = db for parent_cid, parent_papers, leaf_recs in parse_tree(tf, args.limit): for l in leaf_recs: for paper in l.get_papers(): # Register each paper's cluster_id
parser.add_argument("--sample-rate", help="Edge sample rate", type=float, default=None) parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes") parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true") parser.add_argument("-a", "--after", help="Only include nodes published on or after this year") parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument("infile", nargs="+") arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False) g.write("\n") b.increment()
#!/usr/bin/env python3 from parsers.wos import WOSStream from util.PajekFactory import PajekFactory from util.misc import open_file, Benchmark if __name__ == "__main__": import argparse import sys parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from WOS XML") parser.add_argument('infile') parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) arguments = parser.parse_args() with open_file(arguments.infile) as f: p = WOSStream(f) for entry in p.parse(): for citation in entry["citations"]: arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Creates recommendations and stores in an LevelDB from a tree file (.tree)") parser.add_argument('infile') parser.add_argument('db_path') parser.add_argument("--wos-only", help="Only include papers in the WOS collection", action="store_true") parser.add_argument('--benchmark-freq', default=10000, type=int) parser.add_argument('-l', '--limit', type=int, help="Max number of recommendations to generate per-paper", default=10) args = parser.parse_args() db = leveldb.LevelDB(args.db_path, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB with open_file(args.infile) as ifs: b = Benchmark(args.benchmark_freq) # Remove any non-wos entries if args.wos_only: tf = filter(lambda r: r.pid.startswith("WOS:") and '.' not in r.pid, TreeFile(ifs)) else: tf = TreeFile(ifs) writer = db for parent_cid, parent_papers, leaf_recs in parse_tree(tf, args.limit): for l in leaf_recs: for paper in l.get_papers(): # Register each paper's cluster_id writer.Put(("paper|" + paper).encode(), l.cluster_id.encode()) b.increment()
include_fields.add(map(str.strip, args.include.split(","))) t = Metadata(client, table_name) if args.flush: logging.info("Deleting table: " + t.table_name) if not args.dryrun: t.delete() if args.create: logging.info("Creating table: " + t.table_name) if not args.dryrun: t.create(write=2000) b = Benchmark(args.benchmark_freq) with open_file(args.metadata) as ifs: with t.get_batch_put_context() as batch: reader = csv.DictReader(ifs, delimiter=args.delimiter) for row in reader: new_row = {k: v for k, v in row.items() if k in include_fields} if not REQUIRED_KEYS.issubset(new_row.keys()): print(row) raise KeyError("Not all required keys present") if not args.dryrun: batch.put_item(Item=new_row) b.increment() if not args.dryrun: t.update_throughput() b.print_freq()
default="WARNING", choices=["WARNING", "INFO", "DEBUG", "ERROR", "CRITICAL"]) parser.add_argument('wosfiles', nargs='+') arguments = parser.parse_args() logging.basicConfig(level=arguments.log) file_queue = JoinableQueue() result_queue = JoinableQueue() log = logging.getLogger(__name__) filter_set = None if arguments.filter: filter_set = set() log.info("Building filter from " + arguments.filter) with open_file(arguments.filter) as f: if arguments.filter.endswith(".pickle"): log.info("Filter is a pickle, unpickling") filter_set = pickle.load(f) else: tf = TreeFile(f) filter_set = {e.pid for e in tf} pickle_path = arguments.filter + ".pickle" log.info("Pickling filter to %s" % pickle_path) with open_file(pickle_path, "w") as pf: pickle.dump(filter_set, pf, pickle.HIGHEST_PROTOCOL) tf = None log.info("Found %s ids to include" % len(filter_set)) date_after = None if arguments.after:
from util.misc import open_file if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Merge tree files") parser.add_argument('outfile') parser.add_argument('infiles', nargs='+') args = parser.parse_args() with open_file(args.outfile, 'w') as outf: for i, filename in enumerate(args.infiles, start=1): with open_file(filename) as inf: l = inf.readline() if l[0] == "#": continue outf.write("%d:%s\n" % (i, l))
parser = argparse.ArgumentParser(description="Remap strings to ids") parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('pickle', help="Store AutoID pickle") parser.add_argument('--start', type=int, default=1, help="Value to start with") parser.add_argument('--delimiter', default=' ', help="ID delimiter", type=str) arguments = parser.parse_args() aid = AutoID(arguments.start) with open_file(arguments.outfile, "w") as o: with open_file(arguments.infile) as f: for line in f: o.write( arguments.delimiter.join( list( map( str, map( aid.__getitem__, map(str.strip, line.split(arguments.delimiter))))))) o.write('\n') with open(arguments.pickle, "wb") as p: pickle.dump(aid, p, pickle.HIGHEST_PROTOCOL)
parser.add_argument('-bf', '--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split( ".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False)
#!/usr/bin/env python3 if __name__ == "__main__": import argparse from util.AutoID import AutoID from util.misc import open_file import pickle parser = argparse.ArgumentParser(description="Remap strings to ids") parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('pickle', help="Store AutoID pickle") parser.add_argument('--start', type=int, default=1, help="Value to start with") parser.add_argument('--delimiter', default=' ', help="ID delimiter", type=str) arguments = parser.parse_args() aid = AutoID(arguments.start) with open_file(arguments.outfile, "w") as o: with open_file(arguments.infile) as f: for line in f: o.write(arguments.delimiter.join(list(map(str, map(aid.__getitem__, map(str.strip, line.split(arguments.delimiter))))))) o.write('\n') with open(arguments.pickle, "wb") as p: pickle.dump(aid, p, pickle.HIGHEST_PROTOCOL)