Ejemplo n.º 1
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after, filter_set):
    log = logging.getLogger(__name__).getChild('parser')
    batch = []
    filtered_out = 0
    wrote = 0
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after)
            for entry in p.parse():
                if filter_set and entry["id"] not in filter_set:
                    filtered_out += 1
                    continue
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
                    wrote += batch_size
        if len(batch):
            entries.put(batch)
            wrote += len(batch)
            batch = []
        files.task_done()

    log.info("Wrote %s entries", wrote)
    if filter_set:
        log.info("Excluded %s entries", filtered_out)

    files.task_done()
Ejemplo n.º 2
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size,
               date_after, filter_set):
    log = logging.getLogger(__name__).getChild('parser')
    batch = []
    filtered_out = 0
    wrote = 0
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f,
                          wos_only=wos_only,
                          sample_rate=sample_rate,
                          must_cite=must_cite,
                          date_after=date_after)
            for entry in p.parse():
                if filter_set and entry["id"] not in filter_set:
                    filtered_out += 1
                    continue
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
                    wrote += batch_size
        if len(batch):
            entries.put(batch)
            wrote += len(batch)
            batch = []
        files.task_done()

    log.info("Wrote %s entries", wrote)
    if filter_set:
        log.info("Excluded %s entries", filtered_out)

    files.task_done()
Ejemplo n.º 3
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Ejemplo n.º 4
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Ejemplo n.º 5
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after):
    batch = []
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after)
            for entry in p.parse():
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
        if len(batch):
            entries.put(batch)
            batch = []
        files.task_done()
    files.task_done()
Ejemplo n.º 6
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size,
               date_after):
    batch = []
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f,
                          wos_only=wos_only,
                          sample_rate=sample_rate,
                          must_cite=must_cite,
                          date_after=date_after)
            for entry in p.parse():
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
        if len(batch):
            entries.put(batch)
            batch = []
        files.task_done()
    files.task_done()
Ejemplo n.º 7
0
from util.PajekFactory import PajekFactory
from util.misc import open_file

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from an edge/link file")
    parser.add_argument('outfile')
    parser.add_argument('--delimiter', '-d', help="Field delimiter", default='\t')
    parser.add_argument('--temp-dir', help="Directory to store temporary files in", default=None)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    pjk = PajekFactory(temp_dir=arguments.temp_dir)

    for filename in arguments.infile:
        with open_file(filename) as f:
            for line in f:
                v_from, v_to = line.split(arguments.delimiter)
                pjk.add_edge(v_from, v_to.strip())

    with open_file(arguments.outfile, 'w') as f:
        pjk.write(f)
Ejemplo n.º 8
0
    parser.add_argument(
        '--wos-only',
        help="For WoS, exclude any citations or ids that contain a dot (.)",
        action="store_true")
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    b = Benchmark()
    pjk = PajekFactory(temp_dir=arguments.temp_dir)

    subjects = None
    if arguments.subject:
        subjects = set(arguments.subject.split(","))

    for filename in arguments.infile:
        with open_file(filename) as f:
            for line in f:
                entry = ujson.loads(line)
                b.increment()

                if arguments.wos_only and '.' in entry["id"]:
                    continue

                if subjects:
                    if "subject" not in entry:
                        continue

                    if not subjects.intersection(entry["subject"]):
                        continue

                for citation in entry["citations"]:
Ejemplo n.º 9
0
    parser.add_argument('-bf', '--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000)
    parser.add_argument('--log', help="Logging level", default="WARNING", choices=["WARNING", "INFO", "DEBUG", "ERROR", "CRITICAL"])
    parser.add_argument('wosfiles', nargs='+')
    arguments = parser.parse_args()

    logging.basicConfig(level=arguments.log)

    file_queue = JoinableQueue()
    result_queue = JoinableQueue()
    log = logging.getLogger(__name__)

    filter_set = None
    if arguments.filter:
        filter_set = set()
        log.info("Building filter from " + arguments.filter)
        with open_file(arguments.filter) as f:
            if arguments.filter.endswith(".pickle"):
                log.info("Filter is a pickle, unpickling")
                filter_set = pickle.load(f)
            else:
                tf = TreeFile(f)
                filter_set = {e.pid for e in tf}
                pickle_path = arguments.filter+".pickle"
                log.info("Pickling filter to %s" % pickle_path)
                with open_file(pickle_path, "w") as pf:
                    pickle.dump(filter_set, pf, pickle.HIGHEST_PROTOCOL)
                tf = None
            log.info("Found %s ids to include" % len(filter_set))

    date_after = None
    if arguments.after:
Ejemplo n.º 10
0
    if not args.username or not args.password:
        logging.warning("No username or password provided")

    pub_up = None
    if args.publisher:
        pub_up = partial(add_publisher, args.publisher)

    key_filter = None
    if args.keys:
        keys = set((args.keys.split(",")))
        if not keys <= Metadata.fields:
            raise ValueError("Trying to include an unknown field: % " % keys - Metadata.fields)
        key_filter = partial(filter_dict, keys)

    p = Metadata(args.host, args.username, args.password)
    for filename in args.infile:
        with open_file(filename) as f:
            itr = map(ujson.loads, f)

            if pub_up:
                itr = map(pub_up, itr)

            if key_filter:
                itr = map(key_filter, itr)

            if args.dryrun:
                for e in itr:
                    print(e)
            else:
                print(p.insert_bulk(itr))
Ejemplo n.º 11
0
        logging.warning("No username or password provided")

    pub_up = None
    if args.publisher:
        pub_up = partial(add_publisher, args.publisher)

    key_filter = None
    if args.keys:
        keys = set((args.keys.split(",")))
        if not keys <= Metadata.fields:
            raise ValueError("Trying to include an unknown field: % " % keys -
                             Metadata.fields)
        key_filter = partial(filter_dict, keys)

    p = Metadata(args.host, args.username, args.password)
    for filename in args.infile:
        with open_file(filename) as f:
            itr = map(ujson.loads, f)

            if pub_up:
                itr = map(pub_up, itr)

            if key_filter:
                itr = map(key_filter, itr)

            if args.dryrun:
                for e in itr:
                    print(e)
            else:
                print(p.insert_bulk(itr))
Ejemplo n.º 12
0
#!/usr/bin/env python3
from parsers.wos import WOSStream
from util.PajekFactory import PajekFactory
from util.misc import open_file, Benchmark

if __name__ == "__main__":
    import argparse
    import sys
    parser = argparse.ArgumentParser(
        description="Creates Pajek (.net) files from WOS XML")
    parser.add_argument('infile')
    parser.add_argument('outfile',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout)
    arguments = parser.parse_args()

    with open_file(arguments.infile) as f:
        p = WOSStream(f)

        for entry in p.parse():
            for citation in entry["citations"]:
                arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))
Ejemplo n.º 13
0
                        action="store_true")
    parser.add_argument('--benchmark-freq', default=10000, type=int)
    parser.add_argument(
        '-l',
        '--limit',
        type=int,
        help="Max number of recommendations to generate per-paper",
        default=10)
    args = parser.parse_args()

    db = leveldb.LevelDB(
        args.db_path,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB

    with open_file(args.infile) as ifs:
        b = Benchmark(args.benchmark_freq)
        # Remove any non-wos entries
        if args.wos_only:
            tf = filter(
                lambda r: r.pid.startswith("WOS:") and '.' not in r.pid,
                TreeFile(ifs))
        else:
            tf = TreeFile(ifs)

        writer = db

        for parent_cid, parent_papers, leaf_recs in parse_tree(tf, args.limit):
            for l in leaf_recs:
                for paper in l.get_papers():
                    # Register each paper's cluster_id
Ejemplo n.º 14
0
    parser.add_argument("--sample-rate", help="Edge sample rate", type=float, default=None)
    parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes")
    parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true")
    parser.add_argument("-a", "--after", help="Only include nodes published on or after this year")
    parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000)
    parser.add_argument("infile", nargs="+")
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
                    g.write("\n")
                    b.increment()
Ejemplo n.º 15
0
#!/usr/bin/env python3
from parsers.wos import WOSStream
from util.PajekFactory import PajekFactory
from util.misc import open_file, Benchmark

if __name__ == "__main__":
    import argparse
    import sys
    parser = argparse.ArgumentParser(description="Creates Pajek (.net) files from WOS XML")
    parser.add_argument('infile')
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    arguments = parser.parse_args()

    with open_file(arguments.infile) as f:
        p = WOSStream(f)
        
        for entry in p.parse():
            for citation in entry["citations"]:
                arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))
Ejemplo n.º 16
0
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Creates recommendations and stores in an LevelDB from a tree file (.tree)")
    parser.add_argument('infile')
    parser.add_argument('db_path')
    parser.add_argument("--wos-only", help="Only include papers in the WOS collection", action="store_true")
    parser.add_argument('--benchmark-freq', default=10000, type=int)
    parser.add_argument('-l', '--limit', type=int, help="Max number of recommendations to generate per-paper", default=10)
    args = parser.parse_args()

    db = leveldb.LevelDB(args.db_path,
                         write_buffer_size=100 << 20,  # 100MB
                         block_cache_size=400 << 20)  # 400MB

    with open_file(args.infile) as ifs:
        b = Benchmark(args.benchmark_freq)
        # Remove any non-wos entries
        if args.wos_only:
            tf = filter(lambda r: r.pid.startswith("WOS:") and '.' not in r.pid, TreeFile(ifs))
        else:
            tf = TreeFile(ifs)

        writer = db

        for parent_cid, parent_papers, leaf_recs in parse_tree(tf, args.limit):
            for l in leaf_recs:
                for paper in l.get_papers():
                    # Register each paper's cluster_id
                    writer.Put(("paper|" + paper).encode(), l.cluster_id.encode())
                    b.increment()
Ejemplo n.º 17
0
        include_fields.add(map(str.strip, args.include.split(",")))

    t = Metadata(client, table_name)

    if args.flush:
        logging.info("Deleting table: " + t.table_name)
        if not args.dryrun:
            t.delete()

    if args.create:
        logging.info("Creating table: " + t.table_name)
        if not args.dryrun:
            t.create(write=2000)

    b = Benchmark(args.benchmark_freq)
    with open_file(args.metadata) as ifs:
        with t.get_batch_put_context() as batch:
            reader = csv.DictReader(ifs, delimiter=args.delimiter)
            for row in reader:
                new_row = {k: v for k, v in row.items() if k in include_fields}
                if not REQUIRED_KEYS.issubset(new_row.keys()):
                    print(row)
                    raise KeyError("Not all required keys present")
                if not args.dryrun:
                    batch.put_item(Item=new_row)
                b.increment()

    if not args.dryrun:
        t.update_throughput()

    b.print_freq()
Ejemplo n.º 18
0
        default="WARNING",
        choices=["WARNING", "INFO", "DEBUG", "ERROR", "CRITICAL"])
    parser.add_argument('wosfiles', nargs='+')
    arguments = parser.parse_args()

    logging.basicConfig(level=arguments.log)

    file_queue = JoinableQueue()
    result_queue = JoinableQueue()
    log = logging.getLogger(__name__)

    filter_set = None
    if arguments.filter:
        filter_set = set()
        log.info("Building filter from " + arguments.filter)
        with open_file(arguments.filter) as f:
            if arguments.filter.endswith(".pickle"):
                log.info("Filter is a pickle, unpickling")
                filter_set = pickle.load(f)
            else:
                tf = TreeFile(f)
                filter_set = {e.pid for e in tf}
                pickle_path = arguments.filter + ".pickle"
                log.info("Pickling filter to %s" % pickle_path)
                with open_file(pickle_path, "w") as pf:
                    pickle.dump(filter_set, pf, pickle.HIGHEST_PROTOCOL)
                tf = None
            log.info("Found %s ids to include" % len(filter_set))

    date_after = None
    if arguments.after:
Ejemplo n.º 19
0
from util.misc import open_file
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Merge tree files")
    parser.add_argument('outfile')
    parser.add_argument('infiles', nargs='+')

    args = parser.parse_args()

    with open_file(args.outfile, 'w') as outf:
        for i, filename in enumerate(args.infiles, start=1):
            with open_file(filename) as inf:
                l = inf.readline()
                if l[0] == "#":
                    continue
                 outf.write("%d:%s\n" % (i, l))
Ejemplo n.º 20
0
    parser = argparse.ArgumentParser(description="Remap strings to ids")
    parser.add_argument('infile')
    parser.add_argument('outfile')
    parser.add_argument('pickle', help="Store AutoID pickle")
    parser.add_argument('--start',
                        type=int,
                        default=1,
                        help="Value to start with")
    parser.add_argument('--delimiter',
                        default=' ',
                        help="ID delimiter",
                        type=str)
    arguments = parser.parse_args()

    aid = AutoID(arguments.start)
    with open_file(arguments.outfile, "w") as o:
        with open_file(arguments.infile) as f:
            for line in f:
                o.write(
                    arguments.delimiter.join(
                        list(
                            map(
                                str,
                                map(
                                    aid.__getitem__,
                                    map(str.strip,
                                        line.split(arguments.delimiter)))))))
                o.write('\n')

    with open(arguments.pickle, "wb") as p:
        pickle.dump(aid, p, pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 21
0
    parser.add_argument('-bf',
                        '--benchmark_freq',
                        help="How often to emit benchmark info",
                        type=int,
                        default=1000000)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate,
                          arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(
                ".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
Ejemplo n.º 22
0
#!/usr/bin/env python3

if __name__ == "__main__":
    import argparse
    from util.AutoID import AutoID
    from util.misc import open_file
    import pickle
    parser = argparse.ArgumentParser(description="Remap strings to ids")
    parser.add_argument('infile')
    parser.add_argument('outfile')
    parser.add_argument('pickle', help="Store AutoID pickle")
    parser.add_argument('--start', type=int, default=1, help="Value to start with")
    parser.add_argument('--delimiter', default=' ', help="ID delimiter", type=str)
    arguments = parser.parse_args()

    aid = AutoID(arguments.start)
    with open_file(arguments.outfile, "w") as o:
        with open_file(arguments.infile) as f:
            for line in f:
                o.write(arguments.delimiter.join(list(map(str, map(aid.__getitem__, map(str.strip, line.split(arguments.delimiter)))))))
                o.write('\n')

    with open(arguments.pickle, "wb") as p:
        pickle.dump(aid, p, pickle.HIGHEST_PROTOCOL)