コード例 #1
0
def make_entities(db_path, outfile):
    db = dataset.connect("sqlite:///%s" % db_path)
    store = Dataset("temp", database_uri="sqlite://")
    writer = store.bulk()
    write_edges(writer, db)
    write_addresses(writer, db)
    write_nodes(writer, db["entity"], "Company")
    write_nodes(writer, db["intermediary"])
    write_nodes(writer, db["officer"])

    for entity in store.iterate():
        write_object(outfile, entity)
コード例 #2
0
def _stream_collection(collection, N=50_000):
    fid = collection["foreign_id"]
    collection_id = collection["id"]
    cachefile = CACHEDIR / f"{fid}.json"
    if not cachefile.exists():
        return
        cachefile.parent.mkdir(parents=True, exist_ok=True)
        cachefile_back = CACHEDIR / "tmp.json"
        dataset = Dataset(f"collection_{collection_id}", origin="aleph")
        with open(cachefile_back, "w+") as fd:
            for entity in islice(dataset.iterate(skip_errors=True), N):
                yield entity
                fd.write(json.dumps(entity.to_dict()))
                fd.write("\n")
        cachefile_back.rename(cachefile)
    else:
        with open(cachefile) as fd:
            for line in fd:
                yield json.loads(line)
コード例 #3
0
ファイル: parse.py プロジェクト: alephdata/panama
    except Exception:
        log.exception("Failed to parse: %r", member)


def parse_archive(writer, archive_path):
    log.info("Archive: %s", archive_path)
    tar = tarfile.open(archive_path, "r")
    while True:
        member = tar.next()
        if member is None:
            break
        fh = tar.extractfile(member)
        if fh is None:
            continue
        parse_file(writer, fh, member)
        fh.close()
    writer.flush()


if __name__ == "__main__":
    prefix = "data/"
    dataset = Dataset("pa_companies", origin="parse")
    writer = dataset.bulk()
    for file_name in sorted(os.listdir(prefix)):
        file_path = os.path.join(prefix, file_name)
        parse_archive(writer, file_path)

    with open("panama.json", "w") as fh:
        for entity in dataset.iterate():
            write_object(fh, entity)