Beispiel #1
0
def export(outfile="out.csv"):

    print("Export started, connecting to databases...")
    es = ElasticSearchEngine("od-database")
    db = Database("db.sqlite3")
    docs = es.stream_all_docs()
    docs_with_website = db.join_website_on_scan(docs)

    print("Connected, writing to csv")

    with open(outfile + ".temp", "w") as out:

        csv_writer = csv.writer(out)
        csv_writer.writerow([
            "website_id", "website_url", "path", "name", "ext", "size", "mtime"
        ])

        for doc in docs_with_website:
            csv_writer.writerow([
                doc["_source"]["website_id"], doc["_source"]["website_url"],
                doc["_source"]["path"] +
                "/" if doc["_source"]["path"] != "" else "",
                doc["_source"]["name"], "." +
                doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
                doc["_source"]["size"], doc["_source"]["mtime"]
            ])
    print("Wrote to csv, compressing with xz")

    os.system("xz -0 " + outfile + ".temp")
    os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
    print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
Beispiel #2
0

outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"

print("Deleting existing dumps")
for file in os.listdir(dldir):
    if file.endswith("_dump.csv.lz4"):
        os.remove(os.path.join(dldir, file))

print("Export started, connecting to databases...")

db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)

docs_with_url = db.join_website_url(es.stream_all_docs())

print("Connected, writing to csv")

with lz4.frame.open(outfile + ".part",
                    mode='wb',
                    compression_level=9,
                    block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
    fp.write((",".join([
        "website_id", "website_url", "path", "name", "ext", "size", "mtime"
    ]) + "\n").encode())

    for doc in docs_with_url:
        try:
            fp.write((",".join([
                str(doc["_source"]["website_id"]),