Example #1
0
def export(outfile="out.csv"):

    print("Export started, connecting to databases...")
    es = ElasticSearchEngine("od-database")
    db = Database("db.sqlite3")
    docs = es.stream_all_docs()
    docs_with_website = db.join_website_on_scan(docs)

    print("Connected, writing to csv")

    with open(outfile + ".temp", "w") as out:

        csv_writer = csv.writer(out)
        csv_writer.writerow([
            "website_id", "website_url", "path", "name", "ext", "size", "mtime"
        ])

        for doc in docs_with_website:
            csv_writer.writerow([
                doc["_source"]["website_id"], doc["_source"]["website_url"],
                doc["_source"]["path"] +
                "/" if doc["_source"]["path"] != "" else "",
                doc["_source"]["name"], "." +
                doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
                doc["_source"]["size"], doc["_source"]["mtime"]
            ])
    print("Wrote to csv, compressing with xz")

    os.system("xz -0 " + outfile + ".temp")
    os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
    print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
Example #2
0
    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)

        self._indexer_threads = list()
        logger.info("Starting %s indexer threads " %
                    (config.INDEXER_THREADS, ))
        for _ in range(config.INDEXER_THREADS):
            t = Thread(target=self._do_indexing)
            t.setDaemon(True)
            self._indexer_threads.append(t)
            t.start()

        self._recrawl_thread = Thread(target=self._do_recrawl)
        self._recrawl_thread.setDaemon(True)
        self._recrawl_thread.start()
Example #3
0
    def __init__(self):
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_completed_tasks, "interval", seconds=10)
        scheduler.start()

        self.search = ElasticSearchEngine("od-database")

        # TODO load from config
        self.crawl_servers = [
            CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
        ]
Example #4
0
    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()
Example #5
0
    def __init__(self):
        self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("$oddb_master")
            if not self.worker:
                print("Could not create worker: %s" % traceback.format_exc())
                return
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
Example #6
0
from tasks import TaskManager
import logging
from flask import session, abort

# Disable flask logging
flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)

logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
file_handler = FileHandler("oddb.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))

taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler()
db = Database("db.sqlite3")

# temporary hotfix...
sessionStore = dict()


def require_role(role: str):

    if db.get_user_role(session.get("username", None)) != role:
        abort(403)
Example #7
0
def index_file_list(path: str, website_id):

    es = ElasticSearchEngine("od-database")
    with open(path, "r") as f:
        es.import_json(f.read(), website_id)
Example #8
0
 def __init__(self):
     self.search = ElasticSearchEngine("od-database")
     self.db = database.Database("db.sqlite3")
Example #9
0
    else:
        return string


outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"

print("Deleting existing dumps")
for file in os.listdir(dldir):
    if file.endswith("_dump.csv.lz4"):
        os.remove(os.path.join(dldir, file))

print("Export started, connecting to databases...")

db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)

docs_with_url = db.join_website_url(es.stream_all_docs())

print("Connected, writing to csv")

with lz4.frame.open(outfile + ".part",
                    mode='wb',
                    compression_level=9,
                    block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
    fp.write((",".join([
        "website_id", "website_url", "path", "name", "ext", "size", "mtime"
    ]) + "\n").encode())

    for doc in docs_with_url:
        try:
Example #10
0
 def setUp(self):
     self.search = ElasticSearchEngine("od-database-test")
     self.search.reset()
     time.sleep(0.5)