def export(outfile="out.csv"): print("Export started, connecting to databases...") es = ElasticSearchEngine("od-database") db = Database("db.sqlite3") docs = es.stream_all_docs() docs_with_website = db.join_website_on_scan(docs) print("Connected, writing to csv") with open(outfile + ".temp", "w") as out: csv_writer = csv.writer(out) csv_writer.writerow([ "website_id", "website_url", "path", "name", "ext", "size", "mtime" ]) for doc in docs_with_website: csv_writer.writerow([ doc["_source"]["website_id"], doc["_source"]["website_url"], doc["_source"]["path"] + "/" if doc["_source"]["path"] != "" else "", doc["_source"]["name"], "." + doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "", doc["_source"]["size"], doc["_source"]["mtime"] ]) print("Wrote to csv, compressing with xz") os.system("xz -0 " + outfile + ".temp") os.system("mv " + outfile + ".temp.xz " + outfile + ".xz") print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
class TaskManager: def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database("db.sqlite3") def complete_task(self, file_list, task, task_result, crawler_name): self.search.delete_docs(task_result.website_id) if file_list: def iter_lines(): with open(file_list, "r") as f: line = f.readline() while line: yield line line = f.readline() self.search.import_json(iter_lines(), task.website_id) self.db.update_website_date_if_exists(task.website_id) task_result.server_id = crawler_name self.db.log_result(task_result) def queue_task(self, task: Task): self.db.put_task(task) print("Queued task and made it available to crawlers: " + str(task.website_id)) def get_queued_tasks(self) -> list: return self.db.get_tasks()
def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.worker = Worker.from_file(self.tracker) if not self.worker: self.worker = self.tracker.make_worker("oddb_master") self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self._indexer_threads = list() logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) for _ in range(config.INDEXER_THREADS): t = Thread(target=self._do_indexing) t.setDaemon(True) self._indexer_threads.append(t) t.start() self._recrawl_thread = Thread(target=self._do_recrawl) self._recrawl_thread.setDaemon(True) self._recrawl_thread.start()
def __init__(self): scheduler = BackgroundScheduler() scheduler.add_job(self.check_completed_tasks, "interval", seconds=10) scheduler.start() self.search = ElasticSearchEngine("od-database") # TODO load from config self.crawl_servers = [ CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), ]
def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.worker = Worker.from_file(self.tracker) if not self.worker: self.worker = self.tracker.make_worker("oddb_master") self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self._indexer_threads = list()
def __init__(self): self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self._indexer_threads = list() self.worker = Worker.from_file(self.tracker) if not self.worker: self.worker = self.tracker.make_worker("$oddb_master") if not self.worker: print("Could not create worker: %s" % traceback.format_exc()) return self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database("db.sqlite3")
else: return string outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime()) dldir = "static/downloads/" print("Deleting existing dumps") for file in os.listdir(dldir): if file.endswith("_dump.csv.lz4"): os.remove(os.path.join(dldir, file)) print("Export started, connecting to databases...") db = Database(config.DB_CONN_STR) es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) docs_with_url = db.join_website_url(es.stream_all_docs()) print("Connected, writing to csv") with lz4.frame.open(outfile + ".part", mode='wb', compression_level=9, block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp: fp.write((",".join([ "website_id", "website_url", "path", "name", "ext", "size", "mtime" ]) + "\n").encode()) for doc in docs_with_url: try:
def setUp(self): self.search = ElasticSearchEngine("od-database-test") self.search.reset() time.sleep(0.5)
class SearchTest(TestCase): def setUp(self): self.search = ElasticSearchEngine("od-database-test") self.search.reset() time.sleep(0.5) def test_ping(self): self.assertTrue(self.search.ping(), "Search engine not running") def test_import_and_search(self): files = [{ "name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672 }, { "name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None }, { "name": "10'000 days", "size": -1, "path": "c", "mtime": 12345 }, { "name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345 }] in_str = "" for file in files: in_str += json.dumps(file) + "\n" self.search.import_json(in_str, 123) time.sleep(2) self.assertEqual( 4, self.search.es.count(self.search.index_name, "file")["count"]) # Search for 'pan' in PaNopTicon and expect 1 result, a scroll id, and an highlight page = self.search.search("pan") self.assertIsNotNone(page["_scroll_id"]) self.assertEqual(1, page["hits"]["total"]) self.assertIsNotNone(page["hits"]["hits"][0]["highlight"]["name"]) # Search for 'park' and expect BLAckwAter.Park page = self.search.search("park") self.assertEqual(1, page["hits"]["total"]) # Search for fla and expect Dead Racer page = self.search.search("fla") self.assertEqual(1, page["hits"]["total"]) # Search for 10'000 and expect 10'000 days page = self.search.search("10'000") self.assertEqual(1, page["hits"]["total"]) def test_scroll(self): files = [{ "name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672 }, { "name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None }, { "name": "10'000 days", "size": -1, "path": "c", "mtime": 12345 }, { "name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345 }] in_str = "" for file in files: in_str += json.dumps(file) + "\n" self.search.import_json(in_str, 123) time.sleep(2) page = self.search.search("") scroll_id = page["_scroll_id"] # next page next_page = self.search.scroll(scroll_id) next_scroll_id = next_page["_scroll_id"] self.assertIsNotNone(next_scroll_id) # again next_page2 = self.search.scroll(next_scroll_id) self.assertIsNotNone(next_page2["_scroll_id"]) def test_invalid_scroll(self): invalid_scroll = "blahblah" self.assertIsNone(self.search.scroll(invalid_scroll))
import praw from crawl_server.reddit_bot import RedditBot from search.search import ElasticSearchEngine from database import Database, Website import od_util import os import re chars_to_remove_from_comment = re.compile("[\[\]\\\()]+") reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') db = Database("db.sqlite3") search = ElasticSearchEngine("od-database") subreddit = reddit.subreddit("opendirectories") # subreddit = reddit.subreddit("test") bot = RedditBot("crawled.txt", reddit) submissions = [] def handle_exact_repost(website_id, reddit_obj): stats = search.get_stats(website_id) comment = bot.get_comment({"": stats}, website_id, "I already scanned this website on " + website.last_modified + " UTC") print(comment) print("Exact repost!") bot.reply(reddit_obj, comment) def handle_subdir_repost(website_id, reddit_obj):
class TaskDispatcher: def __init__(self): scheduler = BackgroundScheduler() scheduler.add_job(self.check_completed_tasks, "interval", seconds=10) scheduler.start() self.search = ElasticSearchEngine("od-database") # TODO load from config self.crawl_servers = [ CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), ] def check_completed_tasks(self): for server in self.crawl_servers: for task in server.fetch_completed_tasks(): print("Completed task") file_list = server.fetch_website_files(task.website_id) if file_list: self.search.import_json(file_list, task.website_id) def dispatch_task(self, task: Task): self._get_available_crawl_server().queue_task(task) def _get_available_crawl_server(self) -> CrawlServer: # TODO: Load balancing & health check for crawl servers return self.crawl_servers[0] def get_queued_tasks(self) -> list: queued_tasks = [] for server in self.crawl_servers: queued_tasks.extend(server.fetch_queued_tasks()) return queued_tasks def get_current_tasks(self) -> list: # TODO mem cache this current_tasks = [] for server in self.crawl_servers: current_tasks.extend(server.fetch_current_tasks()) return current_tasks def get_task_logs_by_server(self) -> dict: task_logs = dict() for server in self.crawl_servers: task_logs[server.name] = server.fetch_crawl_logs() return task_logs def get_stats_by_server(self) -> dict: stats = dict() for server in self.crawl_servers: server_stats = server.fetch_stats() if server_stats: stats[server.name] = server_stats return stats
from database import Database from search.search import ElasticSearchEngine from tasks import TaskManager # Disable flask logging flaskLogger = logging.getLogger('werkzeug') flaskLogger.setLevel(logging.ERROR) logger = logging.getLogger("default") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') file_handler = FileHandler("oddb.log") file_handler.setFormatter(formatter) for h in logger.handlers: logger.removeHandler(h) logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) taskManager = TaskManager() searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) searchEngine.start_stats_scheduler() db = Database(config.DB_CONN_STR) redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) def require_role(role: str): if db.get_user_role(session.get("username", None)) != role: abort(403)
class TaskManager: def __init__(self): self.search = ElasticSearchEngine("od-database") self.db = database.Database(config.DB_CONN_STR) self.tracker = TaskTrackerApi(config.TT_API) self.worker = Worker.from_file(self.tracker) if not self.worker: self.worker = self.tracker.make_worker("oddb_master") self.worker.dump_to_file() self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) self.worker.request_access(config.TT_INDEX_PROJECT, True, False) self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) self._indexer_threads = list() def start_indexer_threads(self): logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) for _ in range(config.INDEXER_THREADS): t = Thread(target=self._do_indexing) t.setDaemon(True) self._indexer_threads.append(t) t.start() def _do_indexing(self): while True: task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT) if task: try: recipe = task.json_recipe() logger.debug("Got indexing task: " + str(recipe)) filename = os.path.join(config.WSB_PATH, format_file_name(recipe["website_id"], recipe["upload_token"])) self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) except Exception as e: self.worker.release_task(task_id=task.id, result=1, verification=0) finally: try: self.worker.release_task(task_id=task.id, result=0, verification=0) except: pass else: time.sleep(5) def _complete_task(self, file_list, task): self.search.delete_docs(task.website_id) if file_list: def iter_lines(): with open(file_list, "r") as f: line = f.readline() while line: yield line line = f.readline() self.search.import_json(iter_lines(), task.website_id) os.remove(file_list) self.db.update_website_date_if_exists(task.website_id) def do_recrawl(self): logger.debug("Creating re-crawl tasks") self._generate_crawling_tasks() def _generate_crawling_tasks(self): # TODO: Insert more in-depth re-crawl logic here websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http") def recrawl(website: Website): crawl_task = Task(website.id, website.url, priority=(int((time.time() - website.last_modified.timestamp()) / 3600))) self.queue_task(crawl_task) pool = ThreadPool(processes=30) pool.map(func=recrawl, iterable=websites_to_crawl) pool.close() def queue_task(self, task: Task): max_assign_time = 24 * 4 * 3600 upload_token = uuid4().__str__() task.upload_token = upload_token tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT, recipe=task.__str__(), priority=task.priority, max_assign_time=max_assign_time, hash64=task.website_id, verification_count=1, max_retries=3 ) print(tracker_response.text) logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text)) if not tracker_response.json()["ok"]: return bucket_response = self.bucket.allocate(upload_token.__str__(), 21474837499, # 20Gib format_file_name(task.website_id, upload_token), to_dispose_date=int(time.time() + max_assign_time), upload_hook="") logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
from search.search import ElasticSearchEngine import ujson es = ElasticSearchEngine("od-database") es.reset() with open("dump.json", "r") as f: buffer = list() index_every = 10000 for line in f: try: doc = ujson.loads(line)["_source"] buffer.append(doc) if len(buffer) >= index_every: es._index(buffer) buffer.clear() except Exception as e: print("ERROR: " + str(e)) es._index(buffer)
def index_file_list(path: str, website_id): es = ElasticSearchEngine("od-database") with open(path, "r") as f: es.import_json(f.read(), website_id)
from tasks import TaskManager import logging from flask import session, abort # Disable flask logging flaskLogger = logging.getLogger('werkzeug') flaskLogger.setLevel(logging.ERROR) logger = logging.getLogger("default") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') file_handler = FileHandler("oddb.log") file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) taskManager = TaskManager() searchEngine = ElasticSearchEngine("od-database") searchEngine.start_stats_scheduler() db = Database("db.sqlite3") # temporary hotfix... sessionStore = dict() def require_role(role: str): if db.get_user_role(session.get("username", None)) != role: abort(403)
else: return string outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime()) dldir = "static/downloads/" print("Deleting existing dumps") for file in os.listdir(dldir): if file.endswith("_dump.csv.lz4"): os.remove(os.path.join(dldir, file)) print("Export started, connecting to databases...") db = Database(config.DB_CONN_STR) es = ElasticSearchEngine("od-database") docs_with_url = db.join_website_url(es.stream_all_docs()) print("Connected, writing to csv") with lz4.frame.open(outfile + ".part", mode='wb', compression_level=9, block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp: fp.write((",".join( ["website_id", "website_url", "path", "name", "ext", "size", "mtime"] ) + "\n").encode()) for doc in docs_with_url: try: fp.write(