Esempio n. 1
0
class TaskManager:

    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database("db.sqlite3")

    def complete_task(self, file_list, task, task_result, crawler_name):

        self.search.delete_docs(task_result.website_id)

        if file_list:
            def iter_lines():

                with open(file_list, "r") as f:
                    line = f.readline()
                    while line:
                        yield line
                        line = f.readline()

            self.search.import_json(iter_lines(), task.website_id)

        self.db.update_website_date_if_exists(task.website_id)

        task_result.server_id = crawler_name

        self.db.log_result(task_result)

    def queue_task(self, task: Task):
        self.db.put_task(task)
        print("Queued task and made it available to crawlers: " + str(task.website_id))

    def get_queued_tasks(self) -> list:
        return self.db.get_tasks()
Esempio n. 2
0
class TaskManager:

    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()

    def start_indexer_threads(self):
        logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
        for _ in range(config.INDEXER_THREADS):
            t = Thread(target=self._do_indexing)
            t.setDaemon(True)
            self._indexer_threads.append(t)
            t.start()

    def _do_indexing(self):

        while True:
            task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT)

            if task:
                try:
                    recipe = task.json_recipe()
                    logger.debug("Got indexing task: " + str(recipe))
                    filename = os.path.join(config.WSB_PATH,
                                            format_file_name(recipe["website_id"], recipe["upload_token"]))
                    self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
                except Exception as e:
                    self.worker.release_task(task_id=task.id, result=1, verification=0)
                finally:
                    try:
                        self.worker.release_task(task_id=task.id, result=0, verification=0)
                    except:
                        pass
            else:
                time.sleep(5)

    def _complete_task(self, file_list, task):

        self.search.delete_docs(task.website_id)

        if file_list:
            def iter_lines():
                with open(file_list, "r") as f:
                    line = f.readline()
                    while line:
                        yield line
                        line = f.readline()

            self.search.import_json(iter_lines(), task.website_id)
            os.remove(file_list)

        self.db.update_website_date_if_exists(task.website_id)

    def do_recrawl(self):
        logger.debug("Creating re-crawl tasks")
        self._generate_crawling_tasks()

    def _generate_crawling_tasks(self):

        # TODO: Insert more in-depth re-crawl logic here
        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")

        def recrawl(website: Website):
            crawl_task = Task(website.id, website.url,
                              priority=(int((time.time() - website.last_modified.timestamp()) / 3600)))
            self.queue_task(crawl_task)

        pool = ThreadPool(processes=30)
        pool.map(func=recrawl, iterable=websites_to_crawl)
        pool.close()

    def queue_task(self, task: Task):
        max_assign_time = 24 * 4 * 3600
        upload_token = uuid4().__str__()

        task.upload_token = upload_token
        tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
                                                   recipe=task.__str__(),
                                                   priority=task.priority,
                                                   max_assign_time=max_assign_time,
                                                   hash64=task.website_id,
                                                   verification_count=1,
                                                   max_retries=3
                                                   )
        print(tracker_response.text)
        logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
        if not tracker_response.json()["ok"]:
            return

        bucket_response = self.bucket.allocate(upload_token.__str__(),
                                               21474837499,  # 20Gib
                                               format_file_name(task.website_id, upload_token),
                                               to_dispose_date=int(time.time() + max_assign_time),
                                               upload_hook="")
        logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))