Esempio n. 1
0
    def execute_queued_task(self):

        if len(self.current_tasks) <= self.max_processes:

            task = self.fetch_task()

            if task:
                logger.info("Submitted " + task.url + " to process pool")
                self.current_tasks.append(task)

                self.pool.apply_async(TaskManager.run_task,
                                      args=(task, self.current_tasks),
                                      callback=TaskManager.task_complete,
                                      error_callback=TaskManager.task_error)
Esempio n. 2
0
    def fetch_task(self):
        try:
            payload = {"token": config.API_TOKEN}
            r = requests.post(config.SERVER_URL + "/task/get", data=payload)

            if r.status_code == 200:
                text = r.text
                logger.info("Fetched task from server : " + text)
                task_json = json.loads(text)
                return Task(task_json["website_id"], task_json["url"])

            return None

        except Exception as e:
            raise e
Esempio n. 3
0
    def run_task(task, current_tasks):

        result = TaskResult()
        result.start_time = datetime.utcnow().timestamp()
        result.website_id = task.website_id

        logger.info("Starting task " + task.url)

        crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS)
        crawl_result = crawler.crawl_directory("./crawled/" +
                                               str(task.website_id) + ".json")

        result.file_count = crawl_result.file_count
        result.status_code = crawl_result.status_code

        result.end_time = datetime.utcnow().timestamp()
        logger.info("End task " + task.url)

        return result, current_tasks
Esempio n. 4
0
    def _log_to_file(files_q: Queue, out_file: str, files_written: list):

        counter = 0

        with open(out_file, "w") as f:
            while True:

                try:
                    file = files_q.get(timeout=2000)
                except Empty:
                    logger.error("File writer thread timed out")
                    break

                if file is None:
                    break

                f.write(file.to_json() + "\n")
                counter += 1
                files_q.task_done()

        files_written.append(counter)
        logger.info("File writer thread done")
Esempio n. 5
0
    def crawl_directory(self, out_file: str) -> CrawlResult:
        try:
            try:
                directory = RemoteDirectoryFactory.get_directory(self.url)
                logger.info("Crawling directory " + self.url + " with " +
                            str(type(directory)))
                path_id, root_listing = directory.list_dir(
                    urlparse(self.url).path)
                if root_listing:
                    self.crawled_paths.add(path_id)
                else:
                    logger.info("No files in root listing for " + self.url)
                    return CrawlResult(0, "empty")
                directory.close()
            except TimeoutError:
                return CrawlResult(0, "Timeout during initial request")

            in_q = Queue(maxsize=0)
            files_q = Queue(maxsize=0)
            for f in root_listing:
                if f.is_dir:
                    in_q.put(os.path.join(f.path, f.name, ""))
                else:
                    files_q.put(f)

            threads = []
            for i in range(self.max_threads):
                worker = Thread(
                    target=RemoteDirectoryCrawler._process_listings,
                    args=(self, self.url, in_q, files_q))
                threads.append(worker)
                worker.start()

            files_written = []  # Pass array to worker to get result
            file_writer_thread = Thread(
                target=RemoteDirectoryCrawler._log_to_file,
                args=(files_q, out_file, files_written))
            file_writer_thread.start()

            in_q.join()
            files_q.join()
            logger.info("Crawling for " + self.url +
                        " done, waiting for threads to terminate...")

            # Kill threads
            for _ in threads:
                in_q.put(None)
            for t in threads:
                t.join()
            files_q.put(None)
            file_writer_thread.join()

            return CrawlResult(files_written[0], self.status_code)
        except Exception as e:
            return CrawlResult(0, str(e) + " \nType:" + str(type(e)))
Esempio n. 6
0
    def task_complete(result):

        task_result, current_tasks = result

        logger.info("Task completed, sending result to server")
        logger.info("Status code: " + task_result.status_code)
        logger.info("File count: " + str(task_result.file_count))

        TaskManager.push_result(task_result)

        for i, task in enumerate(current_tasks):
            if task.website_id == task_result.website_id:
                del current_tasks[i]
Esempio n. 7
0
    def push_result(task_result: TaskResult):

        try:

            logger.info("Uploading file list in small chunks")
            filename = "./crawled/" + str(task_result.website_id) + ".json"
            CHUNK_SIZE = 500000 * 10  # 5Mb
            if os.path.exists(filename):
                with open(filename) as f:
                    chunk = f.read(CHUNK_SIZE)
                    while chunk:
                        try:
                            payload = {
                                "token": config.API_TOKEN,
                                "website_id": task_result.website_id
                            }

                            files = {"file_list": chunk}

                            r = requests.post(config.SERVER_URL +
                                              "/task/upload",
                                              data=payload,
                                              files=files)
                            logger.info("RESPONSE: " + r.text + "<" +
                                        str(r.status_code) + ">")
                        except Exception as e:
                            logger.error(
                                "Exception while sending file_list chunk: " +
                                str(e))
                            pass
                        chunk = f.read(CHUNK_SIZE)

            payload = {
                "token": config.API_TOKEN,
                "result": json.dumps(task_result.to_json())
            }

            r = requests.post(config.SERVER_URL + "/task/complete",
                              data=payload)
            logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">")

            if os.path.exists(filename):
                os.remove(filename)

        except Exception as e:
            logger.error("Error during push_result: " + str(e))