def execute_queued_task(self): if len(self.current_tasks) <= self.max_processes: task = self.fetch_task() if task: logger.info("Submitted " + task.url + " to process pool") self.current_tasks.append(task) self.pool.apply_async(TaskManager.run_task, args=(task, self.current_tasks), callback=TaskManager.task_complete, error_callback=TaskManager.task_error)
def fetch_task(self): try: payload = {"token": config.API_TOKEN} r = requests.post(config.SERVER_URL + "/task/get", data=payload) if r.status_code == 200: text = r.text logger.info("Fetched task from server : " + text) task_json = json.loads(text) return Task(task_json["website_id"], task_json["url"]) return None except Exception as e: raise e
def run_task(task, current_tasks): result = TaskResult() result.start_time = datetime.utcnow().timestamp() result.website_id = task.website_id logger.info("Starting task " + task.url) crawler = RemoteDirectoryCrawler(task.url, config.CRAWL_SERVER_THREADS) crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json") result.file_count = crawl_result.file_count result.status_code = crawl_result.status_code result.end_time = datetime.utcnow().timestamp() logger.info("End task " + task.url) return result, current_tasks
def _log_to_file(files_q: Queue, out_file: str, files_written: list): counter = 0 with open(out_file, "w") as f: while True: try: file = files_q.get(timeout=2000) except Empty: logger.error("File writer thread timed out") break if file is None: break f.write(file.to_json() + "\n") counter += 1 files_q.task_done() files_written.append(counter) logger.info("File writer thread done")
def crawl_directory(self, out_file: str) -> CrawlResult: try: try: directory = RemoteDirectoryFactory.get_directory(self.url) logger.info("Crawling directory " + self.url + " with " + str(type(directory))) path_id, root_listing = directory.list_dir( urlparse(self.url).path) if root_listing: self.crawled_paths.add(path_id) else: logger.info("No files in root listing for " + self.url) return CrawlResult(0, "empty") directory.close() except TimeoutError: return CrawlResult(0, "Timeout during initial request") in_q = Queue(maxsize=0) files_q = Queue(maxsize=0) for f in root_listing: if f.is_dir: in_q.put(os.path.join(f.path, f.name, "")) else: files_q.put(f) threads = [] for i in range(self.max_threads): worker = Thread( target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q)) threads.append(worker) worker.start() files_written = [] # Pass array to worker to get result file_writer_thread = Thread( target=RemoteDirectoryCrawler._log_to_file, args=(files_q, out_file, files_written)) file_writer_thread.start() in_q.join() files_q.join() logger.info("Crawling for " + self.url + " done, waiting for threads to terminate...") # Kill threads for _ in threads: in_q.put(None) for t in threads: t.join() files_q.put(None) file_writer_thread.join() return CrawlResult(files_written[0], self.status_code) except Exception as e: return CrawlResult(0, str(e) + " \nType:" + str(type(e)))
def task_complete(result): task_result, current_tasks = result logger.info("Task completed, sending result to server") logger.info("Status code: " + task_result.status_code) logger.info("File count: " + str(task_result.file_count)) TaskManager.push_result(task_result) for i, task in enumerate(current_tasks): if task.website_id == task_result.website_id: del current_tasks[i]
def push_result(task_result: TaskResult): try: logger.info("Uploading file list in small chunks") filename = "./crawled/" + str(task_result.website_id) + ".json" CHUNK_SIZE = 500000 * 10 # 5Mb if os.path.exists(filename): with open(filename) as f: chunk = f.read(CHUNK_SIZE) while chunk: try: payload = { "token": config.API_TOKEN, "website_id": task_result.website_id } files = {"file_list": chunk} r = requests.post(config.SERVER_URL + "/task/upload", data=payload, files=files) logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") except Exception as e: logger.error( "Exception while sending file_list chunk: " + str(e)) pass chunk = f.read(CHUNK_SIZE) payload = { "token": config.API_TOKEN, "result": json.dumps(task_result.to_json()) } r = requests.post(config.SERVER_URL + "/task/complete", data=payload) logger.info("RESPONSE: " + r.text + "<" + str(r.status_code) + ">") if os.path.exists(filename): os.remove(filename) except Exception as e: logger.error("Error during push_result: " + str(e))