def hatch(): fileman.create_directory(Crawler.target_name) fileman.create_file(Crawler.queue_file) fileman.create_file(Crawler.crawled_file) fileman.write_file(Crawler.queue_file, Crawler.base_url) fileman.write_file(Crawler.crawled_file, Crawler.base_url) Crawler.queue = fileman.file_to_set(Crawler.queue_file) Crawler.crawled = fileman.file_to_set(Crawler.crawled_file)
def crawl_website(target, website, threads): def work(): while True: url = task_queue.get() Crawler.crawl_page(threading.current_thread().name, url) task_queue.task_done() domain = get_domain_name(website) worker = Crawler(target, website, domain) task_queue = queue.Queue() queued_links = fileman.file_to_set(worker.queue_file) if len(queued_links) > 0: print(str(len(queued_links)) + ' links remaining in the queue') task_queue = create_tasks(worker, task_queue) for _ in range(threads): workers = threading.Thread(target = work) workers.daemon = True workers.start()
def create_tasks(worker, task_queue): for link in fileman.file_to_set(worker.queue_file): task_queue.put(link) task_queue.join() return task_queue