Ejemplo n.º 1
0
 def hatch():
     fileman.create_directory(Crawler.target_name)
     fileman.create_file(Crawler.queue_file)
     fileman.create_file(Crawler.crawled_file)
     fileman.write_file(Crawler.queue_file, Crawler.base_url)
     fileman.write_file(Crawler.crawled_file, Crawler.base_url)
     Crawler.queue = fileman.file_to_set(Crawler.queue_file)
     Crawler.crawled = fileman.file_to_set(Crawler.crawled_file)
Ejemplo n.º 2
0
def crawl_website(target, website, threads):

    def work():
        while True:
            url = task_queue.get()
            Crawler.crawl_page(threading.current_thread().name, url)
            task_queue.task_done()

    domain = get_domain_name(website)
    worker = Crawler(target, website, domain)
    task_queue = queue.Queue()
    queued_links = fileman.file_to_set(worker.queue_file)
    if len(queued_links) > 0:
        print(str(len(queued_links)) + ' links remaining in the queue')
        task_queue = create_tasks(worker, task_queue)

    for _ in range(threads):
        workers = threading.Thread(target = work)
        workers.daemon = True
        workers.start()
Ejemplo n.º 3
0
def create_tasks(worker, task_queue):
    for link in fileman.file_to_set(worker.queue_file):
        task_queue.put(link)
    task_queue.join()
    return task_queue