Example #1
0
    def link_crawler(self, crawlable, max_delay):
        main_url, product_url_regex = crawlable.get_home_page(
        ), crawlable.get_product_pages()

        main_page = Page(crawlable, main_url)
        main_crawling_pages = main_page.get_main_crawling_pages()
        crawl_queue = main_crawling_pages[:]
        current_main_page = 0
        main_pages_length = len(main_crawling_pages)

        all_visited, product_list = [], []
        products_visited = []

        while crawl_queue:
            url = crawl_queue.pop()

            try:
                if url not in all_visited:
                    if url in main_crawling_pages:
                        current_main_page += 1
                        print('\n%d out of %d main pages\n' %
                              (current_main_page, main_pages_length))

                    if max_delay and max_delay > 0:
                        time.sleep(
                            random.randint(0, max_delay)
                        )  #Making a little bit more difficult to be caught

                    page = Page(crawlable, url)
                    all_visited.append(url)

                    if page.is_product and page.url not in products_visited:
                        product_list.append(page.get_product())
                        products_visited.append(page.url)

                    page_links = page.get_page_links()
                    for link in page_links:
                        if link not in all_visited:
                            crawl_queue.append(link)

            except Exception as e:
                traceback.print_exc(file=sys.stdout)
                if url not in all_visited:
                    all_visited.append(url)

        print(str(len(product_list)) + ' products found.')
        return product_list