def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0
class TornadoSpider: def __init__(self, start_url, sitemap_url=None, max_concurrent_connections=MAX_CONCURRENT_REQUESTS_PER_SERVER): self.visited_urls = set() self.intermediate_urls = set() self.base_domain = extract_domain(start_url) self.base_site = extract_base_site(start_url) self.base_page = _get_client_page(start_url, None, start_url, self.base_domain, DOMAINS_TO_BE_SKIPPED) self.non_visited_urls = {self.base_page} self.added_count = 1 self.idle_ping = 0 self.start_idle_counter = False self.sitemap_url = u'{}/sitemap.xml'.format(self.base_site) if not sitemap_url else sitemap_url self.max_concurrent_connections = max_concurrent_connections self.page_queue = JoinableQueue() self.semaphore = BoundedSemaphore(self.max_concurrent_connections) self.start = time.time() self.skip_count = 0 @coroutine def initiate_crawl(self): self.non_visited_urls.add(self.base_page) self.add_sitemap_urls(self.base_page) self.page_queue.put(self.base_page) self._crawl_web_page() yield self.page_queue.join() @coroutine def _crawl_web_page(self): while True: if len(self.intermediate_urls) < 5 and self.start_idle_counter: print("Unprocessed urls : ") for page in self.intermediate_urls: print(u'>>>>>> %s ' % page.encoded_url) # print("Available Semaphore %s" % self.semaphore.counter) yield self.semaphore.acquire() # print("0.Issued Semaphore %s " % (self.semaphore.counter+1)) self._fetch_page(self.semaphore.counter + 1) if len(self.intermediate_urls) < 5 and self.start_idle_counter: print("Unprocessed urls : ") for page in self.intermediate_urls: print(u'>> %s ' % page.encoded_url) self.wrap_up() @coroutine def _fetch_page(self, semaphore_count): try: page = yield self.page_queue.get() if page in self.visited_urls or page in self.intermediate_urls: return if page.skip_page(): self.skip_count += 1 logger.debug("Skipped {} " % page.url) return logger.debug( u"1.Sempahore in use> %s int.count %s for %s" % (semaphore_count, len(self.intermediate_urls), page.encoded_url)) self.intermediate_urls.add(page) page.process(self) response = yield page.make_head_request() get_response = yield page._process_head_response(response) if get_response: page.process_get_response(get_response) print( u"Total urls added : {} , Total urls visited : {} , Total urls in process : {} Skipped : {}," u" semaphore used : {} " \ .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count, semaphore_count)) logger.debug( u"Total urls added : {} , Total urls visited : {} , Total urls in process : {} Skipped : {}, " u"semaphore {}" .format(self.added_count, len(self.visited_urls), len(self.intermediate_urls), self.skip_count, self.semaphore.counter)) except Exception as ex: logger.debug(ex) finally: self.page_queue.task_done() self.semaphore.release() logger.debug( u"2.Sempahore returned>> %s available %s after %s" % (semaphore_count, self.semaphore.counter, page.encoded_url)) def _filter_visited_links(self, page): return page not in self.visited_urls and page not in self.intermediate_urls and page not in self.non_visited_urls def add_sitemap_urls(self, parent_page): logger.debug("Adding sitemap urls as well for processing") http_client = HTTPClient() try: response = http_client.fetch(self.sitemap_url) val = bytes(response.body) root = objectify.fromstring(val) for url_element in root.url: page = _get_client_page(decode_to_unicode(url_element.loc.text), parent_page, self.base_site, self.base_domain, DOMAINS_TO_BE_SKIPPED) if page not in self.visited_urls and page not in self.non_visited_urls \ and page not in self.intermediate_urls: print(u"Added {}".format(url_element.loc)) self.non_visited_urls.add(page) self.added_count += 1 self.page_queue.put(page) except Exception as e: logger.error(u"Error adding sitemap urls from %s " % self.sitemap_url) finally: http_client.close() def _get_unique_non_visited_links(self, page): l = Lock() l.acquire() filtered_links = set(filter(self._filter_visited_links, page.links)) l.release() return filtered_links def process_web_page(self, web_page): logger.debug(u"Called {} for {}".format('process_web_page', unicode(web_page.url).encode("utf-8"))) logger.debug(u"Removing %s " % web_page.url) self.visited_urls.add(web_page) self.non_visited_urls.discard(web_page) self.intermediate_urls.discard(web_page) unique_pages = self._get_unique_non_visited_links(web_page) for page in unique_pages: if page not in self.non_visited_urls: self.non_visited_urls.add(page) self.page_queue.put(page) self.added_count += 1 logger.debug("Added link-url %s " % page.encoded_url) self.start_idle_counter = True def wrap_up(self): self.print_stats() IOLoop.instance().stop() print('Done crawling in %d seconds, fetched %s URLs.' % (time.time() - self.start, len(self.visited_urls))) def print_stats(self): print_pages_with_errors(True, self.visited_urls, "broken_external_links.txt") print_pages_with_errors(False, self.visited_urls, "broken_internal_links.txt") print_pages_with_hardcoded_links(self.visited_urls, "hardcoded_url_links.txt") print("\nTotal pages visited : {}\n".format(len(self.visited_urls))) print_pages_to_file("all_internal_pages.txt", False, self.visited_urls) print_pages_to_file("all_external_pages.txt", True, self.visited_urls)