def process_get_response(self, response): logger.debug(u"Called {} for {} ".format('process_get_response', self.encoded_url)) if response.error: logger.debug(u"Error processing get request: {} with error : {} ( {} )" % (self.encoded_url, response.error, response.reason)) self.failure_message = response.reason else: html_source = response.body html_source = decode_to_unicode(html_source) if self.is_page_internal(): dom = html.fromstring(html_source) # logger.debug("obtained dom object for {}".format(encoded_url)) link_count = 0 for href_value in dom.xpath('//a/@href'): href_value = decode_to_unicode(href_value) logger.debug(u"Entering for loop for for {} with href {}".format(self.encoded_url, href_value)) self._process_hardcoded_url(href_value) link = self._format_link(href_value) logger.debug(u"obtained link object{} for {}".format(link, self.encoded_url)) if link: parsed_link = obtain_domain_with_subdomain_for_page(link) if parsed_link not in self.domains_to_skip: link_page = TornadoClientPage(link, self, self.base_site, self.base_domain, self.domains_to_skip) self.links.add(link_page) link_page.parent = self link_count += 1 self.finalize_process(self.spider)
def _format_link(self, href_value): href_value = decode_to_unicode(href_value.strip()) if href_value.startswith('#'): link = self.url else: href_value = href_value.replace("..", "") if href_value.startswith("..") else href_value link = urlparse.urljoin(self.url, href_value, allow_fragments=False) link = link if 'javascript:void' not in href_value and not href_value.startswith('mailto') else None return decode_to_unicode(link)
def _make_get_request(self): logger.debug(u"Called {} for {} ".format('_make_get_request', self.encoded_url)) request = HTTPRequest(method='GET', url=self.url, request_timeout=PAGE_TIMEOUT, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 " "(KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1"}, max_redirects=10) try: response = yield AsyncHTTPClient().fetch(request) except Exception as ex: logger.debug( u"Error processing get request for : %s with error : %s " % (self.encoded_url, str(ex.message))) self.response_code = ex.code self.failure_message = decode_to_unicode(ex.message) self.finalize_process(self.spider) raise Return(None) raise Return(response)
def __init__(self, url, parent, base_site, base_domain, domains_to_skip): self.url = decode_to_unicode(url) if url is not None else decode_to_unicode('') self.encoded_url = decode_to_unicode(self.url) self.base_domain = base_domain self.response_code = -1 self.errors = [] self.links = set() self.visited = False self.parent = parent self.base_site = base_site self.content_type = decode_to_unicode("text/html") self.domains_to_skip = domains_to_skip self.redirect_location = decode_to_unicode('') self.hardcoded_urls = set() self.failure_message = decode_to_unicode('') AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
def add_sitemap_urls(self, parent_page): logger.debug("Adding sitemap urls as well for processing") http_client = HTTPClient() try: response = http_client.fetch(self.sitemap_url) val = bytes(response.body) root = objectify.fromstring(val) for url_element in root.url: page = _get_client_page(decode_to_unicode(url_element.loc.text), parent_page, self.base_site, self.base_domain, DOMAINS_TO_BE_SKIPPED) if page not in self.visited_urls and page not in self.non_visited_urls \ and page not in self.intermediate_urls: print(u"Added {}".format(url_element.loc)) self.non_visited_urls.add(page) self.added_count += 1 self.page_queue.put(page) except Exception as e: logger.error(u"Error adding sitemap urls from %s " % self.sitemap_url) finally: http_client.close()
if __name__ == "__main__": # url ='http://appdynamics.com/blog/2010/09/01/application-virtualization-survey' # # link_info = extract(url) # parsed_link = u"{}.{}.{}".format(link_info.subdomain, link_info.domain, link_info.suffix) # # for skipped_domain in DOMAINS_TO_BE_SKIPPED: # if parsed_link == skipped_domain: # pass # pass args = process_parameters() base_url = decode_to_unicode(args.url) sitemap_url = decode_to_unicode(args.sitemap_url) enable_js_tests = args.testjs process_existing_urls = args.process_file url_list_file = decode_to_unicode(args.url_file) if process_existing_urls: if not url_list_file: print("Missing file containing url list, please provide one with --url-file parameter") sys.exit(1) detect_js_and_resource_issues(url_list_file) sys.exit(0) scrapper = TornadoSpider(base_url, sitemap_url) future = scrapper.initiate_crawl() IOLoop.instance().start()