def process_get_response(self, response):
        logger.debug(u"Called {} for {} ".format('process_get_response', self.encoded_url))

        if response.error:
            logger.debug(u"Error processing  get request: {} with error : {}  ( {} )"
                         % (self.encoded_url, response.error, response.reason))
            self.failure_message = response.reason
        else:
            html_source = response.body
            html_source = decode_to_unicode(html_source)
            if self.is_page_internal():
                dom = html.fromstring(html_source)
                # logger.debug("obtained dom object for {}".format(encoded_url))

                link_count = 0
                for href_value in dom.xpath('//a/@href'):
                    href_value = decode_to_unicode(href_value)
                    logger.debug(u"Entering for loop for for {} with href {}".format(self.encoded_url, href_value))
                    self._process_hardcoded_url(href_value)
                    link = self._format_link(href_value)
                    logger.debug(u"obtained link  object{} for {}".format(link, self.encoded_url))

                    if link:
                        parsed_link = obtain_domain_with_subdomain_for_page(link)

                        if parsed_link not in self.domains_to_skip:
                            link_page = TornadoClientPage(link, self, self.base_site, self.base_domain,
                                                          self.domains_to_skip)
                            self.links.add(link_page)
                            link_page.parent = self
                            link_count += 1
        self.finalize_process(self.spider)
Beispiel #2
0
    def skip_page(self):
        parsed_link = obtain_domain_with_subdomain_for_page(self.url)

        for skipped_domain in self.domains_to_skip:
            if parsed_link == skipped_domain:
                return True

        for segment_to_skip in URL_SEGMENTS_TO_SKIP:
            if segment_to_skip in self.url:
                return True

        return False