def parse_seed(self, response): """ Parse a webpage from the "seed" website. """ ld = self._load_webpage_item(response, is_seed=True) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() this_domain = get_domain(response.url) for link in self._get_links(response): domain = get_domain(link.url) if is_external_url(response.url, link.url): yield self._offsite_request( response, link, count_key=(this_domain, domain), max_count=self.max_external_links_per_seed_per_domain ) else: yield self._onsite_request( response, link, callback=self.parse, max_depth=self.max_depth_seed, count_key=domain, max_count=self.max_internal_links_per_seed, )
def parse_external(self, response): """ Parse a webpage from an external website. """ ld = self._load_webpage_item(response, is_seed=False) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() for link in self._get_links(response): domain = get_domain(link.url) if is_external_url(response.url, link.url): # total number of hops is limited by settings.DEPTH_LIMIT yield self._offsite_request( response, link, count_key=domain, max_count=self.max_external_links_per_domain ) else: yield self._onsite_request( response, link, callback=self.parse_external, max_depth=self.max_depth_external, count_key=domain, max_count=self.max_external_links_per_domain )
def parse(self, response): if 'referrer_url' in response.meta: if is_external_url(response.url, response.meta['referrer_url']): # When we follow a link and it redirects to another domain # consider it external even if the link url was on-site. response.meta['link_depth'] = 0 return self.parse_external(response) return self.parse_seed(response)