def parse_seed(self, response): """ Parse a webpage from the "seed" website. """ ld = self._load_webpage_item(response, is_seed=True) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() this_domain = get_domain(response.url) for link in self._get_links(response): domain = get_domain(link.url) if is_external_url(response.url, link.url): yield self._offsite_request( response, link, count_key=(this_domain, domain), max_count=self.max_external_links_per_seed_per_domain ) else: yield self._onsite_request( response, link, callback=self.parse, max_depth=self.max_depth_seed, count_key=domain, max_count=self.max_internal_links_per_seed, )
def parse_external(self, response): """ Parse a webpage from an external website. """ ld = self._load_webpage_item(response, is_seed=False) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() for link in self._get_links(response): domain = get_domain(link.url) if is_external_url(response.url, link.url): # total number of hops is limited by settings.DEPTH_LIMIT yield self._offsite_request( response, link, count_key=domain, max_count=self.max_external_links_per_domain ) else: yield self._onsite_request( response, link, callback=self.parse_external, max_depth=self.max_depth_external, count_key=domain, max_count=self.max_external_links_per_domain )
def _process_splash_response(self, response, splash_response, ld): data = json.loads(splash_response.body, encoding='utf8') screenshot_path = self._save_screenshot(get_domain(response.url), data) ld.add_value('screenshot_path', screenshot_path) if self.save_html: ld.add_value('html_rendered', data['html'])
def _process_splash_response(self, response, ld): screenshot_path = save_screenshot( screenshot_dir=self.screenshot_dir, prefix=get_domain(response.url), png=base64.b64decode(response.meta['splash_response']['png']), ) ld.add_value('screenshot_path', screenshot_path) if self.save_html: ld.add_value('html_rendered', response.meta['splash_response']['html'])
def _load_webpage_item(self, response, is_seed): depth = response.meta.get('link_depth', 0) ld = WebpageItemLoader(response=response) ld.add_value('url', response.url) ld.add_value('host', get_domain(response.url)) ld.add_xpath('title', '//title/text()') ld.add_value('depth', depth) ld.add_value('total_depth', response.meta.get('depth')) ld.add_value('crawled_at', datetime.datetime.utcnow()) ld.add_value('is_seed', is_seed) if self.save_html: ld.add_value('html', response.body_as_unicode()) if 'link' in response.meta: link = response.meta['link'] ld.add_value('link_text', link.text) ld.add_value('link_url', link.url) ld.add_value('referrer_url', response.meta['referrer_url']) ld.add_value('referrer_depth', response.meta['referrer_depth']) return ld