Ejemplo n.º 1
0
    def parse_seed(self, response):
        """
        Parse a webpage from the "seed" website.
        """
        ld = self._load_webpage_item(response, is_seed=True)

        if self.use_splash:
            self._process_splash_response(response, ld)

        yield ld.load_item()

        this_domain = get_domain(response.url)

        for link in self._get_links(response):
            domain = get_domain(link.url)

            if is_external_url(response.url, link.url):
                yield self._offsite_request(
                    response, link,
                    count_key=(this_domain, domain),
                    max_count=self.max_external_links_per_seed_per_domain
                )
            else:
                yield self._onsite_request(
                    response, link,
                    callback=self.parse,
                    max_depth=self.max_depth_seed,
                    count_key=domain,
                    max_count=self.max_internal_links_per_seed,
                )
Ejemplo n.º 2
0
    def parse_external(self, response):
        """
        Parse a webpage from an external website.
        """
        ld = self._load_webpage_item(response, is_seed=False)

        if self.use_splash:
            self._process_splash_response(response, ld)

        yield ld.load_item()

        for link in self._get_links(response):
            domain = get_domain(link.url)

            if is_external_url(response.url, link.url):
                # total number of hops is limited by settings.DEPTH_LIMIT
                yield self._offsite_request(
                    response, link,
                    count_key=domain,
                    max_count=self.max_external_links_per_domain
                )
            else:
                yield self._onsite_request(
                    response, link,
                    callback=self.parse_external,
                    max_depth=self.max_depth_external,
                    count_key=domain,
                    max_count=self.max_external_links_per_domain
                )
Ejemplo n.º 3
0
    def _process_splash_response(self, response, splash_response, ld):
        data = json.loads(splash_response.body, encoding='utf8')

        screenshot_path = self._save_screenshot(get_domain(response.url), data)
        ld.add_value('screenshot_path', screenshot_path)

        if self.save_html:
            ld.add_value('html_rendered', data['html'])
Ejemplo n.º 4
0
    def _process_splash_response(self, response, ld):
        screenshot_path = save_screenshot(
            screenshot_dir=self.screenshot_dir,
            prefix=get_domain(response.url),
            png=base64.b64decode(response.meta['splash_response']['png']),
        )
        ld.add_value('screenshot_path', screenshot_path)

        if self.save_html:
            ld.add_value('html_rendered', response.meta['splash_response']['html'])
Ejemplo n.º 5
0
    def _load_webpage_item(self, response, is_seed):
        depth = response.meta.get('link_depth', 0)
        ld = WebpageItemLoader(response=response)
        ld.add_value('url', response.url)
        ld.add_value('host', get_domain(response.url))
        ld.add_xpath('title', '//title/text()')
        ld.add_value('depth', depth)
        ld.add_value('total_depth', response.meta.get('depth'))
        ld.add_value('crawled_at', datetime.datetime.utcnow())
        ld.add_value('is_seed', is_seed)

        if self.save_html:
            ld.add_value('html', response.body_as_unicode())

        if 'link' in response.meta:
            link = response.meta['link']
            ld.add_value('link_text', link.text)
            ld.add_value('link_url', link.url)
            ld.add_value('referrer_url', response.meta['referrer_url'])
            ld.add_value('referrer_depth', response.meta['referrer_depth'])
        return ld