Example #1
0
    def _process_scrape_info(self, scraper: BaseScraper,
                             scrape_result: ScrapeResult,
                             item_session: ItemSession):
        '''Collect the URLs from the scrape info dict.'''
        if not scrape_result:
            return 0, 0

        num_inline = 0
        num_linked = 0

        for link_context in scrape_result.link_contexts:
            url_info = self.parse_url(link_context.link)

            if not url_info:
                continue

            url_info = self.rewrite_url(url_info)

            child_url_record = item_session.child_url_record(
                url_info.url, inline=link_context.inline
            )
            if not self._fetch_rule.consult_filters(item_session.request.url_info, child_url_record)[0]:
                continue

            if link_context.inline:
                num_inline += 1
            else:
                num_linked += 1

            item_session.add_child_url(url_info.url, inline=link_context.inline,
                                       link_type=link_context.link_type)

        return num_inline, num_linked
Example #2
0
    def _process_scrape_info(self, scraper: BaseScraper,
                             scrape_result: ScrapeResult,
                             item_session: ItemSession):
        '''Collect the URLs from the scrape info dict.'''
        if not scrape_result:
            return 0, 0

        num_inline = 0
        num_linked = 0

        for link_context in scrape_result.link_contexts:
            url_info = self.parse_url(link_context.link)

            if not url_info:
                continue

            url_info = self.rewrite_url(url_info)

            child_url_record = item_session.child_url_record(
                url_info.url, inline=link_context.inline)
            if not self._fetch_rule.consult_filters(
                    item_session.request.url_info, child_url_record)[0]:
                continue

            if link_context.inline:
                num_inline += 1
            else:
                num_linked += 1

            item_session.add_child_url(url_info.url,
                                       inline=link_context.inline,
                                       link_type=link_context.link_type)

        return num_inline, num_linked
Example #3
0
    def my_get_urls(self, item_session: ItemSession):

        the_url = item_session.request.url

        logger.info("get_urls() for url `%s`", the_url)


        if the_url.startswith(VALKYRIE_URL_PREFIX) \
            or (the_url.startswith(CHIHIRO_URL_PREFIX) and not the_url.endswith(CHIHIRO_IMAGE_URL_SUFFIX)):

            the_type = None
            if the_url.startswith(VALKYRIE_URL_PREFIX):
                the_type = UrlType.VALKYRIE
            elif the_url.startswith(CHIHIRO_URL_PREFIX):
                the_type = UrlType.CHIHIRO
            else:
                raise Exception("unknown url prefix? `%s`", the_url)

            urls = self.process_result(the_type, item_session)

            for iter_url in urls:

                item_session.add_child_url(iter_url)
        else:

            # not one of the main JSON api urls, don't add any new urls
            logger.info("url doesn't start with JSON api prefix, or had the /image suffix, not adding any new urls")
Example #4
0
def new_mock_item_session():
    args = argparse.Namespace(directory_prefix='/tmp/')
    app_session = AppSession(None, args, None)
    url_record = new_mock_url_record()
    item_session = ItemSession(app_session, url_record)
    item_session.request = BaseRequest()
    item_session.request.url = 'http://example.com'

    return item_session
Example #5
0
    def handle_pre_response(self, item_session: ItemSession) -> Actions:
        '''Process a response that is starting.'''
        action = self.consult_pre_response_hook(item_session)

        if action == Actions.RETRY:
            item_session.set_status(Status.skipped)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')

        return action
Example #6
0
    def process(self, item_session: ItemSession):
        scheme = item_session.url_record.url_info.scheme

        processor = self._processors.get(scheme)

        if processor:
            return (yield from processor.process(item_session))
        else:
            _logger.warning(
                _('No processor available to handle {scheme} scheme.'),
                scheme=repr(scheme))
            item_session.skip()
Example #7
0
    def handle_pre_response(self, item_session: ItemSession) -> Actions:
        '''Process a response that is starting.'''
        action = self.consult_pre_response_hook(item_session)

        if action == Actions.RETRY:
            item_session.set_status(Status.skipped)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')

        return action
Example #8
0
    def process(self, item_session: ItemSession):
        scheme = item_session.url_record.url_info.scheme

        processor = self._processors.get(scheme)

        if processor:
            return (yield from processor.process(item_session))
        else:
            _logger.warning(
                _('No processor available to handle {scheme} scheme.'),
                scheme=repr(scheme)
            )
            item_session.skip()
Example #9
0
    def add_extra_urls(self, item_session: ItemSession):
        '''Add additional URLs such as robots.txt, favicon.ico.'''

        if item_session.url_record.level == 0 and self._sitemaps:
            extra_url_infos = (
                self.parse_url('{0}://{1}/robots.txt'.format(
                    item_session.url_record.url_info.scheme,
                    item_session.url_record.url_info.hostname_with_port)),
                self.parse_url('{0}://{1}/sitemap.xml'.format(
                    item_session.url_record.url_info.scheme,
                    item_session.url_record.url_info.hostname_with_port)))

            for url_info in extra_url_infos:
                item_session.add_child_url(url_info.url)
Example #10
0
    def handle_no_document(self, item_session: ItemSession) -> Actions:
        '''Callback for successful responses containing no useful document.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.reset()

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            item_session.set_status(Status.skipped)

        return action
Example #11
0
    def handle_no_document(self, item_session: ItemSession) -> Actions:
        '''Callback for successful responses containing no useful document.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.reset()

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            item_session.set_status(Status.skipped)

        return action
Example #12
0
    def get_urls(self, item_session: ItemSession):
        filename = item_session.response.body.name
        url_info = item_session.request.url_info
        print('get_urls', filename)
        assert filename
        assert os.path.isfile(filename)
        assert url_info.url

        if url_info.path == '/':
            item_session.add_child_url('http://localhost:' +
                                       str(url_info.port) + '/post/',
                                       inline=True,
                                       post_data='text=hello',
                                       replace=True)
            item_session.add_child_url('..malformed')
Example #13
0
    def handle_document(self, item_session: ItemSession, filename: str) -> Actions:
        '''Process a successful document response.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.reset()

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            self._statistics.increment(item_session.response.body.size())
            item_session.set_status(Status.done, filename=filename)

        return action
Example #14
0
    def get_urls(self, item_session: ItemSession):
        filename = item_session.response.body.name
        url_info = item_session.request.url_info
        print('get_urls', filename)
        assert filename
        assert os.path.isfile(filename)
        assert url_info.url

        if url_info.path == '/':
            item_session.add_child_url(
                'http://localhost:' + str(url_info.port) + '/post/',
                inline=True,
                post_data='text=hello',
                replace=True
            )
            item_session.add_child_url('..malformed')
Example #15
0
    def handle_document(self, item_session: ItemSession,
                        filename: str) -> Actions:
        '''Process a successful document response.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.reset()

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            self._statistics.increment(item_session.response.body.size())
            item_session.set_status(Status.done, filename=filename)

        return action
Example #16
0
    def handle_response(self, item_session: ItemSession) -> Actions:
        '''Generic handler for a response.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        action = self.consult_response_hook(item_session)

        if action == Actions.RETRY:
            item_session.set_status(Status.error)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')

        return action
Example #17
0
    def handle_document_error(self, item_session: ItemSession) -> Actions:
        '''Callback for when the document only describes an server error.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.increment()

        self._statistics.errors[ServerError] += 1

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            item_session.set_status(Status.error)

        return action
Example #18
0
    def handle_document_error(self, item_session: ItemSession) -> Actions:
        '''Callback for when the document only describes an server error.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        self._waiter.increment()

        self._statistics.errors[ServerError] += 1

        action = self.handle_response(item_session)

        if action == Actions.NORMAL:
            item_session.set_status(Status.error)

        return action
Example #19
0
    def handle_response(self, item_session: ItemSession) -> Actions:
        '''Generic handler for a response.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        action = self.consult_response_hook(item_session)

        if action == Actions.RETRY:
            item_session.set_status(Status.error)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')

        return action
Example #20
0
    def add_extra_urls(self, item_session: ItemSession):
        '''Add additional URLs such as robots.txt, favicon.ico.'''

        if item_session.url_record.level == 0 and self._sitemaps:
            extra_url_infos = (
                self.parse_url(
                    '{0}://{1}/robots.txt'.format(
                        item_session.url_record.url_info.scheme,
                        item_session.url_record.url_info.hostname_with_port)
                ),
                self.parse_url(
                    '{0}://{1}/sitemap.xml'.format(
                        item_session.url_record.url_info.scheme,
                        item_session.url_record.url_info.hostname_with_port)
                )
            )

            for url_info in extra_url_infos:
                item_session.add_child_url(url_info.url)
Example #21
0
    def handle_error(self, item_session: ItemSession,
                     error: BaseException) -> Actions:
        '''Process an error.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        if not self._ssl_verification and \
                isinstance(error, SSLVerificationError):
            # Change it into a different error since the user doesn't care
            # about verifying certificates
            self._statistics.increment_error(ProtocolError())
        else:
            self._statistics.increment_error(error)

        self._waiter.increment()

        action = self.consult_error_hook(item_session, error)

        if action == Actions.RETRY:
            item_session.set_status(Status.error)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')
        elif self._ssl_verification and isinstance(error,
                                                   SSLVerificationError):
            raise
        elif isinstance(error, ConnectionRefused) and \
                not self.retry_connrefused:
            item_session.set_status(Status.skipped)
        elif isinstance(error, DNSNotFound) and \
                not self.retry_dns_error:
            item_session.set_status(Status.skipped)
        else:
            item_session.set_status(Status.error)

        return action
Example #22
0
    def handle_error(self, item_session: ItemSession, error: BaseException) -> Actions:
        '''Process an error.

        Returns:
            A value from :class:`.hook.Actions`.
        '''
        if not self._ssl_verification and \
                isinstance(error, SSLVerificationError):
            # Change it into a different error since the user doesn't care
            # about verifying certificates
            self._statistics.increment_error(ProtocolError())
        else:
            self._statistics.increment_error(error)

        self._waiter.increment()

        action = self.consult_error_hook(item_session, error)

        if action == Actions.RETRY:
            item_session.set_status(Status.error)
        elif action == Actions.FINISH:
            item_session.set_status(Status.done)
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')
        elif self._ssl_verification and isinstance(error, SSLVerificationError):
            raise
        elif isinstance(error, ConnectionRefused) and \
                not self.retry_connrefused:
            item_session.set_status(Status.skipped)
        elif isinstance(error, DNSNotFound) and \
                not self.retry_dns_error:
            item_session.set_status(Status.skipped)
        else:
            item_session.set_status(Status.error)

        return action
Example #23
0
    def process(self, session: ItemSession):
        yield from session.app_session.factory['Processor'].process(session)

        assert session.is_processed

        session.finish()
Example #24
0
    def process(self, session: ItemSession):
        yield from session.app_session.factory['Processor'].process(session)

        assert session.is_processed

        session.finish()