def _process_scrape_info(self, scraper: BaseScraper, scrape_result: ScrapeResult, item_session: ItemSession): '''Collect the URLs from the scrape info dict.''' if not scrape_result: return 0, 0 num_inline = 0 num_linked = 0 for link_context in scrape_result.link_contexts: url_info = self.parse_url(link_context.link) if not url_info: continue url_info = self.rewrite_url(url_info) child_url_record = item_session.child_url_record( url_info.url, inline=link_context.inline ) if not self._fetch_rule.consult_filters(item_session.request.url_info, child_url_record)[0]: continue if link_context.inline: num_inline += 1 else: num_linked += 1 item_session.add_child_url(url_info.url, inline=link_context.inline, link_type=link_context.link_type) return num_inline, num_linked
def _process_scrape_info(self, scraper: BaseScraper, scrape_result: ScrapeResult, item_session: ItemSession): '''Collect the URLs from the scrape info dict.''' if not scrape_result: return 0, 0 num_inline = 0 num_linked = 0 for link_context in scrape_result.link_contexts: url_info = self.parse_url(link_context.link) if not url_info: continue url_info = self.rewrite_url(url_info) child_url_record = item_session.child_url_record( url_info.url, inline=link_context.inline) if not self._fetch_rule.consult_filters( item_session.request.url_info, child_url_record)[0]: continue if link_context.inline: num_inline += 1 else: num_linked += 1 item_session.add_child_url(url_info.url, inline=link_context.inline, link_type=link_context.link_type) return num_inline, num_linked
def my_get_urls(self, item_session: ItemSession): the_url = item_session.request.url logger.info("get_urls() for url `%s`", the_url) if the_url.startswith(VALKYRIE_URL_PREFIX) \ or (the_url.startswith(CHIHIRO_URL_PREFIX) and not the_url.endswith(CHIHIRO_IMAGE_URL_SUFFIX)): the_type = None if the_url.startswith(VALKYRIE_URL_PREFIX): the_type = UrlType.VALKYRIE elif the_url.startswith(CHIHIRO_URL_PREFIX): the_type = UrlType.CHIHIRO else: raise Exception("unknown url prefix? `%s`", the_url) urls = self.process_result(the_type, item_session) for iter_url in urls: item_session.add_child_url(iter_url) else: # not one of the main JSON api urls, don't add any new urls logger.info("url doesn't start with JSON api prefix, or had the /image suffix, not adding any new urls")
def new_mock_item_session(): args = argparse.Namespace(directory_prefix='/tmp/') app_session = AppSession(None, args, None) url_record = new_mock_url_record() item_session = ItemSession(app_session, url_record) item_session.request = BaseRequest() item_session.request.url = 'http://example.com' return item_session
def handle_pre_response(self, item_session: ItemSession) -> Actions: '''Process a response that is starting.''' action = self.consult_pre_response_hook(item_session) if action == Actions.RETRY: item_session.set_status(Status.skipped) elif action == Actions.FINISH: item_session.set_status(Status.done) elif action == Actions.STOP: raise HookStop('Script requested immediate stop.') return action
def process(self, item_session: ItemSession): scheme = item_session.url_record.url_info.scheme processor = self._processors.get(scheme) if processor: return (yield from processor.process(item_session)) else: _logger.warning( _('No processor available to handle {scheme} scheme.'), scheme=repr(scheme)) item_session.skip()
def process(self, item_session: ItemSession): scheme = item_session.url_record.url_info.scheme processor = self._processors.get(scheme) if processor: return (yield from processor.process(item_session)) else: _logger.warning( _('No processor available to handle {scheme} scheme.'), scheme=repr(scheme) ) item_session.skip()
def add_extra_urls(self, item_session: ItemSession): '''Add additional URLs such as robots.txt, favicon.ico.''' if item_session.url_record.level == 0 and self._sitemaps: extra_url_infos = ( self.parse_url('{0}://{1}/robots.txt'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port)), self.parse_url('{0}://{1}/sitemap.xml'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port))) for url_info in extra_url_infos: item_session.add_child_url(url_info.url)
def handle_no_document(self, item_session: ItemSession) -> Actions: '''Callback for successful responses containing no useful document. Returns: A value from :class:`.hook.Actions`. ''' self._waiter.reset() action = self.handle_response(item_session) if action == Actions.NORMAL: item_session.set_status(Status.skipped) return action
def get_urls(self, item_session: ItemSession): filename = item_session.response.body.name url_info = item_session.request.url_info print('get_urls', filename) assert filename assert os.path.isfile(filename) assert url_info.url if url_info.path == '/': item_session.add_child_url('http://localhost:' + str(url_info.port) + '/post/', inline=True, post_data='text=hello', replace=True) item_session.add_child_url('..malformed')
def handle_document(self, item_session: ItemSession, filename: str) -> Actions: '''Process a successful document response. Returns: A value from :class:`.hook.Actions`. ''' self._waiter.reset() action = self.handle_response(item_session) if action == Actions.NORMAL: self._statistics.increment(item_session.response.body.size()) item_session.set_status(Status.done, filename=filename) return action
def get_urls(self, item_session: ItemSession): filename = item_session.response.body.name url_info = item_session.request.url_info print('get_urls', filename) assert filename assert os.path.isfile(filename) assert url_info.url if url_info.path == '/': item_session.add_child_url( 'http://localhost:' + str(url_info.port) + '/post/', inline=True, post_data='text=hello', replace=True ) item_session.add_child_url('..malformed')
def handle_response(self, item_session: ItemSession) -> Actions: '''Generic handler for a response. Returns: A value from :class:`.hook.Actions`. ''' action = self.consult_response_hook(item_session) if action == Actions.RETRY: item_session.set_status(Status.error) elif action == Actions.FINISH: item_session.set_status(Status.done) elif action == Actions.STOP: raise HookStop('Script requested immediate stop.') return action
def handle_document_error(self, item_session: ItemSession) -> Actions: '''Callback for when the document only describes an server error. Returns: A value from :class:`.hook.Actions`. ''' self._waiter.increment() self._statistics.errors[ServerError] += 1 action = self.handle_response(item_session) if action == Actions.NORMAL: item_session.set_status(Status.error) return action
def add_extra_urls(self, item_session: ItemSession): '''Add additional URLs such as robots.txt, favicon.ico.''' if item_session.url_record.level == 0 and self._sitemaps: extra_url_infos = ( self.parse_url( '{0}://{1}/robots.txt'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port) ), self.parse_url( '{0}://{1}/sitemap.xml'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port) ) ) for url_info in extra_url_infos: item_session.add_child_url(url_info.url)
def handle_error(self, item_session: ItemSession, error: BaseException) -> Actions: '''Process an error. Returns: A value from :class:`.hook.Actions`. ''' if not self._ssl_verification and \ isinstance(error, SSLVerificationError): # Change it into a different error since the user doesn't care # about verifying certificates self._statistics.increment_error(ProtocolError()) else: self._statistics.increment_error(error) self._waiter.increment() action = self.consult_error_hook(item_session, error) if action == Actions.RETRY: item_session.set_status(Status.error) elif action == Actions.FINISH: item_session.set_status(Status.done) elif action == Actions.STOP: raise HookStop('Script requested immediate stop.') elif self._ssl_verification and isinstance(error, SSLVerificationError): raise elif isinstance(error, ConnectionRefused) and \ not self.retry_connrefused: item_session.set_status(Status.skipped) elif isinstance(error, DNSNotFound) and \ not self.retry_dns_error: item_session.set_status(Status.skipped) else: item_session.set_status(Status.error) return action
def process(self, session: ItemSession): yield from session.app_session.factory['Processor'].process(session) assert session.is_processed session.finish()