def rewrite(self, url_info): if url_info.scheme not in ('http', 'https'): return url_info if self._session_id_enabled: url = '{scheme}://{authority}{path}?{query}#{fragment}'.format( scheme=url_info.scheme, authority=url_info.authority, path=strip_path_session_id(url_info.path), query=strip_query_session_id(url_info.query), fragment=url_info.fragment, ) url_info = parse_url_or_log(url) or url_info if self._hash_fragment_enabled and url_info.fragment.startswith('!'): if url_info.query: url = '{}&_escaped_fragment_={}'.format( url_info.url, url_info.fragment[1:]) else: url = '{}?_escaped_fragment_={}'.format( url_info.url, url_info.fragment[1:]) url_info = parse_url_or_log(url) or url_info return url_info
def rewrite(self, url_info: URLInfo) -> URLInfo: '''Rewrite the given URL.''' if url_info.scheme not in ('http', 'https'): return url_info if self._session_id_enabled: url = '{scheme}://{authority}{path}?{query}#{fragment}'.format( scheme=url_info.scheme, authority=url_info.authority, path=strip_path_session_id(url_info.path), query=strip_query_session_id(url_info.query), fragment=url_info.fragment, ) url_info = parse_url_or_log(url) or url_info if self._hash_fragment_enabled and url_info.fragment.startswith('!'): if url_info.query: url = '{}&_escaped_fragment_={}'.format(url_info.url, url_info.fragment[1:]) else: url = '{}?_escaped_fragment_={}'.format(url_info.url, url_info.fragment[1:]) url_info = parse_url_or_log(url) or url_info return url_info
def check_in(self, url, new_status, *args, **kwargs): if new_status == Status.error and self.is_hook_connected('queued_url'): self._queue_counter += 1 url_info = parse_url_or_log(url) if url_info: self.call_hook('queued_url', url_info) return self.url_table.check_in(url, new_status, *args, **kwargs)
def check_in(self, url, new_status, increment_try_count=True, url_result=None): if new_status == Status.error: self._queue_counter += 1 url_info = parse_url_or_log(url) if url_info: self.event_dispatcher.notify(PluginFunctions.queued_url, url_info) return self.url_table.check_in(url, new_status, increment_try_count=increment_try_count, url_result=url_result)
def add_many(self, urls): added_urls = tuple(self.url_table.add_many(urls)) for url in added_urls: url_info = parse_url_or_log(url) if url_info: self._queue_counter += 1 self.event_dispatcher.notify(PluginFunctions.queued_url, url_info) return added_urls
def add_many(self, urls, **kwargs): added_urls = tuple(self.url_table.add_many(urls, **kwargs)) if self.is_hook_connected('queued_url'): for url in added_urls: url_info = parse_url_or_log(url) if url_info: self._queue_counter += 1 self.call_hook('queued_url', url_info) return added_urls
def add_url(self, url: str, url_properites: Optional[URLProperties]=None, url_data: Optional[URLData]=None): url_info = parse_url_or_log(url) if not url_info: return url_properties = url_properites or URLProperties() url_data = url_data or URLData() add_url_info = AddURLInfo(url, url_properties, url_data) self._add_url_batch.append(add_url_info) if len(self._add_url_batch) >= 1000: self.app_session.factory['URLTable'].add_many(self._add_url_batch) self._add_url_batch.clear()
def _add_listing_links(self, response): '''Add links from file listing response.''' base_url = response.request.url_info.url dir_urls_to_add = set() file_urls_to_add = set() if self._glob_pattern: level = self._url_item.url_record.level else: level = None for file_entry in response.files: if self._glob_pattern and \ not fnmatch.fnmatchcase(file_entry.name, self._glob_pattern): continue if file_entry.type == 'dir': linked_url = urljoin_safe(base_url, file_entry.name + '/') elif file_entry.type in ('file', 'symlink', None): if not self._processor.fetch_params.retr_symlinks and \ file_entry.type == 'symlink': self._make_symlink(file_entry.name, file_entry.dest) linked_url = None else: linked_url = urljoin_safe(base_url, file_entry.name) else: linked_url = None if linked_url: linked_url_info = parse_url_or_log(linked_url) if linked_url_info: linked_url_record = self._url_item.child_url_record(linked_url_info, level=level) verdict = self._fetch_rule.check_ftp_request( linked_url_info, linked_url_record)[0] if verdict: if linked_url_info.path.endswith('/'): dir_urls_to_add.add(linked_url_info.url) else: file_urls_to_add.add(linked_url_info.url) self._url_item.add_child_urls(dir_urls_to_add, link_type=LinkType.directory) self._url_item.add_child_urls(file_urls_to_add, link_type=LinkType.file, level=level)
def _add_hooked_url(self, url_item, new_url_dict): '''Process the ``dict`` from the script and add the URLs.''' url = new_url_dict['url'] link_type = new_url_dict.get('link_type') inline = new_url_dict.get('inline') post_data = new_url_dict.get('post_data') replace = new_url_dict.get('replace') assert url url_info = parse_url_or_log(url) if not url_info: return kwargs = dict(link_type=link_type, post_data=post_data) if replace: url_item.url_table.remove_one(url) url_item.add_child_url(url_info.url, inline=inline, **kwargs)
def _process_url_item(self, url_record): '''Process an item. Args: url_item (:class:`.database.URLRecord`): The item to process. This function calls :meth:`.processor.BaseProcessor.process`. Coroutine. ''' assert url_record url_info = parse_url_or_log(url_record.url) if not url_info: url_item = URLItem(self._url_table, None, url_record) url_item.skip() return url_item = URLItem(self._url_table, url_info, url_record) _logger.debug( __('Begin session for {0} {1}.', url_record, url_item.url_info)) yield From(self._processor.process(url_item)) assert url_item.is_processed self._statistics.mark_done(url_info) if self._statistics.is_quota_exceeded: _logger.debug('Stopping due to quota.') self.stop() _logger.debug( __('End session for {0} {1}.', url_item.url_record, url_item.url_info))
def _process_url_item(self, url_record): '''Process an item. Args: url_item (:class:`.database.URLRecord`): The item to process. This function calls :meth:`.processor.BaseProcessor.process`. Coroutine. ''' assert url_record url_info = parse_url_or_log(url_record.url) if not url_info: url_item = URLItem(self._url_table, None, url_record) url_item.skip() return url_item = URLItem(self._url_table, url_info, url_record) _logger.debug(__('Begin session for {0} {1}.', url_record, url_item.url_info)) yield From(self._processor.process(url_item)) assert url_item.is_processed self._statistics.mark_done(url_info) if self._statistics.is_quota_exceeded: _logger.debug('Stopping due to quota.') self.stop() _logger.debug(__('End session for {0} {1}.', url_item.url_record, url_item.url_info))
def test_parse_url_or_log(self): self.assertTrue(parse_url_or_log('http://example.com')) self.assertFalse(parse_url_or_log('http://'))