Beispiel #1
0
    def rewrite(self, url_info):
        if url_info.scheme not in ('http', 'https'):
            return url_info

        if self._session_id_enabled:
            url = '{scheme}://{authority}{path}?{query}#{fragment}'.format(
                scheme=url_info.scheme,
                authority=url_info.authority,
                path=strip_path_session_id(url_info.path),
                query=strip_query_session_id(url_info.query),
                fragment=url_info.fragment,
            )
            url_info = parse_url_or_log(url) or url_info

        if self._hash_fragment_enabled and url_info.fragment.startswith('!'):
            if url_info.query:
                url = '{}&_escaped_fragment_={}'.format(
                    url_info.url, url_info.fragment[1:])
            else:
                url = '{}?_escaped_fragment_={}'.format(
                    url_info.url, url_info.fragment[1:])

            url_info = parse_url_or_log(url) or url_info

        return url_info
Beispiel #2
0
    def rewrite(self, url_info: URLInfo) -> URLInfo:
        '''Rewrite the given URL.'''
        if url_info.scheme not in ('http', 'https'):
            return url_info

        if self._session_id_enabled:
            url = '{scheme}://{authority}{path}?{query}#{fragment}'.format(
                scheme=url_info.scheme,
                authority=url_info.authority,
                path=strip_path_session_id(url_info.path),
                query=strip_query_session_id(url_info.query),
                fragment=url_info.fragment,
            )
            url_info = parse_url_or_log(url) or url_info

        if self._hash_fragment_enabled and url_info.fragment.startswith('!'):
            if url_info.query:
                url = '{}&_escaped_fragment_={}'.format(url_info.url,
                                                        url_info.fragment[1:])
            else:
                url = '{}?_escaped_fragment_={}'.format(url_info.url,
                                                        url_info.fragment[1:])

            url_info = parse_url_or_log(url) or url_info

        return url_info
Beispiel #3
0
    def check_in(self, url, new_status, *args, **kwargs):
        if new_status == Status.error and self.is_hook_connected('queued_url'):
            self._queue_counter += 1
            url_info = parse_url_or_log(url)

            if url_info:
                self.call_hook('queued_url', url_info)

        return self.url_table.check_in(url, new_status, *args, **kwargs)
Beispiel #4
0
    def check_in(self, url, new_status, increment_try_count=True,
                 url_result=None):
        if new_status == Status.error:
            self._queue_counter += 1
            url_info = parse_url_or_log(url)

            if url_info:
                self.event_dispatcher.notify(PluginFunctions.queued_url, url_info)

        return self.url_table.check_in(url, new_status, increment_try_count=increment_try_count, url_result=url_result)
Beispiel #5
0
    def add_many(self, urls):
        added_urls = tuple(self.url_table.add_many(urls))

        for url in added_urls:
            url_info = parse_url_or_log(url)
            if url_info:
                self._queue_counter += 1
                self.event_dispatcher.notify(PluginFunctions.queued_url, url_info)

        return added_urls
Beispiel #6
0
    def add_many(self, urls, **kwargs):
        added_urls = tuple(self.url_table.add_many(urls, **kwargs))

        if self.is_hook_connected('queued_url'):
            for url in added_urls:
                url_info = parse_url_or_log(url)
                if url_info:
                    self._queue_counter += 1
                    self.call_hook('queued_url', url_info)

        return added_urls
Beispiel #7
0
    def add_many(self, urls):
        added_urls = tuple(self.url_table.add_many(urls))

        for url in added_urls:
            url_info = parse_url_or_log(url)
            if url_info:
                self._queue_counter += 1
                self.event_dispatcher.notify(PluginFunctions.queued_url,
                                             url_info)

        return added_urls
Beispiel #8
0
    def add_url(self, url: str, url_properites: Optional[URLProperties]=None,
                url_data: Optional[URLData]=None):
        url_info = parse_url_or_log(url)
        if not url_info:
            return

        url_properties = url_properites or URLProperties()
        url_data = url_data or URLData()
        add_url_info = AddURLInfo(url, url_properties, url_data)

        self._add_url_batch.append(add_url_info)

        if len(self._add_url_batch) >= 1000:
            self.app_session.factory['URLTable'].add_many(self._add_url_batch)
            self._add_url_batch.clear()
Beispiel #9
0
    def check_in(self,
                 url,
                 new_status,
                 increment_try_count=True,
                 url_result=None):
        if new_status == Status.error:
            self._queue_counter += 1
            url_info = parse_url_or_log(url)

            if url_info:
                self.event_dispatcher.notify(PluginFunctions.queued_url,
                                             url_info)

        return self.url_table.check_in(url,
                                       new_status,
                                       increment_try_count=increment_try_count,
                                       url_result=url_result)
Beispiel #10
0
    def _add_listing_links(self, response):
        '''Add links from file listing response.'''
        base_url = response.request.url_info.url
        dir_urls_to_add = set()
        file_urls_to_add = set()

        if self._glob_pattern:
            level = self._url_item.url_record.level
        else:
            level = None

        for file_entry in response.files:
            if self._glob_pattern and \
                    not fnmatch.fnmatchcase(file_entry.name, self._glob_pattern):
                continue

            if file_entry.type == 'dir':
                linked_url = urljoin_safe(base_url, file_entry.name + '/')
            elif file_entry.type in ('file', 'symlink', None):
                if not self._processor.fetch_params.retr_symlinks and \
                        file_entry.type == 'symlink':
                    self._make_symlink(file_entry.name, file_entry.dest)
                    linked_url = None
                else:
                    linked_url = urljoin_safe(base_url, file_entry.name)
            else:
                linked_url = None

            if linked_url:
                linked_url_info = parse_url_or_log(linked_url)

                if linked_url_info:
                    linked_url_record = self._url_item.child_url_record(linked_url_info, level=level)

                    verdict = self._fetch_rule.check_ftp_request(
                        linked_url_info, linked_url_record)[0]

                    if verdict:
                        if linked_url_info.path.endswith('/'):
                            dir_urls_to_add.add(linked_url_info.url)
                        else:
                            file_urls_to_add.add(linked_url_info.url)

        self._url_item.add_child_urls(dir_urls_to_add, link_type=LinkType.directory)
        self._url_item.add_child_urls(file_urls_to_add, link_type=LinkType.file, level=level)
Beispiel #11
0
    def _add_hooked_url(self, url_item, new_url_dict):
        '''Process the ``dict`` from the script and add the URLs.'''
        url = new_url_dict['url']
        link_type = new_url_dict.get('link_type')
        inline = new_url_dict.get('inline')
        post_data = new_url_dict.get('post_data')
        replace = new_url_dict.get('replace')

        assert url

        url_info = parse_url_or_log(url)

        if not url_info:
            return

        kwargs = dict(link_type=link_type, post_data=post_data)

        if replace:
            url_item.url_table.remove_one(url)

        url_item.add_child_url(url_info.url, inline=inline, **kwargs)
Beispiel #12
0
    def _add_hooked_url(self, url_item, new_url_dict):
        '''Process the ``dict`` from the script and add the URLs.'''
        url = new_url_dict['url']
        link_type = new_url_dict.get('link_type')
        inline = new_url_dict.get('inline')
        post_data = new_url_dict.get('post_data')
        replace = new_url_dict.get('replace')

        assert url

        url_info = parse_url_or_log(url)

        if not url_info:
            return

        kwargs = dict(link_type=link_type, post_data=post_data)

        if replace:
            url_item.url_table.remove_one(url)

        url_item.add_child_url(url_info.url, inline=inline, **kwargs)
Beispiel #13
0
    def _process_url_item(self, url_record):
        '''Process an item.

        Args:
            url_item (:class:`.database.URLRecord`): The item to process.

        This function calls :meth:`.processor.BaseProcessor.process`.

        Coroutine.
        '''
        assert url_record

        url_info = parse_url_or_log(url_record.url)

        if not url_info:
            url_item = URLItem(self._url_table, None, url_record)
            url_item.skip()
            return

        url_item = URLItem(self._url_table, url_info, url_record)

        _logger.debug(
            __('Begin session for {0} {1}.', url_record, url_item.url_info))

        yield From(self._processor.process(url_item))

        assert url_item.is_processed

        self._statistics.mark_done(url_info)

        if self._statistics.is_quota_exceeded:
            _logger.debug('Stopping due to quota.')
            self.stop()

        _logger.debug(
            __('End session for {0} {1}.', url_item.url_record,
               url_item.url_info))
Beispiel #14
0
    def _process_url_item(self, url_record):
        '''Process an item.

        Args:
            url_item (:class:`.database.URLRecord`): The item to process.

        This function calls :meth:`.processor.BaseProcessor.process`.

        Coroutine.
        '''
        assert url_record

        url_info = parse_url_or_log(url_record.url)

        if not url_info:
            url_item = URLItem(self._url_table, None, url_record)
            url_item.skip()
            return

        url_item = URLItem(self._url_table, url_info, url_record)

        _logger.debug(__('Begin session for {0} {1}.',
                         url_record, url_item.url_info))

        yield From(self._processor.process(url_item))

        assert url_item.is_processed

        self._statistics.mark_done(url_info)

        if self._statistics.is_quota_exceeded:
            _logger.debug('Stopping due to quota.')
            self.stop()

        _logger.debug(__('End session for {0} {1}.',
                         url_item.url_record, url_item.url_info))
Beispiel #15
0
 def test_parse_url_or_log(self):
     self.assertTrue(parse_url_or_log('http://example.com'))
     self.assertFalse(parse_url_or_log('http://'))
Beispiel #16
0
 def test_parse_url_or_log(self):
     self.assertTrue(parse_url_or_log('http://example.com'))
     self.assertFalse(parse_url_or_log('http://'))