Beispiel #1
0
    def _scrape_document(self, request, response, url_item):
        to_native = self.to_script_native_type
        url_info_dict = to_native(request.url_info.to_dict())
        document_info_dict = to_native(response.body.to_dict())
        filename = to_native(response.body.content_file.name)

        new_url_dicts = self.callbacks.get_urls(
            filename, url_info_dict, document_info_dict)

        _logger.debug(__('Hooked scrape returned {0}', new_url_dicts))

        if not new_url_dicts:
            return

        if to_native(1) in new_url_dicts:
            # Lua doesn't have sequences
            for i in itertools.count(1):
                new_url_dict = new_url_dicts[to_native(i)]

                _logger.debug(__('Got lua new url info {0}', new_url_dict))

                if new_url_dict is None:
                    break

                self._add_hooked_url(url_item, new_url_dict)
        else:
            for new_url_dict in new_url_dicts:
                self._add_hooked_url(url_item, new_url_dict)
Beispiel #2
0
    def _read_input_urls(cls, session: AppSession, default_scheme='http'):
        '''Read the URLs provided by the user.'''

        url_string_iter = session.args.urls or ()
        # FIXME: url rewriter isn't created yet
        url_rewriter = session.factory.get('URLRewriter')

        if session.args.input_file:
            if session.args.force_html:
                lines = cls._input_file_as_html_links(session)
            else:
                lines = cls._input_file_as_lines(session)

            url_string_iter = itertools.chain(url_string_iter, lines)

        base_url = session.args.base

        for url_string in url_string_iter:
            _logger.debug(__('Parsing URL {0}', url_string))

            if base_url:
                url_string = wpull.url.urljoin(base_url, url_string)

            url_info = wpull.url.URLInfo.parse(
                url_string, default_scheme=default_scheme)

            _logger.debug(__('Parsed URL {0}', url_info))

            if url_rewriter:
                # TODO: this logic should be a hook
                url_info = url_rewriter.rewrite(url_info)
                _logger.debug(__('Rewritten URL {0}', url_info))

            yield url_info
Beispiel #3
0
    def resolve_all(self, host, port=0):
        '''Resolve hostname and return a list of results.

        Args:
            host (str): The hostname.
            port (int): The port number.

        Returns:
            list: A list of tuples where each tuple contains the family and
            the socket address. See :method:`resolve` for the socket address
            format.
        '''
        _logger.debug(__('Lookup address {0} {1}.', host, port))

        host = self._lookup_hook(host, port)
        results = None

        if self._cache:
            results = self._get_cache(host, port, self._family)

        if results is None:
            results = yield From(self._resolve_from_network(host, port))

        if self._cache:
            self._put_cache(host, port, results)

        if not results:
            raise DNSNotFound(
                "DNS resolution for {0} did not return any results."
                .format(repr(host))
            )

        _logger.debug(__('Resolved addresses: {0}.', results))

        raise Return(results)
Beispiel #4
0
    def process(self, session: AppSession):
        self._debug_log_registered_hooks(session)
        internal_plugin_path = get_package_filename(os.path.join('application', 'plugins'))
        plugin_locations = [internal_plugin_path]

        plugin_filenames = []

        if session.args.plugin_script:
            plugin_filenames.append(session.args.plugin_script)

        locator = PluginLocator(plugin_locations, plugin_filenames)

        session.plugin_manager = PluginManager(plugin_locator=locator)
        session.plugin_manager.collectPlugins()

        for plugin_info in session.plugin_manager.getAllPlugins():
            if plugin_info.path.startswith(internal_plugin_path):
                _logger.debug(__(
                    _('Found plugin {name} from {filename}.'),
                    filename=plugin_info.path,
                    name=plugin_info.name
                ))
            else:
                _logger.info(__(
                    _('Found plugin {name} from {filename}.'),
                    filename=plugin_info.path,
                    name=plugin_info.name
                ))

            plugin_info.plugin_object.app_session = session

            if plugin_info.plugin_object.should_activate():
                session.plugin_manager.activatePluginByName(plugin_info.name)
                self._connect_plugin_hooks(session, plugin_info.plugin_object)
Beispiel #5
0
    def process(self, item_session: ItemSession, request, response, file_writer_session):
        '''Process PhantomJS.

        Coroutine.
        '''
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        self._file_writer_session = file_writer_session

        # FIXME: this is a quick hack for crashes. See #137.
        attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))

        for dummy in range(attempts):
            try:
                yield from self._run_driver(item_session, request, response)
            except asyncio.TimeoutError:
                _logger.warning(_('Waiting for page load timed out.'))
                break
            except PhantomJSCrashed as error:
                _logger.exception(__('PhantomJS crashed: {}', error))
            else:
                break
        else:
            _logger.warning(__(
                _('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
                url=request.url_info.url
            ))
Beispiel #6
0
    def _check_resource_monitor(self):
        if not self._resource_monitor:
            return

        for counter in itertools.count():
            resource_info = self._resource_monitor.check()

            if not resource_info:
                if counter:
                    _logger.info(_('Situation cleared.'))
                break

            if counter % 15 == 0:
                if resource_info.path:
                    _logger.warning(
                        __(_('Low disk space on {path} ({size} free).'),
                           path=resource_info.path,
                           size=wpull.string.format_size(resource_info.free)))
                else:
                    _logger.warning(
                        __(_('Low memory ({size} free).'),
                           size=wpull.string.format_size(resource_info.free)))

                _logger.warning(_('Waiting for operator to clear situation.'))

            yield From(trollius.sleep(60))
Beispiel #7
0
    def _check_resource_monitor(self):
        if not self._resource_monitor:
            return

        for counter in itertools.count():
            resource_info = self._resource_monitor.check()

            if not resource_info:
                if counter:
                    _logger.info(_('Situation cleared.'))
                break

            if counter % 15 == 0:
                if resource_info.path:
                    _logger.warning(__(
                        _('Low disk space on {path} ({size} free).'),
                        path=resource_info.path,
                        size=wpull.string.format_size(resource_info.free)
                    ))
                else:
                    _logger.warning(__(
                        _('Low memory ({size} free).'),
                        size=wpull.string.format_size(resource_info.free)
                    ))

                _logger.warning(_('Waiting for operator to clear situation.'))

            yield From(trollius.sleep(60))
Beispiel #8
0
    def _make_socket(self):
        '''Make and wrap the socket with an IOStream.'''
        host, port = self._original_address

        family, self._resolved_address = yield self._resolver.resolve(
            host, port)

        self._socket = socket.socket(family, socket.SOCK_STREAM)

        _logger.debug(__('Socket to {0}/{1}.', family, self._resolved_address))

        if self._params.bind_address:
            _logger.debug(__(
                'Binding socket to {0}', self._params.bind_address
            ))
            self._socket.bind(self._params.bind_address)

        if self._ssl:
            self._io_stream = SSLIOStream(
                self._socket,
                max_buffer_size=self._params.buffer_size,
                rw_timeout=self._params.read_timeout,
                ssl_options=self._params.ssl_options or {},
                server_hostname=host,
            )
        else:
            self._io_stream = IOStream(
                self._socket,
                rw_timeout=self._params.read_timeout,
                max_buffer_size=self._params.buffer_size,
            )

        self._io_stream.set_close_callback(self._stream_closed_callback)
Beispiel #9
0
    def resolve_all(self, host, port=0):
        '''Resolve hostname and return a list of results.

        Args:
            host (str): The hostname.
            port (int): The port number.

        Returns:
            list: A list of tuples where each tuple contains the family and
            the socket address. See :method:`resolve` for the socket address
            format.
        '''
        _logger.debug(__('Lookup address {0} {1}.', host, port))

        host = self._lookup_hook(host, port)
        results = None

        if self._cache:
            results = self._get_cache(host, port, self._family)

        if results is None:
            results = yield From(self._resolve_from_network(host, port))

        if self._cache:
            self._put_cache(host, port, results)

        if not results:
            raise DNSNotFound(
                "DNS resolution for {0} did not return any results.".format(
                    repr(host)))

        _logger.debug(__('Resolved addresses: {0}.', results))

        raise Return(results)
Beispiel #10
0
    def _read_input_urls(cls, session: AppSession, default_scheme='http'):
        '''Read the URLs provided by the user.'''

        url_string_iter = session.args.urls or ()
        # FIXME: url rewriter isn't created yet
        url_rewriter = session.factory.get('URLRewriter')

        if session.args.input_file:
            if session.args.force_html:
                lines = cls._input_file_as_html_links(session)
            else:
                lines = cls._input_file_as_lines(session)

            url_string_iter = itertools.chain(url_string_iter, lines)

        base_url = session.args.base

        for url_string in url_string_iter:
            _logger.debug(__('Parsing URL {0}', url_string))

            if base_url:
                url_string = wpull.url.urljoin(base_url, url_string)

            url_info = wpull.url.URLInfo.parse(url_string,
                                               default_scheme=default_scheme)

            _logger.debug(__('Parsed URL {0}', url_info))

            if url_rewriter:
                # TODO: this logic should be a hook
                url_info = url_rewriter.rewrite(url_info)
                _logger.debug(__('Rewritten URL {0}', url_info))

            yield url_info
Beispiel #11
0
    def process(self, session: AppSession):
        self._debug_log_registered_hooks(session)
        internal_plugin_path = get_package_filename(
            os.path.join('application', 'plugins'))
        plugin_locations = [internal_plugin_path]

        plugin_filenames = []

        if session.args.plugin_script:
            plugin_filenames.append(session.args.plugin_script)

        locator = PluginLocator(plugin_locations, plugin_filenames)

        session.plugin_manager = PluginManager(plugin_locator=locator)
        session.plugin_manager.collectPlugins()

        for plugin_info in session.plugin_manager.getAllPlugins():
            if plugin_info.path.startswith(internal_plugin_path):
                _logger.debug(
                    __(_('Found plugin {name} from {filename}.'),
                       filename=plugin_info.path,
                       name=plugin_info.name))
            else:
                _logger.info(
                    __(_('Found plugin {name} from {filename}.'),
                       filename=plugin_info.path,
                       name=plugin_info.name))

            plugin_info.plugin_object.app_session = session

            if plugin_info.plugin_object.should_activate():
                session.plugin_manager.activatePluginByName(plugin_info.name)
                self._connect_plugin_hooks(session, plugin_info.plugin_object)
Beispiel #12
0
    def process(self, url_item, request, response, file_writer_session):
        '''Process PhantomJS.

        Coroutine.
        '''
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        self._file_writer_session = file_writer_session

        # FIXME: this is a quick hack for crashes. See #137.
        attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))

        for dummy in range(attempts):
            try:
                yield From(self._run_driver(url_item, request, response))
            except trollius.TimeoutError:
                _logger.warning(_('Waiting for page load timed out.'))
                break
            except PhantomJSCrashed as error:
                _logger.exception(__('PhantomJS crashed: {}', error))
            else:
                break
        else:
            _logger.warning(
                __(_('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
                   url=request.url_info.url))
Beispiel #13
0
    def snapshot(self, remote, html_path=None, render_path=None):
        '''Take HTML and PDF snapshot.'''
        content = yield remote.eval('page.content')
        url = yield remote.eval('page.url')

        if html_path:
            _logger.debug(__('Saving snapshot to {0}.', html_path))
            dir_path = os.path.abspath(os.path.dirname(html_path))

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)

            with open(html_path, 'wb') as out_file:
                out_file.write(content.encode('utf-8'))

            if self._warc_recorder:
                self._add_warc_snapshot(html_path, 'text/html', url)

        if render_path:
            _logger.debug(__('Saving snapshot to {0}.', render_path))
            yield remote.call('page.render', render_path)

            if self._warc_recorder:
                self._add_warc_snapshot(render_path, 'application/pdf', url)

        raise tornado.gen.Return(content)
Beispiel #14
0
    def _print_stats(cls, stats: Statistics, human_format_speed: bool = True):
        '''Log the final statistics to the user.'''
        time_length = datetime.timedelta(seconds=int(stats.stop_time -
                                                     stats.start_time))
        file_size = wpull.string.format_size(stats.size)

        if stats.bandwidth_meter.num_samples:
            speed = stats.bandwidth_meter.speed()

            if human_format_speed:
                speed_size_str = wpull.string.format_size(speed)
            else:
                speed_size_str = '{:.1f} b'.format(speed * 8)
        else:
            speed_size_str = _('-- B')

        _logger.info(_('FINISHED.'))
        _logger.info(
            __(
                _('Duration: {preformatted_timedelta}. '
                  'Speed: {preformatted_speed_size}/s.'),
                preformatted_timedelta=time_length,
                preformatted_speed_size=speed_size_str,
            ))
        _logger.info(
            __(gettext.ngettext(
                'Downloaded: {num_files} file, {preformatted_file_size}.',
                'Downloaded: {num_files} files, {preformatted_file_size}.',
                stats.files),
               num_files=stats.files,
               preformatted_file_size=file_size))

        if stats.is_quota_exceeded:
            _logger.info(_('Download quota exceeded.'))
Beispiel #15
0
    def _polling_sleep(cls, resource_monitor, log=False):
        for counter in itertools.count():
            resource_info = resource_monitor.check()

            if not resource_info:
                if log and counter:
                    _logger.info(_('Situation cleared.'))

                break

            if log and counter % 15 == 0:
                if resource_info.path:
                    _logger.warning(__(
                        _('Low disk space on {path} ({size} free).'),
                        path=resource_info.path,
                        size=wpull.string.format_size(resource_info.free)
                    ))
                else:
                    _logger.warning(__(
                        _('Low memory ({size} free).'),
                        size=wpull.string.format_size(resource_info.free)
                    ))

                _logger.warning(_('Waiting for operator to clear situation.'))

            yield from asyncio.sleep(60)
Beispiel #16
0
    def control(self, remote):
        '''Scroll the page.'''
        num_scrolls = self._num_scrolls

        if self._smart_scroll:
            is_page_dynamic = yield remote.call('isPageDynamic')

            if not is_page_dynamic:
                num_scrolls = 0

        url = yield remote.eval('page.url')
        total_scroll_count = 0

        for scroll_count in range(num_scrolls):
            _logger.debug(__('Scrolling page. Count={0}.', scroll_count))

            pre_scroll_counter_values = remote.resource_counter.values()

            scroll_position = yield remote.eval('page.scrollPosition')
            scroll_position['top'] += self._viewport_size[1]

            yield self.scroll_to(remote, 0, scroll_position['top'])

            total_scroll_count += 1

            self._log_action('wait', self._wait_time)
            yield wpull.async.sleep(self._wait_time)

            post_scroll_counter_values = remote.resource_counter.values()

            _logger.debug(__(
                'Counter values pre={0} post={1}',
                pre_scroll_counter_values,
                post_scroll_counter_values
            ))

            if post_scroll_counter_values == pre_scroll_counter_values \
               and self._smart_scroll:
                break

        for dummy in range(remote.resource_counter.pending):
            if remote.resource_counter.pending:
                self._log_action('wait', self._wait_time)
                yield wpull.async.sleep(self._wait_time)
            else:
                break

        yield self.scroll_to(remote, 0, 0)

        _logger.info(__(
            gettext.ngettext(
                'Scrolled page {num} time.',
                'Scrolled page {num} times.',
                total_scroll_count,
            ), num=total_scroll_count
        ))

        if self._warc_recorder:
            self._add_warc_action_log(url)
Beispiel #17
0
    def run(self):
        scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html')
        action_log_path = self._get_temp_path('phantom-action', suffix='.txt')
        event_log_path = self._get_temp_path('phantom-event', suffix='.txt')
        snapshot_paths = [scrape_snapshot_path]
        snapshot_paths.extend(self._get_snapshot_paths())
        url = self._item_session.url_record.url

        driver_params = PhantomJSDriverParams(
            url=url,
            snapshot_paths=snapshot_paths,
            wait_time=self._params.wait_time,
            num_scrolls=self._params.num_scrolls,
            smart_scroll=self._params.smart_scroll,
            snapshot=self._params.snapshot,
            viewport_size=self._params.viewport_size,
            paper_size=self._params.paper_size,
            event_log_filename=event_log_path,
            action_log_filename=action_log_path,
            custom_headers=self._params.custom_headers,
            page_settings=self._params.page_settings,
        )

        driver = self._phantomjs_driver_factory(params=driver_params)

        _logger.info(__(
            _('PhantomJS fetching ‘{url}’.'),
            url=url
        ))

        with contextlib.closing(driver):
            yield from driver.start()

            # FIXME: we don't account that things might be scrolling and
            # downloading so it might not be a good idea to timeout like
            # this
            if self._params.load_time:
                yield from asyncio.wait_for(
                    driver.process.wait(), self._params.load_time
                )
            else:
                yield from driver.process.wait()

            if driver.process.returncode != 0:
                raise PhantomJSCrashed(
                    'PhantomJS exited with code {}'
                    .format(driver.process.returncode)
                )

        if self._warc_recorder:
            self._add_warc_action_log(action_log_path, url)
            for path in snapshot_paths:
                self._add_warc_snapshot(path, url)

        _logger.info(__(
            _('PhantomJS fetched ‘{url}’.'),
            url=url
        ))
Beispiel #18
0
    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug(__('Writing WARC record {0}.',
                         record.fields['WARC-Type']))

        if self._params.compress:
            open_func = gzip.GzipFile
        else:
            open_func = open

        # Use getsize to get actual file size. Avoid tell() because it may
        # not be the raw file position.
        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        journal_filename = self._warc_filename + '-wpullinc'

        with open(journal_filename, 'w') as file:
            file.write('wpull-journal-version:1\n')
            file.write('offset:{}\n'.format(before_offset))

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(__(
                _('Rolling back file {filename} to length {length}.'),
                filename=self._warc_filename, length=before_offset
            ))
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)

            raise error
        finally:
            os.remove(journal_filename)

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(
                record, raw_file_record_size, raw_file_offset
            )
Beispiel #19
0
    def write_record(self, record):
        '''Append the record to the WARC file.'''
        # FIXME: probably not a good idea to modifiy arguments passed to us
        # TODO: add extra gzip headers that wget uses
        record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[
            WARCRecord.WARC_RECORD_ID]

        _logger.debug(
            __('Writing WARC record {0}.', record.fields['WARC-Type']))

        if self._params.compress:
            open_func = gzip.GzipFile
        else:
            open_func = open

        # Use getsize to get actual file size. Avoid tell() because it may
        # not be the raw file position.
        if os.path.exists(self._warc_filename):
            before_offset = os.path.getsize(self._warc_filename)
        else:
            before_offset = 0

        journal_filename = self._warc_filename + '-wpullinc'

        with open(journal_filename, 'w') as file:
            file.write('wpull-journal-version:1\n')
            file.write('offset:{}\n'.format(before_offset))

        try:
            with open_func(self._warc_filename, mode='ab') as out_file:
                for data in record:
                    out_file.write(data)
        except (OSError, IOError) as error:
            _logger.info(
                __(_('Rolling back file {filename} to length {length}.'),
                   filename=self._warc_filename,
                   length=before_offset))
            with open(self._warc_filename, mode='wb') as out_file:
                out_file.truncate(before_offset)

            raise error
        finally:
            os.remove(journal_filename)

        after_offset = os.path.getsize(self._warc_filename)

        if self._cdx_filename:
            raw_file_offset = before_offset
            raw_file_record_size = after_offset - before_offset

            self._write_cdx_field(record, raw_file_record_size,
                                  raw_file_offset)
Beispiel #20
0
    def _load_ca_certs(cls, session: AppSession, clean: bool=True):
        '''Load the Certificate Authority certificates.
        '''
        args = session.args

        if session.ca_certs_filename:
            return session.ca_certs_filename

        certs = set()

        if args.use_internal_ca_certs:
            pem_filename = os.path.join(
                os.path.dirname(__file__), '..', '..', 'cert', 'ca-bundle.pem'
            )
            certs.update(cls._read_pem_file(pem_filename, from_package=True))

        if args.ca_directory:
            if os.path.isdir(args.ca_directory):
                for filename in os.listdir(args.ca_directory):
                    if os.path.isfile(filename):
                        certs.update(cls._read_pem_file(filename))
            else:
                _logger.warning(__(
                    _('Certificate directory {path} does not exist.'),
                    path=args.ca_directory
                ))

        if args.ca_certificate:
            if os.path.isfile(args.ca_certificate):
                certs.update(cls._read_pem_file(args.ca_certificate))
            else:
                _logger.warning(__(
                    _('Certificate file {path} does not exist.'),
                    path=args.ca_certificate
                ))

        session.ca_certs_filename = certs_filename = tempfile.mkstemp(
            suffix='.pem', prefix='tmp-wpull-')[1]

        def clean_certs_file():
            os.remove(certs_filename)

        if clean:
            atexit.register(clean_certs_file)

        with open(certs_filename, 'w+b') as certs_file:
            for cert in certs:
                certs_file.write(cert)

        _logger.debug('CA certs loaded.')
Beispiel #21
0
    def run(self):
        scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html')
        action_log_path = self._get_temp_path('phantom-action', suffix='.txt')
        event_log_path = self._get_temp_path('phantom-event', suffix='.txt')
        snapshot_paths = [scrape_snapshot_path]
        snapshot_paths.extend(self._get_snapshot_paths())
        url = self._url_item.url_record.url

        driver_params = PhantomJSDriverParams(
            url=url,
            snapshot_paths=snapshot_paths,
            wait_time=self._params.wait_time,
            num_scrolls=self._params.num_scrolls,
            smart_scroll=self._params.smart_scroll,
            snapshot=self._params.snapshot,
            viewport_size=self._params.viewport_size,
            paper_size=self._params.paper_size,
            event_log_filename=event_log_path,
            action_log_filename=action_log_path,
            custom_headers=self._params.custom_headers,
            page_settings=self._params.page_settings,
        )

        driver = self._phantomjs_driver_factory(params=driver_params)

        _logger.info(__(_('PhantomJS fetching ‘{url}’.'), url=url))

        with contextlib.closing(driver):
            yield From(driver.start())

            # FIXME: we don't account that things might be scrolling and
            # downloading so it might not be a good idea to timeout like
            # this
            if self._params.load_time:
                yield From(
                    trollius.wait_for(driver.process.wait(),
                                      self._params.load_time))
            else:
                yield From(driver.process.wait())

            if driver.process.returncode != 0:
                raise PhantomJSCrashed('PhantomJS exited with code {}'.format(
                    driver.process.returncode))

        if self._warc_recorder:
            self._add_warc_action_log(action_log_path, url)
            for path in snapshot_paths:
                self._add_warc_snapshot(path, url)

        _logger.info(__(_('PhantomJS fetched ‘{url}’.'), url=url))
Beispiel #22
0
    def _read_content(self, response, original_url_info):
        '''Read response and parse the contents into the pool.'''
        data = response.body.read(4096)
        url_info = original_url_info

        try:
            self._robots_txt_pool.load_robots_txt(url_info, data)
        except ValueError:
            _logger.warning(
                __(
                    _('Failed to parse {url} for robots exclusion rules. '
                      'Ignoring.'), url_info.url))
            self._accept_as_blank(url_info)
        else:
            _logger.debug(__('Got a good robots.txt for {0}.', url_info.url))
Beispiel #23
0
    def _process_url_item(self, url_item):
        '''Process an item.

        Args:
            url_item (:class:`.item.URLItem`): The item to process.

        This function calls :meth:`.processor.BaseProcessor.process`.
        '''
        _logger.debug(__('Begin session for {0} {1}.',
                         url_item.url_record, url_item.url_info))

        yield self._processor.process(url_item)

        _logger.debug(__('End session for {0} {1}.',
                         url_item.url_record, url_item.url_info))
Beispiel #24
0
    def _read_content(self, response, original_url_info):
        '''Read response and parse the contents into the pool.'''
        data = response.body.read(4096)
        url_info = original_url_info

        try:
            self._robots_txt_pool.load_robots_txt(url_info, data)
        except ValueError:
            _logger.warning(__(
                _('Failed to parse {url} for robots exclusion rules. '
                  'Ignoring.'), url_info.url))
            self._accept_as_blank(url_info)
        else:
            _logger.debug(__('Got a good robots.txt for {0}.',
                             url_info.url))
Beispiel #25
0
    def _write_warc_metadata(self):
        '''Write the JSON metadata to WARC.

        Uses pywb spec.
        '''
        uri = 'metadata://{}{}'.format(
            self._item_session.url_record.url_info.authority,
            self._item_session.url_record.url_info.resource)

        glob_pattern = self._path_prefix + '*.info.json'
        filenames = list(glob.glob(glob_pattern))

        if not filenames:
            _logger.warning(
                __(_(
                    'Could not find external process metadata file: {filename}'
                ),
                   filename=glob_pattern))
            return

        for filename in filenames:
            record = WARCRecord()
            record.set_common_fields(
                'metadata', 'application/vnd.youtube-dl_formats+json')
            record.fields['WARC-Target-URI'] = uri
            record.block_file = open(filename, 'rb')

            self._warc_recorder.set_length_and_maybe_checksums(record)
            self._warc_recorder.write_record(record)

            record.block_file.close()
Beispiel #26
0
    def _scrape_document(self, request, response):
        '''Scrape the document for URLs.'''
        demux_info = self._processor.instances\
            .document_scraper.scrape_info(request, response)

        num_inline_urls = 0
        num_linked_urls = 0

        for scraper, scrape_info in demux_info.items():
            new_inline, new_linked = self._process_scrape_info(
                scraper, scrape_info
            )
            num_inline_urls += new_inline
            num_linked_urls += new_linked

        _logger.debug(__('Found URLs: inline={0} linked={1}',
                         num_inline_urls, num_linked_urls
                         ))

        try:
            self._processor.call_hook(
                'scrape_document', request, response, self._url_item
            )
        except HookDisconnected:
            pass
Beispiel #27
0
    def close(self):
        '''Close all the Host Connection Pools and remove them.'''
        for key in self._pools:
            _logger.debug(__('Closing pool for {0}.', key))
            self._pools[key].close()

        self._pools.clear()
Beispiel #28
0
    def _process_robots(self):
        '''Process robots.txt.

        Coroutine.
        '''
        try:
            request = self._new_initial_request(with_body=False)
            verdict = (yield From(
                self._should_fetch_reason_with_robots(
                    request, self._url_item.url_record)))[0]
        except REMOTE_ERRORS as error:
            _logger.error(
                __(_('Fetching robots.txt for ‘{url}’ '
                     'encountered an error: {error}'),
                   url=self._next_url_info.url,
                   error=error))
            self._result_rule.handle_error(request, error, self._url_item)

            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, error=error)

            if wait_time:
                _logger.debug('Sleeping {0}.'.format(wait_time))
                yield From(trollius.sleep(wait_time))

            raise Return(False)
        else:
            if not verdict:
                self._url_item.skip()
                raise Return(False)

        raise Return(True)
Beispiel #29
0
 def _stream_closed_callback(self):
     _logger.debug(__(
         'Stream closed. active={0} connected={1} closed={2}',
         self._active,
         self.connected,
         self._io_stream.closed(),
     ))
Beispiel #30
0
    def _connect(self):
        '''Connect the socket if not already connected.'''
        if self.connected:
            # Reset the callback so the context does not leak to another
            self._io_stream.set_close_callback(self._stream_closed_callback)
            return

        yield self._make_socket()

        _logger.debug(__('Connecting to {0}.', self._resolved_address))
        try:
            yield self._io_stream.connect(
                self._resolved_address, timeout=self._params.connect_timeout
            )
        except (tornado.netutil.SSLCertificateError,
                SSLVerficationError) as error:
            raise SSLVerficationError('Certificate error: {error}'.format(
                error=error)) from error
        except (ssl.SSLError, socket.error) as error:
            if error.errno == errno.ECONNREFUSED:
                raise ConnectionRefused('Connection refused: {error}'.format(
                    error=error)) from error
            else:
                raise NetworkError('Connection error: {error}'.format(
                    error=error)) from error
        else:
            _logger.debug('Connected.')
Beispiel #31
0
    def _handle_error(self, request, url_item, error):
        url_info_dict = self.to_script_native_type(
            request.url_info.to_dict()
        )
        url_record_dict = self.to_script_native_type(
            url_item.url_record.to_dict()
        )
        error_info_dict = self.to_script_native_type({
            'error': error.__class__.__name__,
        })
        action = self.callbacks.call_handle_error(
            url_info_dict, url_record_dict, error_info_dict
        )

        _logger.debug(__('Hooked error returned {0}', action))

        if action == Actions.NORMAL:
            return 'normal'
        elif action == Actions.RETRY:
            return False
        elif action == Actions.FINISH:
            url_item.set_status(Status.done)
            return True
        elif action == Actions.STOP:
            raise HookStop('Script requested immediate stop.')
        else:
            raise NotImplementedError()
Beispiel #32
0
    def _server_end_response_callback(self, respoonse: Response):
        '''Response callback handler.'''
        request = self._item_session.request
        response = self._item_session.response

        _logger.info(
            __(
                _('Fetched ‘{url}’: {status_code} {reason}. '
                  'Length: {content_length} [{content_type}].'),
                url=request.url,
                status_code=response.status_code,
                reason=wpull.string.printable_str(response.reason),
                content_length=wpull.string.printable_str(
                    response.fields.get('Content-Length', _('none'))),
                content_type=wpull.string.printable_str(
                    response.fields.get('Content-Type', _('none'))),
            ))

        self._result_rule.handle_response(self._item_session)

        if response.status_code in WebProcessor.DOCUMENT_STATUS_CODES:
            filename = self._file_writer_session.save_document(response)
            self._processing_rule.scrape_document(self._item_session)
            self._result_rule.handle_document(self._item_session, filename)

        elif response.status_code in WebProcessor.NO_DOCUMENT_STATUS_CODES:
            self._file_writer_session.discard_document(response)
            self._result_rule.handle_no_document(self._item_session)
        else:
            self._file_writer_session.discard_document(response)
            self._result_rule.handle_document_error(self._item_session)
Beispiel #33
0
    def _get_next_url_record(self):
        '''Return the next available URL from the URL table.

        This function will return items marked as "todo" and then items
        marked as "error". As a consequence, items experiencing errors will
        be done last.

        Returns:
            :class:`.item.URLRecord`.
        '''
        _logger.debug('Get next URL todo.')

        try:
            url_record = self._url_table.check_out(Status.todo)
        except NotFound:
            url_record = None

        if not url_record:
            try:
                _logger.debug('Get next URL error.')
                url_record = self._url_table.check_out(Status.error)
            except NotFound:
                url_record = None

        _logger.debug(__('Return record {0}.', url_record))

        return url_record
Beispiel #34
0
    def resolve(self, host, port=0):
        '''Resolve hostname and return the first result.

        Args:
            host (str): The hostname.
            port (int): The port number.

        Returns:
            tuple: A tuple of length 2 where the first item is the family and
            the second item is an socket address that can be passed
            to :func:`socket.connect`.

            Typically in a socket address, the first item is the IP
            address and the second item is the port number. Note that
            IPv6 may return a tuple containing more items than 2.
        '''
        results = yield From(self.resolve_all(host, port))

        if self._rotate:
            result = random.choice(results)
        else:
            result = results[0]

        family, address = result
        _logger.debug(__('Selected {0} as address.', address))

        assert '.' in address[0] or ':' in address[0], \
            ('Resolve did not return numerical address. Got {}.'
             .format(address[0]))

        raise Return((family, address))
Beispiel #35
0
    def _run_worker(self):
        '''Run a single consumer.

        Coroutine.
        '''
        _logger.debug('Worker start.')

        while True:
            priority, item = yield From(self._item_queue.get())

            if item == self.POISON_PILL:
                _logger.debug('Worker quitting.')
                return

            else:
                _logger.debug(__('Processing item {0}.', item))
                self._item_get_semaphore.release()
                self._token_queue.get_nowait()
                yield From(self._process_item(item))
                self._token_queue.task_done()

                if os.environ.get('OBJGRAPH_DEBUG'):
                    import gc
                    import objgraph
                    gc.collect()
                    objgraph.show_most_common_types(25)
                if os.environ.get('FILE_LEAK_DEBUG'):
                    import subprocess
                    output = subprocess.check_output(
                        ['lsof', '-p', str(os.getpid()), '-n'])
                    for line in output.decode('ascii', 'replace').split('\n'):
                        if 'REG' in line and \
                                (os.getcwd() in line or '/tmp/' in line):
                            print('FILELEAK', line)
Beispiel #36
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.sitemap:
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        link_contexts = set()

        try:
            with wpull.util.reset_file_offset(response.body):
                link_iter = self.iter_processed_links(response.body, encoding,
                                                      base_url)
                for link in link_iter:
                    link_contexts.add(LinkContext(link, linked=True))

        except (UnicodeError, self._html_parser.parser_error) as error:
            _logger.warning(
                __(_('Failed to read document at ‘{url}’: {error}'),
                   url=request.url_info.url,
                   error=error))

        return ScrapeResult(link_contexts, encoding)
Beispiel #37
0
    def _handle_response(self, request, response, url_item):
        url_info_dict = self.to_script_native_type(
            request.url_info.to_dict()
        )
        url_record_dict = self.to_script_native_type(
            url_item.url_record.to_dict()
        )
        response_info_dict = self.to_script_native_type(response.to_dict())
        action = self.callbacks.call_handle_response(
            url_info_dict, url_record_dict, response_info_dict
        )

        _logger.debug(__('Hooked response returned {0}', action))

        if action == Actions.NORMAL:
            return 'normal'
        elif action == Actions.RETRY:
            return False
        elif action == Actions.FINISH:
            url_item.set_status(Status.done)
            return True
        elif action == Actions.STOP:
            raise HookStop()
        else:
            raise NotImplementedError()
Beispiel #38
0
    def _server_end_response_callback(self, respoonse: Response):
        '''Response callback handler.'''
        request = self._item_session.request
        response = self._item_session.response

        _logger.info(__(
            _('Fetched ‘{url}’: {status_code} {reason}. '
              'Length: {content_length} [{content_type}].'),
            url=request.url,
            status_code=response.status_code,
            reason=wpull.string.printable_str(response.reason),
            content_length=wpull.string.printable_str(
                response.fields.get('Content-Length', _('none'))),
            content_type=wpull.string.printable_str(
                response.fields.get('Content-Type', _('none'))),
        ))

        self._result_rule.handle_response(self._item_session)

        if response.status_code in WebProcessor.DOCUMENT_STATUS_CODES:
            filename = self._file_writer_session.save_document(response)
            self._processing_rule.scrape_document(self._item_session)
            self._result_rule.handle_document(self._item_session, filename)

        elif response.status_code in WebProcessor.NO_DOCUMENT_STATUS_CODES:
            self._file_writer_session.discard_document(response)
            self._result_rule.handle_no_document(self._item_session)
        else:
            self._file_writer_session.discard_document(response)
            self._result_rule.handle_document_error(self._item_session)
Beispiel #39
0
    def _build_cookie_jar(cls, session: AppSession):
        '''Build the cookie jar'''

        if not session.args.cookies:
            return

        if session.args.load_cookies or session.args.save_cookies:
            session.factory.set('CookieJar', BetterMozillaCookieJar)

            cookie_jar = session.factory.new('CookieJar')

            if session.args.load_cookies:
                cookie_jar.load(session.args.load_cookies, ignore_discard=True)
        else:
            cookie_jar = session.factory.new('CookieJar')

        policy = session.factory.new('CookiePolicy', cookie_jar=cookie_jar)

        cookie_jar.set_policy(policy)

        _logger.debug(__('Loaded cookies: {0}', list(cookie_jar)))

        cookie_jar_wrapper = session.factory.new(
            'CookieJarWrapper',
            cookie_jar,
            save_filename=session.args.save_cookies,
            keep_session_cookies=session.args.keep_session_cookies,
        )

        return cookie_jar_wrapper
Beispiel #40
0
    def process_one(self, _worker_id=None):
        item = yield from self._item_queue.get()

        if item == POISON_PILL:
            return item

        _logger.debug(__('Worker id {} Processing item {}', _worker_id, item))

        for task in self._tasks:
            yield from task.process(item)

        _logger.debug(__('Worker id {} Processed item {}', _worker_id, item))

        yield from self._item_queue.item_done()

        return item
Beispiel #41
0
    def set_status(self, status: Status, increment_try_count: bool=True,
                   filename: str=None):
        '''Mark the item with the given status.

        Args:
            status: a value from :class:`Status`.
            increment_try_count: if True, increment the ``try_count``
                value
        '''
        url = self.url_record.url
        assert not self._try_count_incremented, (url, status)

        if increment_try_count:
            self._try_count_incremented = True

        _logger.debug(__('Marking URL {0} status {1}.', url, status))

        url_result = URLResult()
        url_result.filename = filename

        self.app_session.factory['URLTable'].check_in(
            url,
            status,
            increment_try_count=increment_try_count,
            url_result=url_result,
        )

        self._processed = True
Beispiel #42
0
    def _read_response_by_length(self, response):
        '''Read the connection specified by a length.'''
        _logger.debug('Reading body by length.')

        try:
            body_size = int(response.fields['Content-Length'])

            if body_size < 0:
                raise ValueError('Content length cannot be negative.')

        except ValueError as error:
            _logger.warning(__(
                _('Invalid content length: {error}'), error=error
            ))

            yield self._read_response_until_close(response)
            return

        def callback(data):
            self._events.response_data.fire(data)
            response.body.content_file.write(self._decompress_data(data))

        yield self._io_stream.read_bytes(
            body_size, streaming_callback=callback,
        )

        response.body.content_file.write(self._flush_decompressor())
Beispiel #43
0
    def scrape_document(self, request, response, url_item):
        '''Process document for links.'''
        try:
            self.call_hook('scrape_document', request, response, url_item)
        except HookDisconnected:
            pass

        if not self._document_scraper:
            return

        demux_info = self._document_scraper.scrape_info(
            request, response, url_item.url_record.link_type)

        num_inline_urls = 0
        num_linked_urls = 0

        for scraper, scrape_result in demux_info.items():
            new_inline, new_linked = self._process_scrape_info(
                scraper, scrape_result, url_item)
            num_inline_urls += new_inline
            num_linked_urls += new_linked

        _logger.debug(
            __('Candidate URLs: inline={0} linked={1}', num_inline_urls,
               num_linked_urls))
    def _warn_unsafe_options(cls, args):
        '''Print warnings about any enabled hazardous options.

        This function will print messages complaining about:

        * ``--save-headers``
        * ``--no-iri``
        * ``--output-document``
        * ``--ignore-fatal-errors``
        '''
        enabled_options = []

        for option_name in cls.UNSAFE_OPTIONS:
            if getattr(args, option_name):
                enabled_options.append(option_name)

        if enabled_options:
            _logger.warning(__(
                _('The following unsafe options are enabled: {list}.'),
                list=enabled_options
            ))
            _logger.warning(
                _('The use of unsafe options may lead to unexpected behavior '
                  'or file corruption.'))

        if not args.retr_symlinks:
            _logger.warning(
                _('The --retr-symlinks=off option is a security risk.')
            )
Beispiel #45
0
    def _run_worker(self):
        '''Run a single consumer.

        Coroutine.
        '''
        _logger.debug('Worker start.')

        while True:
            priority, item = yield From(self._item_queue.get())

            if item == self.POISON_PILL:
                _logger.debug('Worker quitting.')
                return

            else:
                _logger.debug(__('Processing item {0}.', item))
                self._item_get_semaphore.release()
                self._token_queue.get_nowait()
                yield From(self._process_item(item))
                self._token_queue.task_done()

                if os.environ.get('OBJGRAPH_DEBUG'):
                    import gc
                    import objgraph
                    gc.collect()
                    objgraph.show_most_common_types(25)
                if os.environ.get('FILE_LEAK_DEBUG'):
                    import subprocess
                    output = subprocess.check_output(
                        ['lsof', '-p', str(os.getpid()), '-n'])
                    for line in output.decode('ascii', 'replace').split('\n'):
                        if 'REG' in line and \
                                (os.getcwd() in line or '/tmp/' in line):
                            print('FILELEAK', line)
Beispiel #46
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.css:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or \
            detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(
                        response.body, encoding, base_url, context=True):
                    if context == 'import':
                        link_type = LinkType.css
                    else:
                        link_type = LinkType.media

                    link_contexts.add(LinkContext(link, inline=True, link_type=link_type))

        except UnicodeError as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))

        return ScrapeResult(link_contexts, encoding)
Beispiel #47
0
    def _get_next_url_record(self):
        '''Return the next available URL from the URL table.

        This function will return items marked as "todo" and then items
        marked as "error". As a consequence, items experiencing errors will
        be done last.

        Returns:
            :class:`.item.URLRecord`.
        '''
        _logger.debug('Get next URL todo.')

        try:
            url_record = self._url_table.check_out(Status.todo)
        except NotFound:
            url_record = None

        if not url_record:
            try:
                _logger.debug('Get next URL error.')
                url_record = self._url_table.check_out(Status.error)
            except NotFound:
                url_record = None

        _logger.debug(__('Return record {0}.', url_record))

        return url_record
Beispiel #48
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.html:
            return

        base_url = request.url_info.url
        content_file = response.body
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        link_contexts = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.iter_elements(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, link_contexts
                )

        except (UnicodeError, self._html_parser.parser_error) as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            link_contexts.discard(frozenset(
                context for context in link_contexts if context.linked
            ))

        scrape_result = ScrapeResult(link_contexts, encoding)
        scrape_result['base_url'] = base_url
        return scrape_result
Beispiel #49
0
    def _start_new_warc_file(self, meta=False):
        if self._params.max_size is None:
            sequence_name = ''
        elif meta:
            sequence_name = '-meta'
        else:
            sequence_name = '-{0:05d}'.format(self._sequence_num)

        if self._params.compress:
            extension = 'warc.gz'
        else:
            extension = 'warc'

        self._warc_filename = '{0}{1}.{2}'.format(
            self._prefix_filename, sequence_name, extension
        )

        _logger.debug(__('WARC file at {0}', self._warc_filename))

        if not self._params.appending:
            wpull.util.truncate_file(self._warc_filename)

        self._warcinfo_record = WARCRecord()
        self._populate_warcinfo(self._params.extra_fields)
        self.write_record(self._warcinfo_record)
Beispiel #50
0
    def connect(self):
        '''Establish a connection.'''
        _logger.debug(__('Connecting to {0}.', self._address))

        if self._state != ConnectionState.ready:
            raise Exception('Closed connection must be reset before reusing.')

        if self._sock:
            connection_future = asyncio.open_connection(
                sock=self._sock, **self._connection_kwargs())
        else:
            # TODO: maybe we don't want to ignore flow-info and scope-id?
            host = self._address[0]
            port = self._address[1]

            connection_future = asyncio.open_connection(
                host, port, **self._connection_kwargs())

        self.reader, self.writer = yield from \
            self.run_network_operation(
                connection_future,
                wait_timeout=self._connect_timeout,
                name='Connect')

        if self._timeout is not None:
            self._close_timer = CloseTimer(self._timeout, self)
        else:
            self._close_timer = DummyCloseTimer()

        self._state = ConnectionState.created
        _logger.debug('Connected.')
Beispiel #51
0
    def _build_cookie_jar(self):
        '''Build the cookie jar'''

        if not self._args.cookies:
            return

        if self._args.load_cookies or self._args.save_cookies:
            self._factory.set('CookieJar', RelaxedMozillaCookieJar)

            cookie_jar = self._factory.new('CookieJar')

            if self._args.load_cookies:
                cookie_jar.load(self._args.load_cookies, ignore_discard=True)
        else:
            cookie_jar = self._factory.new('CookieJar')

        policy = self._factory.new('CookiePolicy', cookie_jar=cookie_jar)

        cookie_jar.set_policy(policy)

        _logger.debug(__('Loaded cookies: {0}', list(cookie_jar)))

        cookie_jar_wrapper = self._factory.new(
            'CookieJarWrapper',
            cookie_jar,
            save_filename=self._args.save_cookies,
            keep_session_cookies=True,
        )

        return cookie_jar_wrapper
Beispiel #52
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.javascript:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or \
            detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(
                        response.body, encoding, base_url, context=True):
                    inline = is_likely_inline(link)

                    if context is True:
                        link_type = None
                    else:
                        link_type = context

                    link_contexts.add(
                        LinkContext(link, inline=inline, linked=not inline,
                                    link_type=link_type)
                    )

        except UnicodeError as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))

        return ScrapeResult(link_contexts, encoding)
Beispiel #53
0
    def _read_body_by_length(self, response, file):
        '''Read the connection specified by a length.

        Coroutine.
        '''
        _logger.debug('Reading body by length.')

        file_is_async = hasattr(file, 'drain')

        try:
            body_size = int(response.fields['Content-Length'])

            if body_size < 0:
                raise ValueError('Content length cannot be negative.')

        except ValueError as error:
            _logger.warning(__(
                _('Invalid content length: {error}'), error=error
            ))

            yield From(self._read_body_until_close(response, file))
            return

        bytes_left = body_size

        while bytes_left > 0:
            data = yield From(self._connection.read(self._read_size))

            if not data:
                break

            bytes_left -= len(data)

            if bytes_left < 0:
                data = data[:bytes_left]

                _logger.warning(_('Content overrun.'))
                self.close()

            self._data_observer.notify('response_body', data)

            content_data = self._decompress_data(data)

            if file:
                file.write(content_data)

                if file_is_async:
                    yield From(file.drain())

        if bytes_left > 0:
            raise NetworkError('Connection closed.')

        content_data = self._flush_decompressor()

        if file and content_data:
            file.write(content_data)

            if file_is_async:
                yield From(file.drain())
Beispiel #54
0
    def _close_servers(self):
        '''Close and wait for servers to close.

        Coroutine.
        '''
        for server in self._servers:
            _logger.debug(__('Closing server {}', server))
            server.close()
            yield From(server.wait_closed())
Beispiel #55
0
    def _start_servers(self):
        '''Start servers.

        Coroutine.
        '''
        for task in self._server_tasks:
            _logger.debug(__('Starting task {}', task))
            server = yield From(task)
            self._servers.append(server)
Beispiel #56
0
 def _warn_discarded_items(self):
     _logger.warning(__(
         gettext.ngettext(
             'Discarding {num} unprocessed item.',
             'Discarding {num} unprocessed items.',
             self._item_queue.unfinished_items
         ),
         num=self._item_queue.unfinished_items
     ))
Beispiel #57
0
    def start(self, request: Request) -> Response:
        '''Begin a HTTP request

        Args:
            request: Request information.

        Returns:
            A response populated with the HTTP headers.

        Once the headers are received, call :meth:`download`.

        Coroutine.
        '''
        if self._session_state != SessionState.ready:
            raise RuntimeError('Session already started')

        assert not self._request
        self._request = request
        _logger.debug(__('Client fetch request {0}.', request))

        connection = yield from self._acquire_request_connection(request)
        full_url = connection.proxied and not connection.tunneled

        self._stream = stream = self._stream_factory(connection)

        yield from self._stream.reconnect()

        request.address = connection.address

        self.event_dispatcher.notify(self.Event.begin_request, request)
        write_callback = functools.partial(self.event_dispatcher.notify,
                                           self.Event.request_data)
        stream.data_event_dispatcher.add_write_listener(write_callback)

        yield from stream.write_request(request, full_url=full_url)

        if request.body:
            assert 'Content-Length' in request.fields
            length = int(request.fields['Content-Length'])
            yield from stream.write_body(request.body, length=length)

        stream.data_event_dispatcher.remove_write_listener(write_callback)
        self.event_dispatcher.notify(self.Event.end_request, request)

        read_callback = functools.partial(self.event_dispatcher.notify,
                                          self.Event.response_data)
        stream.data_event_dispatcher.add_read_listener(read_callback)

        self._response = response = yield from stream.read_response()
        response.request = request

        self.event_dispatcher.notify(self.Event.begin_response, response)

        self._session_state = SessionState.request_sent

        return response