def _scrape_document(self, request, response, url_item): to_native = self.to_script_native_type url_info_dict = to_native(request.url_info.to_dict()) document_info_dict = to_native(response.body.to_dict()) filename = to_native(response.body.content_file.name) new_url_dicts = self.callbacks.get_urls( filename, url_info_dict, document_info_dict) _logger.debug(__('Hooked scrape returned {0}', new_url_dicts)) if not new_url_dicts: return if to_native(1) in new_url_dicts: # Lua doesn't have sequences for i in itertools.count(1): new_url_dict = new_url_dicts[to_native(i)] _logger.debug(__('Got lua new url info {0}', new_url_dict)) if new_url_dict is None: break self._add_hooked_url(url_item, new_url_dict) else: for new_url_dict in new_url_dicts: self._add_hooked_url(url_item, new_url_dict)
def _read_input_urls(cls, session: AppSession, default_scheme='http'): '''Read the URLs provided by the user.''' url_string_iter = session.args.urls or () # FIXME: url rewriter isn't created yet url_rewriter = session.factory.get('URLRewriter') if session.args.input_file: if session.args.force_html: lines = cls._input_file_as_html_links(session) else: lines = cls._input_file_as_lines(session) url_string_iter = itertools.chain(url_string_iter, lines) base_url = session.args.base for url_string in url_string_iter: _logger.debug(__('Parsing URL {0}', url_string)) if base_url: url_string = wpull.url.urljoin(base_url, url_string) url_info = wpull.url.URLInfo.parse( url_string, default_scheme=default_scheme) _logger.debug(__('Parsed URL {0}', url_info)) if url_rewriter: # TODO: this logic should be a hook url_info = url_rewriter.rewrite(url_info) _logger.debug(__('Rewritten URL {0}', url_info)) yield url_info
def resolve_all(self, host, port=0): '''Resolve hostname and return a list of results. Args: host (str): The hostname. port (int): The port number. Returns: list: A list of tuples where each tuple contains the family and the socket address. See :method:`resolve` for the socket address format. ''' _logger.debug(__('Lookup address {0} {1}.', host, port)) host = self._lookup_hook(host, port) results = None if self._cache: results = self._get_cache(host, port, self._family) if results is None: results = yield From(self._resolve_from_network(host, port)) if self._cache: self._put_cache(host, port, results) if not results: raise DNSNotFound( "DNS resolution for {0} did not return any results." .format(repr(host)) ) _logger.debug(__('Resolved addresses: {0}.', results)) raise Return(results)
def process(self, session: AppSession): self._debug_log_registered_hooks(session) internal_plugin_path = get_package_filename(os.path.join('application', 'plugins')) plugin_locations = [internal_plugin_path] plugin_filenames = [] if session.args.plugin_script: plugin_filenames.append(session.args.plugin_script) locator = PluginLocator(plugin_locations, plugin_filenames) session.plugin_manager = PluginManager(plugin_locator=locator) session.plugin_manager.collectPlugins() for plugin_info in session.plugin_manager.getAllPlugins(): if plugin_info.path.startswith(internal_plugin_path): _logger.debug(__( _('Found plugin {name} from {filename}.'), filename=plugin_info.path, name=plugin_info.name )) else: _logger.info(__( _('Found plugin {name} from {filename}.'), filename=plugin_info.path, name=plugin_info.name )) plugin_info.plugin_object.app_session = session if plugin_info.plugin_object.should_activate(): session.plugin_manager.activatePluginByName(plugin_info.name) self._connect_plugin_hooks(session, plugin_info.plugin_object)
def process(self, item_session: ItemSession, request, response, file_writer_session): '''Process PhantomJS. Coroutine. ''' if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') self._file_writer_session = file_writer_session # FIXME: this is a quick hack for crashes. See #137. attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) for dummy in range(attempts): try: yield from self._run_driver(item_session, request, response) except asyncio.TimeoutError: _logger.warning(_('Waiting for page load timed out.')) break except PhantomJSCrashed as error: _logger.exception(__('PhantomJS crashed: {}', error)) else: break else: _logger.warning(__( _('PhantomJS failed to fetch ‘{url}’. I am sorry.'), url=request.url_info.url ))
def _check_resource_monitor(self): if not self._resource_monitor: return for counter in itertools.count(): resource_info = self._resource_monitor.check() if not resource_info: if counter: _logger.info(_('Situation cleared.')) break if counter % 15 == 0: if resource_info.path: _logger.warning( __(_('Low disk space on {path} ({size} free).'), path=resource_info.path, size=wpull.string.format_size(resource_info.free))) else: _logger.warning( __(_('Low memory ({size} free).'), size=wpull.string.format_size(resource_info.free))) _logger.warning(_('Waiting for operator to clear situation.')) yield From(trollius.sleep(60))
def _check_resource_monitor(self): if not self._resource_monitor: return for counter in itertools.count(): resource_info = self._resource_monitor.check() if not resource_info: if counter: _logger.info(_('Situation cleared.')) break if counter % 15 == 0: if resource_info.path: _logger.warning(__( _('Low disk space on {path} ({size} free).'), path=resource_info.path, size=wpull.string.format_size(resource_info.free) )) else: _logger.warning(__( _('Low memory ({size} free).'), size=wpull.string.format_size(resource_info.free) )) _logger.warning(_('Waiting for operator to clear situation.')) yield From(trollius.sleep(60))
def _make_socket(self): '''Make and wrap the socket with an IOStream.''' host, port = self._original_address family, self._resolved_address = yield self._resolver.resolve( host, port) self._socket = socket.socket(family, socket.SOCK_STREAM) _logger.debug(__('Socket to {0}/{1}.', family, self._resolved_address)) if self._params.bind_address: _logger.debug(__( 'Binding socket to {0}', self._params.bind_address )) self._socket.bind(self._params.bind_address) if self._ssl: self._io_stream = SSLIOStream( self._socket, max_buffer_size=self._params.buffer_size, rw_timeout=self._params.read_timeout, ssl_options=self._params.ssl_options or {}, server_hostname=host, ) else: self._io_stream = IOStream( self._socket, rw_timeout=self._params.read_timeout, max_buffer_size=self._params.buffer_size, ) self._io_stream.set_close_callback(self._stream_closed_callback)
def resolve_all(self, host, port=0): '''Resolve hostname and return a list of results. Args: host (str): The hostname. port (int): The port number. Returns: list: A list of tuples where each tuple contains the family and the socket address. See :method:`resolve` for the socket address format. ''' _logger.debug(__('Lookup address {0} {1}.', host, port)) host = self._lookup_hook(host, port) results = None if self._cache: results = self._get_cache(host, port, self._family) if results is None: results = yield From(self._resolve_from_network(host, port)) if self._cache: self._put_cache(host, port, results) if not results: raise DNSNotFound( "DNS resolution for {0} did not return any results.".format( repr(host))) _logger.debug(__('Resolved addresses: {0}.', results)) raise Return(results)
def _read_input_urls(cls, session: AppSession, default_scheme='http'): '''Read the URLs provided by the user.''' url_string_iter = session.args.urls or () # FIXME: url rewriter isn't created yet url_rewriter = session.factory.get('URLRewriter') if session.args.input_file: if session.args.force_html: lines = cls._input_file_as_html_links(session) else: lines = cls._input_file_as_lines(session) url_string_iter = itertools.chain(url_string_iter, lines) base_url = session.args.base for url_string in url_string_iter: _logger.debug(__('Parsing URL {0}', url_string)) if base_url: url_string = wpull.url.urljoin(base_url, url_string) url_info = wpull.url.URLInfo.parse(url_string, default_scheme=default_scheme) _logger.debug(__('Parsed URL {0}', url_info)) if url_rewriter: # TODO: this logic should be a hook url_info = url_rewriter.rewrite(url_info) _logger.debug(__('Rewritten URL {0}', url_info)) yield url_info
def process(self, session: AppSession): self._debug_log_registered_hooks(session) internal_plugin_path = get_package_filename( os.path.join('application', 'plugins')) plugin_locations = [internal_plugin_path] plugin_filenames = [] if session.args.plugin_script: plugin_filenames.append(session.args.plugin_script) locator = PluginLocator(plugin_locations, plugin_filenames) session.plugin_manager = PluginManager(plugin_locator=locator) session.plugin_manager.collectPlugins() for plugin_info in session.plugin_manager.getAllPlugins(): if plugin_info.path.startswith(internal_plugin_path): _logger.debug( __(_('Found plugin {name} from {filename}.'), filename=plugin_info.path, name=plugin_info.name)) else: _logger.info( __(_('Found plugin {name} from {filename}.'), filename=plugin_info.path, name=plugin_info.name)) plugin_info.plugin_object.app_session = session if plugin_info.plugin_object.should_activate(): session.plugin_manager.activatePluginByName(plugin_info.name) self._connect_plugin_hooks(session, plugin_info.plugin_object)
def process(self, url_item, request, response, file_writer_session): '''Process PhantomJS. Coroutine. ''' if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') self._file_writer_session = file_writer_session # FIXME: this is a quick hack for crashes. See #137. attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) for dummy in range(attempts): try: yield From(self._run_driver(url_item, request, response)) except trollius.TimeoutError: _logger.warning(_('Waiting for page load timed out.')) break except PhantomJSCrashed as error: _logger.exception(__('PhantomJS crashed: {}', error)) else: break else: _logger.warning( __(_('PhantomJS failed to fetch ‘{url}’. I am sorry.'), url=request.url_info.url))
def snapshot(self, remote, html_path=None, render_path=None): '''Take HTML and PDF snapshot.''' content = yield remote.eval('page.content') url = yield remote.eval('page.url') if html_path: _logger.debug(__('Saving snapshot to {0}.', html_path)) dir_path = os.path.abspath(os.path.dirname(html_path)) if not os.path.exists(dir_path): os.makedirs(dir_path) with open(html_path, 'wb') as out_file: out_file.write(content.encode('utf-8')) if self._warc_recorder: self._add_warc_snapshot(html_path, 'text/html', url) if render_path: _logger.debug(__('Saving snapshot to {0}.', render_path)) yield remote.call('page.render', render_path) if self._warc_recorder: self._add_warc_snapshot(render_path, 'application/pdf', url) raise tornado.gen.Return(content)
def _print_stats(cls, stats: Statistics, human_format_speed: bool = True): '''Log the final statistics to the user.''' time_length = datetime.timedelta(seconds=int(stats.stop_time - stats.start_time)) file_size = wpull.string.format_size(stats.size) if stats.bandwidth_meter.num_samples: speed = stats.bandwidth_meter.speed() if human_format_speed: speed_size_str = wpull.string.format_size(speed) else: speed_size_str = '{:.1f} b'.format(speed * 8) else: speed_size_str = _('-- B') _logger.info(_('FINISHED.')) _logger.info( __( _('Duration: {preformatted_timedelta}. ' 'Speed: {preformatted_speed_size}/s.'), preformatted_timedelta=time_length, preformatted_speed_size=speed_size_str, )) _logger.info( __(gettext.ngettext( 'Downloaded: {num_files} file, {preformatted_file_size}.', 'Downloaded: {num_files} files, {preformatted_file_size}.', stats.files), num_files=stats.files, preformatted_file_size=file_size)) if stats.is_quota_exceeded: _logger.info(_('Download quota exceeded.'))
def _polling_sleep(cls, resource_monitor, log=False): for counter in itertools.count(): resource_info = resource_monitor.check() if not resource_info: if log and counter: _logger.info(_('Situation cleared.')) break if log and counter % 15 == 0: if resource_info.path: _logger.warning(__( _('Low disk space on {path} ({size} free).'), path=resource_info.path, size=wpull.string.format_size(resource_info.free) )) else: _logger.warning(__( _('Low memory ({size} free).'), size=wpull.string.format_size(resource_info.free) )) _logger.warning(_('Waiting for operator to clear situation.')) yield from asyncio.sleep(60)
def control(self, remote): '''Scroll the page.''' num_scrolls = self._num_scrolls if self._smart_scroll: is_page_dynamic = yield remote.call('isPageDynamic') if not is_page_dynamic: num_scrolls = 0 url = yield remote.eval('page.url') total_scroll_count = 0 for scroll_count in range(num_scrolls): _logger.debug(__('Scrolling page. Count={0}.', scroll_count)) pre_scroll_counter_values = remote.resource_counter.values() scroll_position = yield remote.eval('page.scrollPosition') scroll_position['top'] += self._viewport_size[1] yield self.scroll_to(remote, 0, scroll_position['top']) total_scroll_count += 1 self._log_action('wait', self._wait_time) yield wpull.async.sleep(self._wait_time) post_scroll_counter_values = remote.resource_counter.values() _logger.debug(__( 'Counter values pre={0} post={1}', pre_scroll_counter_values, post_scroll_counter_values )) if post_scroll_counter_values == pre_scroll_counter_values \ and self._smart_scroll: break for dummy in range(remote.resource_counter.pending): if remote.resource_counter.pending: self._log_action('wait', self._wait_time) yield wpull.async.sleep(self._wait_time) else: break yield self.scroll_to(remote, 0, 0) _logger.info(__( gettext.ngettext( 'Scrolled page {num} time.', 'Scrolled page {num} times.', total_scroll_count, ), num=total_scroll_count )) if self._warc_recorder: self._add_warc_action_log(url)
def run(self): scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html') action_log_path = self._get_temp_path('phantom-action', suffix='.txt') event_log_path = self._get_temp_path('phantom-event', suffix='.txt') snapshot_paths = [scrape_snapshot_path] snapshot_paths.extend(self._get_snapshot_paths()) url = self._item_session.url_record.url driver_params = PhantomJSDriverParams( url=url, snapshot_paths=snapshot_paths, wait_time=self._params.wait_time, num_scrolls=self._params.num_scrolls, smart_scroll=self._params.smart_scroll, snapshot=self._params.snapshot, viewport_size=self._params.viewport_size, paper_size=self._params.paper_size, event_log_filename=event_log_path, action_log_filename=action_log_path, custom_headers=self._params.custom_headers, page_settings=self._params.page_settings, ) driver = self._phantomjs_driver_factory(params=driver_params) _logger.info(__( _('PhantomJS fetching ‘{url}’.'), url=url )) with contextlib.closing(driver): yield from driver.start() # FIXME: we don't account that things might be scrolling and # downloading so it might not be a good idea to timeout like # this if self._params.load_time: yield from asyncio.wait_for( driver.process.wait(), self._params.load_time ) else: yield from driver.process.wait() if driver.process.returncode != 0: raise PhantomJSCrashed( 'PhantomJS exited with code {}' .format(driver.process.returncode) ) if self._warc_recorder: self._add_warc_action_log(action_log_path, url) for path in snapshot_paths: self._add_warc_snapshot(path, url) _logger.info(__( _('PhantomJS fetched ‘{url}’.'), url=url ))
def write_record(self, record): '''Append the record to the WARC file.''' # FIXME: probably not a good idea to modifiy arguments passed to us # TODO: add extra gzip headers that wget uses record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[ WARCRecord.WARC_RECORD_ID] _logger.debug(__('Writing WARC record {0}.', record.fields['WARC-Type'])) if self._params.compress: open_func = gzip.GzipFile else: open_func = open # Use getsize to get actual file size. Avoid tell() because it may # not be the raw file position. if os.path.exists(self._warc_filename): before_offset = os.path.getsize(self._warc_filename) else: before_offset = 0 journal_filename = self._warc_filename + '-wpullinc' with open(journal_filename, 'w') as file: file.write('wpull-journal-version:1\n') file.write('offset:{}\n'.format(before_offset)) try: with open_func(self._warc_filename, mode='ab') as out_file: for data in record: out_file.write(data) except (OSError, IOError) as error: _logger.info(__( _('Rolling back file {filename} to length {length}.'), filename=self._warc_filename, length=before_offset )) with open(self._warc_filename, mode='wb') as out_file: out_file.truncate(before_offset) raise error finally: os.remove(journal_filename) after_offset = os.path.getsize(self._warc_filename) if self._cdx_filename: raw_file_offset = before_offset raw_file_record_size = after_offset - before_offset self._write_cdx_field( record, raw_file_record_size, raw_file_offset )
def write_record(self, record): '''Append the record to the WARC file.''' # FIXME: probably not a good idea to modifiy arguments passed to us # TODO: add extra gzip headers that wget uses record.fields['WARC-Warcinfo-ID'] = self._warcinfo_record.fields[ WARCRecord.WARC_RECORD_ID] _logger.debug( __('Writing WARC record {0}.', record.fields['WARC-Type'])) if self._params.compress: open_func = gzip.GzipFile else: open_func = open # Use getsize to get actual file size. Avoid tell() because it may # not be the raw file position. if os.path.exists(self._warc_filename): before_offset = os.path.getsize(self._warc_filename) else: before_offset = 0 journal_filename = self._warc_filename + '-wpullinc' with open(journal_filename, 'w') as file: file.write('wpull-journal-version:1\n') file.write('offset:{}\n'.format(before_offset)) try: with open_func(self._warc_filename, mode='ab') as out_file: for data in record: out_file.write(data) except (OSError, IOError) as error: _logger.info( __(_('Rolling back file {filename} to length {length}.'), filename=self._warc_filename, length=before_offset)) with open(self._warc_filename, mode='wb') as out_file: out_file.truncate(before_offset) raise error finally: os.remove(journal_filename) after_offset = os.path.getsize(self._warc_filename) if self._cdx_filename: raw_file_offset = before_offset raw_file_record_size = after_offset - before_offset self._write_cdx_field(record, raw_file_record_size, raw_file_offset)
def _load_ca_certs(cls, session: AppSession, clean: bool=True): '''Load the Certificate Authority certificates. ''' args = session.args if session.ca_certs_filename: return session.ca_certs_filename certs = set() if args.use_internal_ca_certs: pem_filename = os.path.join( os.path.dirname(__file__), '..', '..', 'cert', 'ca-bundle.pem' ) certs.update(cls._read_pem_file(pem_filename, from_package=True)) if args.ca_directory: if os.path.isdir(args.ca_directory): for filename in os.listdir(args.ca_directory): if os.path.isfile(filename): certs.update(cls._read_pem_file(filename)) else: _logger.warning(__( _('Certificate directory {path} does not exist.'), path=args.ca_directory )) if args.ca_certificate: if os.path.isfile(args.ca_certificate): certs.update(cls._read_pem_file(args.ca_certificate)) else: _logger.warning(__( _('Certificate file {path} does not exist.'), path=args.ca_certificate )) session.ca_certs_filename = certs_filename = tempfile.mkstemp( suffix='.pem', prefix='tmp-wpull-')[1] def clean_certs_file(): os.remove(certs_filename) if clean: atexit.register(clean_certs_file) with open(certs_filename, 'w+b') as certs_file: for cert in certs: certs_file.write(cert) _logger.debug('CA certs loaded.')
def run(self): scrape_snapshot_path = self._get_temp_path('phantom', suffix='.html') action_log_path = self._get_temp_path('phantom-action', suffix='.txt') event_log_path = self._get_temp_path('phantom-event', suffix='.txt') snapshot_paths = [scrape_snapshot_path] snapshot_paths.extend(self._get_snapshot_paths()) url = self._url_item.url_record.url driver_params = PhantomJSDriverParams( url=url, snapshot_paths=snapshot_paths, wait_time=self._params.wait_time, num_scrolls=self._params.num_scrolls, smart_scroll=self._params.smart_scroll, snapshot=self._params.snapshot, viewport_size=self._params.viewport_size, paper_size=self._params.paper_size, event_log_filename=event_log_path, action_log_filename=action_log_path, custom_headers=self._params.custom_headers, page_settings=self._params.page_settings, ) driver = self._phantomjs_driver_factory(params=driver_params) _logger.info(__(_('PhantomJS fetching ‘{url}’.'), url=url)) with contextlib.closing(driver): yield From(driver.start()) # FIXME: we don't account that things might be scrolling and # downloading so it might not be a good idea to timeout like # this if self._params.load_time: yield From( trollius.wait_for(driver.process.wait(), self._params.load_time)) else: yield From(driver.process.wait()) if driver.process.returncode != 0: raise PhantomJSCrashed('PhantomJS exited with code {}'.format( driver.process.returncode)) if self._warc_recorder: self._add_warc_action_log(action_log_path, url) for path in snapshot_paths: self._add_warc_snapshot(path, url) _logger.info(__(_('PhantomJS fetched ‘{url}’.'), url=url))
def _read_content(self, response, original_url_info): '''Read response and parse the contents into the pool.''' data = response.body.read(4096) url_info = original_url_info try: self._robots_txt_pool.load_robots_txt(url_info, data) except ValueError: _logger.warning( __( _('Failed to parse {url} for robots exclusion rules. ' 'Ignoring.'), url_info.url)) self._accept_as_blank(url_info) else: _logger.debug(__('Got a good robots.txt for {0}.', url_info.url))
def _process_url_item(self, url_item): '''Process an item. Args: url_item (:class:`.item.URLItem`): The item to process. This function calls :meth:`.processor.BaseProcessor.process`. ''' _logger.debug(__('Begin session for {0} {1}.', url_item.url_record, url_item.url_info)) yield self._processor.process(url_item) _logger.debug(__('End session for {0} {1}.', url_item.url_record, url_item.url_info))
def _read_content(self, response, original_url_info): '''Read response and parse the contents into the pool.''' data = response.body.read(4096) url_info = original_url_info try: self._robots_txt_pool.load_robots_txt(url_info, data) except ValueError: _logger.warning(__( _('Failed to parse {url} for robots exclusion rules. ' 'Ignoring.'), url_info.url)) self._accept_as_blank(url_info) else: _logger.debug(__('Got a good robots.txt for {0}.', url_info.url))
def _write_warc_metadata(self): '''Write the JSON metadata to WARC. Uses pywb spec. ''' uri = 'metadata://{}{}'.format( self._item_session.url_record.url_info.authority, self._item_session.url_record.url_info.resource) glob_pattern = self._path_prefix + '*.info.json' filenames = list(glob.glob(glob_pattern)) if not filenames: _logger.warning( __(_( 'Could not find external process metadata file: {filename}' ), filename=glob_pattern)) return for filename in filenames: record = WARCRecord() record.set_common_fields( 'metadata', 'application/vnd.youtube-dl_formats+json') record.fields['WARC-Target-URI'] = uri record.block_file = open(filename, 'rb') self._warc_recorder.set_length_and_maybe_checksums(record) self._warc_recorder.write_record(record) record.block_file.close()
def _scrape_document(self, request, response): '''Scrape the document for URLs.''' demux_info = self._processor.instances\ .document_scraper.scrape_info(request, response) num_inline_urls = 0 num_linked_urls = 0 for scraper, scrape_info in demux_info.items(): new_inline, new_linked = self._process_scrape_info( scraper, scrape_info ) num_inline_urls += new_inline num_linked_urls += new_linked _logger.debug(__('Found URLs: inline={0} linked={1}', num_inline_urls, num_linked_urls )) try: self._processor.call_hook( 'scrape_document', request, response, self._url_item ) except HookDisconnected: pass
def close(self): '''Close all the Host Connection Pools and remove them.''' for key in self._pools: _logger.debug(__('Closing pool for {0}.', key)) self._pools[key].close() self._pools.clear()
def _process_robots(self): '''Process robots.txt. Coroutine. ''' try: request = self._new_initial_request(with_body=False) verdict = (yield From( self._should_fetch_reason_with_robots( request, self._url_item.url_record)))[0] except REMOTE_ERRORS as error: _logger.error( __(_('Fetching robots.txt for ‘{url}’ ' 'encountered an error: {error}'), url=self._next_url_info.url, error=error)) self._result_rule.handle_error(request, error, self._url_item) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, error=error) if wait_time: _logger.debug('Sleeping {0}.'.format(wait_time)) yield From(trollius.sleep(wait_time)) raise Return(False) else: if not verdict: self._url_item.skip() raise Return(False) raise Return(True)
def _stream_closed_callback(self): _logger.debug(__( 'Stream closed. active={0} connected={1} closed={2}', self._active, self.connected, self._io_stream.closed(), ))
def _connect(self): '''Connect the socket if not already connected.''' if self.connected: # Reset the callback so the context does not leak to another self._io_stream.set_close_callback(self._stream_closed_callback) return yield self._make_socket() _logger.debug(__('Connecting to {0}.', self._resolved_address)) try: yield self._io_stream.connect( self._resolved_address, timeout=self._params.connect_timeout ) except (tornado.netutil.SSLCertificateError, SSLVerficationError) as error: raise SSLVerficationError('Certificate error: {error}'.format( error=error)) from error except (ssl.SSLError, socket.error) as error: if error.errno == errno.ECONNREFUSED: raise ConnectionRefused('Connection refused: {error}'.format( error=error)) from error else: raise NetworkError('Connection error: {error}'.format( error=error)) from error else: _logger.debug('Connected.')
def _handle_error(self, request, url_item, error): url_info_dict = self.to_script_native_type( request.url_info.to_dict() ) url_record_dict = self.to_script_native_type( url_item.url_record.to_dict() ) error_info_dict = self.to_script_native_type({ 'error': error.__class__.__name__, }) action = self.callbacks.call_handle_error( url_info_dict, url_record_dict, error_info_dict ) _logger.debug(__('Hooked error returned {0}', action)) if action == Actions.NORMAL: return 'normal' elif action == Actions.RETRY: return False elif action == Actions.FINISH: url_item.set_status(Status.done) return True elif action == Actions.STOP: raise HookStop('Script requested immediate stop.') else: raise NotImplementedError()
def _server_end_response_callback(self, respoonse: Response): '''Response callback handler.''' request = self._item_session.request response = self._item_session.response _logger.info( __( _('Fetched ‘{url}’: {status_code} {reason}. ' 'Length: {content_length} [{content_type}].'), url=request.url, status_code=response.status_code, reason=wpull.string.printable_str(response.reason), content_length=wpull.string.printable_str( response.fields.get('Content-Length', _('none'))), content_type=wpull.string.printable_str( response.fields.get('Content-Type', _('none'))), )) self._result_rule.handle_response(self._item_session) if response.status_code in WebProcessor.DOCUMENT_STATUS_CODES: filename = self._file_writer_session.save_document(response) self._processing_rule.scrape_document(self._item_session) self._result_rule.handle_document(self._item_session, filename) elif response.status_code in WebProcessor.NO_DOCUMENT_STATUS_CODES: self._file_writer_session.discard_document(response) self._result_rule.handle_no_document(self._item_session) else: self._file_writer_session.discard_document(response) self._result_rule.handle_document_error(self._item_session)
def _get_next_url_record(self): '''Return the next available URL from the URL table. This function will return items marked as "todo" and then items marked as "error". As a consequence, items experiencing errors will be done last. Returns: :class:`.item.URLRecord`. ''' _logger.debug('Get next URL todo.') try: url_record = self._url_table.check_out(Status.todo) except NotFound: url_record = None if not url_record: try: _logger.debug('Get next URL error.') url_record = self._url_table.check_out(Status.error) except NotFound: url_record = None _logger.debug(__('Return record {0}.', url_record)) return url_record
def resolve(self, host, port=0): '''Resolve hostname and return the first result. Args: host (str): The hostname. port (int): The port number. Returns: tuple: A tuple of length 2 where the first item is the family and the second item is an socket address that can be passed to :func:`socket.connect`. Typically in a socket address, the first item is the IP address and the second item is the port number. Note that IPv6 may return a tuple containing more items than 2. ''' results = yield From(self.resolve_all(host, port)) if self._rotate: result = random.choice(results) else: result = results[0] family, address = result _logger.debug(__('Selected {0} as address.', address)) assert '.' in address[0] or ':' in address[0], \ ('Resolve did not return numerical address. Got {}.' .format(address[0])) raise Return((family, address))
def _run_worker(self): '''Run a single consumer. Coroutine. ''' _logger.debug('Worker start.') while True: priority, item = yield From(self._item_queue.get()) if item == self.POISON_PILL: _logger.debug('Worker quitting.') return else: _logger.debug(__('Processing item {0}.', item)) self._item_get_semaphore.release() self._token_queue.get_nowait() yield From(self._process_item(item)) self._token_queue.task_done() if os.environ.get('OBJGRAPH_DEBUG'): import gc import objgraph gc.collect() objgraph.show_most_common_types(25) if os.environ.get('FILE_LEAK_DEBUG'): import subprocess output = subprocess.check_output( ['lsof', '-p', str(os.getpid()), '-n']) for line in output.decode('ascii', 'replace').split('\n'): if 'REG' in line and \ (os.getcwd() in line or '/tmp/' in line): print('FILELEAK', line)
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.sitemap: return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) link_contexts = set() try: with wpull.util.reset_file_offset(response.body): link_iter = self.iter_processed_links(response.body, encoding, base_url) for link in link_iter: link_contexts.add(LinkContext(link, linked=True)) except (UnicodeError, self._html_parser.parser_error) as error: _logger.warning( __(_('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error)) return ScrapeResult(link_contexts, encoding)
def _handle_response(self, request, response, url_item): url_info_dict = self.to_script_native_type( request.url_info.to_dict() ) url_record_dict = self.to_script_native_type( url_item.url_record.to_dict() ) response_info_dict = self.to_script_native_type(response.to_dict()) action = self.callbacks.call_handle_response( url_info_dict, url_record_dict, response_info_dict ) _logger.debug(__('Hooked response returned {0}', action)) if action == Actions.NORMAL: return 'normal' elif action == Actions.RETRY: return False elif action == Actions.FINISH: url_item.set_status(Status.done) return True elif action == Actions.STOP: raise HookStop() else: raise NotImplementedError()
def _server_end_response_callback(self, respoonse: Response): '''Response callback handler.''' request = self._item_session.request response = self._item_session.response _logger.info(__( _('Fetched ‘{url}’: {status_code} {reason}. ' 'Length: {content_length} [{content_type}].'), url=request.url, status_code=response.status_code, reason=wpull.string.printable_str(response.reason), content_length=wpull.string.printable_str( response.fields.get('Content-Length', _('none'))), content_type=wpull.string.printable_str( response.fields.get('Content-Type', _('none'))), )) self._result_rule.handle_response(self._item_session) if response.status_code in WebProcessor.DOCUMENT_STATUS_CODES: filename = self._file_writer_session.save_document(response) self._processing_rule.scrape_document(self._item_session) self._result_rule.handle_document(self._item_session, filename) elif response.status_code in WebProcessor.NO_DOCUMENT_STATUS_CODES: self._file_writer_session.discard_document(response) self._result_rule.handle_no_document(self._item_session) else: self._file_writer_session.discard_document(response) self._result_rule.handle_document_error(self._item_session)
def _build_cookie_jar(cls, session: AppSession): '''Build the cookie jar''' if not session.args.cookies: return if session.args.load_cookies or session.args.save_cookies: session.factory.set('CookieJar', BetterMozillaCookieJar) cookie_jar = session.factory.new('CookieJar') if session.args.load_cookies: cookie_jar.load(session.args.load_cookies, ignore_discard=True) else: cookie_jar = session.factory.new('CookieJar') policy = session.factory.new('CookiePolicy', cookie_jar=cookie_jar) cookie_jar.set_policy(policy) _logger.debug(__('Loaded cookies: {0}', list(cookie_jar))) cookie_jar_wrapper = session.factory.new( 'CookieJarWrapper', cookie_jar, save_filename=session.args.save_cookies, keep_session_cookies=session.args.keep_session_cookies, ) return cookie_jar_wrapper
def process_one(self, _worker_id=None): item = yield from self._item_queue.get() if item == POISON_PILL: return item _logger.debug(__('Worker id {} Processing item {}', _worker_id, item)) for task in self._tasks: yield from task.process(item) _logger.debug(__('Worker id {} Processed item {}', _worker_id, item)) yield from self._item_queue.item_done() return item
def set_status(self, status: Status, increment_try_count: bool=True, filename: str=None): '''Mark the item with the given status. Args: status: a value from :class:`Status`. increment_try_count: if True, increment the ``try_count`` value ''' url = self.url_record.url assert not self._try_count_incremented, (url, status) if increment_try_count: self._try_count_incremented = True _logger.debug(__('Marking URL {0} status {1}.', url, status)) url_result = URLResult() url_result.filename = filename self.app_session.factory['URLTable'].check_in( url, status, increment_try_count=increment_try_count, url_result=url_result, ) self._processed = True
def _read_response_by_length(self, response): '''Read the connection specified by a length.''' _logger.debug('Reading body by length.') try: body_size = int(response.fields['Content-Length']) if body_size < 0: raise ValueError('Content length cannot be negative.') except ValueError as error: _logger.warning(__( _('Invalid content length: {error}'), error=error )) yield self._read_response_until_close(response) return def callback(data): self._events.response_data.fire(data) response.body.content_file.write(self._decompress_data(data)) yield self._io_stream.read_bytes( body_size, streaming_callback=callback, ) response.body.content_file.write(self._flush_decompressor())
def scrape_document(self, request, response, url_item): '''Process document for links.''' try: self.call_hook('scrape_document', request, response, url_item) except HookDisconnected: pass if not self._document_scraper: return demux_info = self._document_scraper.scrape_info( request, response, url_item.url_record.link_type) num_inline_urls = 0 num_linked_urls = 0 for scraper, scrape_result in demux_info.items(): new_inline, new_linked = self._process_scrape_info( scraper, scrape_result, url_item) num_inline_urls += new_inline num_linked_urls += new_linked _logger.debug( __('Candidate URLs: inline={0} linked={1}', num_inline_urls, num_linked_urls))
def _warn_unsafe_options(cls, args): '''Print warnings about any enabled hazardous options. This function will print messages complaining about: * ``--save-headers`` * ``--no-iri`` * ``--output-document`` * ``--ignore-fatal-errors`` ''' enabled_options = [] for option_name in cls.UNSAFE_OPTIONS: if getattr(args, option_name): enabled_options.append(option_name) if enabled_options: _logger.warning(__( _('The following unsafe options are enabled: {list}.'), list=enabled_options )) _logger.warning( _('The use of unsafe options may lead to unexpected behavior ' 'or file corruption.')) if not args.retr_symlinks: _logger.warning( _('The --retr-symlinks=off option is a security risk.') )
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.css: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): if context == 'import': link_type = LinkType.css else: link_type = LinkType.media link_contexts.add(LinkContext(link, inline=True, link_type=link_type)) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.html: return base_url = request.url_info.url content_file = response.body encoding = self._encoding_override \ or detect_response_encoding(response, is_html=True) link_contexts = set() try: with wpull.util.reset_file_offset(content_file): elements = self.iter_elements(content_file, encoding=encoding) result_meta_info = self._process_elements( elements, response, base_url, link_contexts ) except (UnicodeError, self._html_parser.parser_error) as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) result_meta_info = {} if result_meta_info.get('robots_no_follow'): link_contexts.discard(frozenset( context for context in link_contexts if context.linked )) scrape_result = ScrapeResult(link_contexts, encoding) scrape_result['base_url'] = base_url return scrape_result
def _start_new_warc_file(self, meta=False): if self._params.max_size is None: sequence_name = '' elif meta: sequence_name = '-meta' else: sequence_name = '-{0:05d}'.format(self._sequence_num) if self._params.compress: extension = 'warc.gz' else: extension = 'warc' self._warc_filename = '{0}{1}.{2}'.format( self._prefix_filename, sequence_name, extension ) _logger.debug(__('WARC file at {0}', self._warc_filename)) if not self._params.appending: wpull.util.truncate_file(self._warc_filename) self._warcinfo_record = WARCRecord() self._populate_warcinfo(self._params.extra_fields) self.write_record(self._warcinfo_record)
def connect(self): '''Establish a connection.''' _logger.debug(__('Connecting to {0}.', self._address)) if self._state != ConnectionState.ready: raise Exception('Closed connection must be reset before reusing.') if self._sock: connection_future = asyncio.open_connection( sock=self._sock, **self._connection_kwargs()) else: # TODO: maybe we don't want to ignore flow-info and scope-id? host = self._address[0] port = self._address[1] connection_future = asyncio.open_connection( host, port, **self._connection_kwargs()) self.reader, self.writer = yield from \ self.run_network_operation( connection_future, wait_timeout=self._connect_timeout, name='Connect') if self._timeout is not None: self._close_timer = CloseTimer(self._timeout, self) else: self._close_timer = DummyCloseTimer() self._state = ConnectionState.created _logger.debug('Connected.')
def _build_cookie_jar(self): '''Build the cookie jar''' if not self._args.cookies: return if self._args.load_cookies or self._args.save_cookies: self._factory.set('CookieJar', RelaxedMozillaCookieJar) cookie_jar = self._factory.new('CookieJar') if self._args.load_cookies: cookie_jar.load(self._args.load_cookies, ignore_discard=True) else: cookie_jar = self._factory.new('CookieJar') policy = self._factory.new('CookiePolicy', cookie_jar=cookie_jar) cookie_jar.set_policy(policy) _logger.debug(__('Loaded cookies: {0}', list(cookie_jar))) cookie_jar_wrapper = self._factory.new( 'CookieJarWrapper', cookie_jar, save_filename=self._args.save_cookies, keep_session_cookies=True, ) return cookie_jar_wrapper
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.javascript: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): inline = is_likely_inline(link) if context is True: link_type = None else: link_type = context link_contexts.add( LinkContext(link, inline=inline, linked=not inline, link_type=link_type) ) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)
def _read_body_by_length(self, response, file): '''Read the connection specified by a length. Coroutine. ''' _logger.debug('Reading body by length.') file_is_async = hasattr(file, 'drain') try: body_size = int(response.fields['Content-Length']) if body_size < 0: raise ValueError('Content length cannot be negative.') except ValueError as error: _logger.warning(__( _('Invalid content length: {error}'), error=error )) yield From(self._read_body_until_close(response, file)) return bytes_left = body_size while bytes_left > 0: data = yield From(self._connection.read(self._read_size)) if not data: break bytes_left -= len(data) if bytes_left < 0: data = data[:bytes_left] _logger.warning(_('Content overrun.')) self.close() self._data_observer.notify('response_body', data) content_data = self._decompress_data(data) if file: file.write(content_data) if file_is_async: yield From(file.drain()) if bytes_left > 0: raise NetworkError('Connection closed.') content_data = self._flush_decompressor() if file and content_data: file.write(content_data) if file_is_async: yield From(file.drain())
def _close_servers(self): '''Close and wait for servers to close. Coroutine. ''' for server in self._servers: _logger.debug(__('Closing server {}', server)) server.close() yield From(server.wait_closed())
def _start_servers(self): '''Start servers. Coroutine. ''' for task in self._server_tasks: _logger.debug(__('Starting task {}', task)) server = yield From(task) self._servers.append(server)
def _warn_discarded_items(self): _logger.warning(__( gettext.ngettext( 'Discarding {num} unprocessed item.', 'Discarding {num} unprocessed items.', self._item_queue.unfinished_items ), num=self._item_queue.unfinished_items ))
def start(self, request: Request) -> Response: '''Begin a HTTP request Args: request: Request information. Returns: A response populated with the HTTP headers. Once the headers are received, call :meth:`download`. Coroutine. ''' if self._session_state != SessionState.ready: raise RuntimeError('Session already started') assert not self._request self._request = request _logger.debug(__('Client fetch request {0}.', request)) connection = yield from self._acquire_request_connection(request) full_url = connection.proxied and not connection.tunneled self._stream = stream = self._stream_factory(connection) yield from self._stream.reconnect() request.address = connection.address self.event_dispatcher.notify(self.Event.begin_request, request) write_callback = functools.partial(self.event_dispatcher.notify, self.Event.request_data) stream.data_event_dispatcher.add_write_listener(write_callback) yield from stream.write_request(request, full_url=full_url) if request.body: assert 'Content-Length' in request.fields length = int(request.fields['Content-Length']) yield from stream.write_body(request.body, length=length) stream.data_event_dispatcher.remove_write_listener(write_callback) self.event_dispatcher.notify(self.Event.end_request, request) read_callback = functools.partial(self.event_dispatcher.notify, self.Event.response_data) stream.data_event_dispatcher.add_read_listener(read_callback) self._response = response = yield from stream.read_response() response.request = request self.event_dispatcher.notify(self.Event.begin_response, response) self._session_state = SessionState.request_sent return response