def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int): '''Build proxy server and PhantomJS client. controller, coprocessor.''' page_settings = {} default_headers = NameValueRecord() for header_string in session.args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not session.args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if session.args.read_timeout: page_settings['resourceTimeout'] = session.args.read_timeout * 1000 page_settings['userAgent'] = session.args.user_agent \ or session.default_user_agent # Test early for executable wpull.driver.phantomjs.get_version(session.args.phantomjs_exe) phantomjs_params = PhantomJSParams( wait_time=session.args.phantomjs_wait, num_scrolls=session.args.phantomjs_scroll, smart_scroll=session.args.phantomjs_smart_scroll, snapshot=session.args.phantomjs_snapshot, custom_headers=default_headers, page_settings=page_settings, load_time=session.args.phantomjs_max_time, ) extra_args = [ '--proxy', '{}:{}'.format(session.args.proxy_server_address, proxy_port), '--ignore-ssl-errors=true' ] phantomjs_driver_factory = functools.partial( session.factory.class_map['PhantomJSDriver'], exe_path=session.args.phantomjs_exe, extra_args=extra_args, ) phantomjs_coprocessor = session.factory.new( 'PhantomJSCoprocessor', phantomjs_driver_factory, session.factory['ProcessingRule'], phantomjs_params, root_path=session.args.directory_prefix, warc_recorder=session.factory.get('WARCRecorder'), ) return phantomjs_coprocessor
def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new( 'HTTPProxyServer', self.factory['Client'] ) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = NameValueRecord() for header_string in self._args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not self._args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, exe_path=self._args.phantomjs_exe ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), smart_scroll=self._args.phantomjs_smart_scroll, snapshot=self._args.phantomjs_snapshot, ) return phantomjs_controller
def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new('HTTPProxyServer', self.factory['Client']) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = NameValueRecord() for header_string in self._args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not self._args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), smart_scroll=self._args.phantomjs_smart_scroll, ) return phantomjs_controller
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields( WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = self._params.software_string \ or self.DEFAULT_SOFTWARE_STRING info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields(WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = self._params.software_string \ or self.DEFAULT_SOFTWARE_STRING info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields( WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = 'Wpull/{0} Python/{1}'.format( wpull.version.__version__, wpull.util.python_version()) info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def test_with_normalize_overrides(self): record = NameValueRecord(normalize_overrides=['WARC-Type']) record.add('WARC-Type', 'warcinfo') self.assertIn('WARC-Type', record) self.assertEqual('warcinfo', record['WARC-Type']) self.assertEqual([('WARC-Type', 'warcinfo')], list(record.get_all())) self.assertEqual(['warcinfo'], record.get_list('Warc-Type')) self.assertEqual(['WARC-Type'], list(record.keys())) record['Warc-Type'] = 'resource' self.assertIn('WARC-Type', record) self.assertEqual('resource', record['WARC-Type']) self.assertEqual([('WARC-Type', 'resource')], list(record.get_all())) self.assertEqual(['resource'], record.get_list('Warc-Type')) self.assertEqual(['WARC-Type'], list(record.keys())) record['WARC-Blah'] = 'blah' self.assertEqual(['WARC-Type', 'Warc-Blah'], list(record.keys()))