def test_name_value_fallback(self): text = '''Name: Кракозябры'''.encode('koi8-r') record = NameValueRecord() record.parse(text) self.assertEqual('Кракозябры'.encode('koi8-r').decode('latin1'), record['name'])
def test_name_value_str_format(self): record = NameValueRecord() record.parse(self.RECORD_STR_1) self.assertEqual(('Entry:\r\n' 'Who: Gilbert, W.S. | Sullivan, Arthur\r\n' 'What: The Yeomen of the Guard\r\n' 'When/Created: 1888\r\n'), str(record))
def test_name_value_fallback(self): text = '''Name: Кракозябры'''.encode('koi8-r') record = NameValueRecord() record.parse(text) self.assertEqual( 'Кракозябры'.encode('koi8-r').decode('latin1'), record['name'])
def __init__(self, method=None, resource_path=None, version='HTTP/1.1'): super().__init__() self.method = method self.resource_path = resource_path self.version = version self.fields = NameValueRecord(encoding='latin-1') self.body = None self.encoding = 'latin-1'
def test_name_value_encoding(self): text = '''Name: Кракозябры'''.encode('koi8-r') record = NameValueRecord(encoding='koi8-r') record.parse(text) self.assertEqual( 'Кракозябры', record['name'])
def test_name_value_record_setters(self): record = NameValueRecord() self.assertNotIn('cache', record) self.assertRaises(KeyError, lambda: record['cache']) record['cache'] = 'value1' self.assertEqual('value1', record['CACHE']) self.assertEqual(['value1'], record.get_list('Cache')) self.assertEqual([('Cache', 'value1')], list(record.get_all()))
def test_name_value_str_format(self): record = NameValueRecord() record.parse(self.RECORD_STR_1) self.assertEqual( ('Entry:\r\n' 'Who: Gilbert, W.S. | Sullivan, Arthur\r\n' 'What: The Yeomen of the Guard\r\n' 'When/Created: 1888\r\n'), str(record) )
def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int): '''Build proxy server and PhantomJS client. controller, coprocessor.''' page_settings = {} default_headers = NameValueRecord() for header_string in session.args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not session.args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if session.args.read_timeout: page_settings['resourceTimeout'] = session.args.read_timeout * 1000 page_settings['userAgent'] = session.args.user_agent \ or session.default_user_agent # Test early for executable wpull.driver.phantomjs.get_version(session.args.phantomjs_exe) phantomjs_params = PhantomJSParams( wait_time=session.args.phantomjs_wait, num_scrolls=session.args.phantomjs_scroll, smart_scroll=session.args.phantomjs_smart_scroll, snapshot=session.args.phantomjs_snapshot, custom_headers=default_headers, page_settings=page_settings, load_time=session.args.phantomjs_max_time, ) extra_args = [ '--proxy', '{}:{}'.format(session.args.proxy_server_address, proxy_port), '--ignore-ssl-errors=true' ] phantomjs_driver_factory = functools.partial( session.factory.class_map['PhantomJSDriver'], exe_path=session.args.phantomjs_exe, extra_args=extra_args, ) phantomjs_coprocessor = session.factory.new( 'PhantomJSCoprocessor', phantomjs_driver_factory, session.factory['ProcessingRule'], phantomjs_params, root_path=session.args.directory_prefix, warc_recorder=session.factory.get('WARCRecorder'), ) return phantomjs_coprocessor
def test_missing_colon(self): record = NameValueRecord() self.assertRaises(ValueError, record.parse, 'text:hello\nhi\n') record = NameValueRecord() record.parse('text:hello\nhi\n', strict=False) self.assertEqual('hello', record['text']) self.assertNotIn('hi', record)
def test_name_value_record_setters(self): record = NameValueRecord() self.assertNotIn('cache', record) self.assertRaises(KeyError, lambda: record['cache']) record['cache'] = 'value1' self.assertEqual('value1', record['CACHE']) self.assertEqual(['value1'], record.get_list('Cache')) self.assertEqual( [('Cache', 'value1')], list(record.get_all()) )
def test_mixed_line_ending(self): record = NameValueRecord() record.parse(self.MIXED_LINE_ENDING_STR_1) self.assertEqual('woof', record['dog']) self.assertEqual('meow', record['cat']) self.assertEqual('tweet', record['bird']) self.assertEqual('squeak', record['mouse']) self.assertEqual('moo', record['cow']) self.assertEqual('croak', record['frog']) self.assertEqual('toot', record['elephant']) self.assertEqual('quack', record['duck']) self.assertEqual('blub', record['fish']) self.assertEqual('ow ow ow', record['seal']) self.assertEqual('???', record['fox'])
def __init__(self, version, status_code, status_reason): self.version = version self.status_code = status_code self.status_reason = status_reason self.fields = NameValueRecord() self.body = Body() self.url_info = None
def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new('HTTPProxyServer', self.factory['Client']) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = NameValueRecord() for header_string in self._args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not self._args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), smart_scroll=self._args.phantomjs_smart_scroll, ) return phantomjs_controller
def __init__(self, status_code=None, reason=None, version='HTTP/1.1', request=None): if status_code is not None: assert isinstance(status_code, int), \ 'Expect int, got {}'.format(type(status_code)) assert reason is not None self.status_code = status_code self.reason = reason self.version = version self.fields = NameValueRecord(encoding='latin-1') self.body = None self.request = request self.encoding = 'latin-1'
def __init__(self, method, resource_url, version='HTTP/1.1'): self.method = method self.resource_url = resource_url self.url_info = None self.version = version self.fields = NameValueRecord() self.body = Body() self.address = None
def _build_phantomjs_controller(self): '''Build proxy server and PhantomJS client and controller.''' if not self._args.phantomjs: return proxy_server = self._factory.new( 'HTTPProxyServer', self.factory['Client'] ) proxy_socket, proxy_port = tornado.testing.bind_unused_port() proxy_server.add_socket(proxy_socket) page_settings = {} default_headers = NameValueRecord() for header_string in self._args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not self._args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if self._args.read_timeout: page_settings['resourceTimeout'] = self._args.read_timeout * 1000 page_settings['userAgent'] = self._args.user_agent \ or self.default_user_agent phantomjs_client = self._factory.new( 'PhantomJSClient', 'localhost:{0}'.format(proxy_port), page_settings=page_settings, default_headers=default_headers, exe_path=self._args.phantomjs_exe ) phantomjs_client.test_client_exe() phantomjs_controller = self._factory.new( 'PhantomJSController', phantomjs_client, wait_time=self._args.phantomjs_wait, num_scrolls=self._args.phantomjs_scroll, warc_recorder=self.factory.get('WARCRecorder'), smart_scroll=self._args.phantomjs_smart_scroll, snapshot=self._args.phantomjs_snapshot, ) return phantomjs_controller
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields( WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = 'Wpull/{0} Python/{1}'.format( wpull.version.__version__, wpull.util.python_version()) info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields(WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = self._params.software_string \ or self.DEFAULT_SOFTWARE_STRING info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def test_wrap_width(self): record = NameValueRecord(wrap_width=24) record['blah'] = 'hello ' * 10 self.assertEqual( 'Blah: hello hello hello hello\r\n' ' hello hello hello \r\n' ' hello hello hello \r\n', str(record) )
def _populate_warcinfo(self, extra_fields=None): '''Add the metadata to the Warcinfo record.''' self._warcinfo_record.set_common_fields( WARCRecord.WARCINFO, WARCRecord.WARC_FIELDS) info_fields = NameValueRecord() info_fields['Software'] = self._params.software_string \ or self.DEFAULT_SOFTWARE_STRING info_fields['format'] = 'WARC File Format 1.0' info_fields['conformsTo'] = \ 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf' if extra_fields: for name, value in extra_fields: info_fields.add(name, value) self._warcinfo_record.block_file = io.BytesIO( bytes(info_fields) + b'\r\n') self._warcinfo_record.compute_checksum()
def test_with_normalize_overrides(self): record = NameValueRecord(normalize_overrides=['WARC-Type']) record.add('WARC-Type', 'warcinfo') self.assertIn('WARC-Type', record) self.assertEqual('warcinfo', record['WARC-Type']) self.assertEqual([('WARC-Type', 'warcinfo')], list(record.get_all())) self.assertEqual(['warcinfo'], record.get_list('Warc-Type')) self.assertEqual(['WARC-Type'], list(record.keys())) record['Warc-Type'] = 'resource' self.assertIn('WARC-Type', record) self.assertEqual('resource', record['WARC-Type']) self.assertEqual([('WARC-Type', 'resource')], list(record.get_all())) self.assertEqual(['resource'], record.get_list('Warc-Type')) self.assertEqual(['WARC-Type'], list(record.keys())) record['WARC-Blah'] = 'blah' self.assertEqual(['WARC-Type', 'Warc-Blah'], list(record.keys()))
def _new_mock_response(self, response, file_path): '''Return a new mock Response with the content.''' mock_response = copy.copy(response) mock_response.body = Body(open(file_path, 'rb')) mock_response.fields = NameValueRecord() for name, value in response.fields.get_all(): mock_response.fields.add(name, value) mock_response.fields['Content-Type'] = 'text/html; charset="utf-8"' return mock_response
def _new_phantomjs_response(self, response, content): '''Return a new mock Response with the content.''' mock_response = copy.copy(response) # tempfile needed for scripts that need a on-disk filename mock_response.body.content_file = tempfile.SpooledTemporaryFile( max_size=999999999) mock_response.body.content_file.write(content.encode('utf-8')) mock_response.body.content_file.seek(0) mock_response.fields = NameValueRecord() for name, value in response.fields.get_all(): mock_response.fields.add(name, value) mock_response.fields['Content-Type'] = 'text/html; charset="utf-8"' return mock_response
def test_copy(self): record = NameValueRecord() record['blah'] = 'hello' # Check for no crash copy.deepcopy(record)
class Response(SerializableMixin, DictableMixin, ProtocolResponseMixin): '''Represents the HTTP response. Attributes: status_code (int): The status code in the status line. status_reason (str): The status reason string in the status line. version (str): The HTTP version in the status line. For example, ``HTTP/1.1``. fields (:class:`.namevalue.NameValueRecord`): The fields in the HTTP headers (and trailer, if present). body (:class:`.body.Body`, file-like, None): The optional payload (without and transfer or content encoding). request: The corresponding request. encoding (str): The encoding of the status line. ''' def __init__(self, status_code=None, reason=None, version='HTTP/1.1', request=None): if status_code is not None: assert isinstance(status_code, int), \ 'Expect int, got {}'.format(type(status_code)) assert reason is not None self.status_code = status_code self.reason = reason self.version = version self.fields = NameValueRecord(encoding='latin-1') self.body = None self.request = request self.encoding = 'latin-1' @property def protocol(self): return 'http' def to_dict(self): return { 'protocol': 'http', 'status_code': self.status_code, 'reason': self.reason, 'response_code': self.status_code, 'response_message': self.reason, 'version': self.version, 'fields': list(self.fields.get_all()), 'body': self.call_to_dict_or_none(self.body), 'request': self.request.to_dict() if self.request else None, 'encoding': self.encoding, } def to_bytes(self): assert self.version assert self.status_code is not None assert self.reason is not None status = '{0} {1} {2}'.format(self.version, self.status_code, self.reason).encode(self.encoding) fields = self.fields.to_bytes(errors='replace') return b'\r\n'.join([status, fields, b'']) def parse(self, data): if self.status_code is None: line, data = data.split(b'\n', 1) self.version, self.status_code, self.reason = self.parse_status_line(line) self.fields.parse(data, strict=False) @classmethod def parse_status_line(cls, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the version, code, and reason. ''' match = re.match( br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)', data ) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], int(groups[1]), groups[2]), encoding='latin-1', ) raise ProtocolError( 'Error parsing status line {line}".'.format(line=ascii(data)) ) def __repr__(self): return '<Response({version}, {code}, {reason})>'.format( version=ascii(self.version), code=self.status_code, reason=ascii(self.reason) ) def __str__(self): return wpull.string.printable_str( self.to_bytes().decode('utf-8', 'replace'), keep_newlines=True ) def response_code(self): return self.status_code def response_message(self): return self.reason
class RawRequest(SerializableMixin, DictableMixin): '''Represents an HTTP request. Attributes: method (str): The HTTP method in the status line. For example, ``GET``, ``POST``. resource_path (str): The URL or "path" in the status line. version (str): The HTTP version in the status line. For example, ``HTTP/1.0``. fields (:class:`.namevalue.NameValueRecord`): The fields in the HTTP header. body (:class:`.body.Body`, file-like, None): An optional payload. encoding (str): The encoding of the status line. ''' def __init__(self, method=None, resource_path=None, version='HTTP/1.1'): super().__init__() self.method = method self.resource_path = resource_path self.version = version self.fields = NameValueRecord(encoding='latin-1') self.body = None self.encoding = 'latin-1' def to_dict(self): return { 'protocol': 'http', 'method': self.method, 'version': self.version, 'resource_path': self.resource_path, 'fields': list(self.fields.get_all()), 'body': self.call_to_dict_or_none(self.body), 'encoding': self.encoding, } def to_bytes(self): assert self.method assert self.resource_path assert self.version status = '{0} {1} {2}'.format(self.method, self.resource_path, self.version).encode(self.encoding) fields = self.fields.to_bytes(errors='replace') return b'\r\n'.join([status, fields, b'']) def parse(self, data): if not self.resource_path: line, data = data.split(b'\n', 1) self.method, self.resource_path, self.version = self.parse_status_line(line) self.fields.parse(data, strict=False) def parse_status_line(self, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the method, URI, and version. ''' match = re.match( br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)', data ) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], groups[1], groups[2]), encoding=self.encoding, ) raise ProtocolError('Error parsing status line.') def __repr__(self): return '<Request({method}, {url}, {version})>'.format( method=self.method, url=self.resource_path, version=self.version ) def copy(self): '''Return a copy.''' return copy.deepcopy(self) def set_continue(self, offset): '''Modify the request into a range request.''' assert offset >= 0, offset self.fields['Range'] = 'bytes={0}-'.format(offset)
def test_name_value_record_parsing(self): record = NameValueRecord() record.parse(self.RECORD_STR_1) self.assertIn('who', record) self.assertEqual('Gilbert, W.S. | Sullivan, Arthur', record['who'])
def __init__(self): self.fields = NameValueRecord(normalize_overrides=self.NAME_OVERRIDES) self.block_file = None
def test_name_value_utf8(self): text = '''Name: dogé''' record = NameValueRecord() record.parse(text) self.assertEqual('dogé', record['name'])
class RawRequest(BaseRequest, SerializableMixin, DictableMixin): '''Represents an HTTP request. Attributes: method (str): The HTTP method in the status line. For example, ``GET``, ``POST``. resource_path (str): The URL or "path" in the status line. version (str): The HTTP version in the status line. For example, ``HTTP/1.0``. fields (:class:`.namevalue.NameValueRecord`): The fields in the HTTP header. body (:class:`.body.Body`, file-like, None): An optional payload. encoding (str): The encoding of the status line. ''' def __init__(self, method=None, resource_path=None, version='HTTP/1.1'): super().__init__() self.method = method self.resource_path = resource_path self.version = version self.fields = NameValueRecord(encoding='latin-1') self.body = None self.encoding = 'latin-1' def to_dict(self): return { 'protocol': 'http', 'method': self.method, 'version': self.version, 'resource_path': self.resource_path, 'fields': list(self.fields.get_all()), 'body': self.call_to_dict_or_none(self.body), 'encoding': self.encoding, } def to_bytes(self): assert self.method assert self.resource_path assert self.version status = '{0} {1} {2}'.format(self.method, self.resource_path, self.version).encode(self.encoding) fields = self.fields.to_bytes(errors='replace') return b'\r\n'.join([status, fields, b'']) def parse(self, data): if not self.resource_path: line, data = data.split(b'\n', 1) self.method, self.resource_path, self.version = self.parse_status_line( line) self.fields.parse(data, strict=False) def parse_status_line(self, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the method, URI, and version. ''' match = re.match(br'([a-zA-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/\d+\.\d+)', data) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], groups[1], groups[2]), encoding=self.encoding, ) raise ProtocolError('Error parsing status line.') def __repr__(self): return '<Request({method}, {url}, {version})>'.format( method=self.method, url=self.resource_path, version=self.version) def copy(self): '''Return a copy.''' return copy.deepcopy(self) def set_continue(self, offset): '''Modify the request into a range request.''' assert offset >= 0, offset self.fields['Range'] = 'bytes={0}-'.format(offset)
class Response(BaseResponse, SerializableMixin, DictableMixin): '''Represents the HTTP response. Attributes: status_code (int): The status code in the status line. status_reason (str): The status reason string in the status line. version (str): The HTTP version in the status line. For example, ``HTTP/1.1``. fields (:class:`.namevalue.NameValueRecord`): The fields in the HTTP headers (and trailer, if present). body (:class:`.body.Body`, file-like, None): The optional payload (without and transfer or content encoding). request: The corresponding request. encoding (str): The encoding of the status line. ''' def __init__(self, status_code=None, reason=None, version='HTTP/1.1', request=None): super().__init__() if status_code is not None: assert isinstance(status_code, int), \ 'Expect int, got {}'.format(type(status_code)) assert reason is not None self.status_code = status_code self.reason = reason self.version = version self.fields = NameValueRecord(encoding='latin-1') self.request = request self.encoding = 'latin-1' @property def protocol(self): return 'http' def to_dict(self): return { 'protocol': 'http', 'status_code': self.status_code, 'reason': self.reason, 'response_code': self.status_code, 'response_message': self.reason, 'version': self.version, 'fields': list(self.fields.get_all()), 'body': self.call_to_dict_or_none(self.body), 'request': self.request.to_dict() if self.request else None, 'encoding': self.encoding, } def to_bytes(self): assert self.version assert self.status_code is not None assert self.reason is not None status = '{0} {1} {2}'.format(self.version, self.status_code, self.reason).encode(self.encoding) fields = self.fields.to_bytes(errors='replace') return b'\r\n'.join([status, fields, b'']) def parse(self, data): if self.status_code is None: line, data = data.split(b'\n', 1) self.version, self.status_code, self.reason = self.parse_status_line( line) self.fields.parse(data, strict=False) @classmethod def parse_status_line(cls, data): '''Parse the status line bytes. Returns: tuple: An tuple representing the version, code, and reason. ''' match = re.match(br'(HTTP/\d+\.\d+)[ \t]+([0-9]{1,3})[ \t]*([^\r\n]*)', data) if match: groups = match.groups() if len(groups) == 3: return wpull.string.to_str( (groups[0], int(groups[1]), groups[2]), encoding='latin-1', ) raise ProtocolError( 'Error parsing status line {line}".'.format(line=ascii(data))) def __repr__(self): return '<Response({version}, {code}, {reason})>'.format( version=ascii(self.version), code=self.status_code, reason=ascii(self.reason)) def __str__(self): return wpull.string.printable_str(self.to_bytes().decode( 'utf-8', 'replace'), keep_newlines=True) def response_code(self): return self.status_code def response_message(self): return self.reason