def urljoin_rfc(base, ref, encoding='utf-8'): r""" .. warning:: This function is deprecated and will be removed in future. Please use ``urlparse.urljoin`` instead. Same as urlparse.urljoin but supports unicode values in base and ref parameters (in which case they will be converted to str using the given encoding). Always returns a str. >>> import w3lib.url >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html') 'http://www.example.com/otherpath/index2.html' >>> >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm' >>> """ warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead", DeprecationWarning) str_base = to_bytes(base, encoding) str_ref = to_bytes(ref, encoding) return urljoin(str_base, str_ref)
def __init__(self, settings): server = settings.get('KAFKA_LOCATION') self.topic_todo = to_bytes( settings.get('OUTGOING_TOPIC', "frontier-todo")) self.topic_done = to_bytes( settings.get('INCOMING_TOPIC', "frontier-done")) self.topic_scoring = to_bytes(settings.get('SCORING_TOPIC')) self.general_group = to_bytes(settings.get('FRONTIER_GROUP', "general")) self.sw_group = to_bytes( settings.get('SCORING_GROUP', "strategy-workers")) self.spider_partition_id = settings.get('SPIDER_PARTITION_ID') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.hostname_partitioning = settings.get( 'QUEUE_HOSTNAME_PARTITIONING') self.codec = None codec = settings.get('KAFKA_CODEC_LEGACY') if codec == 'none': from kafka.protocol import CODEC_NONE self.codec = CODEC_NONE if codec == 'snappy': from kafka.protocol import CODEC_SNAPPY self.codec = CODEC_SNAPPY if codec == 'gzip': from kafka.protocol import CODEC_GZIP self.codec = CODEC_GZIP if self.codec is None: raise NameError("Non-existent Kafka compression codec.") self.conn = KafkaClient(server)
def test_local_hostname_fingerprint_bytes(self): assert hostname_local_fingerprint( to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13' assert hostname_local_fingerprint( to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' assert hostname_local_fingerprint( to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76'
def test_sha1_bytes(self): assert sha1( to_bytes(url1)) == b'880c5e7919cb09e182bd639d724bce6d90db71eb' assert sha1( to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' assert sha1( to_bytes(url3)) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e'
def urljoin_rfc(base, ref, encoding='utf-8'): r""" .. warning:: This function is deprecated and will be removed in future. It is not supported with Python 3. Please use ``urlparse.urljoin`` instead. Same as urlparse.urljoin but supports unicode values in base and ref parameters (in which case they will be converted to str using the given encoding). Always returns a str. >>> import w3lib.url >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html') 'http://www.example.com/otherpath/index2.html' >>> >>> # Note: the following does not work in Python 3 >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm' >>> """ warnings.warn( "w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead", DeprecationWarning) str_base = to_bytes(base, encoding) str_ref = to_bytes(ref, encoding) return urljoin(str_base, str_ref)
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. Also, ASCII tabs and newlines are removed as per https://url.spec.whatwg.org/#url-parsing. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). If quote_path is True (default), path_encoding ('utf-8' by default) is used to encode URL path component which is then quoted. Otherwise, if quote_path is False, path component is not encoded or quoted. Given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes decoded = to_unicode(url, encoding=encoding, errors='percentencode') parts = urlsplit(_ascii_tab_newline_re.sub('', decoded)) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # default encoding for path component SHOULD be UTF-8 if quote_path: path = quote(to_bytes(parts.path, path_encoding), _safe_chars) else: path = to_native_str(parts.path) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc).rstrip(':'), path, # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def test_request_response_converters(): spider = TestSpider() rc = RequestConverter(spider) rsc = ResponseConverter(spider, rc) url = "http://test.com/test?param=123" request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback, body=REQUEST_BODY) request.meta[b'test_param'] = b'test_value' request.headers.appendlist(b"TestKey", b"test value") request.cookies[b'MyCookie'] = b'CookieContent' frontier_request = rc.to_frontier(request) assert frontier_request.meta[b'scrapy_callback'] == b'callback' assert frontier_request.meta[b'scrapy_errback'] == b'errback' assert frontier_request.body == to_bytes(REQUEST_BODY) assert frontier_request.url == url assert frontier_request.method == b'GET' assert frontier_request.headers[b'Testkey'] == b'test value' assert frontier_request.cookies[b'MyCookie'] == b'CookieContent' assert b'frontier_request' not in frontier_request.meta[b'scrapy_meta'] request_converted = rc.from_frontier(frontier_request) assert request_converted.meta[b'test_param'] == b'test_value' assert request_converted.body == to_bytes(REQUEST_BODY) assert request_converted.url == url assert request_converted.method == 'GET' assert request_converted.cookies[b'MyCookie'] == b'CookieContent' assert request_converted.headers.get(b'Testkey') == b'test value' assert request_converted.callback == spider.callback assert request_converted.errback == spider.errback # Some middleware could change .meta contents request_converted.meta[b'middleware_stuff'] = b'appeared' response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY, headers={b'TestHeader': b'Test value'}) frontier_response = rsc.to_frontier(response) assert frontier_response.body == RESPONSE_BODY assert frontier_response.meta[b'scrapy_meta'][b'test_param'] == b'test_value' assert frontier_response.meta[b'scrapy_meta'][b'middleware_stuff'] == b'appeared' assert frontier_response.status_code == 200 assert b'frontier_request' not in frontier_response.meta[b'scrapy_meta'] response_converted = rsc.from_frontier(frontier_response) assert response_converted.body == RESPONSE_BODY assert response_converted.meta[b'test_param'] == b'test_value' assert response_converted.url == url assert response_converted.status == 200 assert response_converted.headers[b'TestHeader'] == b'Test value' frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ if six.PY2: baseurl = to_bytes(baseurl, encoding) try: text = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise text = remove_tags_with_content(text, ignore_tags) text = remove_comments(replace_entities(text)) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) url = moves.urllib.parse.urljoin(baseurl, url) return interval, url else: return None, None
def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table( self.table_name, {'f': { 'max_versions': 1, 'block_cache_enabled': 1 }}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def __init__(self, connection, table_name, cache_size_limit, write_log_size, drop_all_tables): self.connection = connection self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") self._state_batch = self.connection.table( self._table_name).batch(batch_size=write_log_size) self._state_stats = defaultdict(int) self._state_cache = LRUCacheWithStats(maxsize=cache_size_limit, stats=self._state_stats) self._state_last_updates = 0 tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = { 's': { 'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, } } connection.create_table(self._table_name, schema)
def test_middleware_output(result): out = list(result) self.assertEquals(len(out), 1) self.assertIsInstance(out[0], Request) self.assertIn('Referer', out[0].headers) self.assertEquals(out[0].headers['Referer'], to_bytes('http://www.scrapy.org'))
def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content): self._table_name = to_bytes(table_name) tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = { 'm': { 'max_versions': 1 }, 's': { 'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': { 'max_versions': 1 } } if use_snappy: schema['m']['compression'] = 'SNAPPY' schema['c']['compression'] = 'SNAPPY' connection.create_table(self._table_name, schema) table = connection.table(self._table_name) self.batch = table.batch(batch_size=batch_size) self.store_content = store_content
def _store_item_batch(self, key, value): data = {} self._key_check(key) for k, v in six.iteritems(value): if k.startswith('_'): continue # convert set to list manually for successful serialization v = restruct_for_pack(v) k = to_bytes(k) data[b"m:%s" % k] = packb(v, use_bin_type=True) tries = 3 while data and tries > 0: try: self._batch.put(key, data) except ValueError: self.logger.exception("Exception happened during item storing, %d tries left", tries) data_lengths = dict((k, len(v)) for k, v in six.iteritems(data)) self.logger.info("RK %s per-column lengths %s", key, str(data_lengths)) for k, length in list(data_lengths.items()): if length > self.MAX_VALUE_SIZE: self.logger.info("Dropping key %s", k) del data[k] tries -= 1 continue else: break
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. :param max_n_requests: maximum number of requests to return :param partition_id: partition id :return: list of :class:`Request <frontera.core.models.Request>` objects. """ results = [] try: for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ limit(max_n_requests): method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) r.meta[b'score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception as exc: self.logger.exception(exc) self.session.rollback() return results
def _store_item_batch(self, key, value): data = {} self._key_check(key) for k, v in six.iteritems(value): if k.startswith('_'): continue # convert set to list manually for successful serialization v = restruct_for_pack(v) k = to_bytes(k) data[b"m:%s" % k] = packb(v, use_bin_type=True) tries = 3 while data and tries > 0: try: self._batch.put(key, data) except ValueError: self.logger.exception("Exception happened during item storing, %d tries left", tries) data_lengths = dict((k, len(v)) for k, v in six.iteritems(data)) self.logger.info("RK %s per-column lengths %s", key, str(data_lengths)) for k, length in data_lengths.items(): if length > self.MAX_VALUE_SIZE: self.logger.info("Dropping key %s", k) del data[k] tries -= 1 continue else: break
def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: schema = {'f': {'max_versions': 1}} if use_snappy: schema['f']['compression'] = 'SNAPPY' self.connection.create_table(self.table_name, schema) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes parts = urlsplit(to_unicode(url, encoding=encoding, errors='percentencode')) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc).rstrip(':'), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def fetch(self, fingerprints): to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)): self._cache[to_bytes(state.fingerprint)] = state.state
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes parts = urlsplit(to_unicode(url, encoding=encoding, errors='percentencode')) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def get_crc32(name): """ signed crc32 of bytes or unicode. In python 3, return the same number as in python 2, converting to [-2**31, 2**31-1] range. This is done to maintain backwards compatibility with python 2, since checksums are stored in the database, so this allows to keep the same database schema. """ return to_signed32(crc32(to_bytes(name, 'utf-8', 'ignore')))
def sw_activity(self): c = 0 p = 0 for m in self.sw_sl_c.get_messages(timeout=1.0, count=512): if m.startswith(b'http://helloworld.com/'): p += 1 self.sw_us_p.send(None, b'message' + b'0' + b"," + to_bytes(str(c))) c += 1 assert p > 0 return c
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc return ( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars))
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing a encoding, you should use the encoding of the original page (the page from which the url was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding. # # it is assumed that a raw bytes input comes from the page # corresponding to the encoding # # Note: if this assumption is wrong, this will fail; # in the general case, users are required to use Unicode # or safe ASCII bytes input parts = urlsplit(to_unicode(url, encoding=encoding)) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(parts.netloc.encode('idna')), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc return ( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars) )
def fetch(self, fingerprints): to_fetch = [ to_native_str(f) for f in fingerprints if f not in self._cache ] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.session.query(self.model).filter( self.model.fingerprint.in_(chunk)): self._cache[to_bytes(state.fingerprint)] = state.state
def __contains__(self, key): self._key_check(key) self.stats["contains"] += 1 if super(DomainCache, self).__contains__(key): self.stats["contains_in_memory"] += 1 return True if key in self._second_gen: self.stats["contains_in_secgen"] += 1 return True if self._table.row(to_bytes(key)): self.stats["contains_in_hbase"] += 1 return True self.stats["contains_false"] += 1 return False
def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): """ :param string url: URL to send. :param string method: HTTP method to use. :param dict headers: dictionary of headers to send. :param dict cookies: dictionary of cookies to attach to this request. :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \ the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items. """ self._url = to_native_str(url) self._method = to_bytes((method or b'GET').upper()) self._headers = headers or {} self._cookies = cookies or {} self._meta = meta or {b'scrapy_meta': {}} self._body = body
def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None, batch_size=100): super(DomainCache, self).__init__(maxsize) self._second_gen = dict() table_name = to_bytes(table_name) self._table = self._get_domain_table(connection, table_name) self._batch = HardenedBatch(self._table, batch_size=batch_size) self._set_fields = set(set_fields) if set_fields else set() self._on_get_func = on_get_func self.logger = logging.getLogger("domain-cache") self.stats = defaultdict(int) self.next_log = time() + self.LOG_INTERVAL self.batch_size = batch_size
def __delitem__(self, key): self._key_check(key) not_found = True if super(DomainCache, self).__contains__(key): super(DomainCache, self).__delitem__(key) not_found = False if key in self._second_gen: del self._second_gen[key] not_found = False rk = to_bytes(key) if self._table.row(rk): self._table.delete(rk) not_found = False if not_found: raise KeyError
def __init__(self, connection, table_name, cache_size_limit, drop_all_tables): self.connection = connection self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") self._state_cache = {} self._cache_size_limit = cache_size_limit tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = {'s': {'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, } } connection.create_table(self._table_name, schema)
def parse_domain_info(self, url, test_mode=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = to_bytes(match.groups()[0]) if match else b'?' scheme = sld = tld = subdomain = b'-' else: netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url) return { b'netloc': to_bytes(netloc), b'name': to_bytes(name), b'scheme': to_bytes(scheme), b'sld': to_bytes(sld), b'tld': to_bytes(tld), b'subdomain': to_bytes(subdomain), }
def _parse_domain_info(self, url, test_mode=False): if test_mode: match = re.match(r'([A-Z])\w+', url) netloc = name = to_bytes(match.groups()[0]) if match else b'?' scheme = sld = tld = subdomain = b'-' else: netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func( url) return { b'netloc': to_bytes(netloc), b'name': to_bytes(name), b'scheme': to_bytes(scheme), b'sld': to_bytes(sld), b'tld': to_bytes(tld), b'subdomain': to_bytes(subdomain), }
def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = "ISO-8859-1") -> bytes: """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ >>> import w3lib.http >>> w3lib.http.basic_auth_header('someuser', 'somepass') 'Basic c29tZXVzZXI6c29tZXBhc3M=' .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt """ auth = f"{to_unicode(username)}:{to_unicode(password)}" # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
def _get_item(self, key): self.stats["hbase_gets"] += 1 hbase_key = to_bytes(key) row = self._table.row(hbase_key) if not row: self.stats["hbase_misses"] += 1 super(DomainCache, self).__missing__(key) raise KeyError value = {} for k, v in six.iteritems(row): cf, _, col = k.partition(b':') col = to_native_str(col) value[col] = unpackb(v, encoding='utf-8') # XXX extract some fields as a set for faster in-checks if col in self._set_fields: value[col] = set(value[col]) if self._on_get_func: self._on_get_func(value) return value
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) hostname = result.hostname if result.hostname else '-' host_checksum = get_crc32(hostname) combined = hostname+result.path+';'+result.params+result.query+result.fragment combined = to_bytes(combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return fprint
def _convert_from_saved_type(obj): """ :param obj: object returned by `_convert_and_save_type` Restores the original state of the object converted earlier by `_convert_and_save_type`. This method considers every first element of the nested tuple as the original type information and the second value to be the converted value. It applies the original type recursively on the object to retrieve the original form of the object. """ assert len(obj) == 2 obj_type, obj_value = obj if obj_type == 'bytes': return to_bytes(obj_value) elif obj_type == 'dict': return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value]) elif obj_type in ['list', 'tuple']: _type = list if obj_type == 'list' else tuple return _type([_convert_from_saved_type(item) for item in obj_value]) return obj_value
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) if not result.hostname: return sha1(key) host_checksum = get_crc32(result.hostname) doc_uri_combined = result.path+';'+result.params+result.query+result.fragment doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(doc_uri_combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return to_native_str(fprint, 'utf8')
def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content): self._table_name = to_bytes(table_name) tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = {'m': {'max_versions': 1}, 's': {'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} } if use_snappy: schema['m']['compression'] = 'SNAPPY' schema['c']['compression'] = 'SNAPPY' connection.create_table(self._table_name, schema) table = connection.table(self._table_name) self.batch = table.batch(batch_size=batch_size) self.store_content = store_content
def get_next_requests(self, max_n_requests, partition_id, score, **kwargs): results = [] try: queue = self.queue_model query = self.session.query(queue ).filter(queue.partition_id == partition_id, queue.score >= score ).order_by(queue.created_at ).limit(max_n_requests) for item in query: method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) fp = item.fingerprint msg = f"retrieved request {fp[:6]}...{fp[-6:]}" self.logger.info(msg) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) r.meta[b'score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception as exc: self.logger.exception(exc) self.session.rollback() self.logger.info(f"Got {len(results)} next requests with score {score}") return results
def get_crc32(name): return crc32(to_bytes(name, 'utf-8', 'ignore'))
def md5(key): return hashlib.md5(to_bytes(key, 'utf8')).hexdigest()
def sha1(key): return hashlib.sha1(to_bytes(key, 'utf8')).hexdigest()
def _find_method(obj, func): if obj and hasattr(func, '__self__') and func.__self__ is obj: return to_bytes(func.__func__.__name__) else: raise ValueError("Function %s is not a method of: %s" % (func, obj))
def test_middleware_output(result): out = list(result) self.assertEquals(len(out), 1) self.assertIsInstance(out[0], Request) self.assertIn('Referer', out[0].headers) self.assertEquals(out[0].headers['Referer'], to_bytes(res.url))
def finalize(self): """Write all buffered files to archive.""" for zinfo, contents, compress_type in self._files.values(): self._writestr(zinfo, to_bytes(contents), compress_type) self._files = {}
def test_sha1_bytes(self): assert sha1(to_bytes(url1)) == b'880c5e7919cb09e182bd639d724bce6d90db71eb' assert sha1(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' assert sha1(to_bytes(url3)) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e'
def test_md5_bytes(self): assert md5(to_bytes(url1)) == b'bb82110ce034c1a6ad55a2e73adc322a' assert md5(to_bytes(url2)) == b'5b56f40f8828701f97fa4511ddcd25fb' assert md5(to_bytes(url3)) == b'5abf5c9aa02d870756032bdec0bd6522'
def test_local_hostname_fingerprint_bytes(self): assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13' assert hostname_local_fingerprint(to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76'