def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self.logger.error( "Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition( hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=to_native_str(request.method), partition_id=partition_id, host_crc32=host_crc32, created_at=time() * 1E+6) to_save.append(q) request.meta[b'state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. Also, ASCII tabs and newlines are removed as per https://url.spec.whatwg.org/#url-parsing. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). If quote_path is True (default), path_encoding ('utf-8' by default) is used to encode URL path component which is then quoted. Otherwise, if quote_path is False, path component is not encoded or quoted. Given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes decoded = to_unicode(url, encoding=encoding, errors='percentencode') parts = urlsplit(_ascii_tab_newline_re.sub('', decoded)) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # default encoding for path component SHOULD be UTF-8 if quote_path: path = quote(to_bytes(parts.path, path_encoding), _safe_chars) else: path = to_native_str(parts.path) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc).rstrip(':'), path, # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def _request_from_object(self, obj): return self._request_model(url=to_native_str(obj[0]), method=obj[1], headers=obj[2], cookies=obj[3], meta=obj[4], body=obj[5])
def read_seeds(self, stream): processed, scheduled = 0, 0 requests = [] for line in stream: url = to_native_str(line.strip()) if url.startswith("#"): continue if not url.startswith("http"): url = "http://" + url + "/" try: request = self.create_request(url, meta={b'home': True}, headers=DEFAULT_HEADERS) requests.append(request) if len(requests) % 40000 == 0: scheduled += self._schedule_batch(requests) processed += len(requests) self.logger.info("Processed %d, scheduled %d urls.", processed, scheduled) requests = [] except Exception: self.logger.exception("Error during seeds addition") if requests: try: scheduled += self._schedule_batch(requests) except Exception: self.logger.exception("Error during seeds addition") processed += len(requests) self.logger.info("Processed %d, and scheduled %d urls overall.", processed, scheduled)
def _response_from_object(self, obj): url = to_native_str(obj[0]) return self._response_model(url=url, status_code=obj[1], body=obj[3], request=self._request_model(url=url, meta=obj[2]))
def from_frontier(self, frontier_request): """request: Frontier > Scrapy""" cb = frontier_request.meta.get(b'scrapy_callback', None) if cb and self.spider: cb = _get_method(self.spider, cb) eb = frontier_request.meta.get(b'scrapy_errback', None) if eb and self.spider: eb = _get_method(self.spider, eb) body = frontier_request.meta.get(b'scrapy_body', None) meta = frontier_request.meta[b'scrapy_meta'] meta.pop('cf_store', None) for attr, val in frontier_request.meta.get(b'spider_state', []): prev_value = getattr(self.spider, attr, None) if prev_value is not None and prev_value != val: _LOG.error( "State for attribute '%s' change from '%s' to '%s' attempted by request <%s> so crawl may loose consistency. \ Per request state should be propagated via request attributes.", attr, prev_value, val, frontier_request.url) elif prev_value != val: setattr(self.spider, attr, val) _LOG.info("State for attribute '%s' set to %s by request <%s>", attr, val, frontier_request.url) return ScrapyRequest(url=frontier_request.url, callback=cb, errback=eb, body=body, method=to_native_str(frontier_request.method), headers=frontier_request.headers, cookies=frontier_request.cookies, meta=meta, dont_filter=True)
def decode_request(self, message): obj = dict_to_bytes(super(Decoder, self).decode(message)) return self._request_model(url=to_native_str(obj[b'url']), method=obj[b'method'], headers=obj[b'headers'], cookies=obj[b'cookies'], meta=obj[b'meta'])
def decode(self, message): message = dict_to_bytes(super(Decoder, self).decode(message)) if message[b'type'] == b'links_extracted': request = self._request_from_object(message[b'r']) links = [ self._request_from_object(link) for link in message[b'links'] ] return ('links_extracted', request, links) if message[b'type'] == b'page_crawled': response = self._response_from_object(message[b'r']) return ('page_crawled', response) if message[b'type'] == b'request_error': request = self._request_from_object(message[b'r']) return ('request_error', request, to_native_str(message[b'error'])) if message[b'type'] == b'update_score': return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule']) if message[b'type'] == b'add_seeds': seeds = [] for seed in message[b'seeds']: request = self._request_from_object(seed) seeds.append(request) return ('add_seeds', seeds) if message[b'type'] == b'new_job_id': return ('new_job_id', int(message[b'job_id'])) if message[b'type'] == b'offset': return ('offset', int(message[b'partition_id']), int(message[b'offset'])) return TypeError('Unknown message type')
def flush(self, force_clear=False): for fingerprint, state_val in six.iteritems(self._cache): state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) self.session.merge(state) self.session.commit() self.logger.debug("State cache has been flushed.") super(States, self).flush(force_clear)
def _response_from_object(self, obj): url = to_native_str(obj[b'url']) request = self._request_model(url=url, meta=obj[b'meta']) return self._response_model(url=url, status_code=obj[b'status_code'], body=b64decode(obj[b'body']), request=request)
def decode(self, message): message = dict_to_bytes(super(Decoder, self).decode(message)) if message[b'type'] == b'links_extracted': request = self._request_from_object(message[b'r']) links = [self._request_from_object(link) for link in message[b'links']] return ('links_extracted', request, links) if message[b'type'] == b'page_crawled': response = self._response_from_object(message[b'r']) return ('page_crawled', response) if message[b'type'] == b'request_error': request = self._request_from_object(message[b'r']) return ('request_error', request, to_native_str(message[b'error'])) if message[b'type'] == b'update_score': return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule']) if message[b'type'] == b'add_seeds': seeds = [] for seed in message[b'seeds']: request = self._request_from_object(seed) seeds.append(request) return ('add_seeds', seeds) if message[b'type'] == b'new_job_id': return ('new_job_id', int(message[b'job_id'])) if message[b'type'] == b'offset': return ('offset', int(message[b'partition_id']), int(message[b'offset'])) return TypeError('Unknown message type')
def fetch(self, fingerprints): to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)): self._cache[to_bytes(state.fingerprint)] = state.state
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes parts = urlsplit(to_unicode(url, encoding=encoding, errors='percentencode')) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing an encoding, you should use the encoding of the original page (the page from which the URL was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes parts = urlsplit(to_unicode(url, encoding=encoding, errors='percentencode')) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(netloc).rstrip(':'), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def flush(self): for fingerprint, state_val in six.iteritems(self._cache): state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) self.session.merge(state) self.session.commit() self.logger.debug("State cache has been flushed.") super(States, self).flush()
def _create_page(self, obj): db_page = self.model() db_page.fingerprint = to_native_str(obj.meta[b'fingerprint']) db_page.url = obj.url db_page.created_at = datetime.utcnow() db_page.meta = obj.meta db_page.depth = 0 if isinstance(obj, Request): db_page.headers = obj.headers db_page.method = to_native_str(obj.method) db_page.cookies = obj.cookies elif isinstance(obj, Response): db_page.headers = obj.request.headers db_page.method = to_native_str(obj.request.method) db_page.cookies = obj.request.cookies db_page.status_code = obj.status_code return db_page
def _modify_page(self, obj): db_page = self.cache[obj.meta[b'fingerprint']] db_page.fetched_at = datetime.utcnow() if isinstance(obj, Response): db_page.headers = obj.request.headers db_page.method = to_native_str(obj.request.method) db_page.cookies = obj.request.cookies db_page.status_code = obj.status_code return db_page
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc return ( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars))
def filter(self, record): if isinstance(record.msg, dict): for field_name in self.excluded_fields: setattr(record, field_name, record.msg.get(field_name, '')) record.msg = self.separator.join([to_native_str(value) for key, value in six.iteritems(record.msg) if key not in self.excluded_fields]) if self.msg_max_length and len(record.msg) > self.msg_max_length: record.msg = record.msg[0:self.msg_max_length-3] + "..." return True
def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for URL path component (unless overriden by path_encoding), and given encoding is used for query string or form data. When passing a encoding, you should use the encoding of the original page (the page from which the url was extracted from). Calling this function on an already "safe" URL will return the URL unmodified. Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding. # # it is assumed that a raw bytes input comes from the page # corresponding to the encoding # # Note: if this assumption is wrong, this will fail; # in the general case, users are required to use Unicode # or safe ASCII bytes input parts = urlsplit(to_unicode(url, encoding=encoding)) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), to_native_str(parts.netloc.encode('idna')), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars), ))
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: netloc = parts.netloc.encode('idna') except UnicodeError: netloc = parts.netloc return ( to_native_str(parts.scheme), to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars), quote(to_bytes(parts.params, path_encoding), _safe_chars), # encoding of query and fragment follows page encoding # or form-charset (if known and passed) quote(to_bytes(parts.query, encoding), _safe_chars), quote(to_bytes(parts.fragment, encoding), _safe_chars) )
def fetch(self, fingerprints): to_fetch = [ to_native_str(f) for f in fingerprints if f not in self._cache ] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.session.query(self.model).filter( self.model.fingerprint.in_(chunk)): self._cache[to_bytes(state.fingerprint)] = state.state
def _response_from_object(self, obj): url = to_native_str(obj[0]) return self._response_model(url=url, status_code=obj[1], body=obj[4], headers=obj[3], request=self._request_model( url=url, meta=obj[2], method=obj[5], headers=obj[6], cookies=obj[7]))
def test_metadata(self): connection = Connection(host='hbase-docker', port=9090) metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) metadata.add_seeds([r1, r2, r3]) resp = Response('https://www.example.com', request=r1) metadata.page_crawled(resp) metadata.links_extracted(resp.request, [r2, r3]) metadata.request_error(r4, 'error') metadata.frontier_stop() table = connection.table('metadata') assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \ set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12'])
def __init__(self, url, status_code=200, headers=None, body='', request=None): """ :param string url: URL of this response. :param int status_code: the HTTP status of the response. Defaults to 200. :param dict headers: dictionary of headers to send. :param str body: the response body. :param Request request: The Request object that generated this response. """ self._url = to_native_str(url) self._status_code = int(status_code) self._headers = headers or {} self._body = body self._request = request
def __init__(self, url, method='GET', headers=None, cookies=None, meta=None, body=''): """ :param string url: URL to send. :param string method: HTTP method to use. :param dict headers: dictionary of headers to send. :param dict cookies: dictionary of cookies to attach to this request. :param dict meta: dictionary that contains arbitrary metadata for this request. """ self._url = url self._method = to_native_str(method or 'GET').upper() self._headers = headers or {} self._cookies = cookies or {} self._meta = meta or {'scrapy_meta': {}} self._body = body
def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): """ :param string url: URL to send. :param string method: HTTP method to use. :param dict headers: dictionary of headers to send. :param dict cookies: dictionary of cookies to attach to this request. :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \ the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items. """ self._url = to_native_str(url) self._method = to_bytes((method or b'GET').upper()) self._headers = headers or {} self._cookies = cookies or {} self._meta = meta or {b'scrapy_meta': {}} self._body = body
def decode(self, buffer): obj = unpackb(buffer, encoding='utf-8') if obj[0] == b'pc': return ('page_crawled', self._response_from_object(obj[1])) if obj[0] == b'le': return ('links_extracted', self._request_from_object(obj[1]), [self._request_from_object(x) for x in obj[2]]) if obj[0] == b'us': return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) if obj[0] == b're': return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) if obj[0] == b'as': return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) if obj[0] == b'njid': return ('new_job_id', int(obj[1])) if obj[0] == b'of': return ('offset', int(obj[1]), int(obj[2])) if obj[0] == b'ou': return ('overused', int(obj[1]), [to_native_str(s) for s in obj[2]]) raise TypeError('Unknown message type')
def __init__(self, manager, args, mb_stream, states_context): self.logger = logging.getLogger("discovery") backend = manager.backend self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata) try: psl_file = codecs.open("public_suffix_list.dat", encoding='utf8') except IOError: self.logger.exception("Please get the public suffix file from https://publicsuffix.org/") raise self._suffix_list = PublicSuffixList(psl_file) self._states_ctx = states_context self.states = backend.states self.user_agent = to_native_str(manager.settings.get('USER_AGENT')) self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES')) super(Discovery, self).__init__(manager, args, mb_stream, states_context)
def _get_item(self, key): self.stats["hbase_gets"] += 1 hbase_key = to_bytes(key) row = self._table.row(hbase_key) if not row: self.stats["hbase_misses"] += 1 super(DomainCache, self).__missing__(key) raise KeyError value = {} for k, v in six.iteritems(row): cf, _, col = k.partition(b':') col = to_native_str(col) value[col] = unpackb(v, encoding='utf-8') # XXX extract some fields as a set for faster in-checks if col in self._set_fields: value[col] = set(value[col]) if self._on_get_func: self._on_get_func(value) return value
def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=to_native_str(request.method), partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6) to_save.append(q) request.meta[b'state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def from_frontier(self, frontier_request): """request: Frontier > Scrapy""" cb = frontier_request.meta.get(b'scrapy_callback', None) if cb and self.spider: cb = _get_method(self.spider, cb) eb = frontier_request.meta.get(b'scrapy_errback', None) if eb and self.spider: eb = _get_method(self.spider, eb) body = frontier_request.body meta = frontier_request.meta.get(b'scrapy_meta', {}) meta[b'frontier_request'] = frontier_request return ScrapyRequest(url=frontier_request.url, callback=cb, errback=eb, body=body, method=to_native_str(frontier_request.method), headers=frontier_request.headers, cookies=frontier_request.cookies, meta=meta, dont_filter=True)
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) if not result.hostname: return sha1(key) host_checksum = get_crc32(result.hostname) doc_uri_combined = result.path+';'+result.params+result.query+result.fragment doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(doc_uri_combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return to_native_str(fprint, 'utf8')
def decode(self, buffer): obj = unpackb(buffer) if obj[0] == b'pc': return ('page_crawled', self._response_from_object(obj[1])) if obj[0] == b'le': return ('links_extracted', self._request_from_object(obj[1]), [self._request_from_object(x) for x in obj[2]]) if obj[0] == b'us': return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) if obj[0] == b're': return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) if obj[0] == b'as': return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) if obj[0] == b'njid': return ('new_job_id', int(obj[1])) if obj[0] == b'of': return ('offset', int(obj[1]), int(obj[2])) return TypeError('Unknown message type')
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. :param max_n_requests: maximum number of requests to return :param partition_id: partition id :return: list of :class:`Request <frontera.core.models.Request>` objects. """ results = [] try: for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ limit(max_n_requests): method = item.method or 'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta['fingerprint'] = to_native_str(item.fingerprint) r.meta['score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception as exc: self.logger.exception(exc) self.session.rollback() return results
def _get_method(obj, name): name = to_native_str(name) try: return getattr(obj, name) except AttributeError: raise ValueError("Method %r not found in: %s" % (name, obj))
def _request_from_object(self, obj): return self._request_model(url=to_native_str(obj[b'url']), method=obj[b'method'], headers=obj[b'headers'], cookies=obj[b'cookies'], meta=obj[b'meta'])
def __init__(self, separator=None, excluded_fields=None, msg_max_length=0): super(PlainValuesFilter, self).__init__() self.separator = to_native_str(separator or " ") self.excluded_fields = excluded_fields or [] self.msg_max_length = msg_max_length
def update_score(self, batch): for fprint, score, request, schedule in batch: m = self.model(fingerprint=to_native_str(fprint), score=score) self.session.merge(m) self.session.commit()
def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ return url if isinstance(url, parse.ParseResult) else \ parse.urlparse(to_native_str(url))
def test_deprecation(self): with deprecated_call(): to_native_str("")
def _request_from_object(self, obj): return self._request_model(url=to_native_str(obj[0]), method=obj[1], headers=obj[2], cookies=obj[3], meta=obj[4])