Beispiel #1
0
def urljoin_rfc(base, ref, encoding='utf-8'):
    r"""
    .. warning::

        This function is deprecated and will be removed in future.
        Please use ``urlparse.urljoin`` instead.

    Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.

    >>> import w3lib.url
    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
    'http://www.example.com/otherpath/index2.html'
    >>>

    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm')
    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
    >>>


    """

    warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
        DeprecationWarning)

    str_base = to_bytes(base, encoding)
    str_ref = to_bytes(ref, encoding)
    return urljoin(str_base, str_ref)
Beispiel #2
0
    def __init__(self, settings):
        server = settings.get('KAFKA_LOCATION')
        self.topic_todo = to_bytes(
            settings.get('OUTGOING_TOPIC', "frontier-todo"))
        self.topic_done = to_bytes(
            settings.get('INCOMING_TOPIC', "frontier-done"))
        self.topic_scoring = to_bytes(settings.get('SCORING_TOPIC'))
        self.general_group = to_bytes(settings.get('FRONTIER_GROUP',
                                                   "general"))
        self.sw_group = to_bytes(
            settings.get('SCORING_GROUP', "strategy-workers"))
        self.spider_partition_id = settings.get('SPIDER_PARTITION_ID')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.hostname_partitioning = settings.get(
            'QUEUE_HOSTNAME_PARTITIONING')

        self.codec = None
        codec = settings.get('KAFKA_CODEC_LEGACY')
        if codec == 'none':
            from kafka.protocol import CODEC_NONE
            self.codec = CODEC_NONE
        if codec == 'snappy':
            from kafka.protocol import CODEC_SNAPPY
            self.codec = CODEC_SNAPPY
        if codec == 'gzip':
            from kafka.protocol import CODEC_GZIP
            self.codec = CODEC_GZIP
        if self.codec is None:
            raise NameError("Non-existent Kafka compression codec.")

        self.conn = KafkaClient(server)
 def test_local_hostname_fingerprint_bytes(self):
     assert hostname_local_fingerprint(
         to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13'
     assert hostname_local_fingerprint(
         to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8'
     assert hostname_local_fingerprint(
         to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76'
 def test_sha1_bytes(self):
     assert sha1(
         to_bytes(url1)) == b'880c5e7919cb09e182bd639d724bce6d90db71eb'
     assert sha1(
         to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d'
     assert sha1(
         to_bytes(url3)) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e'
Beispiel #5
0
Datei: url.py Projekt: wRAR/w3lib
def urljoin_rfc(base, ref, encoding='utf-8'):
    r"""
    .. warning::

        This function is deprecated and will be removed in future.
        It is not supported with Python 3.
        Please use ``urlparse.urljoin`` instead.

    Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.

    >>> import w3lib.url
    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
    'http://www.example.com/otherpath/index2.html'
    >>>

    >>> # Note: the following does not work in Python 3
    >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
    >>>


    """

    warnings.warn(
        "w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
        DeprecationWarning)

    str_base = to_bytes(base, encoding)
    str_ref = to_bytes(ref, encoding)
    return urljoin(str_base, str_ref)
Beispiel #6
0
def safe_url_string(url,
                    encoding='utf8',
                    path_encoding='utf8',
                    quote_path=True):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). If quote_path is True (default),
    path_encoding ('utf-8' by default) is used to encode URL path component
    which is then quoted. Otherwise, if quote_path is False, path component
    is not encoded or quoted. Given encoding is used for query string
    or form data.

    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    decoded = to_unicode(url, encoding=encoding, errors='percentencode')
    parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # default encoding for path component SHOULD be UTF-8
    if quote_path:
        path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
    else:
        path = to_native_str(parts.path)

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),
        path,
        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Beispiel #7
0
def test_request_response_converters():
    spider = TestSpider()
    rc = RequestConverter(spider)
    rsc = ResponseConverter(spider, rc)

    url = "http://test.com/test?param=123"
    request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback,
                            body=REQUEST_BODY)
    request.meta[b'test_param'] = b'test_value'
    request.headers.appendlist(b"TestKey", b"test value")
    request.cookies[b'MyCookie'] = b'CookieContent'

    frontier_request = rc.to_frontier(request)
    assert frontier_request.meta[b'scrapy_callback'] == b'callback'
    assert frontier_request.meta[b'scrapy_errback'] == b'errback'
    assert frontier_request.body == to_bytes(REQUEST_BODY)
    assert frontier_request.url == url
    assert frontier_request.method == b'GET'
    assert frontier_request.headers[b'Testkey'] == b'test value'
    assert frontier_request.cookies[b'MyCookie'] == b'CookieContent'
    assert b'frontier_request' not in frontier_request.meta[b'scrapy_meta']

    request_converted = rc.from_frontier(frontier_request)
    assert request_converted.meta[b'test_param'] == b'test_value'
    assert request_converted.body == to_bytes(REQUEST_BODY)
    assert request_converted.url == url
    assert request_converted.method == 'GET'
    assert request_converted.cookies[b'MyCookie'] == b'CookieContent'
    assert request_converted.headers.get(b'Testkey') == b'test value'
    assert request_converted.callback == spider.callback
    assert request_converted.errback == spider.errback

    # Some middleware could change .meta contents
    request_converted.meta[b'middleware_stuff'] = b'appeared'

    response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY,
                              headers={b'TestHeader': b'Test value'})

    frontier_response = rsc.to_frontier(response)
    assert frontier_response.body == RESPONSE_BODY
    assert frontier_response.meta[b'scrapy_meta'][b'test_param'] == b'test_value'
    assert frontier_response.meta[b'scrapy_meta'][b'middleware_stuff'] == b'appeared'
    assert frontier_response.status_code == 200
    assert b'frontier_request' not in frontier_response.meta[b'scrapy_meta']

    response_converted = rsc.from_frontier(frontier_response)
    assert response_converted.body == RESPONSE_BODY
    assert response_converted.meta[b'test_param'] == b'test_value'
    assert response_converted.url == url
    assert response_converted.status == 200
    assert response_converted.headers[b'TestHeader'] == b'Test value'

    frontier_request = FrontierRequest(url)
    request_converted = rc.from_frontier(frontier_request)
    assert frontier_request.url == url
Beispiel #8
0
def get_meta_refresh(text,
                     baseurl='',
                     encoding='utf-8',
                     ignore_tags=('script', 'noscript')):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = to_bytes(baseurl, encoding)
    try:
        text = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_tags_with_content(text, ignore_tags)
    text = remove_comments(replace_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None
Beispiel #9
0
    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(
                self.table_name,
                {'f': {
                    'max_versions': 1,
                    'block_cache_enabled': 1
                }})

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
    def __init__(self, connection, table_name, cache_size_limit,
                 write_log_size, drop_all_tables):
        self.connection = connection
        self._table_name = to_bytes(table_name)
        self.logger = logging.getLogger("hbase.states")
        self._state_batch = self.connection.table(
            self._table_name).batch(batch_size=write_log_size)
        self._state_stats = defaultdict(int)
        self._state_cache = LRUCacheWithStats(maxsize=cache_size_limit,
                                              stats=self._state_stats)
        self._state_last_updates = 0

        tables = set(connection.tables())
        if drop_all_tables and self._table_name in tables:
            connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {
                's': {
                    'max_versions': 1,
                    'block_cache_enabled': 1,
                    'bloom_filter_type': 'ROW',
                    'in_memory': True,
                }
            }
            connection.create_table(self._table_name, schema)
 def test_middleware_output(result):
     out = list(result)
     self.assertEquals(len(out), 1)
     self.assertIsInstance(out[0], Request)
     self.assertIn('Referer', out[0].headers)
     self.assertEquals(out[0].headers['Referer'],
                       to_bytes('http://www.scrapy.org'))
Beispiel #12
0
    def __init__(self, connection, table_name, drop_all_tables, use_snappy,
                 batch_size, store_content):
        self._table_name = to_bytes(table_name)
        tables = set(connection.tables())
        if drop_all_tables and self._table_name in tables:
            connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {
                'm': {
                    'max_versions': 1
                },
                's': {
                    'max_versions': 1,
                    'block_cache_enabled': 1,
                    'bloom_filter_type': 'ROW',
                    'in_memory': True,
                },
                'c': {
                    'max_versions': 1
                }
            }
            if use_snappy:
                schema['m']['compression'] = 'SNAPPY'
                schema['c']['compression'] = 'SNAPPY'
            connection.create_table(self._table_name, schema)
        table = connection.table(self._table_name)
        self.batch = table.batch(batch_size=batch_size)
        self.store_content = store_content
Beispiel #13
0
 def _store_item_batch(self, key, value):
     data = {}
     self._key_check(key)
     for k, v in six.iteritems(value):
         if k.startswith('_'):
             continue
         # convert set to list manually for successful serialization
         v = restruct_for_pack(v)
         k = to_bytes(k)
         data[b"m:%s" % k] = packb(v, use_bin_type=True)
     tries = 3
     while data and tries > 0:
         try:
             self._batch.put(key, data)
         except ValueError:
             self.logger.exception("Exception happened during item storing, %d tries left", tries)
             data_lengths = dict((k, len(v)) for k, v in six.iteritems(data))
             self.logger.info("RK %s per-column lengths %s", key, str(data_lengths))
             for k, length in list(data_lengths.items()):
                 if length > self.MAX_VALUE_SIZE:
                     self.logger.info("Dropping key %s", k)
                     del data[k]
             tries -= 1
             continue
         else:
             break
Beispiel #14
0
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = to_bytes(baseurl, encoding)
    try:
        text = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_tags_with_content(text, ignore_tags)
    text = remove_comments(replace_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None
Beispiel #15
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = item.method or b'GET'
                r = Request(item.url,
                            method=method,
                            meta=item.meta,
                            headers=item.headers,
                            cookies=item.cookies)
                r.meta[b'fingerprint'] = to_bytes(item.fingerprint)
                r.meta[b'score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception as exc:
            self.logger.exception(exc)
            self.session.rollback()
        return results
Beispiel #16
0
 def _store_item_batch(self, key, value):
     data = {}
     self._key_check(key)
     for k, v in six.iteritems(value):
         if k.startswith('_'):
             continue
         # convert set to list manually for successful serialization
         v = restruct_for_pack(v)
         k = to_bytes(k)
         data[b"m:%s" % k] = packb(v, use_bin_type=True)
     tries = 3
     while data and tries > 0:
         try:
             self._batch.put(key, data)
         except ValueError:
             self.logger.exception("Exception happened during item storing, %d tries left", tries)
             data_lengths = dict((k, len(v)) for k, v in six.iteritems(data))
             self.logger.info("RK %s per-column lengths %s", key, str(data_lengths))
             for k, length in data_lengths.items():
                 if length > self.MAX_VALUE_SIZE:
                     self.logger.info("Dropping key %s", k)
                     del data[k]
             tries -= 1
             continue
         else:
             break
    def __init__(self,
                 connection,
                 partitions,
                 table_name,
                 drop=False,
                 use_snappy=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            schema = {'f': {'max_versions': 1}}
            if use_snappy:
                schema['f']['compression'] = 'SNAPPY'
            self.connection.create_table(self.table_name, schema)

        class DumbResponse:
            pass

        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
Beispiel #18
0
Datei: url.py Projekt: wRAR/w3lib
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Beispiel #19
0
    def fetch(self, fingerprints):
        to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state
Beispiel #20
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Beispiel #21
0
def get_crc32(name):
    """ signed crc32 of bytes or unicode.
    In python 3, return the same number as in python 2, converting to
    [-2**31, 2**31-1] range. This is done to maintain backwards compatibility
    with python 2, since checksums are stored in the database, so this allows
    to keep the same database schema.
    """
    return to_signed32(crc32(to_bytes(name, 'utf-8', 'ignore')))
Beispiel #22
0
def get_crc32(name):
    """ signed crc32 of bytes or unicode.
    In python 3, return the same number as in python 2, converting to
    [-2**31, 2**31-1] range. This is done to maintain backwards compatibility
    with python 2, since checksums are stored in the database, so this allows
    to keep the same database schema.
    """
    return to_signed32(crc32(to_bytes(name, 'utf-8', 'ignore')))
 def sw_activity(self):
     c = 0
     p = 0
     for m in self.sw_sl_c.get_messages(timeout=1.0, count=512):
         if m.startswith(b'http://helloworld.com/'):
             p += 1
             self.sw_us_p.send(None, b'message' + b'0' + b"," + to_bytes(str(c)))
         c += 1
     assert p > 0
     return c
Beispiel #24
0
Datei: url.py Projekt: wRAR/w3lib
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars))
Beispiel #25
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing a encoding, you should use the encoding of the
    original page (the page from which the url was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding.
    #
    # it is assumed that a raw bytes input comes from the page
    # corresponding to the encoding
    #
    # Note: if this assumption is wrong, this will fail;
    #       in the general case, users are required to use Unicode
    #       or safe ASCII bytes input
    parts = urlsplit(to_unicode(url, encoding=encoding))

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Beispiel #26
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
Beispiel #27
0
    def fetch(self, fingerprints):
        to_fetch = [
            to_native_str(f) for f in fingerprints if f not in self._cache
        ]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch),
                          len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(
                    self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state
Beispiel #28
0
 def __contains__(self, key):
     self._key_check(key)
     self.stats["contains"] += 1
     if super(DomainCache, self).__contains__(key):
         self.stats["contains_in_memory"] += 1
         return True
     if key in self._second_gen:
         self.stats["contains_in_secgen"] += 1
         return True
     if self._table.row(to_bytes(key)):
         self.stats["contains_in_hbase"] += 1
         return True
     self.stats["contains_false"] += 1
     return False
Beispiel #29
0
 def __contains__(self, key):
     self._key_check(key)
     self.stats["contains"] += 1
     if super(DomainCache, self).__contains__(key):
         self.stats["contains_in_memory"] += 1
         return True
     if key in self._second_gen:
         self.stats["contains_in_secgen"] += 1
         return True
     if self._table.row(to_bytes(key)):
         self.stats["contains_in_hbase"] += 1
         return True
     self.stats["contains_false"] += 1
     return False
Beispiel #30
0
 def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body
Beispiel #31
0
    def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None, batch_size=100):
        super(DomainCache, self).__init__(maxsize)

        self._second_gen = dict()

        table_name = to_bytes(table_name)
        self._table = self._get_domain_table(connection, table_name)
        self._batch = HardenedBatch(self._table, batch_size=batch_size)
        self._set_fields = set(set_fields) if set_fields else set()
        self._on_get_func = on_get_func

        self.logger = logging.getLogger("domain-cache")
        self.stats = defaultdict(int)
        self.next_log = time() + self.LOG_INTERVAL
        self.batch_size = batch_size
Beispiel #32
0
 def __delitem__(self, key):
     self._key_check(key)
     not_found = True
     if super(DomainCache, self).__contains__(key):
         super(DomainCache, self).__delitem__(key)
         not_found = False
     if key in self._second_gen:
         del self._second_gen[key]
         not_found = False
     rk = to_bytes(key)
     if self._table.row(rk):
         self._table.delete(rk)
         not_found = False
     if not_found:
         raise KeyError
Beispiel #33
0
    def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None, batch_size=100):
        super(DomainCache, self).__init__(maxsize)

        self._second_gen = dict()

        table_name = to_bytes(table_name)
        self._table = self._get_domain_table(connection, table_name)
        self._batch = HardenedBatch(self._table, batch_size=batch_size)
        self._set_fields = set(set_fields) if set_fields else set()
        self._on_get_func = on_get_func

        self.logger = logging.getLogger("domain-cache")
        self.stats = defaultdict(int)
        self.next_log = time() + self.LOG_INTERVAL
        self.batch_size = batch_size
Beispiel #34
0
 def __delitem__(self, key):
     self._key_check(key)
     not_found = True
     if super(DomainCache, self).__contains__(key):
         super(DomainCache, self).__delitem__(key)
         not_found = False
     if key in self._second_gen:
         del self._second_gen[key]
         not_found = False
     rk = to_bytes(key)
     if self._table.row(rk):
         self._table.delete(rk)
         not_found = False
     if not_found:
         raise KeyError
Beispiel #35
0
    def __init__(self, connection, table_name, cache_size_limit, drop_all_tables):
        self.connection = connection
        self._table_name = to_bytes(table_name)
        self.logger = logging.getLogger("hbase.states")
        self._state_cache = {}
        self._cache_size_limit = cache_size_limit

        tables = set(connection.tables())
        if drop_all_tables and self._table_name in tables:
            connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {'s': {'max_versions': 1, 'block_cache_enabled': 1,
                            'bloom_filter_type': 'ROW', 'in_memory': True, }
                      }
            connection.create_table(self._table_name, schema)
Beispiel #36
0
 def parse_domain_info(self, url, test_mode=False):
     if test_mode:
         match = re.match('([A-Z])\w+', url)
         netloc = name = to_bytes(match.groups()[0]) if match else b'?'
         scheme = sld = tld = subdomain = b'-'
     else:
         netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url)
     return {
         b'netloc': to_bytes(netloc),
         b'name': to_bytes(name),
         b'scheme': to_bytes(scheme),
         b'sld': to_bytes(sld),
         b'tld': to_bytes(tld),
         b'subdomain': to_bytes(subdomain),
     }
Beispiel #37
0
 def _parse_domain_info(self, url, test_mode=False):
     if test_mode:
         match = re.match(r'([A-Z])\w+', url)
         netloc = name = to_bytes(match.groups()[0]) if match else b'?'
         scheme = sld = tld = subdomain = b'-'
     else:
         netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(
             url)
     return {
         b'netloc': to_bytes(netloc),
         b'name': to_bytes(name),
         b'scheme': to_bytes(scheme),
         b'sld': to_bytes(sld),
         b'tld': to_bytes(tld),
         b'subdomain': to_bytes(subdomain),
     }
Beispiel #38
0
    def __init__(self, connection, partitions, table_name, drop=False):
        self.connection = connection
        self.partitions = [i for i in range(0, partitions)]
        self.partitioner = Crc32NamePartitioner(self.partitions)
        self.logger = logging.getLogger("hbase.queue")
        self.table_name = to_bytes(table_name)

        tables = set(self.connection.tables())
        if drop and self.table_name in tables:
            self.connection.delete_table(self.table_name, disable=True)
            tables.remove(self.table_name)

        if self.table_name not in tables:
            self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}})

        class DumbResponse:
            pass
        self.decoder = Decoder(Request, DumbResponse)
        self.encoder = Encoder(Request)
Beispiel #39
0
def basic_auth_header(username: AnyStr,
                      password: AnyStr,
                      encoding: str = "ISO-8859-1") -> bytes:
    """
    Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_

    >>> import w3lib.http
    >>> w3lib.http.basic_auth_header('someuser', 'somepass')
    'Basic c29tZXVzZXI6c29tZXBhc3M='

    .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt

    """

    auth = f"{to_unicode(username)}:{to_unicode(password)}"
    # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
    # seems to be the most widely used encoding here. See also:
    # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
    return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
Beispiel #40
0
 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value
Beispiel #41
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    hostname = result.hostname if result.hostname else '-'
    host_checksum = get_crc32(hostname)
    combined = hostname+result.path+';'+result.params+result.query+result.fragment

    combined = to_bytes(combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return fprint
Beispiel #42
0
 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value
Beispiel #43
0
def _convert_from_saved_type(obj):
    """
    :param obj: object returned by `_convert_and_save_type`

    Restores the original state of the object converted
    earlier by `_convert_and_save_type`. This method considers every
    first element of the nested tuple as the original type information and
    the second value to be the converted value. It applies the original type
    recursively on the object to retrieve the original form of the object.
    """
    assert len(obj) == 2
    obj_type, obj_value = obj
    if obj_type == 'bytes':
        return to_bytes(obj_value)
    elif obj_type == 'dict':
        return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value])
    elif obj_type in ['list', 'tuple']:
        _type = list if obj_type == 'list' else tuple
        return _type([_convert_from_saved_type(item) for item in obj_value])
    return obj_value
Beispiel #44
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    if not result.hostname:
        return sha1(key)
    host_checksum = get_crc32(result.hostname)
    doc_uri_combined = result.path+';'+result.params+result.query+result.fragment

    doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(doc_uri_combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return to_native_str(fprint, 'utf8')
Beispiel #45
0
def _convert_from_saved_type(obj):
    """
    :param obj: object returned by `_convert_and_save_type`

    Restores the original state of the object converted
    earlier by `_convert_and_save_type`. This method considers every
    first element of the nested tuple as the original type information and
    the second value to be the converted value. It applies the original type
    recursively on the object to retrieve the original form of the object.
    """
    assert len(obj) == 2
    obj_type, obj_value = obj
    if obj_type == 'bytes':
        return to_bytes(obj_value)
    elif obj_type == 'dict':
        return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value])
    elif obj_type in ['list', 'tuple']:
        _type = list if obj_type == 'list' else tuple
        return _type([_convert_from_saved_type(item) for item in obj_value])
    return obj_value
Beispiel #46
0
    def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content):
        self._table_name = to_bytes(table_name)
        tables = set(connection.tables())
        if drop_all_tables and self._table_name in tables:
            connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {'m': {'max_versions': 1},
                      's': {'max_versions': 1, 'block_cache_enabled': 1,
                            'bloom_filter_type': 'ROW', 'in_memory': True, },
                      'c': {'max_versions': 1}
                      }
            if use_snappy:
                schema['m']['compression'] = 'SNAPPY'
                schema['c']['compression'] = 'SNAPPY'
            connection.create_table(self._table_name, schema)
        table = connection.table(self._table_name)
        self.batch = table.batch(batch_size=batch_size)
        self.store_content = store_content
Beispiel #47
0
 def __init__(self,
              url,
              method=b'GET',
              headers=None,
              cookies=None,
              meta=None,
              body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body
Beispiel #48
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = item.method or b'GET'
                r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)
                r.meta[b'fingerprint'] = to_bytes(item.fingerprint)
                r.meta[b'score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception as exc:
            self.logger.exception(exc)
            self.session.rollback()
        return results
Beispiel #49
0
 def get_next_requests(self, max_n_requests, partition_id, score, **kwargs):
     results = []
     try:
         queue = self.queue_model
         query = self.session.query(queue
             ).filter(queue.partition_id == partition_id, queue.score >= score
             ).order_by(queue.created_at
             ).limit(max_n_requests)
         for item in query:
             method = item.method or b'GET'
             r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)
             fp = item.fingerprint
             msg = f"retrieved request {fp[:6]}...{fp[-6:]}"
             self.logger.info(msg)
             r.meta[b'fingerprint'] = to_bytes(item.fingerprint)
             r.meta[b'score'] = item.score
             results.append(r)
             self.session.delete(item)
         self.session.commit()
     except Exception as exc:
         self.logger.exception(exc)
         self.session.rollback()
     self.logger.info(f"Got {len(results)} next requests with score {score}")
     return results
Beispiel #50
0
def get_crc32(name):
    return crc32(to_bytes(name, 'utf-8', 'ignore'))
Beispiel #51
0
def md5(key):
    return hashlib.md5(to_bytes(key, 'utf8')).hexdigest()
Beispiel #52
0
def sha1(key):
    return hashlib.sha1(to_bytes(key, 'utf8')).hexdigest()
Beispiel #53
0
def _find_method(obj, func):
    if obj and hasattr(func, '__self__') and func.__self__ is obj:
        return to_bytes(func.__func__.__name__)
    else:
        raise ValueError("Function %s is not a method of: %s" % (func, obj))
Beispiel #54
0
 def test_middleware_output(result):
     out = list(result)
     self.assertEquals(len(out), 1)
     self.assertIsInstance(out[0], Request)
     self.assertIn('Referer', out[0].headers)
     self.assertEquals(out[0].headers['Referer'], to_bytes(res.url))
Beispiel #55
0
 def finalize(self):
     """Write all buffered files to archive."""
     for zinfo, contents, compress_type in self._files.values():
         self._writestr(zinfo, to_bytes(contents), compress_type)
     self._files = {}
 def test_sha1_bytes(self):
     assert sha1(to_bytes(url1)) == b'880c5e7919cb09e182bd639d724bce6d90db71eb'
     assert sha1(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d'
     assert sha1(to_bytes(url3)) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e'
 def test_md5_bytes(self):
     assert md5(to_bytes(url1)) == b'bb82110ce034c1a6ad55a2e73adc322a'
     assert md5(to_bytes(url2)) == b'5b56f40f8828701f97fa4511ddcd25fb'
     assert md5(to_bytes(url3)) == b'5abf5c9aa02d870756032bdec0bd6522'
 def test_local_hostname_fingerprint_bytes(self):
     assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13'
     assert hostname_local_fingerprint(to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8'
     assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76'