Exemple #1
0
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint),
                                  score=score,
                                  url=request.url,
                                  meta=request.meta,
                                  headers=request.headers,
                                  cookies=request.cookies,
                                  method=to_native_str(request.method),
                                  partition_id=partition_id,
                                  host_crc32=host_crc32,
                                  created_at=time() * 1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
Exemple #2
0
def safe_url_string(url,
                    encoding='utf8',
                    path_encoding='utf8',
                    quote_path=True):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). If quote_path is True (default),
    path_encoding ('utf-8' by default) is used to encode URL path component
    which is then quoted. Otherwise, if quote_path is False, path component
    is not encoded or quoted. Given encoding is used for query string
    or form data.

    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    decoded = to_unicode(url, encoding=encoding, errors='percentencode')
    parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # default encoding for path component SHOULD be UTF-8
    if quote_path:
        path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
    else:
        path = to_native_str(parts.path)

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),
        path,
        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #3
0
 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[0]),
                                method=obj[1],
                                headers=obj[2],
                                cookies=obj[3],
                                meta=obj[4],
                                body=obj[5])
Exemple #4
0
 def read_seeds(self, stream):
     processed, scheduled = 0, 0
     requests = []
     for line in stream:
         url = to_native_str(line.strip())
         if url.startswith("#"):
             continue
         if not url.startswith("http"):
             url = "http://" + url + "/"
         try:
             request = self.create_request(url,
                                           meta={b'home': True},
                                           headers=DEFAULT_HEADERS)
             requests.append(request)
             if len(requests) % 40000 == 0:
                 scheduled += self._schedule_batch(requests)
                 processed += len(requests)
                 self.logger.info("Processed %d, scheduled %d urls.",
                                  processed, scheduled)
                 requests = []
         except Exception:
             self.logger.exception("Error during seeds addition")
     if requests:
         try:
             scheduled += self._schedule_batch(requests)
         except Exception:
             self.logger.exception("Error during seeds addition")
         processed += len(requests)
     self.logger.info("Processed %d, and scheduled %d urls overall.",
                      processed, scheduled)
Exemple #5
0
 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[3],
                                 request=self._request_model(url=url,
                                                             meta=obj[2]))
Exemple #6
0
    def from_frontier(self, frontier_request):
        """request: Frontier > Scrapy"""
        cb = frontier_request.meta.get(b'scrapy_callback', None)
        if cb and self.spider:
            cb = _get_method(self.spider, cb)
        eb = frontier_request.meta.get(b'scrapy_errback', None)
        if eb and self.spider:
            eb = _get_method(self.spider, eb)
        body = frontier_request.meta.get(b'scrapy_body', None)
        meta = frontier_request.meta[b'scrapy_meta']
        meta.pop('cf_store', None)
        for attr, val in frontier_request.meta.get(b'spider_state', []):
            prev_value = getattr(self.spider, attr, None)
            if prev_value is not None and prev_value != val:
                _LOG.error(
                    "State for attribute '%s' change from '%s' to '%s' attempted by request <%s> so crawl may loose consistency. \
                           Per request state should be propagated via request attributes.",
                    attr, prev_value, val, frontier_request.url)
            elif prev_value != val:
                setattr(self.spider, attr, val)
                _LOG.info("State for attribute '%s' set to %s by request <%s>",
                          attr, val, frontier_request.url)

        return ScrapyRequest(url=frontier_request.url,
                             callback=cb,
                             errback=eb,
                             body=body,
                             method=to_native_str(frontier_request.method),
                             headers=frontier_request.headers,
                             cookies=frontier_request.cookies,
                             meta=meta,
                             dont_filter=True)
Exemple #7
0
 def decode_request(self, message):
     obj = dict_to_bytes(super(Decoder, self).decode(message))
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])
Exemple #8
0
 def decode(self, message):
     message = dict_to_bytes(super(Decoder, self).decode(message))
     if message[b'type'] == b'links_extracted':
         request = self._request_from_object(message[b'r'])
         links = [
             self._request_from_object(link) for link in message[b'links']
         ]
         return ('links_extracted', request, links)
     if message[b'type'] == b'page_crawled':
         response = self._response_from_object(message[b'r'])
         return ('page_crawled', response)
     if message[b'type'] == b'request_error':
         request = self._request_from_object(message[b'r'])
         return ('request_error', request, to_native_str(message[b'error']))
     if message[b'type'] == b'update_score':
         return ('update_score', self._request_from_object(message[b'r']),
                 message[b'score'], message[b'schedule'])
     if message[b'type'] == b'add_seeds':
         seeds = []
         for seed in message[b'seeds']:
             request = self._request_from_object(seed)
             seeds.append(request)
         return ('add_seeds', seeds)
     if message[b'type'] == b'new_job_id':
         return ('new_job_id', int(message[b'job_id']))
     if message[b'type'] == b'offset':
         return ('offset', int(message[b'partition_id']),
                 int(message[b'offset']))
     return TypeError('Unknown message type')
Exemple #9
0
 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[3],
                                 request=self._request_model(url=url,
                                                             meta=obj[2]))
Exemple #10
0
 def flush(self, force_clear=False):
     for fingerprint, state_val in six.iteritems(self._cache):
         state = self.model(fingerprint=to_native_str(fingerprint), state=state_val)
         self.session.merge(state)
     self.session.commit()
     self.logger.debug("State cache has been flushed.")
     super(States, self).flush(force_clear)
Exemple #11
0
 def decode_request(self, message):
     obj = dict_to_bytes(super(Decoder, self).decode(message))
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])
Exemple #12
0
 def _response_from_object(self, obj):
     url = to_native_str(obj[b'url'])
     request = self._request_model(url=url, meta=obj[b'meta'])
     return self._response_model(url=url,
                                 status_code=obj[b'status_code'],
                                 body=b64decode(obj[b'body']),
                                 request=request)
Exemple #13
0
 def decode(self, message):
     message = dict_to_bytes(super(Decoder, self).decode(message))
     if message[b'type'] == b'links_extracted':
         request = self._request_from_object(message[b'r'])
         links = [self._request_from_object(link) for link in message[b'links']]
         return ('links_extracted', request, links)
     if message[b'type'] == b'page_crawled':
         response = self._response_from_object(message[b'r'])
         return ('page_crawled', response)
     if message[b'type'] == b'request_error':
         request = self._request_from_object(message[b'r'])
         return ('request_error', request, to_native_str(message[b'error']))
     if message[b'type'] == b'update_score':
         return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule'])
     if message[b'type'] == b'add_seeds':
         seeds = []
         for seed in message[b'seeds']:
             request = self._request_from_object(seed)
             seeds.append(request)
         return ('add_seeds', seeds)
     if message[b'type'] == b'new_job_id':
         return ('new_job_id', int(message[b'job_id']))
     if message[b'type'] == b'offset':
         return ('offset', int(message[b'partition_id']), int(message[b'offset']))
     return TypeError('Unknown message type')
Exemple #14
0
    def fetch(self, fingerprints):
        to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state
Exemple #15
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #16
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #17
0
 def _response_from_object(self, obj):
     url = to_native_str(obj[b'url'])
     request = self._request_model(url=url,
                                   meta=obj[b'meta'])
     return self._response_model(url=url,
                                 status_code=obj[b'status_code'],
                                 body=b64decode(obj[b'body']),
                                 request=request)
Exemple #18
0
 def flush(self):
     for fingerprint, state_val in six.iteritems(self._cache):
         state = self.model(fingerprint=to_native_str(fingerprint),
                            state=state_val)
         self.session.merge(state)
     self.session.commit()
     self.logger.debug("State cache has been flushed.")
     super(States, self).flush()
Exemple #19
0
    def _create_page(self, obj):
        db_page = self.model()
        db_page.fingerprint = to_native_str(obj.meta[b'fingerprint'])
        db_page.url = obj.url
        db_page.created_at = datetime.utcnow()
        db_page.meta = obj.meta
        db_page.depth = 0

        if isinstance(obj, Request):
            db_page.headers = obj.headers
            db_page.method = to_native_str(obj.method)
            db_page.cookies = obj.cookies
        elif isinstance(obj, Response):
            db_page.headers = obj.request.headers
            db_page.method = to_native_str(obj.request.method)
            db_page.cookies = obj.request.cookies
            db_page.status_code = obj.status_code
        return db_page
Exemple #20
0
 def _modify_page(self, obj):
     db_page = self.cache[obj.meta[b'fingerprint']]
     db_page.fetched_at = datetime.utcnow()
     if isinstance(obj, Response):
         db_page.headers = obj.request.headers
         db_page.method = to_native_str(obj.request.method)
         db_page.cookies = obj.request.cookies
         db_page.status_code = obj.status_code
     return db_page
Exemple #21
0
    def _create_page(self, obj):
        db_page = self.model()
        db_page.fingerprint = to_native_str(obj.meta[b'fingerprint'])
        db_page.url = obj.url
        db_page.created_at = datetime.utcnow()
        db_page.meta = obj.meta
        db_page.depth = 0

        if isinstance(obj, Request):
            db_page.headers = obj.headers
            db_page.method = to_native_str(obj.method)
            db_page.cookies = obj.cookies
        elif isinstance(obj, Response):
            db_page.headers = obj.request.headers
            db_page.method = to_native_str(obj.request.method)
            db_page.cookies = obj.request.cookies
            db_page.status_code = obj.status_code
        return db_page
Exemple #22
0
 def _modify_page(self, obj):
     db_page = self.cache[obj.meta[b'fingerprint']]
     db_page.fetched_at = datetime.utcnow()
     if isinstance(obj, Response):
         db_page.headers = obj.request.headers
         db_page.method = to_native_str(obj.request.method)
         db_page.cookies = obj.request.cookies
         db_page.status_code = obj.status_code
     return db_page
Exemple #23
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars))
Exemple #24
0
    def filter(self, record):
        if isinstance(record.msg, dict):
            for field_name in self.excluded_fields:
                setattr(record, field_name, record.msg.get(field_name, ''))
            record.msg = self.separator.join([to_native_str(value)
                                              for key, value in six.iteritems(record.msg)
                                              if key not in self.excluded_fields])
            if self.msg_max_length and len(record.msg) > self.msg_max_length:
                record.msg = record.msg[0:self.msg_max_length-3] + "..."

        return True
Exemple #25
0
    def filter(self, record):
        if isinstance(record.msg, dict):
            for field_name in self.excluded_fields:
                setattr(record, field_name, record.msg.get(field_name, ''))
            record.msg = self.separator.join([to_native_str(value)
                                              for key, value in six.iteritems(record.msg)
                                              if key not in self.excluded_fields])
            if self.msg_max_length and len(record.msg) > self.msg_max_length:
                record.msg = record.msg[0:self.msg_max_length-3] + "..."

        return True
Exemple #26
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing a encoding, you should use the encoding of the
    original page (the page from which the url was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding.
    #
    # it is assumed that a raw bytes input comes from the page
    # corresponding to the encoding
    #
    # Note: if this assumption is wrong, this will fail;
    #       in the general case, users are required to use Unicode
    #       or safe ASCII bytes input
    parts = urlsplit(to_unicode(url, encoding=encoding))

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #27
0
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )
Exemple #28
0
    def fetch(self, fingerprints):
        to_fetch = [
            to_native_str(f) for f in fingerprints if f not in self._cache
        ]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch),
                          len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(
                    self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state
Exemple #29
0
 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[4],
                                 headers=obj[3],
                                 request=self._request_model(
                                     url=url,
                                     meta=obj[2],
                                     method=obj[5],
                                     headers=obj[6],
                                     cookies=obj[7]))
Exemple #30
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Exemple #31
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Exemple #32
0
    def __init__(self, url, status_code=200, headers=None, body='', request=None):
        """
        :param string url: URL of this response.
        :param int status_code: the HTTP status of the response. Defaults to 200.
        :param dict headers: dictionary of headers to send.
        :param str body: the response body.
        :param Request request: The Request object that generated this response.
        """

        self._url = to_native_str(url)
        self._status_code = int(status_code)
        self._headers = headers or {}
        self._body = body
        self._request = request
Exemple #33
0
 def __init__(self, url, method='GET', headers=None, cookies=None, meta=None, body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request.
     """
     self._url = url
     self._method = to_native_str(method or 'GET').upper()
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {'scrapy_meta': {}}
     self._body = body
Exemple #34
0
 def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body
Exemple #35
0
 def decode(self, buffer):
     obj = unpackb(buffer, encoding='utf-8')
     if obj[0] == b'pc':
         return ('page_crawled', self._response_from_object(obj[1]))
     if obj[0] == b'le':
         return ('links_extracted', self._request_from_object(obj[1]),
                 [self._request_from_object(x) for x in obj[2]])
     if obj[0] == b'us':
         return ('update_score', self._request_from_object(obj[1]), obj[2],
                 obj[3])
     if obj[0] == b're':
         return ('request_error', self._request_from_object(obj[1]),
                 to_native_str(obj[2]))
     if obj[0] == b'as':
         return ('add_seeds',
                 [self._request_from_object(x) for x in obj[1]])
     if obj[0] == b'njid':
         return ('new_job_id', int(obj[1]))
     if obj[0] == b'of':
         return ('offset', int(obj[1]), int(obj[2]))
     if obj[0] == b'ou':
         return ('overused', int(obj[1]),
                 [to_native_str(s) for s in obj[2]])
     raise TypeError('Unknown message type')
    def __init__(self, manager, args, mb_stream, states_context):
        self.logger = logging.getLogger("discovery")
        backend = manager.backend
        self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata)

        try:
            psl_file = codecs.open("public_suffix_list.dat", encoding='utf8')
        except IOError:
            self.logger.exception("Please get the public suffix file from https://publicsuffix.org/")
            raise
        self._suffix_list = PublicSuffixList(psl_file)
        self._states_ctx = states_context
        self.states = backend.states

        self.user_agent = to_native_str(manager.settings.get('USER_AGENT'))
        self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES'))
        super(Discovery, self).__init__(manager, args, mb_stream, states_context)
Exemple #37
0
    def __init__(self,
                 url,
                 status_code=200,
                 headers=None,
                 body='',
                 request=None):
        """
        :param string url: URL of this response.
        :param int status_code: the HTTP status of the response. Defaults to 200.
        :param dict headers: dictionary of headers to send.
        :param str body: the response body.
        :param Request request: The Request object that generated this response.
        """

        self._url = to_native_str(url)
        self._status_code = int(status_code)
        self._headers = headers or {}
        self._body = body
        self._request = request
Exemple #38
0
 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value
Exemple #39
0
 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value
Exemple #40
0
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
             if not hostname:
                 self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta,
                                  headers=request.headers, cookies=request.cookies, method=to_native_str(request.method),
                                  partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
Exemple #41
0
 def from_frontier(self, frontier_request):
     """request: Frontier > Scrapy"""
     cb = frontier_request.meta.get(b'scrapy_callback', None)
     if cb and self.spider:
         cb = _get_method(self.spider, cb)
     eb = frontier_request.meta.get(b'scrapy_errback', None)
     if eb and self.spider:
         eb = _get_method(self.spider, eb)
     body = frontier_request.body
     meta = frontier_request.meta.get(b'scrapy_meta', {})
     meta[b'frontier_request'] = frontier_request
     return ScrapyRequest(url=frontier_request.url,
                          callback=cb,
                          errback=eb,
                          body=body,
                          method=to_native_str(frontier_request.method),
                          headers=frontier_request.headers,
                          cookies=frontier_request.cookies,
                          meta=meta,
                          dont_filter=True)
Exemple #42
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    if not result.hostname:
        return sha1(key)
    host_checksum = get_crc32(result.hostname)
    doc_uri_combined = result.path+';'+result.params+result.query+result.fragment

    doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(doc_uri_combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return to_native_str(fprint, 'utf8')
Exemple #43
0
 def from_frontier(self, frontier_request):
     """request: Frontier > Scrapy"""
     cb = frontier_request.meta.get(b'scrapy_callback', None)
     if cb and self.spider:
         cb = _get_method(self.spider, cb)
     eb = frontier_request.meta.get(b'scrapy_errback', None)
     if eb and self.spider:
         eb = _get_method(self.spider, eb)
     body = frontier_request.body
     meta = frontier_request.meta.get(b'scrapy_meta', {})
     meta[b'frontier_request'] = frontier_request
     return ScrapyRequest(url=frontier_request.url,
                          callback=cb,
                          errback=eb,
                          body=body,
                          method=to_native_str(frontier_request.method),
                          headers=frontier_request.headers,
                          cookies=frontier_request.cookies,
                          meta=meta,
                          dont_filter=True)
Exemple #44
0
 def decode(self, buffer):
     obj = unpackb(buffer)
     if obj[0] == b'pc':
         return ('page_crawled',
                 self._response_from_object(obj[1]))
     if obj[0] == b'le':
         return ('links_extracted',
                 self._request_from_object(obj[1]),
                 [self._request_from_object(x) for x in obj[2]])
     if obj[0] == b'us':
         return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3])
     if obj[0] == b're':
         return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2]))
     if obj[0] == b'as':
         return ('add_seeds', [self._request_from_object(x) for x in obj[1]])
     if obj[0] == b'njid':
         return ('new_job_id', int(obj[1]))
     if obj[0] == b'of':
         return ('offset', int(obj[1]), int(obj[2]))
     return TypeError('Unknown message type')
Exemple #45
0
 def __init__(self,
              url,
              method=b'GET',
              headers=None,
              cookies=None,
              meta=None,
              body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body
Exemple #46
0
    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = item.method or 'GET'
                r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)
                r.meta['fingerprint'] = to_native_str(item.fingerprint)
                r.meta['score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception as exc:
            self.logger.exception(exc)
            self.session.rollback()
        return results
Exemple #47
0
def _get_method(obj, name):
    name = to_native_str(name)
    try:
        return getattr(obj, name)
    except AttributeError:
        raise ValueError("Method %r not found in: %s" % (name, obj))
Exemple #48
0
 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])
Exemple #49
0
 def __init__(self, separator=None, excluded_fields=None, msg_max_length=0):
     super(PlainValuesFilter, self).__init__()
     self.separator = to_native_str(separator or " ")
     self.excluded_fields = excluded_fields or []
     self.msg_max_length = msg_max_length
Exemple #50
0
 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])
Exemple #51
0
def _get_method(obj, name):
    name = to_native_str(name)
    try:
        return getattr(obj, name)
    except AttributeError:
        raise ValueError("Method %r not found in: %s" % (name, obj))
Exemple #52
0
 def update_score(self, batch):
     for fprint, score, request, schedule in batch:
         m = self.model(fingerprint=to_native_str(fprint), score=score)
         self.session.merge(m)
     self.session.commit()
Exemple #53
0
 def update_score(self, batch):
     for fprint, score, request, schedule in batch:
         m = self.model(fingerprint=to_native_str(fprint), score=score)
         self.session.merge(m)
     self.session.commit()
Exemple #54
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, parse.ParseResult) else \
        parse.urlparse(to_native_str(url))
Exemple #55
0
 def test_deprecation(self):
     with deprecated_call():
         to_native_str("")
Exemple #56
0
 def __init__(self, separator=None, excluded_fields=None, msg_max_length=0):
     super(PlainValuesFilter, self).__init__()
     self.separator = to_native_str(separator or " ")
     self.excluded_fields = excluded_fields or []
     self.msg_max_length = msg_max_length
Exemple #57
0
 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[0]),
                                method=obj[1],
                                headers=obj[2],
                                cookies=obj[3],
                                meta=obj[4])