Python to_native_str Beispiele, w3lib.util.to_native_str Python Beispiele

Beispiel #1

0

Datei anzeigen

 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint),
                                  score=score,
                                  url=request.url,
                                  meta=request.meta,
                                  headers=request.headers,
                                  cookies=request.cookies,
                                  method=to_native_str(request.method),
                                  partition_id=partition_id,
                                  host_crc32=host_crc32,
                                  created_at=time() * 1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()

Beispiel #2

0

Datei anzeigen

def safe_url_string(url,
                    encoding='utf8',
                    path_encoding='utf8',
                    quote_path=True):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). If quote_path is True (default),
    path_encoding ('utf-8' by default) is used to encode URL path component
    which is then quoted. Otherwise, if quote_path is False, path component
    is not encoded or quoted. Given encoding is used for query string
    or form data.

    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    decoded = to_unicode(url, encoding=encoding, errors='percentencode')
    parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # default encoding for path component SHOULD be UTF-8
    if quote_path:
        path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
    else:
        path = to_native_str(parts.path)

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),
        path,
        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))

Beispiel #3

0

Datei anzeigen

Datei: msgpack.py Projekt: CN-hanyi/frontera

 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[0]),
                                method=obj[1],
                                headers=obj[2],
                                cookies=obj[3],
                                meta=obj[4],
                                body=obj[5])

Beispiel #4

0

Datei anzeigen

 def read_seeds(self, stream):
     processed, scheduled = 0, 0
     requests = []
     for line in stream:
         url = to_native_str(line.strip())
         if url.startswith("#"):
             continue
         if not url.startswith("http"):
             url = "http://" + url + "/"
         try:
             request = self.create_request(url,
                                           meta={b'home': True},
                                           headers=DEFAULT_HEADERS)
             requests.append(request)
             if len(requests) % 40000 == 0:
                 scheduled += self._schedule_batch(requests)
                 processed += len(requests)
                 self.logger.info("Processed %d, scheduled %d urls.",
                                  processed, scheduled)
                 requests = []
         except Exception:
             self.logger.exception("Error during seeds addition")
     if requests:
         try:
             scheduled += self._schedule_batch(requests)
         except Exception:
             self.logger.exception("Error during seeds addition")
         processed += len(requests)
     self.logger.info("Processed %d, and scheduled %d urls overall.",
                      processed, scheduled)

Beispiel #5

0

Datei anzeigen

Datei: msgpack.py Projekt: voith/frontera

 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[3],
                                 request=self._request_model(url=url,
                                                             meta=obj[2]))

Beispiel #6

0

Datei anzeigen

    def from_frontier(self, frontier_request):
        """request: Frontier > Scrapy"""
        cb = frontier_request.meta.get(b'scrapy_callback', None)
        if cb and self.spider:
            cb = _get_method(self.spider, cb)
        eb = frontier_request.meta.get(b'scrapy_errback', None)
        if eb and self.spider:
            eb = _get_method(self.spider, eb)
        body = frontier_request.meta.get(b'scrapy_body', None)
        meta = frontier_request.meta[b'scrapy_meta']
        meta.pop('cf_store', None)
        for attr, val in frontier_request.meta.get(b'spider_state', []):
            prev_value = getattr(self.spider, attr, None)
            if prev_value is not None and prev_value != val:
                _LOG.error(
                    "State for attribute '%s' change from '%s' to '%s' attempted by request <%s> so crawl may loose consistency. \
                           Per request state should be propagated via request attributes.",
                    attr, prev_value, val, frontier_request.url)
            elif prev_value != val:
                setattr(self.spider, attr, val)
                _LOG.info("State for attribute '%s' set to %s by request <%s>",
                          attr, val, frontier_request.url)

        return ScrapyRequest(url=frontier_request.url,
                             callback=cb,
                             errback=eb,
                             body=body,
                             method=to_native_str(frontier_request.method),
                             headers=frontier_request.headers,
                             cookies=frontier_request.cookies,
                             meta=meta,
                             dont_filter=True)

Beispiel #7

0

Datei anzeigen

 def decode_request(self, message):
     obj = dict_to_bytes(super(Decoder, self).decode(message))
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])

Beispiel #8

0

Datei anzeigen

 def decode(self, message):
     message = dict_to_bytes(super(Decoder, self).decode(message))
     if message[b'type'] == b'links_extracted':
         request = self._request_from_object(message[b'r'])
         links = [
             self._request_from_object(link) for link in message[b'links']
         ]
         return ('links_extracted', request, links)
     if message[b'type'] == b'page_crawled':
         response = self._response_from_object(message[b'r'])
         return ('page_crawled', response)
     if message[b'type'] == b'request_error':
         request = self._request_from_object(message[b'r'])
         return ('request_error', request, to_native_str(message[b'error']))
     if message[b'type'] == b'update_score':
         return ('update_score', self._request_from_object(message[b'r']),
                 message[b'score'], message[b'schedule'])
     if message[b'type'] == b'add_seeds':
         seeds = []
         for seed in message[b'seeds']:
             request = self._request_from_object(seed)
             seeds.append(request)
         return ('add_seeds', seeds)
     if message[b'type'] == b'new_job_id':
         return ('new_job_id', int(message[b'job_id']))
     if message[b'type'] == b'offset':
         return ('offset', int(message[b'partition_id']),
                 int(message[b'offset']))
     return TypeError('Unknown message type')

Beispiel #9

0

Datei anzeigen

Datei: msgpack.py Projekt: loitv1689/frontera

 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[3],
                                 request=self._request_model(url=url,
                                                             meta=obj[2]))

Beispiel #10

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

 def flush(self, force_clear=False):
     for fingerprint, state_val in six.iteritems(self._cache):
         state = self.model(fingerprint=to_native_str(fingerprint), state=state_val)
         self.session.merge(state)
     self.session.commit()
     self.logger.debug("State cache has been flushed.")
     super(States, self).flush(force_clear)

Beispiel #11

0

Datei anzeigen

Datei: json.py Projekt: voith/frontera

 def decode_request(self, message):
     obj = dict_to_bytes(super(Decoder, self).decode(message))
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])

Beispiel #12

0

Datei anzeigen

 def _response_from_object(self, obj):
     url = to_native_str(obj[b'url'])
     request = self._request_model(url=url, meta=obj[b'meta'])
     return self._response_model(url=url,
                                 status_code=obj[b'status_code'],
                                 body=b64decode(obj[b'body']),
                                 request=request)

Beispiel #13

0

Datei anzeigen

Datei: json.py Projekt: voith/frontera

 def decode(self, message):
     message = dict_to_bytes(super(Decoder, self).decode(message))
     if message[b'type'] == b'links_extracted':
         request = self._request_from_object(message[b'r'])
         links = [self._request_from_object(link) for link in message[b'links']]
         return ('links_extracted', request, links)
     if message[b'type'] == b'page_crawled':
         response = self._response_from_object(message[b'r'])
         return ('page_crawled', response)
     if message[b'type'] == b'request_error':
         request = self._request_from_object(message[b'r'])
         return ('request_error', request, to_native_str(message[b'error']))
     if message[b'type'] == b'update_score':
         return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule'])
     if message[b'type'] == b'add_seeds':
         seeds = []
         for seed in message[b'seeds']:
             request = self._request_from_object(seed)
             seeds.append(request)
         return ('add_seeds', seeds)
     if message[b'type'] == b'new_job_id':
         return ('new_job_id', int(message[b'job_id']))
     if message[b'type'] == b'offset':
         return ('offset', int(message[b'partition_id']), int(message[b'offset']))
     return TypeError('Unknown message type')

Beispiel #14

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

    def fetch(self, fingerprints):
        to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state

Beispiel #15

0

Datei anzeigen

Datei: url.py Projekt: Preetwinder/w3lib

def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))

Beispiel #16

0

Datei anzeigen

Datei: url.py Projekt: wRAR/w3lib

def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))

Beispiel #17

0

Datei anzeigen

Datei: json.py Projekt: voith/frontera

 def _response_from_object(self, obj):
     url = to_native_str(obj[b'url'])
     request = self._request_model(url=url,
                                   meta=obj[b'meta'])
     return self._response_model(url=url,
                                 status_code=obj[b'status_code'],
                                 body=b64decode(obj[b'body']),
                                 request=request)

Beispiel #18

0

Datei anzeigen

 def flush(self):
     for fingerprint, state_val in six.iteritems(self._cache):
         state = self.model(fingerprint=to_native_str(fingerprint),
                            state=state_val)
         self.session.merge(state)
     self.session.commit()
     self.logger.debug("State cache has been flushed.")
     super(States, self).flush()

Beispiel #19

0

Datei anzeigen

    def _create_page(self, obj):
        db_page = self.model()
        db_page.fingerprint = to_native_str(obj.meta[b'fingerprint'])
        db_page.url = obj.url
        db_page.created_at = datetime.utcnow()
        db_page.meta = obj.meta
        db_page.depth = 0

        if isinstance(obj, Request):
            db_page.headers = obj.headers
            db_page.method = to_native_str(obj.method)
            db_page.cookies = obj.cookies
        elif isinstance(obj, Response):
            db_page.headers = obj.request.headers
            db_page.method = to_native_str(obj.request.method)
            db_page.cookies = obj.request.cookies
            db_page.status_code = obj.status_code
        return db_page

Beispiel #20

0

Datei anzeigen

 def _modify_page(self, obj):
     db_page = self.cache[obj.meta[b'fingerprint']]
     db_page.fetched_at = datetime.utcnow()
     if isinstance(obj, Response):
         db_page.headers = obj.request.headers
         db_page.method = to_native_str(obj.request.method)
         db_page.cookies = obj.request.cookies
         db_page.status_code = obj.status_code
     return db_page

Beispiel #21

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

    def _create_page(self, obj):
        db_page = self.model()
        db_page.fingerprint = to_native_str(obj.meta[b'fingerprint'])
        db_page.url = obj.url
        db_page.created_at = datetime.utcnow()
        db_page.meta = obj.meta
        db_page.depth = 0

        if isinstance(obj, Request):
            db_page.headers = obj.headers
            db_page.method = to_native_str(obj.method)
            db_page.cookies = obj.cookies
        elif isinstance(obj, Response):
            db_page.headers = obj.request.headers
            db_page.method = to_native_str(obj.request.method)
            db_page.cookies = obj.request.cookies
            db_page.status_code = obj.status_code
        return db_page

Beispiel #22

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

 def _modify_page(self, obj):
     db_page = self.cache[obj.meta[b'fingerprint']]
     db_page.fetched_at = datetime.utcnow()
     if isinstance(obj, Response):
         db_page.headers = obj.request.headers
         db_page.method = to_native_str(obj.request.method)
         db_page.cookies = obj.request.cookies
         db_page.status_code = obj.status_code
     return db_page

Beispiel #23

0

Datei anzeigen

Datei: url.py Projekt: wRAR/w3lib

def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars))

Beispiel #24

0

Datei anzeigen

    def filter(self, record):
        if isinstance(record.msg, dict):
            for field_name in self.excluded_fields:
                setattr(record, field_name, record.msg.get(field_name, ''))
            record.msg = self.separator.join([to_native_str(value)
                                              for key, value in six.iteritems(record.msg)
                                              if key not in self.excluded_fields])
            if self.msg_max_length and len(record.msg) > self.msg_max_length:
                record.msg = record.msg[0:self.msg_max_length-3] + "..."

        return True

Beispiel #25

0

Datei anzeigen

Datei: __init__.py Projekt: Preetwinder/frontera

    def filter(self, record):
        if isinstance(record.msg, dict):
            for field_name in self.excluded_fields:
                setattr(record, field_name, record.msg.get(field_name, ''))
            record.msg = self.separator.join([to_native_str(value)
                                              for key, value in six.iteritems(record.msg)
                                              if key not in self.excluded_fields])
            if self.msg_max_length and len(record.msg) > self.msg_max_length:
                record.msg = record.msg[0:self.msg_max_length-3] + "..."

        return True

Beispiel #26

0

Datei anzeigen

Datei: url.py Projekt: Digenis/w3lib

def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing a encoding, you should use the encoding of the
    original page (the page from which the url was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding.
    #
    # it is assumed that a raw bytes input comes from the page
    # corresponding to the encoding
    #
    # Note: if this assumption is wrong, this will fail;
    #       in the general case, users are required to use Unicode
    #       or safe ASCII bytes input
    parts = urlsplit(to_unicode(url, encoding=encoding))

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))

Beispiel #27

0

Datei anzeigen

Datei: url.py Projekt: andywu42000/2016YCProject

def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    return (
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),
        quote(to_bytes(parts.params, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars)
    )

Beispiel #28

0

Datei anzeigen

    def fetch(self, fingerprints):
        to_fetch = [
            to_native_str(f) for f in fingerprints if f not in self._cache
        ]
        self.logger.debug("cache size %s", len(self._cache))
        self.logger.debug("to fetch %d from %d", len(to_fetch),
                          len(fingerprints))

        for chunk in chunks(to_fetch, 128):
            for state in self.session.query(self.model).filter(
                    self.model.fingerprint.in_(chunk)):
                self._cache[to_bytes(state.fingerprint)] = state.state

Beispiel #29

0

Datei anzeigen

Datei: msgpack.py Projekt: CN-hanyi/frontera

 def _response_from_object(self, obj):
     url = to_native_str(obj[0])
     return self._response_model(url=url,
                                 status_code=obj[1],
                                 body=obj[4],
                                 headers=obj[3],
                                 request=self._request_model(
                                     url=url,
                                     meta=obj[2],
                                     method=obj[5],
                                     headers=obj[6],
                                     cookies=obj[7]))

Beispiel #30

0

Datei anzeigen

Datei: test_hbase.py Projekt: zhodj/frontera

 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])

Beispiel #31

0

Datei anzeigen

Datei: test_hbase.py Projekt: voith/frontera

 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])

Beispiel #32

0

Datei anzeigen

Datei: models.py Projekt: scrapinghub/frontera

    def __init__(self, url, status_code=200, headers=None, body='', request=None):
        """
        :param string url: URL of this response.
        :param int status_code: the HTTP status of the response. Defaults to 200.
        :param dict headers: dictionary of headers to send.
        :param str body: the response body.
        :param Request request: The Request object that generated this response.
        """

        self._url = to_native_str(url)
        self._status_code = int(status_code)
        self._headers = headers or {}
        self._body = body
        self._request = request

Beispiel #33

0

Datei anzeigen

Datei: models.py Projekt: lopuhin/frontera

 def __init__(self, url, method='GET', headers=None, cookies=None, meta=None, body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request.
     """
     self._url = url
     self._method = to_native_str(method or 'GET').upper()
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {'scrapy_meta': {}}
     self._body = body

Beispiel #34

0

Datei anzeigen

Datei: models.py Projekt: scrapinghub/frontera

 def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body

Beispiel #35

0

Datei anzeigen

Datei: msgpack.py Projekt: Flared/frontera

 def decode(self, buffer):
     obj = unpackb(buffer, encoding='utf-8')
     if obj[0] == b'pc':
         return ('page_crawled', self._response_from_object(obj[1]))
     if obj[0] == b'le':
         return ('links_extracted', self._request_from_object(obj[1]),
                 [self._request_from_object(x) for x in obj[2]])
     if obj[0] == b'us':
         return ('update_score', self._request_from_object(obj[1]), obj[2],
                 obj[3])
     if obj[0] == b're':
         return ('request_error', self._request_from_object(obj[1]),
                 to_native_str(obj[2]))
     if obj[0] == b'as':
         return ('add_seeds',
                 [self._request_from_object(x) for x in obj[1]])
     if obj[0] == b'njid':
         return ('new_job_id', int(obj[1]))
     if obj[0] == b'of':
         return ('offset', int(obj[1]), int(obj[2]))
     if obj[0] == b'ou':
         return ('overused', int(obj[1]),
                 [to_native_str(s) for s in obj[2]])
     raise TypeError('Unknown message type')

Beispiel #36

0

Datei anzeigen

Datei: __init__.py Projekt: bomquote/transistor-frontera

    def __init__(self, manager, args, mb_stream, states_context):
        self.logger = logging.getLogger("discovery")
        backend = manager.backend
        self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata)

        try:
            psl_file = codecs.open("public_suffix_list.dat", encoding='utf8')
        except IOError:
            self.logger.exception("Please get the public suffix file from https://publicsuffix.org/")
            raise
        self._suffix_list = PublicSuffixList(psl_file)
        self._states_ctx = states_context
        self.states = backend.states

        self.user_agent = to_native_str(manager.settings.get('USER_AGENT'))
        self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES'))
        super(Discovery, self).__init__(manager, args, mb_stream, states_context)

Beispiel #37

0

Datei anzeigen

Datei: models.py Projekt: zhangadrian/frontera

    def __init__(self,
                 url,
                 status_code=200,
                 headers=None,
                 body='',
                 request=None):
        """
        :param string url: URL of this response.
        :param int status_code: the HTTP status of the response. Defaults to 200.
        :param dict headers: dictionary of headers to send.
        :param str body: the response body.
        :param Request request: The Request object that generated this response.
        """

        self._url = to_native_str(url)
        self._status_code = int(status_code)
        self._headers = headers or {}
        self._body = body
        self._request = request

Beispiel #38

0

Datei anzeigen

 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value

Beispiel #39

0

Datei anzeigen

Datei: domaincache.py Projekt: scrapinghub/frontera

 def _get_item(self, key):
     self.stats["hbase_gets"] += 1
     hbase_key = to_bytes(key)
     row = self._table.row(hbase_key)
     if not row:
         self.stats["hbase_misses"] += 1
         super(DomainCache, self).__missing__(key)
         raise KeyError
     value = {}
     for k, v in six.iteritems(row):
         cf, _, col = k.partition(b':')
         col = to_native_str(col)
         value[col] = unpackb(v, encoding='utf-8')
         # XXX extract some fields as a set for faster in-checks
         if col in self._set_fields:
             value[col] = set(value[col])
     if self._on_get_func:
         self._on_get_func(value)
     return value

Beispiel #40

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
             if not hostname:
                 self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta,
                                  headers=request.headers, cookies=request.cookies, method=to_native_str(request.method),
                                  partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()

Beispiel #41

0

Datei anzeigen

 def from_frontier(self, frontier_request):
     """request: Frontier > Scrapy"""
     cb = frontier_request.meta.get(b'scrapy_callback', None)
     if cb and self.spider:
         cb = _get_method(self.spider, cb)
     eb = frontier_request.meta.get(b'scrapy_errback', None)
     if eb and self.spider:
         eb = _get_method(self.spider, eb)
     body = frontier_request.body
     meta = frontier_request.meta.get(b'scrapy_meta', {})
     meta[b'frontier_request'] = frontier_request
     return ScrapyRequest(url=frontier_request.url,
                          callback=cb,
                          errback=eb,
                          body=body,
                          method=to_native_str(frontier_request.method),
                          headers=frontier_request.headers,
                          cookies=frontier_request.cookies,
                          meta=meta,
                          dont_filter=True)

Beispiel #42

0

Datei anzeigen

Datei: fingerprint.py Projekt: lopuhin/frontera

def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    if not result.hostname:
        return sha1(key)
    host_checksum = get_crc32(result.hostname)
    doc_uri_combined = result.path+';'+result.params+result.query+result.fragment

    doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(doc_uri_combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return to_native_str(fprint, 'utf8')

Beispiel #43

0

Datei anzeigen

Datei: converters.py Projekt: scrapinghub/frontera

 def from_frontier(self, frontier_request):
     """request: Frontier > Scrapy"""
     cb = frontier_request.meta.get(b'scrapy_callback', None)
     if cb and self.spider:
         cb = _get_method(self.spider, cb)
     eb = frontier_request.meta.get(b'scrapy_errback', None)
     if eb and self.spider:
         eb = _get_method(self.spider, eb)
     body = frontier_request.body
     meta = frontier_request.meta.get(b'scrapy_meta', {})
     meta[b'frontier_request'] = frontier_request
     return ScrapyRequest(url=frontier_request.url,
                          callback=cb,
                          errback=eb,
                          body=body,
                          method=to_native_str(frontier_request.method),
                          headers=frontier_request.headers,
                          cookies=frontier_request.cookies,
                          meta=meta,
                          dont_filter=True)

Beispiel #44

0

Datei anzeigen

Datei: msgpack.py Projekt: voith/frontera

 def decode(self, buffer):
     obj = unpackb(buffer)
     if obj[0] == b'pc':
         return ('page_crawled',
                 self._response_from_object(obj[1]))
     if obj[0] == b'le':
         return ('links_extracted',
                 self._request_from_object(obj[1]),
                 [self._request_from_object(x) for x in obj[2]])
     if obj[0] == b'us':
         return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3])
     if obj[0] == b're':
         return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2]))
     if obj[0] == b'as':
         return ('add_seeds', [self._request_from_object(x) for x in obj[1]])
     if obj[0] == b'njid':
         return ('new_job_id', int(obj[1]))
     if obj[0] == b'of':
         return ('offset', int(obj[1]), int(obj[2]))
     return TypeError('Unknown message type')

Beispiel #45

0

Datei anzeigen

Datei: models.py Projekt: zhangadrian/frontera

 def __init__(self,
              url,
              method=b'GET',
              headers=None,
              cookies=None,
              meta=None,
              body=''):
     """
     :param string url: URL to send.
     :param string method: HTTP method to use.
     :param dict headers: dictionary of headers to send.
     :param dict cookies: dictionary of cookies to attach to this request.
     :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \
     the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items.
     """
     self._url = to_native_str(url)
     self._method = to_bytes((method or b'GET').upper())
     self._headers = headers or {}
     self._cookies = cookies or {}
     self._meta = meta or {b'scrapy_meta': {}}
     self._body = body

Beispiel #46

0

Datei anzeigen

Datei: components.py Projekt: lopuhin/frontera

    def get_next_requests(self, max_n_requests, partition_id, **kwargs):
        """
        Dequeues new batch of requests for crawling.

        :param max_n_requests: maximum number of requests to return
        :param partition_id: partition id
        :return: list of :class:`Request <frontera.core.models.Request>` objects.
        """
        results = []
        try:
            for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\
                    limit(max_n_requests):
                method = item.method or 'GET'
                r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)
                r.meta['fingerprint'] = to_native_str(item.fingerprint)
                r.meta['score'] = item.score
                results.append(r)
                self.session.delete(item)
            self.session.commit()
        except Exception as exc:
            self.logger.exception(exc)
            self.session.rollback()
        return results

Beispiel #47

0

Datei anzeigen

Datei: converters.py Projekt: scrapinghub/frontera

def _get_method(obj, name):
    name = to_native_str(name)
    try:
        return getattr(obj, name)
    except AttributeError:
        raise ValueError("Method %r not found in: %s" % (name, obj))

Beispiel #48

0

Datei anzeigen

 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])

Beispiel #49

0

Datei anzeigen

Datei: __init__.py Projekt: Preetwinder/frontera

 def __init__(self, separator=None, excluded_fields=None, msg_max_length=0):
     super(PlainValuesFilter, self).__init__()
     self.separator = to_native_str(separator or " ")
     self.excluded_fields = excluded_fields or []
     self.msg_max_length = msg_max_length

Beispiel #50

0

Datei anzeigen

Datei: json.py Projekt: voith/frontera

 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[b'url']),
                                method=obj[b'method'],
                                headers=obj[b'headers'],
                                cookies=obj[b'cookies'],
                                meta=obj[b'meta'])

Beispiel #51

0

Datei anzeigen

def _get_method(obj, name):
    name = to_native_str(name)
    try:
        return getattr(obj, name)
    except AttributeError:
        raise ValueError("Method %r not found in: %s" % (name, obj))

Beispiel #52

0

Datei anzeigen

Datei: components.py Projekt: Preetwinder/frontera

 def update_score(self, batch):
     for fprint, score, request, schedule in batch:
         m = self.model(fingerprint=to_native_str(fprint), score=score)
         self.session.merge(m)
     self.session.commit()

Beispiel #53

0

Datei anzeigen

 def update_score(self, batch):
     for fprint, score, request, schedule in batch:
         m = self.model(fingerprint=to_native_str(fprint), score=score)
         self.session.merge(m)
     self.session.commit()

Beispiel #54

0

Datei anzeigen

def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, parse.ParseResult) else \
        parse.urlparse(to_native_str(url))

Beispiel #55

0

Datei anzeigen

Datei: test_util.py Projekt: scrapy/w3lib

 def test_deprecation(self):
     with deprecated_call():
         to_native_str("")

Beispiel #56

0

Datei anzeigen

 def __init__(self, separator=None, excluded_fields=None, msg_max_length=0):
     super(PlainValuesFilter, self).__init__()
     self.separator = to_native_str(separator or " ")
     self.excluded_fields = excluded_fields or []
     self.msg_max_length = msg_max_length

Beispiel #57

0

Datei anzeigen

Datei: msgpack.py Projekt: voith/frontera

 def _request_from_object(self, obj):
     return self._request_model(url=to_native_str(obj[0]),
                                method=obj[1],
                                headers=obj[2],
                                cookies=obj[3],
                                meta=obj[4])