Beispiel #1
0
    def _schedule(self, batch, timestamp):
        """
        Row - portion of the queue for each partition id created at some point in time
        Row Key - partition id + score interval + random_str
        Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...)
        Value - QueueCell msgpack blob

        Where score is mapped from 0.0 to 1.0
        score intervals are
          [0.01-0.02)
          [0.02-0.03)
          [0.03-0.04)
         ...
          [0.99-1.00]
        random_str - the time when links was scheduled for retrieval, microsecs

        :param batch: iterable of Request objects
        :return:
        """
        def get_interval(score, resolution):
            if score < 0.0 or score > 1.0:
                raise OverflowError

            i = int(score / resolution)
            if i % 10 == 0 and i > 0:
                i = i - 1  # last interval is inclusive from right
            return (i * resolution, (i + 1) * resolution)

        random_str = int(time() * 1E+6)
        data = dict()
        for request, score in batch:
            domain = request.meta[b'domain']
            fingerprint = request.meta[b'fingerprint']
            key = self.partitioner.get_key(request)
            partition_id = self.partitioner.partition(key)
            host_crc32 = domain if type(domain) == int else get_crc32(key)
            item = (unhexlify(fingerprint), host_crc32,
                    self.encoder.encode_request(request), score)
            score = 1 - score  # because of lexicographical sort in HBase
            rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" %
                               get_interval(score, 0.01), random_str)
            data.setdefault(rk, []).append((score, item))

        table = self.connection.table(self.table_name)
        with table.batch(transaction=True) as b:
            for rk, tuples in six.iteritems(data):
                obj = dict()
                for score, item in tuples:
                    column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001)
                    obj.setdefault(column, []).append(item)

                final = dict()
                packer = Packer()
                for column, items in six.iteritems(obj):
                    stream = BytesIO()
                    for item in items:
                        stream.write(packer.pack(item))
                    final[column] = stream.getvalue()
                final[b'f:t'] = str(timestamp)
                b.put(rk, final)
Beispiel #2
0
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(
                 fingerprint=fprint,
                 score=score,
                 url=request.url,
                 meta=request.meta,
                 headers=request.headers,
                 cookies=request.cookies,
                 method=request.method,
                 partition_id=partition_id,
                 host_crc32=host_crc32,
                 created_at=time() * 1E+6)
             to_save.append(q)
             request.meta['state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
Beispiel #3
0
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint),
                                  score=score,
                                  url=request.url,
                                  meta=request.meta,
                                  headers=request.headers,
                                  cookies=request.cookies,
                                  method=to_native_str(request.method),
                                  partition_id=partition_id,
                                  host_crc32=host_crc32,
                                  created_at=time() * 1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
Beispiel #4
0
    def _schedule(self, batch):
        """
        Row - portion of the queue for each partition id created at some point in time
        Row Key - partition id + score interval + timestamp
        Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...)
        Value - QueueCell msgpack blob

        Where score is mapped from 0.0 to 1.0
        score intervals are
          [0.01-0.02)
          [0.02-0.03)
          [0.03-0.04)
         ...
          [0.99-1.00]
        timestamp - the time when links was scheduled for retrieval.

        :param batch: list of tuples(score, fingerprint, domain, url)
        :return:
        """
        def get_interval(score, resolution):
            if score < 0.0 or score > 1.0:
                raise OverflowError

            i = int(score / resolution)
            if i % 10 == 0 and i > 0:
                i = i - 1  # last interval is inclusive from right
            return (i * resolution, (i + 1) * resolution)

        timestamp = int(time() * 1E+6)
        data = dict()
        for score, fingerprint, domain, url in batch:
            if type(domain) == dict:
                partition_id = self.partitioner.partition(domain['name'], self.partitions)
                host_crc32 = get_crc32(domain['name'])
            elif type(domain) == int:
                partition_id = self.partitioner.partition_by_hash(domain, self.partitions)
                host_crc32 = domain
            else:
                raise TypeError("domain of unknown type.")
            item = (unhexlify(fingerprint), host_crc32, url, score)
            score = 1 - score  # because of lexicographical sort in HBase
            rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), timestamp)
            data.setdefault(rk, []).append((score, item))

        table = self.connection.table(self.table_name)
        with table.batch(transaction=True) as b:
            for rk, tuples in data.iteritems():
                obj = dict()
                for score, item in tuples:
                    column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001)
                    obj.setdefault(column, []).append(item)

                final = dict()
                packer = Packer()
                for column, items in obj.iteritems():
                    stream = BytesIO()
                    for item in items:
                        stream.write(packer.pack(item))
                    final[column] = stream.getvalue()
                b.put(rk, final)
Beispiel #5
0
 def test_crc32_range(self):
     left, right = -2**31, 2**31 - 1
     for x in range(10000):
         bytestr = hashlib.md5(str(x).encode('ascii')).hexdigest()
         assert left <= get_crc32(bytestr) <= right
     for x in [left, left + 1, right - 1, right, right + 1,
               2**32 - 2, 2**32 - 1]:
         assert left <= to_signed32(x) <= right
 def test_crc32_range(self):
     left, right = -2**31, 2**31 - 1
     for x in range(10000):
         bytestr = hashlib.md5(str(x).encode('ascii')).hexdigest()
         assert left <= get_crc32(bytestr) <= right
     for x in [left, left + 1, right - 1, right, right + 1,
               2**32 - 2, 2**32 - 1]:
         assert left <= to_signed32(x) <= right
Beispiel #7
0
    def schedule(self, batch):
        for fprint, score, request, schedule_at in batch:
            if schedule_at:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                if not hostname:
                    self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                    partition_id = self.partitions[0]
                    host_crc32 = 0
                else:
                    partition_id = self.partitioner.partition(hostname, self.partitions)
                    host_crc32 = get_crc32(hostname)
                created_at = time()*1E+6
                q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at)

                q.save()
                request.meta['state'] = States.QUEUED
Beispiel #8
0
    def schedule(self, batch):
        query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, "
                                     "created_at, meta, depth, headers, method, cookies) "
                                     "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
        cql_items = []
        for fprint, score, request, schedule in batch:
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                if not hostname:
                    self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                    partition_id = self.partitions[0]
                    host_crc32 = 0
                else:
                    partition_id = self.partitioner.partition(hostname, self.partitions)
                    host_crc32 = get_crc32(hostname)
                created_at = time()*1E+6

                if "domain" not in request.meta:
                    request.meta["domain"] = {}
                if "origin_is_frontier" not in request.meta:
                    request.meta["origin_is_frontier"] = ''
                if "scrapy_callback" not in request.meta:
                    request.meta["scrapy_callback"] = None
                if "scrapy_errback" not in request.meta:
                    request.meta["scrapy_errback"] = None
                if "scrapy_meta" not in request.meta:
                    request.meta["scrapy_meta"] = {}
                if "score" not in request.meta:
                    request.meta["score"] = 0
                if "jid" not in request.meta:
                    request.meta["jid"] = 0

                meta = Meta(domain=request.meta['domain'], fingerprint=fprint,
                            origin_is_frontier=request.meta['origin_is_frontier'],
                            scrapy_callback=request.meta['scrapy_callback'],
                            scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta'])

                cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0,
                         request.headers, request.method, request.cookies)
                cql_items.append(cql_i)

                request.meta['state'] = States.QUEUED

        execute_concurrent_with_args(self.session, query, cql_items, concurrency=400)
        self.counter_cls.cass_count({"queued_urls": len(cql_items)})
Beispiel #9
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    hostname = result.hostname if result.hostname else '-'
    host_checksum = get_crc32(hostname)
    combined = hostname+result.path+';'+result.params+result.query+result.fragment

    combined = to_bytes(combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return fprint
Beispiel #10
0
def hostname_local_fingerprint(key):
    """
    This function is used for URL fingerprinting, which serves to uniquely identify the document in storage.
    ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5
    from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents
    of average website within one cache block, which can be efficiently read from disk once.

    :param key: str URL
    :return: str 20 bytes hex string
    """
    result = parse_url(key)
    if not result.hostname:
        return sha1(key)
    host_checksum = get_crc32(result.hostname)
    doc_uri_combined = result.path+';'+result.params+result.query+result.fragment

    doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore')
    doc_fprint = hashlib.md5(doc_uri_combined).digest()
    fprint = hexlify(pack(">i16s", host_checksum, doc_fprint))
    return to_native_str(fprint, 'utf8')
 def _schedule(self, batch, timestamp):
     data = dict()
     for request, score in batch:
         domain = request.meta[FIELD_DOMAIN]
         fingerprint = request.meta[FIELD_FINGERPRINT]
         if type(domain) == dict:
             partition_id = self._partitioner.partition(
                 domain[FIELD_NAME], self._partitions)
             host_crc32 = get_crc32(domain[FIELD_NAME])
         elif type(domain) == int:
             partition_id = self._partitioner.partition_by_hash(
                 domain, self._partitions)
             host_crc32 = domain
         else:
             raise TypeError("domain of unknown type.")
         item = (timestamp, fingerprint, host_crc32,
                 self._encoder.encode_request(request), score)
         interval_start = self.get_interval_start(score)
         data.setdefault(partition_id,
                         {})[packb(item)] = int(interval_start * 100)
     for (key, items) in data.items():
         self._redis_pipeline.zadd(key, mapping=items)
     self._redis_pipeline.execute()
Beispiel #12
0
 def test_ascii_unicode(self):
     assert get_crc32(u'example') == 1861000095
Beispiel #13
0
 def partition(self, key, partitions=None):
     if key is None:
         return self.partitions[0]
     value = get_crc32(key)
     return self.partition_by_hash(
         value, partitions if partitions else self.partitions)
Beispiel #14
0
 def test_bytes(self):
     assert get_crc32(b'example') == 1861000095
Beispiel #15
0
 def test_negative_crc32(self):
     assert get_crc32(b'1') == -2082672713
Beispiel #16
0
 def test_non_ascii_bytes(self):
     assert get_crc32(u'example\u5000'.encode('utf8')) == 1259721235
Beispiel #17
0
 def test_non_ascii_unicode(self):
     assert get_crc32(u'example\u5000') == 1259721235
Beispiel #18
0
 def test_ascii_unicode(self):
     assert get_crc32(u'example') == 1861000095
Beispiel #19
0
 def test_bytes(self):
     assert get_crc32(b'example') == 1861000095
Beispiel #20
0
 def hash(self, key):
     if type(key) == int:
         return key
     else:
         return get_crc32(key)
Beispiel #21
0
 def test_negative_crc32(self):
     assert get_crc32(b'1') == -2082672713
Beispiel #22
0
 def test_non_ascii_bytes(self):
     assert get_crc32(u'example\u5000'.encode('utf8')) == 1259721235
Beispiel #23
0
 def test_non_ascii_unicode(self):
     assert get_crc32(u'example\u5000') == 1259721235
Beispiel #24
0
    def _schedule(self, batch):
        """
        Row - portion of the queue for each partition id created at some point in time
        Row Key - partition id + score interval + timestamp
        Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...)
        Value - QueueCell msgpack blob

        Where score is mapped from 0.0 to 1.0
        score intervals are
          [0.01-0.02)
          [0.02-0.03)
          [0.03-0.04)
         ...
          [0.99-1.00]
        timestamp - the time when links was scheduled for retrieval.

        :param batch: list of tuples(score, fingerprint, domain, url)
        :return:
        """
        def get_interval(score, resolution):
            if score < 0.0 or score > 1.0:
                raise OverflowError

            i = int(score / resolution)
            if i % 10 == 0 and i > 0:
                i = i - 1  # last interval is inclusive from right
            return (i * resolution, (i + 1) * resolution)

        timestamp = int(time() * 1E+6)
        data = dict()
        for score, fingerprint, domain, url in batch:
            if type(domain) == dict:
                partition_id = self.partitioner.partition(
                    domain['name'], self.partitions)
                host_crc32 = get_crc32(domain['name'])
            elif type(domain) == int:
                partition_id = self.partitioner.partition_by_hash(
                    domain, self.partitions)
                host_crc32 = domain
            else:
                raise TypeError("domain of unknown type.")
            item = (unhexlify(fingerprint), host_crc32, url, score)
            score = 1 - score  # because of lexicographical sort in HBase
            rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" %
                               get_interval(score, 0.01), timestamp)
            data.setdefault(rk, []).append((score, item))

        table = self.connection.table(self.table_name)
        with table.batch(transaction=True) as b:
            for rk, tuples in data.iteritems():
                obj = dict()
                for score, item in tuples:
                    column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001)
                    obj.setdefault(column, []).append(item)

                final = dict()
                packer = Packer()
                for column, items in obj.iteritems():
                    stream = BytesIO()
                    for item in items:
                        stream.write(packer.pack(item))
                    final[column] = stream.getvalue()
                b.put(rk, final)
Beispiel #25
0
 def partition(self, key, partitions=None):
     if key is None:
         return self.partitions[0]
     value = get_crc32(key)
     return self.partition_by_hash(value, partitions if partitions else self.partitions)