def _schedule(self, batch, timestamp): """ Row - portion of the queue for each partition id created at some point in time Row Key - partition id + score interval + random_str Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...) Value - QueueCell msgpack blob Where score is mapped from 0.0 to 1.0 score intervals are [0.01-0.02) [0.02-0.03) [0.03-0.04) ... [0.99-1.00] random_str - the time when links was scheduled for retrieval, microsecs :param batch: iterable of Request objects :return: """ def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError i = int(score / resolution) if i % 10 == 0 and i > 0: i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) random_str = int(time() * 1E+6) data = dict() for request, score in batch: domain = request.meta[b'domain'] fingerprint = request.meta[b'fingerprint'] key = self.partitioner.get_key(request) partition_id = self.partitioner.partition(key) host_crc32 = domain if type(domain) == int else get_crc32(key) item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) data.setdefault(rk, []).append((score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: for rk, tuples in six.iteritems(data): obj = dict() for score, item in tuples: column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) obj.setdefault(column, []).append(item) final = dict() packer = Packer() for column, items in six.iteritems(obj): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() final[b'f:t'] = str(timestamp) b.put(rk, final)
def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self.logger.error( "Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition( hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model( fingerprint=fprint, score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=request.method, partition_id=partition_id, host_crc32=host_crc32, created_at=time() * 1E+6) to_save.append(q) request.meta['state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self.logger.error( "Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition( hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=to_native_str(request.method), partition_id=partition_id, host_crc32=host_crc32, created_at=time() * 1E+6) to_save.append(q) request.meta[b'state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def _schedule(self, batch): """ Row - portion of the queue for each partition id created at some point in time Row Key - partition id + score interval + timestamp Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...) Value - QueueCell msgpack blob Where score is mapped from 0.0 to 1.0 score intervals are [0.01-0.02) [0.02-0.03) [0.03-0.04) ... [0.99-1.00] timestamp - the time when links was scheduled for retrieval. :param batch: list of tuples(score, fingerprint, domain, url) :return: """ def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError i = int(score / resolution) if i % 10 == 0 and i > 0: i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) timestamp = int(time() * 1E+6) data = dict() for score, fingerprint, domain, url in batch: if type(domain) == dict: partition_id = self.partitioner.partition(domain['name'], self.partitions) host_crc32 = get_crc32(domain['name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash(domain, self.partitions) host_crc32 = domain else: raise TypeError("domain of unknown type.") item = (unhexlify(fingerprint), host_crc32, url, score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), timestamp) data.setdefault(rk, []).append((score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: for rk, tuples in data.iteritems(): obj = dict() for score, item in tuples: column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) obj.setdefault(column, []).append(item) final = dict() packer = Packer() for column, items in obj.iteritems(): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() b.put(rk, final)
def test_crc32_range(self): left, right = -2**31, 2**31 - 1 for x in range(10000): bytestr = hashlib.md5(str(x).encode('ascii')).hexdigest() assert left <= get_crc32(bytestr) <= right for x in [left, left + 1, right - 1, right, right + 1, 2**32 - 2, 2**32 - 1]: assert left <= to_signed32(x) <= right
def schedule(self, batch): for fprint, score, request, schedule_at in batch: if schedule_at: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) created_at = time()*1E+6 q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at) q.save() request.meta['state'] = States.QUEUED
def schedule(self, batch): query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, " "created_at, meta, depth, headers, method, cookies) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") cql_items = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) created_at = time()*1E+6 if "domain" not in request.meta: request.meta["domain"] = {} if "origin_is_frontier" not in request.meta: request.meta["origin_is_frontier"] = '' if "scrapy_callback" not in request.meta: request.meta["scrapy_callback"] = None if "scrapy_errback" not in request.meta: request.meta["scrapy_errback"] = None if "scrapy_meta" not in request.meta: request.meta["scrapy_meta"] = {} if "score" not in request.meta: request.meta["score"] = 0 if "jid" not in request.meta: request.meta["jid"] = 0 meta = Meta(domain=request.meta['domain'], fingerprint=fprint, origin_is_frontier=request.meta['origin_is_frontier'], scrapy_callback=request.meta['scrapy_callback'], scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta']) cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0, request.headers, request.method, request.cookies) cql_items.append(cql_i) request.meta['state'] = States.QUEUED execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) self.counter_cls.cass_count({"queued_urls": len(cql_items)})
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) hostname = result.hostname if result.hostname else '-' host_checksum = get_crc32(hostname) combined = hostname+result.path+';'+result.params+result.query+result.fragment combined = to_bytes(combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return fprint
def hostname_local_fingerprint(key): """ This function is used for URL fingerprinting, which serves to uniquely identify the document in storage. ``hostname_local_fingerprint`` is constructing fingerprint getting first 4 bytes as Crc32 from host, and rest is MD5 from rest of the URL. Default option is set to make use of HBase block cache. It is expected to fit all the documents of average website within one cache block, which can be efficiently read from disk once. :param key: str URL :return: str 20 bytes hex string """ result = parse_url(key) if not result.hostname: return sha1(key) host_checksum = get_crc32(result.hostname) doc_uri_combined = result.path+';'+result.params+result.query+result.fragment doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') doc_fprint = hashlib.md5(doc_uri_combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) return to_native_str(fprint, 'utf8')
def _schedule(self, batch, timestamp): data = dict() for request, score in batch: domain = request.meta[FIELD_DOMAIN] fingerprint = request.meta[FIELD_FINGERPRINT] if type(domain) == dict: partition_id = self._partitioner.partition( domain[FIELD_NAME], self._partitions) host_crc32 = get_crc32(domain[FIELD_NAME]) elif type(domain) == int: partition_id = self._partitioner.partition_by_hash( domain, self._partitions) host_crc32 = domain else: raise TypeError("domain of unknown type.") item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) data.setdefault(partition_id, {})[packb(item)] = int(interval_start * 100) for (key, items) in data.items(): self._redis_pipeline.zadd(key, mapping=items) self._redis_pipeline.execute()
def test_ascii_unicode(self): assert get_crc32(u'example') == 1861000095
def partition(self, key, partitions=None): if key is None: return self.partitions[0] value = get_crc32(key) return self.partition_by_hash( value, partitions if partitions else self.partitions)
def test_bytes(self): assert get_crc32(b'example') == 1861000095
def test_negative_crc32(self): assert get_crc32(b'1') == -2082672713
def test_non_ascii_bytes(self): assert get_crc32(u'example\u5000'.encode('utf8')) == 1259721235
def test_non_ascii_unicode(self): assert get_crc32(u'example\u5000') == 1259721235
def hash(self, key): if type(key) == int: return key else: return get_crc32(key)
def _schedule(self, batch): """ Row - portion of the queue for each partition id created at some point in time Row Key - partition id + score interval + timestamp Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...) Value - QueueCell msgpack blob Where score is mapped from 0.0 to 1.0 score intervals are [0.01-0.02) [0.02-0.03) [0.03-0.04) ... [0.99-1.00] timestamp - the time when links was scheduled for retrieval. :param batch: list of tuples(score, fingerprint, domain, url) :return: """ def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError i = int(score / resolution) if i % 10 == 0 and i > 0: i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) timestamp = int(time() * 1E+6) data = dict() for score, fingerprint, domain, url in batch: if type(domain) == dict: partition_id = self.partitioner.partition( domain['name'], self.partitions) host_crc32 = get_crc32(domain['name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash( domain, self.partitions) host_crc32 = domain else: raise TypeError("domain of unknown type.") item = (unhexlify(fingerprint), host_crc32, url, score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), timestamp) data.setdefault(rk, []).append((score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: for rk, tuples in data.iteritems(): obj = dict() for score, item in tuples: column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) obj.setdefault(column, []).append(item) final = dict() packer = Packer() for column, items in obj.iteritems(): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() b.put(rk, final)
def partition(self, key, partitions=None): if key is None: return self.partitions[0] value = get_crc32(key) return self.partition_by_hash(value, partitions if partitions else self.partitions)