def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: schema = {'f': {'max_versions': 1}} if use_snappy: schema['f']['compression'] = 'SNAPPY' self.connection.create_table(self.table_name, schema) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table( self.table_name, {'f': { 'max_versions': 1, 'block_cache_enabled': 1 }}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work)
def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {'consumed_since_start': 0} self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) logger.info( "Strategy worker is initialized and consuming partition %d", partition_id)
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status)
def __init__(self, manager): self._manager = manager settings = manager.settings # Kafka connection parameters self._server = settings.get('KAFKA_LOCATION') self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo") self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done") self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler") self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._partition_id = settings.get('SPIDER_PARTITION_ID') # Kafka setup self._conn = KafkaClient(self._server) self._prod = None self._cons = None logger = getLogger("kafka") handler = StreamHandler() logger.addHandler(handler) self._connect_consumer() self._connect_producer() store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model)
def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request)
class HBaseQueue(Queue): GET_RETRIES = 3 def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) schema = {'f': {'max_versions': 1}} if use_snappy: schema['f']['compression'] = 'SNAPPY' if self.table_name not in tables: self.connection.create_table(self.table_name, schema) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request) def frontier_start(self): pass def frontier_stop(self): pass def schedule(self, batch): to_schedule = dict() now = int(time()) for fprint, score, request, schedule in batch: if schedule: if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, # so I propose to require DomainMiddleware by HBaseBackend and remove that code _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) request.meta[b'domain'] = {'name': hostname} timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in six.iteritems(to_schedule): self._schedule(batch, timestamp) def _schedule(self, batch, timestamp): """ Row - portion of the queue for each partition id created at some point in time Row Key - partition id + score interval + random_str Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...) Value - QueueCell msgpack blob Where score is mapped from 0.0 to 1.0 score intervals are [0.01-0.02) [0.02-0.03) [0.03-0.04) ... [0.99-1.00] random_str - the time when links was scheduled for retrieval, microsecs :param batch: iterable of Request objects :return: """ def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError i = int(score / resolution) if i % 10 == 0 and i > 0: i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) random_str = int(time() * 1E+6) data = dict() for request, score in batch: domain = request.meta[b'domain'] fingerprint = request.meta[b'fingerprint'] if type(domain) == dict: partition_id = self.partitioner.partition(domain[b'name'], self.partitions) host_crc32 = get_crc32(domain[b'name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash(domain, self.partitions) host_crc32 = domain else: raise TypeError("domain of unknown type.") item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) data.setdefault(rk, []).append((score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: for rk, tuples in six.iteritems(data): obj = dict() for score, item in tuples: column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) obj.setdefault(column, []).append(item) final = dict() packer = Packer() for column, items in six.iteritems(obj): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() final[b'f:t'] = str(timestamp) b.put(rk, final) def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from :param min_requests: minimum number of requests :param min_hosts: minimum number of hosts :param max_requests_per_host: maximum number of requests per host :return: list of :class:`Request <frontera.core.models.Request>` objects. """ min_requests = kwargs.pop('min_requests') min_hosts = kwargs.pop('min_hosts', None) max_requests_per_host = kwargs.pop('max_requests_per_host', None) assert(max_n_requests > min_requests) table = self.connection.table(self.table_name) meta_map = {} queue = {} limit = min_requests tries = 0 count = 0 prefix = to_bytes('%d_' % partition_id) # now_ts = int(time()) # TODO: figure out how to use filter here, Thrift filter above causes full scan # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", tries, limit, count, len(queue.keys())) meta_map.clear() queue.clear() count = 0 for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): # filter=filter for cq, buf in six.iteritems(data): if cq == b'f:t': continue stream = BytesIO(buf) unpacker = Unpacker(stream) for item in unpacker: fprint, host_crc32, _, _ = item if host_crc32 not in queue: queue[host_crc32] = [] if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: continue queue[host_crc32].append(fprint) count += 1 if fprint not in meta_map: meta_map[fprint] = [] meta_map[fprint].append((rk, item)) if count > max_n_requests: break if min_hosts is not None and len(queue.keys()) < min_hosts: continue if count < min_requests: continue break self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) # For every fingerprint collect it's row keys and return all fingerprints from them fprint_map = {} for fprint, meta_list in six.iteritems(meta_map): for rk, _ in meta_list: fprint_map.setdefault(rk, []).append(fprint) results = [] trash_can = set() for _, fprints in six.iteritems(queue): for fprint in fprints: for rk, _ in meta_map[fprint]: if rk in trash_can: continue for rk_fprint in fprint_map[rk]: _, item = meta_map[rk_fprint][0] _, _, encoded, score = item request = self.decoder.decode_request(encoded) request.meta[b'score'] = score results.append(request) trash_can.add(rk) with table.batch(transaction=True) as b: for rk in trash_can: b.delete(rk) self.logger.debug("%d row keys removed", len(trash_can)) return results def count(self): raise NotImplementedError
class HBaseQueue(Queue): GET_RETRIES = 3 def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) tables = set(self.connection.tables()) if drop and self.table_name in tables: self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) if self.table_name not in tables: self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) class DumbResponse: pass self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request) def frontier_start(self): pass def frontier_stop(self): pass def schedule(self, batch): to_schedule = dict() now = int(time()) for fprint, score, request, schedule in batch: if schedule: if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, # so I propose to require DomainMiddleware by HBaseBackend and remove that code _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) request.meta[b'domain'] = {'name': hostname} timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in six.iteritems(to_schedule): self._schedule(batch, timestamp) def _schedule(self, batch, timestamp): """ Row - portion of the queue for each partition id created at some point in time Row Key - partition id + score interval + random_str Column Qualifier - discrete score (first three digits after dot, e.g. 0.001_0.002, 0.002_0.003, ...) Value - QueueCell msgpack blob Where score is mapped from 0.0 to 1.0 score intervals are [0.01-0.02) [0.02-0.03) [0.03-0.04) ... [0.99-1.00] random_str - the time when links was scheduled for retrieval, microsecs :param batch: iterable of Request objects :return: """ def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError i = int(score / resolution) if i % 10 == 0 and i > 0: i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) random_str = int(time() * 1E+6) data = dict() for request, score in batch: domain = request.meta[b'domain'] fingerprint = request.meta[b'fingerprint'] if type(domain) == dict: partition_id = self.partitioner.partition(domain[b'name'], self.partitions) host_crc32 = get_crc32(domain[b'name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash(domain, self.partitions) host_crc32 = domain else: raise TypeError("domain of unknown type.") item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) data.setdefault(rk, []).append((score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: for rk, tuples in six.iteritems(data): obj = dict() for score, item in tuples: column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) obj.setdefault(column, []).append(item) final = dict() packer = Packer() for column, items in six.iteritems(obj): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() final[b'f:t'] = str(timestamp) b.put(rk, final) def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from :param min_requests: minimum number of requests :param min_hosts: minimum number of hosts :param max_requests_per_host: maximum number of requests per host :return: list of :class:`Request <frontera.core.models.Request>` objects. """ min_requests = kwargs.pop('min_requests') min_hosts = kwargs.pop('min_hosts') max_requests_per_host = kwargs.pop('max_requests_per_host') assert(max_n_requests > min_requests) table = self.connection.table(self.table_name) meta_map = {} queue = {} limit = min_requests tries = 0 count = 0 prefix = '%d_' % partition_id now_ts = int(time()) filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", tries, limit, count, len(queue.keys())) meta_map.clear() queue.clear() count = 0 for rk, data in table.scan(limit=int(limit), batch_size=256, filter=filter): for cq, buf in six.iteritems(data): if cq == b'f:t': continue stream = BytesIO(buf) unpacker = Unpacker(stream) for item in unpacker: fprint, host_crc32, _, _ = item if host_crc32 not in queue: queue[host_crc32] = [] if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: continue queue[host_crc32].append(fprint) count += 1 if fprint not in meta_map: meta_map[fprint] = [] meta_map[fprint].append((rk, item)) if count > max_n_requests: break if min_hosts is not None and len(queue.keys()) < min_hosts: continue if count < min_requests: continue break self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) # For every fingerprint collect it's row keys and return all fingerprints from them fprint_map = {} for fprint, meta_list in six.iteritems(meta_map): for rk, _ in meta_list: fprint_map.setdefault(rk, []).append(fprint) results = [] trash_can = set() for _, fprints in six.iteritems(queue): for fprint in fprints: for rk, _ in meta_map[fprint]: if rk in trash_can: continue for rk_fprint in fprint_map[rk]: _, item = meta_map[rk_fprint][0] _, _, encoded, score = item request = self.decoder.decode_request(encoded) request.meta[b'score'] = score results.append(request) trash_can.add(rk) with table.batch(transaction=True) as b: for rk in trash_can: b.delete(rk) self.logger.debug("%d row keys removed", len(trash_can)) return results def count(self): raise NotImplementedError
class DBWorker(object): def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status) def set_process_info(self, process_info): self.process_info = process_info def run(self): def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) self.slot.schedule(on_start=True) self._logging_task.start(30) signal(SIGUSR1, debug) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def stop(self): logger.info("Stopping frontier manager.") self._manager.stop() def log_status(self): for k, v in six.iteritems(self.stats): logger.info("%s=%s", k, v) def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False def consume_incoming(self, *args, **kwargs): consumed = 0 for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: %s', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if 'jid' not in response.meta or response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if 'jid' not in request.meta or request.meta['jid'] != self.job_id: continue logger.debug("Request error %s", request.url) self._backend.request_error(request, error) if type == 'offset': _, partition_id, offset = msg try: producer_offset = self.spider_feed_producer.get_offset(partition_id) except KeyError: continue else: lag = producer_offset - offset if lag < 0: # non-sense in general, happens when SW is restarted and not synced yet with Spiders. continue if lag < self.max_next_requests or offset == 0: self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) finally: consumed += 1 """ # TODO: Think how it should be implemented in DB-worker only mode. if not self.strategy_enabled and self._backend.finished(): logger.info("Crawling is finished.") reactor.stop() """ self.stats['consumed_since_start'] += consumed self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() self.slot.schedule() return consumed def consume_scoring(self, *args, **kwargs): consumed = 0 seen = set() batch = [] for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: if msg[0] == 'update_score': _, fprint, score, url, schedule = msg if fprint not in seen: batch.append((fprint, score, Request(url), schedule)) seen.add(fprint) if msg[0] == 'new_job_id': self.job_id = msg[1] finally: consumed += 1 self.queue.schedule(batch) self.stats['consumed_scoring_since_start'] += consumed self.stats['last_consumed_scoring'] = consumed self.stats['last_consumption_run_scoring'] = asctime() self.slot.schedule() def new_batch(self, *args, **kwargs): def get_hostname(request): try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) except Exception as e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) return None else: return name.encode('utf-8', 'ignore') def get_fingerprint(request): return request.meta['fingerprint'] partitions = self.spider_feed.available_partitions() logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) if not partitions: return 0 count = 0 if self.spider_feed_partitioning == 'hostname': get_key = get_hostname elif self.spider_feed_partitioning == 'fingerprint': get_key = get_fingerprint else: raise Exception("Unexpected value in self.spider_feed_partitioning") for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception as e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count += 1 self.spider_feed_producer.send(get_key(request), eo) self.stats['pushed_since_start'] += count self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count