class StrategyWorker(object): def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {} self.job_id = 0 self.task = LoopingCall(self.work) def work(self): # Collecting batch to process consumed = 0 batch = [] for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg self.states_context.to_fetch(seeds) continue if type == 'page_crawled': _, response, links = msg self.states_context.to_fetch(response) self.states_context.to_fetch(links) continue if type == 'request_error': _, request, error = msg self.states_context.to_fetch(request) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) finally:
class StrategyWorker(object): def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work) def work(self): consumed = 0 batch = [] fingerprints = set() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update(map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) finally:
class StrategyWorker(object): def __init__(self, settings, strategy_module): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_module.CrawlingStrategy() self.states = self._manager.backend.states self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 self.task = LoopingCall(self.work) def work(self): consumed = 0 batch = [] fingerprints = set() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) finally:
class StrategyWorker(object): def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = {'consumed_since_start': 0} self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) logger.info( "Strategy worker is initialized and consuming partition %d", partition_id) def work(self): # Collecting batch to process consumed = 0 batch = [] for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error:") logger.exception(e) logger.debug("Message %s", hexlify(m)) continue else: type = msg[0] batch.append(msg) try: if type == 'add_seeds': _, seeds = msg self.states_context.to_fetch(seeds) continue if type == 'page_crawled': _, response, links = msg self.states_context.to_fetch(response) self.states_context.to_fetch(links) continue if type == 'request_error': _, request, error = msg self.states_context.to_fetch(request) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) except Exception, exc: logger.exception(exc) pass finally:
class StrategyWorker(object): def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() self.consumer = spider_log.consumer(partition_id=partition_id, type='sw') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { 'consumed_since_start': 0 } self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) logger.info("Strategy worker is initialized and consuming partition %d", partition_id) def work(self): # Collecting batch to process consumed = 0 batch = [] for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: msg = self._decoder.decode(m) except (KeyError, TypeError) as e: logger.error("Decoding error:") logger.exception(e) logger.debug("Message %s", hexlify(m)) continue else: type = msg[0] batch.append(msg) try: if type == 'add_seeds': _, seeds = msg self.states_context.to_fetch(seeds) continue if type == 'page_crawled': _, response, links = msg self.states_context.to_fetch(response) self.states_context.to_fetch(links) continue if type == 'request_error': _, request, error = msg self.states_context.to_fetch(request) continue if type == 'offset': continue raise TypeError('Unknown message type %s' % type) except Exception as exc: logger.exception(exc) pass finally: consumed += 1 # Fetching states self.states_context.fetch() # Batch processing for msg in batch: type = msg[0] try: if type == 'add_seeds': _, seeds = msg for seed in seeds: seed.meta['jid'] = self.job_id self.on_add_seeds(seeds) continue if type == 'page_crawled': _, response, links = msg if 'jid' not in response.meta or response.meta['jid'] != self.job_id: continue self.on_page_crawled(response, links) continue if type == 'request_error': _, request, error = msg if 'jid' not in request.meta or request.meta['jid'] != self.job_id: continue self.on_request_error(request, error) continue except Exception as exc: logger.exception(exc) pass self.update_score.flush() self.states_context.release() # Exiting, if crawl is finished if self.strategy.finished(): logger.info("Successfully reached the crawling goal.") logger.info("Closing crawling strategy.") self.strategy.close() logger.info("Finishing.") reactor.callFromThread(reactor.stop) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() self.stats['consumed_since_start'] += consumed def run(self): def errback(failure): logger.exception(failure.value) if failure.frames: logger.critical(str("").join(format_tb(failure.getTracebackObject()))) self.task.start(interval=0).addErrback(errback) def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) self.task.start(interval=0).addErrback(errback) self._logging_task.start(interval=30) signal(SIGUSR1, debug) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def log_status(self): for k, v in six.iteritems(self.stats): logger.info("%s=%s", k, v) def stop(self): logger.info("Closing crawling strategy.") self.strategy.close() logger.info("Stopping frontier manager.") self._manager.stop() def on_add_seeds(self, seeds): logger.debug('Adding %i seeds', len(seeds)) self.states.set_states(seeds) self.strategy.add_seeds(seeds) self.states.update_cache(seeds) def on_page_crawled(self, response, links): logger.debug("Page crawled %s", response.url) objs_list = [response] objs_list.extend(links) self.states.set_states(objs_list) self.strategy.page_crawled(response, links) self.states.update_cache(links) self.states.update_cache(response) def on_request_error(self, request, error): logger.debug("Page error %s (%s)", request.url, error) self.states.set_states(request) self.strategy.page_error(request, error) self.states.update_cache(request)
class DBWorker(object): def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status) def set_process_info(self, process_info): self.process_info = process_info def run(self): self.slot.schedule(on_start=True) self._logging_task.start(30) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def stop(self): logger.info("Stopping frontier manager.") self._manager.stop() def log_status(self): for k, v in self.stats.iteritems(): logger.info("%s=%s", k, v) def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False def consume_incoming(self, *args, **kwargs): consumed = 0 for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: %s', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue logger.debug("Request error %s", request.url) self._backend.request_error(request, error) if type == 'offset': _, partition_id, offset = msg try: producer_offset = self.spider_feed_producer.get_offset(partition_id) except KeyError: continue else: lag = producer_offset - offset if lag < 0: # non-sense in general, happens when SW is restarted and not synced yet with Spiders. continue if lag < self.max_next_requests or offset == 0: self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) finally:
class DBWorker(object): def __init__(self, settings, no_batches, no_incoming): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db') self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend self._encoder = Encoder(self._manager.request_model) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) if isinstance(self._backend, DistributedBackend): scoring_log = self.mb.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.queue = self._backend.queue self.strategy_enabled = True else: self.strategy_enabled = False self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' self.max_next_requests = settings.MAX_NEXT_REQUESTS self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming) self.job_id = 0 self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, 'pushed_since_start': 0 } self._logging_task = task.LoopingCall(self.log_status) def set_process_info(self, process_info): self.process_info = process_info def run(self): def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) self.slot.schedule(on_start=True) self._logging_task.start(30) signal(SIGUSR1, debug) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() def stop(self): logger.info("Stopping frontier manager.") self._manager.stop() def log_status(self): for k, v in six.iteritems(self.stats): logger.info("%s=%s", k, v) def disable_new_batches(self): self.slot.disable_new_batches = True def enable_new_batches(self): self.slot.disable_new_batches = False def consume_incoming(self, *args, **kwargs): consumed = 0 for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: type = msg[0] if type == 'add_seeds': _, seeds = msg logger.info('Adding %i seeds', len(seeds)) for seed in seeds: logger.debug('URL: %s', seed.url) self._backend.add_seeds(seeds) if type == 'page_crawled': _, response, links = msg logger.debug("Page crawled %s", response.url) if 'jid' not in response.meta or response.meta['jid'] != self.job_id: continue self._backend.page_crawled(response, links) if type == 'request_error': _, request, error = msg if 'jid' not in request.meta or request.meta['jid'] != self.job_id: continue logger.debug("Request error %s", request.url) self._backend.request_error(request, error) if type == 'offset': _, partition_id, offset = msg try: producer_offset = self.spider_feed_producer.get_offset(partition_id) except KeyError: continue else: lag = producer_offset - offset if lag < 0: # non-sense in general, happens when SW is restarted and not synced yet with Spiders. continue if lag < self.max_next_requests or offset == 0: self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) finally: consumed += 1 """ # TODO: Think how it should be implemented in DB-worker only mode. if not self.strategy_enabled and self._backend.finished(): logger.info("Crawling is finished.") reactor.stop() """ self.stats['consumed_since_start'] += consumed self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() self.slot.schedule() return consumed def consume_scoring(self, *args, **kwargs): consumed = 0 seen = set() batch = [] for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: if msg[0] == 'update_score': _, fprint, score, url, schedule = msg if fprint not in seen: batch.append((fprint, score, Request(url), schedule)) seen.add(fprint) if msg[0] == 'new_job_id': self.job_id = msg[1] finally: consumed += 1 self.queue.schedule(batch) self.stats['consumed_scoring_since_start'] += consumed self.stats['last_consumed_scoring'] = consumed self.stats['last_consumption_run_scoring'] = asctime() self.slot.schedule() def new_batch(self, *args, **kwargs): def get_hostname(request): try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) except Exception as e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) return None else: return name.encode('utf-8', 'ignore') def get_fingerprint(request): return request.meta['fingerprint'] partitions = self.spider_feed.available_partitions() logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) if not partitions: return 0 count = 0 if self.spider_feed_partitioning == 'hostname': get_key = get_hostname elif self.spider_feed_partitioning == 'fingerprint': get_key = get_fingerprint else: raise Exception("Unexpected value in self.spider_feed_partitioning") for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception as e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count += 1 self.spider_feed_producer.send(get_key(request), eo) self.stats['pushed_since_start'] += count self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count