Example #1
0
class StrategyWorker(object):
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {}
        self.job_id = 0
        self.task = LoopingCall(self.work)

    def work(self):
        # Collecting batch to process
        consumed = 0
        batch = []
        for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError), e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                batch.append(msg)
                if type == 'add_seeds':
                    _, seeds = msg
                    self.states_context.to_fetch(seeds)
                    continue

                if type == 'page_crawled':
                    _, response, links = msg
                    self.states_context.to_fetch(response)
                    self.states_context.to_fetch(links)
                    continue

                if type == 'request_error':
                    _, request, error = msg
                    self.states_context.to_fetch(request)
                    continue

                if type == 'offset':
                    continue
                raise TypeError('Unknown message type %s' % type)
            finally:
Example #2
0
class StrategyWorker(object):
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)

    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError), e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                batch.append(msg)
                if type == 'add_seeds':
                    _, seeds = msg
                    fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds))
                    continue

                if type == 'page_crawled':
                    _, response, links = msg
                    fingerprints.add(response.meta['fingerprint'])
                    fingerprints.update(map(lambda x: x.meta['fingerprint'], links))
                    continue

                if type == 'request_error':
                    _, request, error = msg
                    fingerprints.add(request.meta['fingerprint'])
                    continue

                if type == 'offset':
                    continue
                raise TypeError('Unknown message type %s' % type)
            finally:
Example #3
0
class StrategyWorker(object):
    def __init__(self, settings, strategy_module):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_module.CrawlingStrategy()
        self.states = self._manager.backend.states
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0
        self.task = LoopingCall(self.work)

    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        for m in self.consumer.get_messages(count=self.consumer_batch_size,
                                            timeout=1.0):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError), e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                batch.append(msg)
                if type == 'add_seeds':
                    _, seeds = msg
                    fingerprints.update(
                        map(lambda x: x.meta['fingerprint'], seeds))
                    continue

                if type == 'page_crawled':
                    _, response, links = msg
                    fingerprints.add(response.meta['fingerprint'])
                    fingerprints.update(
                        map(lambda x: x.meta['fingerprint'], links))
                    continue

                if type == 'request_error':
                    _, request, error = msg
                    fingerprints.add(request.meta['fingerprint'])
                    continue

                if type == 'offset':
                    continue
                raise TypeError('Unknown message type %s' % type)
            finally:
Example #4
0
class StrategyWorker(object):
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id,
                                            type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings,
                                                      strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder,
                                              self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager,
                                                   self.update_score,
                                                   self.states_context)
        self.states = self._manager.backend.states
        self.stats = {'consumed_since_start': 0}
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        logger.info(
            "Strategy worker is initialized and consuming partition %d",
            partition_id)

    def work(self):
        # Collecting batch to process
        consumed = 0
        batch = []
        for m in self.consumer.get_messages(count=self.consumer_batch_size,
                                            timeout=1.0):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError), e:
                logger.error("Decoding error:")
                logger.exception(e)
                logger.debug("Message %s", hexlify(m))
                continue
            else:
                type = msg[0]
                batch.append(msg)
                try:
                    if type == 'add_seeds':
                        _, seeds = msg
                        self.states_context.to_fetch(seeds)
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        self.states_context.to_fetch(response)
                        self.states_context.to_fetch(links)
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        self.states_context.to_fetch(request)
                        continue

                    if type == 'offset':
                        continue
                    raise TypeError('Unknown message type %s' % type)
                except Exception, exc:
                    logger.exception(exc)
                    pass
            finally:
Example #5
0
class StrategyWorker(object):
    def __init__(self, settings, strategy_class):
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id is None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")

        messagebus = load_object(settings.get('MESSAGE_BUS'))
        mb = messagebus(settings)
        spider_log = mb.spider_log()
        scoring_log = mb.scoring_log()
        self.consumer = spider_log.consumer(partition_id=partition_id, type='sw')
        self.scoring_log_producer = scoring_log.producer()

        self._manager = FrontierManager.from_settings(settings, strategy_worker=True)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024)
        self.states_context = StatesContext(self._manager.backend.states)

        self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context)
        self.states = self._manager.backend.states
        self.stats = {
            'consumed_since_start': 0
        }
        self.job_id = 0
        self.task = LoopingCall(self.work)
        self._logging_task = LoopingCall(self.log_status)
        logger.info("Strategy worker is initialized and consuming partition %d", partition_id)

    def work(self):
        # Collecting batch to process
        consumed = 0
        batch = []
        for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError) as e:
                logger.error("Decoding error:")
                logger.exception(e)
                logger.debug("Message %s", hexlify(m))
                continue
            else:
                type = msg[0]
                batch.append(msg)
                try:
                    if type == 'add_seeds':
                        _, seeds = msg
                        self.states_context.to_fetch(seeds)
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        self.states_context.to_fetch(response)
                        self.states_context.to_fetch(links)
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        self.states_context.to_fetch(request)
                        continue

                    if type == 'offset':
                        continue
                    raise TypeError('Unknown message type %s' % type)
                except Exception as exc:
                    logger.exception(exc)
                    pass
            finally:
                consumed += 1

        # Fetching states
        self.states_context.fetch()

        # Batch processing
        for msg in batch:
            type = msg[0]
            try:
                if type == 'add_seeds':
                    _, seeds = msg
                    for seed in seeds:
                        seed.meta['jid'] = self.job_id
                    self.on_add_seeds(seeds)
                    continue

                if type == 'page_crawled':
                    _, response, links = msg
                    if 'jid' not in response.meta or response.meta['jid'] != self.job_id:
                        continue
                    self.on_page_crawled(response, links)
                    continue

                if type == 'request_error':
                    _, request, error = msg
                    if 'jid' not in request.meta or request.meta['jid'] != self.job_id:
                        continue
                    self.on_request_error(request, error)
                    continue
            except Exception as exc:
                logger.exception(exc)
                pass

        self.update_score.flush()
        self.states_context.release()

        # Exiting, if crawl is finished
        if self.strategy.finished():
            logger.info("Successfully reached the crawling goal.")
            logger.info("Closing crawling strategy.")
            self.strategy.close()
            logger.info("Finishing.")
            reactor.callFromThread(reactor.stop)

        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()
        self.stats['consumed_since_start'] += consumed

    def run(self):
        def errback(failure):
            logger.exception(failure.value)
            if failure.frames:
                logger.critical(str("").join(format_tb(failure.getTracebackObject())))
            self.task.start(interval=0).addErrback(errback)

        def debug(sig, frame):
            logger.critical("Signal received: printing stack trace")
            logger.critical(str("").join(format_stack(frame)))

        self.task.start(interval=0).addErrback(errback)
        self._logging_task.start(interval=30)
        signal(SIGUSR1, debug)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()

    def log_status(self):
        for k, v in six.iteritems(self.stats):
            logger.info("%s=%s", k, v)

    def stop(self):
        logger.info("Closing crawling strategy.")
        self.strategy.close()
        logger.info("Stopping frontier manager.")
        self._manager.stop()

    def on_add_seeds(self, seeds):
        logger.debug('Adding %i seeds', len(seeds))
        self.states.set_states(seeds)
        self.strategy.add_seeds(seeds)
        self.states.update_cache(seeds)

    def on_page_crawled(self, response, links):
        logger.debug("Page crawled %s", response.url)
        objs_list = [response]
        objs_list.extend(links)
        self.states.set_states(objs_list)
        self.strategy.page_crawled(response, links)
        self.states.update_cache(links)
        self.states.update_cache(response)

    def on_request_error(self, request, error):
        logger.debug("Page error %s (%s)", request.url, error)
        self.states.set_states(request)
        self.strategy.page_error(request, error)
        self.states.update_cache(request)
Example #6
0
class DBWorker(object):
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)

    def set_process_info(self, process_info):
        self.process_info = process_info

    def run(self):
        self.slot.schedule(on_start=True)
        self._logging_task.start(30)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()

    def stop(self):
        logger.info("Stopping frontier manager.")
        self._manager.stop()

    def log_status(self):
        for k, v in self.stats.iteritems():
            logger.info("%s=%s", k, v)

    def disable_new_batches(self):
        self.slot.disable_new_batches = True

    def enable_new_batches(self):
        self.slot.disable_new_batches = False

    def consume_incoming(self, *args, **kwargs):
        consumed = 0
        for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.consumer_batch_size):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError), e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                if type == 'add_seeds':
                    _, seeds = msg
                    logger.info('Adding %i seeds', len(seeds))
                    for seed in seeds:
                        logger.debug('URL: %s', seed.url)
                    self._backend.add_seeds(seeds)
                if type == 'page_crawled':
                    _, response, links = msg
                    logger.debug("Page crawled %s", response.url)
                    if response.meta['jid'] != self.job_id:
                        continue
                    self._backend.page_crawled(response, links)
                if type == 'request_error':
                    _, request, error = msg
                    if request.meta['jid'] != self.job_id:
                        continue
                    logger.debug("Request error %s", request.url)
                    self._backend.request_error(request, error)
                if type == 'offset':
                    _, partition_id, offset = msg
                    try:
                        producer_offset = self.spider_feed_producer.get_offset(partition_id)
                    except KeyError:
                        continue
                    else:
                        lag = producer_offset - offset
                        if lag < 0:
                            # non-sense in general, happens when SW is restarted and not synced yet with Spiders.
                            continue
                        if lag < self.max_next_requests or offset == 0:
                            self.spider_feed.mark_ready(partition_id)
                        else:
                            self.spider_feed.mark_busy(partition_id)
            finally:
Example #7
0
class DBWorker(object):
    def __init__(self, settings, no_batches, no_incoming):
        messagebus = load_object(settings.get('MESSAGE_BUS'))
        self.mb = messagebus(settings)
        spider_log = self.mb.spider_log()

        self.spider_feed = self.mb.spider_feed()
        self.spider_log_consumer = spider_log.consumer(partition_id=None, type='db')
        self.spider_feed_producer = self.spider_feed.producer()

        self._manager = FrontierManager.from_settings(settings, db_worker=True)
        self._backend = self._manager.backend
        self._encoder = Encoder(self._manager.request_model)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)

        if isinstance(self._backend, DistributedBackend):
            scoring_log = self.mb.scoring_log()
            self.scoring_log_consumer = scoring_log.consumer()
            self.queue = self._backend.queue
            self.strategy_enabled = True
        else:
            self.strategy_enabled = False

        self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE')
        self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE')
        self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname'
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches,
                         self.strategy_enabled, settings.get('NEW_BATCH_DELAY'), no_incoming)
        self.job_id = 0
        self.stats = {
            'consumed_since_start': 0,
            'consumed_scoring_since_start': 0,
            'pushed_since_start': 0
        }
        self._logging_task = task.LoopingCall(self.log_status)

    def set_process_info(self, process_info):
        self.process_info = process_info

    def run(self):
        def debug(sig, frame):
            logger.critical("Signal received: printing stack trace")
            logger.critical(str("").join(format_stack(frame)))

        self.slot.schedule(on_start=True)
        self._logging_task.start(30)
        signal(SIGUSR1, debug)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()

    def stop(self):
        logger.info("Stopping frontier manager.")
        self._manager.stop()

    def log_status(self):
        for k, v in six.iteritems(self.stats):
            logger.info("%s=%s", k, v)

    def disable_new_batches(self):
        self.slot.disable_new_batches = True

    def enable_new_batches(self):
        self.slot.disable_new_batches = False

    def consume_incoming(self, *args, **kwargs):
        consumed = 0
        for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError) as e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                type = msg[0]
                if type == 'add_seeds':
                    _, seeds = msg
                    logger.info('Adding %i seeds', len(seeds))
                    for seed in seeds:
                        logger.debug('URL: %s', seed.url)
                    self._backend.add_seeds(seeds)
                if type == 'page_crawled':
                    _, response, links = msg
                    logger.debug("Page crawled %s", response.url)
                    if 'jid' not in response.meta or response.meta['jid'] != self.job_id:
                        continue
                    self._backend.page_crawled(response, links)
                if type == 'request_error':
                    _, request, error = msg
                    if 'jid' not in request.meta or request.meta['jid'] != self.job_id:
                        continue
                    logger.debug("Request error %s", request.url)
                    self._backend.request_error(request, error)
                if type == 'offset':
                    _, partition_id, offset = msg
                    try:
                        producer_offset = self.spider_feed_producer.get_offset(partition_id)
                    except KeyError:
                        continue
                    else:
                        lag = producer_offset - offset
                        if lag < 0:
                            # non-sense in general, happens when SW is restarted and not synced yet with Spiders.
                            continue
                        if lag < self.max_next_requests or offset == 0:
                            self.spider_feed.mark_ready(partition_id)
                        else:
                            self.spider_feed.mark_busy(partition_id)
            finally:
                consumed += 1
        """
        # TODO: Think how it should be implemented in DB-worker only mode.
        if not self.strategy_enabled and self._backend.finished():
            logger.info("Crawling is finished.")
            reactor.stop()
        """
        self.stats['consumed_since_start'] += consumed
        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()
        self.slot.schedule()
        return consumed

    def consume_scoring(self, *args, **kwargs):
        consumed = 0
        seen = set()
        batch = []
        for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size):
            try:
                msg = self._decoder.decode(m)
            except (KeyError, TypeError) as e:
                logger.error("Decoding error: %s", e)
                continue
            else:
                if msg[0] == 'update_score':
                    _, fprint, score, url, schedule = msg
                    if fprint not in seen:
                        batch.append((fprint, score, Request(url), schedule))
                    seen.add(fprint)
                if msg[0] == 'new_job_id':
                    self.job_id = msg[1]
            finally:
                consumed += 1
        self.queue.schedule(batch)

        self.stats['consumed_scoring_since_start'] += consumed
        self.stats['last_consumed_scoring'] = consumed
        self.stats['last_consumption_run_scoring'] = asctime()
        self.slot.schedule()

    def new_batch(self, *args, **kwargs):
        def get_hostname(request):
            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
            except Exception as e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'],
                                                                               request.url))
                return None
            else:
                return name.encode('utf-8', 'ignore')

        def get_fingerprint(request):
            return request.meta['fingerprint']

        partitions = self.spider_feed.available_partitions()
        logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions)))
        if not partitions:
            return 0

        count = 0
        if self.spider_feed_partitioning == 'hostname':
            get_key = get_hostname
        elif self.spider_feed_partitioning == 'fingerprint':
            get_key = get_fingerprint
        else:
            raise Exception("Unexpected value in self.spider_feed_partitioning")

        for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception as e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e,
                                                                               request.meta['fingerprint'],
                                                                               request.url))
                continue
            finally:
                count += 1
            self.spider_feed_producer.send(get_key(request), eo)

        self.stats['pushed_since_start'] += count
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count