Esempio n. 1
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Esempio n. 2
0
def main():
    """
    Parse arguments, set configuration values, then start the broker
    """
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config', type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--address', type=str,
        help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1'
        '. When binding to wildcard it defaults to IPv4.')
    parser.add_argument(
        '--log-level', '-L', type=str, default='INFO',
        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is'
        ' INFO.')
    parser.add_argument(
        '--port', type=int,
        help='Base port number, server will bind to 6 ports starting from base'
        '. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    address = args.address if args.address else settings.get("ZMQ_ADDRESS")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(address, port)
    server.logger.setLevel(args.log_level)
    server.start()
Esempio n. 3
0
 def sw_setup_filtered_links(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     return StrategyWorker(settings, FilteredLinksCrawlingStrategy, None,
                           None)
 def sw_setup_add_seeds(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return StrategyWorker(settings, True)
Esempio n. 5
0
    def __init__(self, cls, settings=Settings()):
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set(
            'SPIDER_FEED_PARTITIONER',
            'frontera.contrib.backends.partitioners.Crc32NamePartitioner')
        self.messagebus = cls(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b'sw')

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        sleep(0.1)

        # db
        self.db_sl_c = spiderlog.consumer(partition_id=None, type=b'db')
        self.db_us_c = scoring_log.consumer()

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = spider_feed.consumer(0)

        sleep(0.1)
Esempio n. 6
0
    def __init__(self, cls, settings=Settings()):
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = cls(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b'sw')

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        sleep(0.1)

        # db
        self.db_sl_c = spiderlog.consumer(partition_id=None, type=b'db')
        self.db_us_c = scoring_log.consumer()

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = spider_feed.consumer(0)

        sleep(0.1)
Esempio n. 7
0
 def strategy(self):
     settings = Settings()
     manager = FakeFrontierManager(settings)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return TestingCrawlingStrategy.from_worker(manager, stream, states_ctx)
    def __init__(self):
        settings = Settings()
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type='sw')
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type='db')
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Esempio n. 10
0
 def test_close_manager(self):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('BACKEND',
                  'frontera.contrib.backends.redis_backend.RedisBackend')
     manager = WorkerFrontierManager.from_settings(settings,
                                                   strategy_worker=True)
     self.assertEqual(RedisBackend, manager.backend.__class__)
     manager.close()
Esempio n. 11
0
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
 def mbb_setup(self, settings=None):
     manager = type('manager', (object, ), {})
     settings = settings or Settings()
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STORE_CONTENT = True
     manager.settings = settings
     manager.request_model = Request
     manager.response_model = Response
     return MessageBusBackend(manager)
Esempio n. 13
0
 def dbw_setup(self, distributed=False):
     settings = Settings()
     settings.MAX_NEXT_REQUESTS = 64
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     if distributed:
         settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend'
     else:
         settings.BACKEND = 'tests.mocks.components.FakeBackend'
     return DBWorker(settings, False, False, False, partitions="0")
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy'
     manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return manager.strategy
Esempio n. 15
0
 def test_max_requests_reached(self):
     settings = Settings()
     settings.MAX_REQUESTS = 2
     fm = self.setup_frontier_manager(settings)
     fm.backend.put_requests([r1, r2, r3])
     requests = set(fm.get_next_requests(10))
     assert requests == set([r1, r2]) or requests == set(
         [r2, r3]) or requests == set([r1, r3])
     assert fm.get_next_requests(10) == []
     assert fm.finished is True
Esempio n. 16
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     return FrontierManager.from_settings(settings)
 def mbb_setup(self, settings=None):
     manager = type('manager', (object, ), {})
     settings = settings or Settings()
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STORE_CONTENT = True
     #test json codecs
     # settings.MESSAGE_BUS_CODEC='frontera.contrib.backends.remote.codecs.json'
     manager.settings = settings
     manager.request_model = Request
     manager.response_model = Response
     return MessageBusBackend(manager)
Esempio n. 18
0
    def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero(self):
        settings = Settings()
        # test partition_id > feed_partitions
        settings.SPIDER_PARTITION_ID = 2
        settings.SPIDER_FEED_PARTITIONS = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id = feed_partitions
        settings.SPIDER_PARTITION_ID = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id < 0
        settings.SPIDER_PARTITION_ID = -1
        self.assertRaises(ValueError, self.mbb_setup, settings)
Esempio n. 19
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'frontera.contrib.middlewares.domain.DomainMiddleware',
         'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return LocalFrontierManager.from_settings(settings)
Esempio n. 20
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Esempio n. 21
0
    def seed_loader_setup(self, seeds_content=None):
        seed_path = os.path.join(self.tmp_path, 'seeds.txt')
        default_content = """
https://www.example.com
https://www.scrapy.org
"""
        seeds_content = seeds_content or default_content
        with open(seed_path, 'wb') as tmpl_file:
            tmpl_file.write(seeds_content.encode('utf-8'))
        assert os.path.isfile(seed_path)  # Failure of test itself
        settings = Settings()
        settings.SEEDS_SOURCE = seed_path
        crawler = type('crawler', (object, ), {})
        crawler.settings = settings
        return FileSeedLoader(crawler)
Esempio n. 22
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(
            spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
Esempio n. 23
0
def test_kafka_message_bus_integration():
    kafka_location = "127.0.0.1:9092"
    client = KafkaClient(kafka_location)
    client.ensure_topic_exists("frontier-todo")
    client.ensure_topic_exists("frontier-done")
    client.ensure_topic_exists("frontier-score")

    logging.basicConfig(level=logging.INFO)
    #kafkabus = logging.getLogger("kafkabus")
    #kafkabus.addHandler(logging.StreamHandler())
    settings = Settings()
    settings.set('KAFKA_LOCATION', kafka_location)
    settings.set('FRONTIER_GROUP', 'frontier2')
    settings.set('SCORING_TOPIC', "frontier-score")
    tester = MessageBusTester(KafkaMessageBus, settings)
    tester.spider_log_activity(64)
    assert tester.sw_activity() == 64
    assert tester.db_activity(128) == (64, 32)
    assert tester.spider_feed_activity() == 128
Esempio n. 24
0
 def from_settings(cls, settings=None):
     """
     Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>`  instance initialized with \
     the passed settings argument. Argument value can either be a string path pointing to settings file or a \
     :class:`Settings <frontera.settings.Settings>` object instance. If no settings is given,
     :ref:`frontier default settings <frontier-default-settings>` are used.
     """
     manager_settings = Settings(settings)
     return FrontierManager(
         request_model=manager_settings.REQUEST_MODEL,
         response_model=manager_settings.RESPONSE_MODEL,
         backend=manager_settings.BACKEND,
         logger=manager_settings.LOGGER,
         event_log_manager=manager_settings.EVENT_LOG_MANAGER,
         middlewares=manager_settings.MIDDLEWARES,
         test_mode=manager_settings.TEST_MODE,
         max_requests=manager_settings.MAX_REQUESTS,
         max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
         auto_start=manager_settings.AUTO_START,
         settings=manager_settings)
Esempio n. 25
0
    def __init__(self, request_model, response_model, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = logging.getLogger("manager")

        # Log frontier manager starting
        self._logger.info('-' * 80)
        self._logger.info('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Esempio n. 26
0
    def __init__(self, request_model, response_model, logger, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = load_object(logger)(self._settings)
        assert isinstance(self._logger, FrontierLogger), "logger '%s' must subclass FrontierLogger" % \
                                                         self._logger.__class__.__name__

        # Log frontier manager starting
        self.logger.manager.debug('-' * 80)
        self.logger.manager.debug('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Esempio n. 27
0
    def setUp(self):
        self.tmp_path = mkdtemp()
        settings = Settings()
        settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder'
        settings.SEEDS_AWS_ACCESS_KEY = 'access_key'
        settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key'
        crawler = type('crawler', (object, ), {})
        crawler.settings = settings
        self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt')
        self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt')
        s1_content = """
https://www.example.com
https://www.scrapy.org
"""
        s2_content = """
https://www.dmoz.org
https://www.test.com
"""

        with open(self.seed_path_1, 'wb') as tmpl_file:
            tmpl_file.write(s1_content.encode('utf-8'))
        with open(self.seed_path_2, 'wb') as tmpl_file:
            tmpl_file.write(s2_content.encode('utf-8'))
        self.seed_loader = S3SeedLoader(crawler)
Esempio n. 28
0
                request.meta['fingerprint'], score, request.url, False)
            return [encoded]
        return []


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        required=True,
                        help='Crawling strategy module name')

    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)
    settings = Settings(module=args.config)
    strategy_module = import_module(args.strategy)
    worker = StrategyWorker(settings, strategy_module)
    worker.run()
Esempio n. 29
0
 def from_settings(cls, settings=None):
     manager_settings = Settings(settings)
     return BaseContext(request_model=manager_settings.REQUEST_MODEL,
                        response_model=manager_settings.RESPONSE_MODEL,
                        settings=manager_settings)
Esempio n. 30
0
 def __init__(self):
     settings = Settings()
     settings.set('ZMQ_ADDRESS', '::1')
     super(IPv6MessageBusTester, self).__init__(settings)