def main(): """ Parse arguments, set configuration values, then start the broker """ parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( '--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument( '--address', type=str, help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1' '. When binding to wildcard it defaults to IPv4.') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is' ' INFO.') parser.add_argument( '--port', type=int, help='Base port number, server will bind to 6 ports starting from base' '. Default is 5550') args = parser.parse_args() settings = Settings(module=args.config) address = args.address if args.address else settings.get("ZMQ_ADDRESS") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(address, port) server.logger.setLevel(args.log_level) server.start()
def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 return StrategyWorker(settings, False)
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def __init__(self): settings = Settings() settings.set("SPIDER_FEED_PARTITIONS", 1) settings.set("QUEUE_HOSTNAME_PARTITIONING", True) self.mb = MessageBus(settings) sl = self.mb.spider_log() # sw self.sw_sl_c = sl.consumer(partition_id=0, type="sw") us = self.mb.scoring_log() self.sw_us_p = us.producer() sleep(0.1) # db self.db_sl_c = sl.consumer(partition_id=None, type="db") self.db_us_c = us.consumer() sf = self.mb.spider_feed() self.db_sf_p = sf.producer() sleep(0.1) # spider self.sp_sl_p = sl.producer() self.sp_sf_c = sf.consumer(0) sleep(0.1)
def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( '--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument( '--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help= 'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument( '--port', type=int, help= 'Base port number, server will bind to 6 ports starting from base. Default is 5550' ) args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 return StrategyWorker(settings, FilteredLinksCrawlingStrategy, None, None)
def sw_setup_add_seeds(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' return StrategyWorker(settings, True)
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def test_close_manager(self): settings = Settings(module='frontera.settings.default_settings') settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend') manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) self.assertEqual(RedisBackend, manager.backend.__class__) manager.close()
def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' if distributed: settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' return DBWorker(settings, True, True, False)
def test_max_requests_reached(self): settings = Settings() settings.MAX_REQUESTS = 2 fm = self.setup_frontier_manager(settings) fm.backend.put_requests([r1, r2, r3]) requests = set(fm.get_next_requests(10)) assert requests == set([r1, r2]) or requests == set([r2, r3]) or requests == set([r1, r3]) assert fm.get_next_requests(10) == [] assert fm.finished is True
def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy' manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return manager.strategy
def test_max_requests_reached(self): settings = Settings() settings.MAX_REQUESTS = 2 fm = self.setup_frontier_manager(settings) fm.backend.put_requests([r1, r2, r3]) requests = set(fm.get_next_requests(10)) assert requests == set([r1, r2]) or requests == set( [r2, r3]) or requests == set([r1, r3]) assert fm.get_next_requests(10) == [] assert fm.finished is True
def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero(self): settings = Settings() # test partition_id > feed_partitions settings.SPIDER_PARTITION_ID = 2 settings.SPIDER_FEED_PARTITIONS = 1 self.assertRaises(ValueError, self.mbb_setup, settings) # test partition_id = feed_partitions settings.SPIDER_PARTITION_ID = 1 self.assertRaises(ValueError, self.mbb_setup, settings) # test partition_id < 0 settings.SPIDER_PARTITION_ID = -1 self.assertRaises(ValueError, self.mbb_setup, settings)
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, strategy_class
def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def seed_loader_setup(self, seeds_content=None): seed_path = os.path.join(self.tmp_path, 'seeds.txt') default_content = """ https://www.example.com https://www.scrapy.org """ seeds_content = seeds_content or default_content with open(seed_path, 'wb') as tmpl_file: tmpl_file.write(seeds_content.encode('utf-8')) assert os.path.isfile(seed_path) # Failure of test itself settings = Settings() settings.SEEDS_SOURCE = seed_path crawler = type('crawler', (object,), {}) crawler.settings = settings return FileSeedLoader(crawler)
def seed_loader_setup(self, seeds_content=None): seed_path = os.path.join(self.tmp_path, 'seeds.txt') default_content = """ https://www.example.com https://www.scrapy.org """ seeds_content = seeds_content or default_content with open(seed_path, 'wb') as tmpl_file: tmpl_file.write(seeds_content.encode('utf-8')) assert os.path.isfile(seed_path) # Failure of test itself settings = Settings() settings.SEEDS_SOURCE = seed_path crawler = type('crawler', (object, ), {}) crawler.settings = settings return FileSeedLoader(crawler)
def strategy(self): settings = Settings() manager = FakeFrontierManager(settings) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) return DummyCrawlingStrategy.from_worker(manager, stream, states_ctx)
def __init__(self, settings=Settings()): settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = MessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = spiderlog.consumer(partition_id=0, type='sw') scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() sleep(0.1) # db self.db_sl_c = spiderlog.consumer(partition_id=None, type='db') self.db_us_c = scoring_log.consumer() spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() sleep(0.1) # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = spider_feed.consumer(0) sleep(0.1)
def __init__(self, cls, settings=Settings()): settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set( 'SPIDER_FEED_PARTITIONER', 'frontera.contrib.backends.partitioners.Crc32NamePartitioner') self.messagebus = cls(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b'sw') scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() sleep(0.1) # db self.db_sl_c = spiderlog.consumer(partition_id=None, type=b'db') self.db_us_c = scoring_log.consumer() spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() sleep(0.1) # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = spider_feed.consumer(0) sleep(0.1)
def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): """ Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>` instance initialized with \ the passed settings argument. If no settings is given, :ref:`frontier default settings <frontier-default-settings>` are used. """ manager_settings = Settings.object_from(settings) return FrontierManager( request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, logger=manager_settings.LOGGER, event_log_manager=manager_settings.EVENT_LOG_MANAGER, middlewares=manager_settings.MIDDLEWARES, test_mode=manager_settings.TEST_MODE, max_requests=manager_settings.MAX_REQUESTS, max_next_requests=manager_settings.MAX_NEXT_REQUESTS, auto_start=manager_settings.AUTO_START, settings=manager_settings, canonicalsolver=manager_settings.CANONICAL_SOLVER, db_worker=db_worker, strategy_worker=strategy_worker)
def mbb_setup(self, settings=None): manager = type('manager', (object, ), {}) settings = settings or Settings() settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STORE_CONTENT = True manager.settings = settings manager.request_model = Request manager.response_model = Response return MessageBusBackend(manager)
def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument('--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument('--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument('--log-level', '-L', type=str, default='INFO', help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument('--port', type=int, help='Base port number, server will bind to 6 ports starting from base. Default is 5550') args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling( spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling( spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
def from_settings(cls, settings=None): manager_settings = Settings.object_from(settings) return SpiderFrontierManager( request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, middlewares=manager_settings.MIDDLEWARES, max_next_requests=manager_settings.MAX_NEXT_REQUESTS, settings=manager_settings, canonicalsolver=manager_settings.CANONICAL_SOLVER)
def mbb_setup(self, settings=None): manager = type('manager', (object, ), {}) settings = settings or Settings() settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STORE_CONTENT = True #test json codecs # settings.MESSAGE_BUS_CODEC='frontera.contrib.backends.remote.codecs.json' manager.settings = settings manager.request_model = Request manager.response_model = Response return MessageBusBackend(manager)
def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' return FrontierManager.from_settings(settings)
def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' fm = LocalFrontierManager.from_settings(settings) SEEDS_FILE.seek(0) fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) fm.request_error(r3, 'error') #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0] * 4 #the 3 seeds reach the first three middlewares. assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3] * 3 #the error, response and link reached the first three middlewares. assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1] * 3] * 3 #the values do not reach the bottom 2 middlewares and the canonical solver. assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0] * 4] * 2 assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' return LocalFrontierManager.from_settings(settings)
def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' if distributed: settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' return DBWorker(settings, False, False, False, partitions="0")
def setUp(self): self.tmp_path = mkdtemp() settings = Settings() settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder' settings.SEEDS_AWS_ACCESS_KEY = 'access_key' settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key' crawler = type('crawler', (object,), {}) crawler.settings = settings self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt') self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt') s1_content = """ https://www.example.com https://www.scrapy.org """ s2_content = """ https://www.dmoz.org https://www.test.com """ with open(self.seed_path_1, 'wb') as tmpl_file: tmpl_file.write(s1_content.encode('utf-8')) with open(self.seed_path_2, 'wb') as tmpl_file: tmpl_file.write(s2_content.encode('utf-8')) self.seed_loader = S3SeedLoader(crawler)
def setUp(self): self.tmp_path = mkdtemp() settings = Settings() settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder' settings.SEEDS_AWS_ACCESS_KEY = 'access_key' settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key' crawler = type('crawler', (object, ), {}) crawler.settings = settings self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt') self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt') s1_content = """ https://www.example.com https://www.scrapy.org """ s2_content = """ https://www.dmoz.org https://www.test.com """ with open(self.seed_path_1, 'wb') as tmpl_file: tmpl_file.write(s1_content.encode('utf-8')) with open(self.seed_path_2, 'wb') as tmpl_file: tmpl_file.write(s2_content.encode('utf-8')) self.seed_loader = S3SeedLoader(crawler)
def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") logger.setLevel(logging.INFO) logger.addHandler(handler) self.logger = logging.getLogger("tester") self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") client.close() settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('SPIDER_LOG_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.messagebus = KafkaMessageBus(settings) spiderlog = self.messagebus.spider_log() # sw self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw')) scoring_log = self.messagebus.scoring_log() self.sw_us_p = scoring_log.producer() # db self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db')) self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) spider_feed = self.messagebus.spider_feed() self.db_sf_p = spider_feed.producer() # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) self.logger.debug("init is done")
def test_kafka_message_bus_integration(): kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") client.ensure_topic_exists("frontier-done") client.ensure_topic_exists("frontier-score") logging.basicConfig(level=logging.INFO) #kafkabus = logging.getLogger("kafkabus") #kafkabus.addHandler(logging.StreamHandler()) settings = Settings() settings.set('KAFKA_LOCATION', kafka_location) settings.set('FRONTIER_GROUP', 'frontier2') settings.set('SCORING_TOPIC', "frontier-score") tester = MessageBusTester(KafkaMessageBus, settings) tester.spider_log_activity(64) assert tester.sw_activity() == 64 assert tester.db_activity(128) == (64, 32) assert tester.spider_feed_activity() == 128
def from_settings(cls, settings=None): """ Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>` instance initialized with \ the passed settings argument. If no settings is given, :ref:`frontier default settings <frontier-default-settings>` are used. """ manager_settings = Settings.object_from(settings) settings.set_from_dict(DEFAULT_SETTINGS) return cls(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, middlewares=manager_settings.MIDDLEWARES, test_mode=manager_settings.TEST_MODE, max_requests=manager_settings.MAX_REQUESTS, max_next_requests=manager_settings.MAX_NEXT_REQUESTS, auto_start=manager_settings.AUTO_START, settings=manager_settings, canonicalsolver=manager_settings.CANONICAL_SOLVER)
def from_settings(cls, settings=None): """ Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>` instance initialized with \ the passed settings argument. Argument value can either be a string path pointing to settings file or a \ :class:`Settings <frontera.settings.Settings>` object instance. If no settings is given, :ref:`frontier default settings <frontier-default-settings>` are used. """ manager_settings = Settings(settings) return FrontierManager( request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, logger=manager_settings.LOGGER, event_log_manager=manager_settings.EVENT_LOG_MANAGER, middlewares=manager_settings.MIDDLEWARES, test_mode=manager_settings.TEST_MODE, max_requests=manager_settings.MAX_REQUESTS, max_next_requests=manager_settings.MAX_NEXT_REQUESTS, auto_start=manager_settings.AUTO_START, settings=manager_settings)
def __init__(self, request_model, response_model, settings=None): # Settings self._settings = settings or Settings() # Logger self._logger = logging.getLogger("manager") # Log frontier manager starting self._logger.info('-' * 80) self._logger.info('Starting Frontier Manager...') # Load request model self._request_model = load_object(request_model) assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ self._request_model.__name__ # Load response model self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__
def test_invalid_s3_seed_source(self): crawler = type('crawler', (object,), {}) settings = Settings() settings.SEEDS_SOURCE = 'invalid_url' crawler.settings = settings self.assertRaises(NotConfigured, S3SeedLoader, crawler)
def test_settings_attributes_can_be_assigned(): settings = Settings() settings.NEW_ATTRIBUTE = 10 assert settings.NEW_ATTRIBUTE == 10
self.states.update_cache(links) self.states.update_cache(response) def on_request_error(self, request, error): logger.debug("Page error %s (%s)", request.url, error) self.states.set_states(request) self.strategy.page_error(request, error) self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() logger.setLevel(args.log_level) logger.addHandler(CONSOLE) settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) worker = StrategyWorker(settings, strategy_class) worker.run()
def from_settings(cls, settings=None): settings = Settings.object_from(settings) return FakeFrontierManager(settings)
def test_settings_passed_as_attributes_can_be_found(): settings = Settings(attributes={'SETTING': 'value'}) assert settings.get('SETTING') == 'value'
def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " "in a form of key=value separated with space") parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " "of strategy add_seeds method") parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) if args.port: settings.set('JSONRPC_PORT', args.port) strategy_args = {} if args.args: for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) return settings, args.add_seeds, args.seeds_url
self.states.set_states(request) self.strategy.page_error(request, error) self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = StrategyWorker(settings, strategy_class) worker.run()
def run_add_seeds(settings, seeds_file): fh = open(seeds_file, "rb") logger.info("Starting local seeds addition from file %s", seeds_file) manager = LocalFrontierManager.from_settings(settings) manager.add_seeds(fh) manager.stop() manager.close() logger.info("Seeds addition finished") if __name__ == '__main__': parser = ArgumentParser(description="Frontera local add seeds utility") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") args = parser.parse_args() settings = Settings(module=args.config) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) run_add_seeds(settings, args.seeds_file)
def test_fallsback_to_frontera_default_settings(): settings = Settings() assert settings.get('MAX_NEXT_REQUESTS') == 0
def __init__(self): settings = Settings() settings.set('ZMQ_ADDRESS', '::1') super(IPv6MessageBusTester, self).__init__(settings)
parser.add_argument('--no-batches', action='store_true', help='Disables generation of new batches.') parser.add_argument('--no-incoming', action='store_true', help='Disables spider log processing.') parser.add_argument('--no-scoring', action='store_true', help='Disables scoring log processing.') parser.add_argument('--partitions', type=int, nargs='*', help='Optional partitions range for batch generator') parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and os.path.exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings) server.start_listening()
def test_settings_on_a_python_module_are_loaded(): settings = Settings('frontera.tests.scrapy_spider.frontera.settings') assert settings.get('MAX_REQUESTS') == 5
self.strategy.page_error(request, error) self.states.update_cache(request) if __name__ == '__main__': parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") strategy_class = load_object(strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % partition_id) settings.set('SCORING_PARTITION_ID', partition_id) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path)