Example #1
0
def main():
    """
    Parse arguments, set configuration values, then start the broker
    """
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config', type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--address', type=str,
        help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1'
        '. When binding to wildcard it defaults to IPv4.')
    parser.add_argument(
        '--log-level', '-L', type=str, default='INFO',
        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is'
        ' INFO.')
    parser.add_argument(
        '--port', type=int,
        help='Base port number, server will bind to 6 ports starting from base'
        '. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    address = args.address if args.address else settings.get("ZMQ_ADDRESS")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(address, port)
    server.logger.setLevel(args.log_level)
    server.start()
 def sw_setup_filtered_links(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     return StrategyWorker(settings, False)
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
Example #4
0
    def __init__(self):
        settings = Settings()
        settings.set("SPIDER_FEED_PARTITIONS", 1)
        settings.set("QUEUE_HOSTNAME_PARTITIONING", True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type="sw")
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type="db")
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
Example #5
0
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config',
        type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--hostname',
        type=str,
        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help=
        'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument(
        '--port',
        type=int,
        help=
        'Base port number, server will bind to 6 ports starting from base. Default is 5550'
    )
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
Example #6
0
 def sw_setup_filtered_links(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     return StrategyWorker(settings, FilteredLinksCrawlingStrategy, None,
                           None)
 def sw_setup_add_seeds(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return StrategyWorker(settings, True)
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware',
                                'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
                                'tests.mocks.components.FakeMiddleware',
                                'tests.mocks.components.FakeMiddlewareModifySeeds',
                                'tests.mocks.components.FakeMiddlewareBlocking',
                                'tests.mocks.components.FakeMiddlewareModifyResponse',
                                'tests.mocks.components.FakeMiddlewareModifyLinks']
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0]*4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4
Example #9
0
def main():
    """
    Parse arguments, set configuration values, then start the broker
    """
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config',
        type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--address',
        type=str,
        help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1'
        '. When binding to wildcard it defaults to IPv4.')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is'
        ' INFO.')
    parser.add_argument(
        '--port',
        type=int,
        help='Base port number, server will bind to 6 ports starting from base'
        '. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    address = args.address if args.address else settings.get("ZMQ_ADDRESS")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(address, port)
    server.logger.setLevel(args.log_level)
    server.start()
Example #10
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        fm = FrontierManager.from_settings(settings)
        fm.add_seeds([r1, r2, r3])
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(3)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(3, 5)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Example #11
0
 def test_close_manager(self):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('BACKEND',
                  'frontera.contrib.backends.redis_backend.RedisBackend')
     manager = WorkerFrontierManager.from_settings(settings,
                                                   strategy_worker=True)
     self.assertEqual(RedisBackend, manager.backend.__class__)
     manager.close()
Example #12
0
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx)
Example #13
0
 def dbw_setup(self, distributed=False):
     settings = Settings()
     settings.MAX_NEXT_REQUESTS = 64
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     if distributed:
         settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend'
     else:
         settings.BACKEND = 'tests.mocks.components.FakeBackend'
     return DBWorker(settings, True, True, False)
 def test_max_requests_reached(self):
     settings = Settings()
     settings.MAX_REQUESTS = 2
     fm = self.setup_frontier_manager(settings)
     fm.backend.put_requests([r1, r2, r3])
     requests = set(fm.get_next_requests(10))
     assert requests == set([r1, r2]) or requests == set([r2, r3]) or requests == set([r1, r3])
     assert fm.get_next_requests(10) == []
     assert fm.finished is True
 def strategy(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy'
     manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return manager.strategy
Example #16
0
 def test_max_requests_reached(self):
     settings = Settings()
     settings.MAX_REQUESTS = 2
     fm = self.setup_frontier_manager(settings)
     fm.backend.put_requests([r1, r2, r3])
     requests = set(fm.get_next_requests(10))
     assert requests == set([r1, r2]) or requests == set(
         [r2, r3]) or requests == set([r1, r3])
     assert fm.get_next_requests(10) == []
     assert fm.finished is True
    def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero(self):
        settings = Settings()
        # test partition_id > feed_partitions
        settings.SPIDER_PARTITION_ID = 2
        settings.SPIDER_FEED_PARTITIONS = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id = feed_partitions
        settings.SPIDER_PARTITION_ID = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id < 0
        settings.SPIDER_PARTITION_ID = -1
        self.assertRaises(ValueError, self.mbb_setup, settings)
Example #18
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Example #19
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Example #20
0
    def seed_loader_setup(self, seeds_content=None):
        seed_path = os.path.join(self.tmp_path, 'seeds.txt')
        default_content = """
https://www.example.com
https://www.scrapy.org
"""
        seeds_content = seeds_content or default_content
        with open(seed_path, 'wb') as tmpl_file:
            tmpl_file.write(seeds_content.encode('utf-8'))
        assert os.path.isfile(seed_path)  # Failure of test itself
        settings = Settings()
        settings.SEEDS_SOURCE = seed_path
        crawler = type('crawler', (object,), {})
        crawler.settings = settings
        return FileSeedLoader(crawler)
Example #21
0
    def seed_loader_setup(self, seeds_content=None):
        seed_path = os.path.join(self.tmp_path, 'seeds.txt')
        default_content = """
https://www.example.com
https://www.scrapy.org
"""
        seeds_content = seeds_content or default_content
        with open(seed_path, 'wb') as tmpl_file:
            tmpl_file.write(seeds_content.encode('utf-8'))
        assert os.path.isfile(seed_path)  # Failure of test itself
        settings = Settings()
        settings.SEEDS_SOURCE = seed_path
        crawler = type('crawler', (object, ), {})
        crawler.settings = settings
        return FileSeedLoader(crawler)
Example #22
0
 def strategy(self):
     settings = Settings()
     manager = FakeFrontierManager(settings)
     stream = MessageBusStream()
     states = MemoryStates(10)
     states_ctx = StatesContext(states)
     return DummyCrawlingStrategy.from_worker(manager, stream, states_ctx)
Example #23
0
    def __init__(self, settings=Settings()):
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = MessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = spiderlog.consumer(partition_id=0, type='sw')
        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        sleep(0.1)

        # db
        self.db_sl_c = spiderlog.consumer(partition_id=None, type='db')
        self.db_us_c = scoring_log.consumer()

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = spider_feed.consumer(0)

        sleep(0.1)
Example #24
0
    def __init__(self, cls, settings=Settings()):
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set(
            'SPIDER_FEED_PARTITIONER',
            'frontera.contrib.backends.partitioners.Crc32NamePartitioner')
        self.messagebus = cls(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b'sw')

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        sleep(0.1)

        # db
        self.db_sl_c = spiderlog.consumer(partition_id=None, type=b'db')
        self.db_us_c = scoring_log.consumer()

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = spider_feed.consumer(0)

        sleep(0.1)
Example #25
0
 def from_settings(cls,
                   settings=None,
                   db_worker=False,
                   strategy_worker=False):
     """
     Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>`  instance initialized with \
     the passed settings argument. If no settings is given,
     :ref:`frontier default settings <frontier-default-settings>` are used.
     """
     manager_settings = Settings.object_from(settings)
     return FrontierManager(
         request_model=manager_settings.REQUEST_MODEL,
         response_model=manager_settings.RESPONSE_MODEL,
         backend=manager_settings.BACKEND,
         logger=manager_settings.LOGGER,
         event_log_manager=manager_settings.EVENT_LOG_MANAGER,
         middlewares=manager_settings.MIDDLEWARES,
         test_mode=manager_settings.TEST_MODE,
         max_requests=manager_settings.MAX_REQUESTS,
         max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
         auto_start=manager_settings.AUTO_START,
         settings=manager_settings,
         canonicalsolver=manager_settings.CANONICAL_SOLVER,
         db_worker=db_worker,
         strategy_worker=strategy_worker)
Example #26
0
 def from_settings(cls,
                   settings=None,
                   db_worker=False,
                   strategy_worker=False):
     """
     Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>`  instance initialized with \
     the passed settings argument. If no settings is given,
     :ref:`frontier default settings <frontier-default-settings>` are used.
     """
     manager_settings = Settings.object_from(settings)
     return FrontierManager(
         request_model=manager_settings.REQUEST_MODEL,
         response_model=manager_settings.RESPONSE_MODEL,
         backend=manager_settings.BACKEND,
         logger=manager_settings.LOGGER,
         event_log_manager=manager_settings.EVENT_LOG_MANAGER,
         middlewares=manager_settings.MIDDLEWARES,
         test_mode=manager_settings.TEST_MODE,
         max_requests=manager_settings.MAX_REQUESTS,
         max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
         auto_start=manager_settings.AUTO_START,
         settings=manager_settings,
         canonicalsolver=manager_settings.CANONICAL_SOLVER,
         db_worker=db_worker,
         strategy_worker=strategy_worker)
Example #27
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
 def mbb_setup(self, settings=None):
     manager = type('manager', (object, ), {})
     settings = settings or Settings()
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STORE_CONTENT = True
     manager.settings = settings
     manager.request_model = Request
     manager.response_model = Response
     return MessageBusBackend(manager)
Example #29
0
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument('--config', type=str,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--hostname', type=str,
                        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument('--port', type=int,
                        help='Base port number, server will bind to 6 ports starting from base. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
Example #30
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(
            spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
Example #31
0
 def from_settings(cls, settings=None):
     manager_settings = Settings.object_from(settings)
     return SpiderFrontierManager(
         request_model=manager_settings.REQUEST_MODEL,
         response_model=manager_settings.RESPONSE_MODEL,
         backend=manager_settings.BACKEND,
         middlewares=manager_settings.MIDDLEWARES,
         max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
         settings=manager_settings,
         canonicalsolver=manager_settings.CANONICAL_SOLVER)
 def mbb_setup(self, settings=None):
     manager = type('manager', (object, ), {})
     settings = settings or Settings()
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.STORE_CONTENT = True
     #test json codecs
     # settings.MESSAGE_BUS_CODEC='frontera.contrib.backends.remote.codecs.json'
     manager.settings = settings
     manager.request_model = Request
     manager.response_model = Response
     return MessageBusBackend(manager)
Example #33
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     return FrontierManager.from_settings(settings)
Example #34
0
 def sw_setup_add_seeds(self):
     settings = Settings()
     settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed'
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return StrategyWorker(settings, True)
Example #35
0
    def test_blocking_middleware(self):
        settings = Settings()
        settings.BACKEND = 'tests.mocks.components.FakeBackend'
        settings.MIDDLEWARES = [
            'frontera.contrib.middlewares.domain.DomainMiddleware',
            'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
            'tests.mocks.components.FakeMiddleware',
            'tests.mocks.components.FakeMiddlewareModifySeeds',
            'tests.mocks.components.FakeMiddlewareBlocking',
            'tests.mocks.components.FakeMiddlewareModifyResponse',
            'tests.mocks.components.FakeMiddlewareModifyLinks'
        ]
        settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
        settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
        fm = LocalFrontierManager.from_settings(settings)
        SEEDS_FILE.seek(0)
        fm.add_seeds(SEEDS_FILE)
        response = Response(r1.url, request=r1)
        fm.page_crawled(response)
        fm.links_extracted(r1, links=[r2])
        fm.request_error(r3, 'error')

        #the seeds, responses, links and errors have not reached the backend.
        assert [len(list) for list in fm.backend.lists] == [0] * 4
        #the 3 seeds reach the first three middlewares.
        assert [len(fm.middlewares[i].requests)
                for i in range(2, 5)] == [3] * 3
        #the error, response and link reached the first three middlewares.
        assert [[len(list) for list in fm.middlewares[i].lists[1:]]
                for i in range(2, 5)] == [[1] * 3] * 3
        #the values do not reach the bottom 2 middlewares and the canonical solver.
        assert [[len(list) for list in fm.middlewares[i].lists]
                for i in range(5, 7)] == [[0] * 4] * 2
        assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4
Example #36
0
 def setup_frontier_manager(self, settings=None):
     settings = settings or Settings()
     settings.BACKEND = 'tests.mocks.components.FakeBackend'
     settings.MIDDLEWARES = [
         'frontera.contrib.middlewares.domain.DomainMiddleware',
         'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware',
         'tests.mocks.components.FakeMiddleware',
         'tests.mocks.components.FakeMiddlewareModifySeeds',
         'tests.mocks.components.FakeMiddlewareModifyResponse',
         'tests.mocks.components.FakeMiddlewareModifyLinks'
     ]
     settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver'
     settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy'
     return LocalFrontierManager.from_settings(settings)
Example #37
0
 def dbw_setup(self, distributed=False):
     settings = Settings()
     settings.MAX_NEXT_REQUESTS = 64
     settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
     if distributed:
         settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend'
     else:
         settings.BACKEND = 'tests.mocks.components.FakeBackend'
     return DBWorker(settings, False, False, False, partitions="0")
Example #38
0
    def setUp(self):
        self.tmp_path = mkdtemp()
        settings = Settings()
        settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder'
        settings.SEEDS_AWS_ACCESS_KEY = 'access_key'
        settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key'
        crawler = type('crawler', (object,), {})
        crawler.settings = settings
        self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt')
        self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt')
        s1_content = """
https://www.example.com
https://www.scrapy.org
"""
        s2_content = """
https://www.dmoz.org
https://www.test.com
"""

        with open(self.seed_path_1, 'wb') as tmpl_file:
            tmpl_file.write(s1_content.encode('utf-8'))
        with open(self.seed_path_2, 'wb') as tmpl_file:
            tmpl_file.write(s2_content.encode('utf-8'))
        self.seed_loader = S3SeedLoader(crawler)
Example #39
0
    def setUp(self):
        self.tmp_path = mkdtemp()
        settings = Settings()
        settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder'
        settings.SEEDS_AWS_ACCESS_KEY = 'access_key'
        settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key'
        crawler = type('crawler', (object, ), {})
        crawler.settings = settings
        self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt')
        self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt')
        s1_content = """
https://www.example.com
https://www.scrapy.org
"""
        s2_content = """
https://www.dmoz.org
https://www.test.com
"""

        with open(self.seed_path_1, 'wb') as tmpl_file:
            tmpl_file.write(s1_content.encode('utf-8'))
        with open(self.seed_path_2, 'wb') as tmpl_file:
            tmpl_file.write(s2_content.encode('utf-8'))
        self.seed_loader = S3SeedLoader(crawler)
Example #40
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
Example #41
0
def test_kafka_message_bus_integration():
    kafka_location = "127.0.0.1:9092"
    client = KafkaClient(kafka_location)
    client.ensure_topic_exists("frontier-todo")
    client.ensure_topic_exists("frontier-done")
    client.ensure_topic_exists("frontier-score")

    logging.basicConfig(level=logging.INFO)
    #kafkabus = logging.getLogger("kafkabus")
    #kafkabus.addHandler(logging.StreamHandler())
    settings = Settings()
    settings.set('KAFKA_LOCATION', kafka_location)
    settings.set('FRONTIER_GROUP', 'frontier2')
    settings.set('SCORING_TOPIC', "frontier-score")
    tester = MessageBusTester(KafkaMessageBus, settings)
    tester.spider_log_activity(64)
    assert tester.sw_activity() == 64
    assert tester.db_activity(128) == (64, 32)
    assert tester.spider_feed_activity() == 128
Example #42
0
 def from_settings(cls, settings=None):
     """
     Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>`  instance initialized with \
     the passed settings argument. If no settings is given,
     :ref:`frontier default settings <frontier-default-settings>` are used.
     """
     manager_settings = Settings.object_from(settings)
     settings.set_from_dict(DEFAULT_SETTINGS)
     return cls(request_model=manager_settings.REQUEST_MODEL,
                            response_model=manager_settings.RESPONSE_MODEL,
                            backend=manager_settings.BACKEND,
                            middlewares=manager_settings.MIDDLEWARES,
                            test_mode=manager_settings.TEST_MODE,
                            max_requests=manager_settings.MAX_REQUESTS,
                            max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
                            auto_start=manager_settings.AUTO_START,
                            settings=manager_settings,
                            canonicalsolver=manager_settings.CANONICAL_SOLVER)
    def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero(self):
        settings = Settings()
        # test partition_id > feed_partitions
        settings.SPIDER_PARTITION_ID = 2
        settings.SPIDER_FEED_PARTITIONS = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id = feed_partitions
        settings.SPIDER_PARTITION_ID = 1
        self.assertRaises(ValueError, self.mbb_setup, settings)

        # test partition_id < 0
        settings.SPIDER_PARTITION_ID = -1
        self.assertRaises(ValueError, self.mbb_setup, settings)
Example #44
0
 def from_settings(cls, settings=None):
     """
     Returns a :class:`FrontierManager <frontera.core.manager.FrontierManager>`  instance initialized with \
     the passed settings argument. Argument value can either be a string path pointing to settings file or a \
     :class:`Settings <frontera.settings.Settings>` object instance. If no settings is given,
     :ref:`frontier default settings <frontier-default-settings>` are used.
     """
     manager_settings = Settings(settings)
     return FrontierManager(
         request_model=manager_settings.REQUEST_MODEL,
         response_model=manager_settings.RESPONSE_MODEL,
         backend=manager_settings.BACKEND,
         logger=manager_settings.LOGGER,
         event_log_manager=manager_settings.EVENT_LOG_MANAGER,
         middlewares=manager_settings.MIDDLEWARES,
         test_mode=manager_settings.TEST_MODE,
         max_requests=manager_settings.MAX_REQUESTS,
         max_next_requests=manager_settings.MAX_NEXT_REQUESTS,
         auto_start=manager_settings.AUTO_START,
         settings=manager_settings)
Example #45
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Example #46
0
    def __init__(self, request_model, response_model, settings=None):

        # Settings
        self._settings = settings or Settings()

        # Logger
        self._logger = logging.getLogger("manager")

        # Log frontier manager starting
        self._logger.info('-' * 80)
        self._logger.info('Starting Frontier Manager...')

        # Load request model
        self._request_model = load_object(request_model)
        assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \
                                                                self._request_model.__name__

        # Load response model
        self._response_model = load_object(response_model)
        assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \
                                                                  self._response_model.__name__
Example #47
0
 def test_invalid_s3_seed_source(self):
     crawler = type('crawler', (object,), {})
     settings = Settings()
     settings.SEEDS_SOURCE = 'invalid_url'
     crawler.settings = settings
     self.assertRaises(NotConfigured, S3SeedLoader, crawler)
Example #48
0
def test_settings_attributes_can_be_assigned():
    settings = Settings()
    settings.NEW_ATTRIBUTE = 10
    assert settings.NEW_ATTRIBUTE == 10
Example #49
0
        self.states.update_cache(links)
        self.states.update_cache(response)

    def on_request_error(self, request, error):
        logger.debug("Page error %s (%s)", request.url, error)
        self.states.set_states(request)
        self.strategy.page_error(request, error)
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Example #50
0
 def from_settings(cls, settings=None):
     settings = Settings.object_from(settings)
     return FakeFrontierManager(settings)
Example #51
0
def test_settings_passed_as_attributes_can_be_found():
    settings = Settings(attributes={'SETTING': 'value'})
    assert settings.get('SETTING') == 'value'
Example #52
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, "
                                                                  "in a form of key=value separated with space")
    parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running "
                                                                 "of strategy add_seeds method")
    parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently "
                                                      "supported, implies add seeds run mode")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    settings.set('STRATEGY', strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    if args.port:
        settings.set('JSONRPC_PORT', args.port)

    strategy_args = {}
    if args.args:
        for arg in args.args:
            key, _, value = arg.partition("=")
            strategy_args[key] = value if value else None
    settings.set("STRATEGY_ARGS", strategy_args)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    return settings, args.add_seeds, args.seeds_url
Example #53
0
        self.states.set_states(request)
        self.strategy.page_error(request, error)
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Example #54
0
def run_add_seeds(settings, seeds_file):
    fh = open(seeds_file, "rb")

    logger.info("Starting local seeds addition from file %s", seeds_file)

    manager = LocalFrontierManager.from_settings(settings)
    manager.add_seeds(fh)
    manager.stop()
    manager.close()

    logger.info("Seeds addition finished")


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera local add seeds utility")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    run_add_seeds(settings, args.seeds_file)
Example #55
0
def test_fallsback_to_frontera_default_settings():
    settings = Settings()
    assert settings.get('MAX_NEXT_REQUESTS') == 0
Example #56
0
 def __init__(self):
     settings = Settings()
     settings.set('ZMQ_ADDRESS', '::1')
     super(IPv6MessageBusTester, self).__init__(settings)
Example #57
0
    parser.add_argument('--no-batches', action='store_true',
                        help='Disables generation of new batches.')
    parser.add_argument('--no-incoming', action='store_true',
                        help='Disables spider log processing.')
    parser.add_argument('--no-scoring', action='store_true',
                        help='Disables scoring log processing.')
    parser.add_argument('--partitions', type=int, nargs='*',
                        help='Optional partitions range for batch generator')
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    args = parser.parse_args()

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and os.path.exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    worker = DBWorker(settings, args.no_batches, args.no_incoming,
                      args.no_scoring, partitions=args.partitions)
    server = WorkerJsonRpcService(worker, settings)
    server.start_listening()
Example #58
0
def test_settings_on_a_python_module_are_loaded():
    settings = Settings('frontera.tests.scrapy_spider.frontera.settings')
    assert settings.get('MAX_REQUESTS') == 5
Example #59
0
        self.strategy.page_error(request, error)
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)