Exemple #1
0
    def __init__(self):
        settings = Settings()
        settings.set("SPIDER_FEED_PARTITIONS", 1)
        settings.set("QUEUE_HOSTNAME_PARTITIONING", True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type="sw")
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type="db")
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
    def __init__(self):
        settings = Settings()
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.mb = MessageBus(settings)
        sl = self.mb.spider_log()

        # sw
        self.sw_sl_c = sl.consumer(partition_id=0, type='sw')
        us = self.mb.scoring_log()
        self.sw_us_p = us.producer()

        sleep(0.1)

        # db
        self.db_sl_c = sl.consumer(partition_id=None, type='db')
        self.db_us_c = us.consumer()

        sf = self.mb.spider_feed()
        self.db_sf_p = sf.producer()

        sleep(0.1)

        # spider
        self.sp_sl_p = sl.producer()
        self.sp_sf_c = sf.consumer(0)

        sleep(0.1)
Exemple #3
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Exemple #4
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Exemple #5
0
 def test_close_manager(self):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('BACKEND',
                  'frontera.contrib.backends.redis_backend.RedisBackend')
     manager = WorkerFrontierManager.from_settings(settings,
                                                   strategy_worker=True)
     self.assertEqual(RedisBackend, manager.backend.__class__)
     manager.close()
Exemple #6
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, "
                                                                  "in a form of key=value separated with space")
    parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running "
                                                                 "of strategy add_seeds method")
    parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently "
                                                      "supported, implies add seeds run mode")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    settings.set('STRATEGY', strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    if args.port:
        settings.set('JSONRPC_PORT', args.port)

    strategy_args = {}
    if args.args:
        for arg in args.args:
            key, _, value = arg.partition("=")
            strategy_args[key] = value if value else None
    settings.set("STRATEGY_ARGS", strategy_args)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    return settings, args.add_seeds, args.seeds_url
Exemple #7
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(
            spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
def test_kafka_message_bus_integration():
    kafka_location = "127.0.0.1:9092"
    client = KafkaClient(kafka_location)
    client.ensure_topic_exists("frontier-todo")
    client.ensure_topic_exists("frontier-done")
    client.ensure_topic_exists("frontier-score")

    logging.basicConfig(level=logging.INFO)
    #kafkabus = logging.getLogger("kafkabus")
    #kafkabus.addHandler(logging.StreamHandler())
    settings = Settings()
    settings.set('KAFKA_LOCATION', kafka_location)
    settings.set('FRONTIER_GROUP', 'frontier2')
    settings.set('SCORING_TOPIC', "frontier-score")
    tester = MessageBusTester(KafkaMessageBus, settings)
    tester.spider_log_activity(64)
    assert tester.sw_activity() == 64
    assert tester.db_activity(128) == (64, 32)
    assert tester.spider_feed_activity() == 128
Exemple #10
0
 def __init__(self):
     settings = Settings()
     settings.set('ZMQ_ADDRESS', '::1')
     super(IPv6MessageBusTester, self).__init__(settings)
Exemple #11
0
    parser.add_argument('--no-incoming', action='store_true',
                        help='Disables spider log processing.')
    parser.add_argument('--no-scoring', action='store_true',
                        help='Disables scoring log processing.')
    parser.add_argument('--partitions', type=int, nargs='*',
                        help='Optional partitions range for batch generator')
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    args = parser.parse_args()

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and os.path.exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    worker = DBWorker(settings, args.no_batches, args.no_incoming,
                      args.no_scoring, partitions=args.partitions)
    server = WorkerJsonRpcService(worker, settings)
    server.start_listening()
    worker.run()
 def __init__(self):
     settings = Settings()
     settings.set('ZMQ_ADDRESS', '::1')
     super(IPv6MessageBusTester, self).__init__(settings)
Exemple #13
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id',
                        type=int,
                        help="Instance partition id.")
    parser.add_argument('--port',
                        type=int,
                        help="Json Rpc service port to listen.")
    parser.add_argument('--args',
                        '-a',
                        nargs='*',
                        type=str,
                        help="Optional arguments for crawling strategy, "
                        "in a form of key=value separated with space")
    parser.add_argument(
        '--add-seeds',
        action='store_true',
        help="Run in add seeds mode. Worker finishes after running "
        "of strategy add_seeds method")
    parser.add_argument(
        '--seeds-url',
        type=str,
        help="Seeds url. S3 and native urlopen schemas are currently "
        "supported, implies add seeds run mode")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    settings.set('STRATEGY', strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get(
        'SCORING_PARTITION_ID')
    if partition_id >= settings.get(
            'SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError(
            "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS."
            % partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    if args.port:
        settings.set('JSONRPC_PORT', args.port)

    strategy_args = {}
    if args.args:
        for arg in args.args:
            key, _, value = arg.partition("=")
            strategy_args[key] = value if value else None
    settings.set("STRATEGY_ARGS", strategy_args)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    return settings, args.add_seeds, args.seeds_url
Exemple #14
0
 def setup_subject(partitions):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('SPIDER_FEED_PARTITIONS', partitions)
     settings.set('REDIS_DROP_ALL_TABLES', True)
     return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True))
Exemple #15
0
 def setup_subject(partitions):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('SPIDER_FEED_PARTITIONS', partitions)
     settings.set('REDIS_DROP_ALL_TABLES', True)
     return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True))
Exemple #16
0
        required=True,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port',
                        type=int,
                        help="Json Rpc service port to listen.")
    args = parser.parse_args()

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and os.path.exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    worker = DBWorker(settings,
                      args.no_batches,
                      args.no_incoming,
                      args.no_scoring,
                      partitions=args.partitions)
    server = WorkerJsonRpcService(worker, settings)
Exemple #17
0
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Exemple #18
0
                        type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get(
        'SCORING_PARTITION_ID')
    if partition_id >= settings.get(
            'SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError(
            "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS."
            % partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Exemple #19
0
 def test_close_manager(self):
     settings = Settings(module='frontera.settings.default_settings')
     settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend')
     manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True)
     self.assertEqual(RedisBackend, manager.backend.__class__)
     manager.close()