Esempio n. 1
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Esempio n. 2
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    return settings, strategy_class
Esempio n. 3
0
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config',
        type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--hostname',
        type=str,
        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help=
        'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument(
        '--port',
        type=int,
        help=
        'Base port number, server will bind to 6 ports starting from base. Default is 5550'
    )
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
Esempio n. 4
0
def main():
    """
    Parse arguments, set configuration values, then start the broker
    """
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config',
        type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--address',
        type=str,
        help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1'
        '. When binding to wildcard it defaults to IPv4.')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is'
        ' INFO.')
    parser.add_argument(
        '--port',
        type=int,
        help='Base port number, server will bind to 6 ports starting from base'
        '. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    address = args.address if args.address else settings.get("ZMQ_ADDRESS")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(address, port)
    server.logger.setLevel(args.log_level)
    server.start()
Esempio n. 5
0
def main():
    """
    Parse arguments, set configuration values, then start the broker
    """
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument(
        '--config', type=str,
        help='Settings module name, should be accessible by import.')
    parser.add_argument(
        '--address', type=str,
        help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1'
        '. When binding to wildcard it defaults to IPv4.')
    parser.add_argument(
        '--log-level', '-L', type=str, default='INFO',
        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is'
        ' INFO.')
    parser.add_argument(
        '--port', type=int,
        help='Base port number, server will bind to 6 ports starting from base'
        '. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    address = args.address if args.address else settings.get("ZMQ_ADDRESS")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(address, port)
    server.logger.setLevel(args.log_level)
    server.start()
Esempio n. 6
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    parser.add_argument('--port', type=int, help="Json Rpc service port to listen.")
    parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, "
                                                                  "in a form of key=value separated with space")
    parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running "
                                                                 "of strategy add_seeds method")
    parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently "
                                                      "supported, implies add seeds run mode")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    settings.set('STRATEGY', strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    if args.port:
        settings.set('JSONRPC_PORT', args.port)

    strategy_args = {}
    if args.args:
        for arg in args.args:
            key, _, value = arg.partition("=")
            strategy_args[key] = value if value else None
    settings.set("STRATEGY_ARGS", strategy_args)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    return settings, args.add_seeds, args.seeds_url
Esempio n. 7
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Esempio n. 8
0
 def __init__(self, manager):
     self._manager = manager
     settings = Settings(attributes=manager.settings.attributes)
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = settings.get('SPIDER_PARTITION_ID')
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Esempio n. 9
0
def main():
    parser = ArgumentParser(description="Crawl frontier worker.")
    parser.add_argument('--config', type=str,
                        help='Settings module name, should be accessible by import.')
    parser.add_argument('--hostname', type=str,
                        help='Hostname or IP address to bind. Default is 127.0.0.1')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.')
    parser.add_argument('--port', type=int,
                        help='Base port number, server will bind to 6 ports starting from base. Default is 5550')
    args = parser.parse_args()

    settings = Settings(module=args.config)
    hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME")
    port = args.port if args.port else settings.get("ZMQ_BASE_PORT")
    server = Server(hostname, port)
    server.logger.setLevel(args.log_level)
    server.start()
Esempio n. 10
0
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id', type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID')
    if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." %
                         partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
Esempio n. 11
0
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id',
                        type=int,
                        help="Instance partition id.")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    strategy_class = load_object(strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get(
        'SCORING_PARTITION_ID')
    if partition_id >= settings.get(
            'SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError(
            "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS."
            % partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)
Esempio n. 12
0
def setup_environment():
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        help='Crawling strategy class path')
    parser.add_argument('--partition-id',
                        type=int,
                        help="Instance partition id.")
    parser.add_argument('--port',
                        type=int,
                        help="Json Rpc service port to listen.")
    parser.add_argument('--args',
                        '-a',
                        nargs='*',
                        type=str,
                        help="Optional arguments for crawling strategy, "
                        "in a form of key=value separated with space")
    parser.add_argument(
        '--add-seeds',
        action='store_true',
        help="Run in add seeds mode. Worker finishes after running "
        "of strategy add_seeds method")
    parser.add_argument(
        '--seeds-url',
        type=str,
        help="Seeds url. S3 and native urlopen schemas are currently "
        "supported, implies add seeds run mode")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    settings.set('STRATEGY', strategy_classpath)

    partition_id = args.partition_id if args.partition_id is not None else settings.get(
        'SCORING_PARTITION_ID')
    if partition_id >= settings.get(
            'SPIDER_LOG_PARTITIONS') or partition_id < 0:
        raise ValueError(
            "Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS."
            % partition_id)
    settings.set('SCORING_PARTITION_ID', partition_id)

    if args.port:
        settings.set('JSONRPC_PORT', args.port)

    strategy_args = {}
    if args.args:
        for arg in args.args:
            key, _, value = arg.partition("=")
            strategy_args[key] = value if value else None
    settings.set("STRATEGY_ARGS", strategy_args)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    return settings, args.add_seeds, args.seeds_url
Esempio n. 13
0
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.")
    parser.add_argument('--port',
                        type=int,
                        help="Json Rpc service port to listen.")
    args = parser.parse_args()

    settings = Settings(module=args.config)
    if args.port:
        settings.set("JSONRPC_PORT", [args.port])

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and os.path.exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    worker = DBWorker(settings,
                      args.no_batches,
                      args.no_incoming,
                      args.no_scoring,
                      partitions=args.partitions)
    server = WorkerJsonRpcService(worker, settings)
    server.start_listening()
    worker.run()
Esempio n. 14
0
def run_add_seeds(settings, seeds_file):
    fh = open(seeds_file, "rb")

    logger.info("Starting local seeds addition from file %s", seeds_file)

    manager = LocalFrontierManager.from_settings(settings)
    manager.add_seeds(fh)
    manager.stop()
    manager.close()

    logger.info("Seeds addition finished")


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera local add seeds utility")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path")
    args = parser.parse_args()
    settings = Settings(module=args.config)
    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path, disable_existing_loggers=False)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)

    run_add_seeds(settings, args.seeds_file)
Esempio n. 15
0
if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument(
        '--config',
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    strategy_class = load_object(strategy_classpath)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Esempio n. 16
0
def test_fallsback_to_frontera_default_settings():
    settings = Settings()
    assert settings.get('MAX_NEXT_REQUESTS') == 64
Esempio n. 17
0
def test_settings_on_a_python_module_are_loaded():
    settings = Settings('tests.scrapy_spider.frontera.settings')
    assert settings.get('MAX_REQUESTS') == 5
Esempio n. 18
0
        self.strategy.page_error(request, error)
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Esempio n. 19
0
def test_settings_passed_as_attributes_can_be_found():
    settings = Settings(attributes={'SETTING': 'value'})
    assert settings.get('SETTING') == 'value'
Esempio n. 20
0
        type=str,
        required=True,
        help='Settings module name, should be accessible by import')
    parser.add_argument(
        '--log-level',
        '-L',
        type=str,
        default='INFO',
        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy',
                        type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get(
        'CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError(
            "Couldn't locate strategy class path. Please supply it either using command line option or "
            "settings file.")
    strategy_class = load_object(strategy_classpath)

    logging_config_path = settings.get("LOGGING_CONFIG")
    if logging_config_path and exists(logging_config_path):
        fileConfig(logging_config_path)
    else:
        logging.basicConfig(level=args.log_level)
        logger.setLevel(args.log_level)
        logger.addHandler(CONSOLE)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Esempio n. 21
0
        self.states.update_cache(links)
        self.states.update_cache(response)

    def on_request_error(self, request, error):
        logger.debug("Page error %s (%s)", request.url, error)
        self.states.set_states(request)
        self.strategy.page_error(request, error)
        self.states.update_cache(request)


if __name__ == '__main__':
    parser = ArgumentParser(description="Frontera strategy worker.")
    parser.add_argument('--config', type=str, required=True,
                        help='Settings module name, should be accessible by import')
    parser.add_argument('--log-level', '-L', type=str, default='INFO',
                        help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL")
    parser.add_argument('--strategy', type=str,
                        help='Crawling strategy class path')

    args = parser.parse_args()
    logger.setLevel(args.log_level)
    logger.addHandler(CONSOLE)
    settings = Settings(module=args.config)
    strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY')
    if not strategy_classpath:
        raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or "
                         "settings file.")
    strategy_class = load_object(strategy_classpath)
    worker = StrategyWorker(settings, strategy_class)
    worker.run()
Esempio n. 22
0
def test_settings_on_a_python_module_are_loaded():
    settings = Settings('frontera.tests.scrapy_spider.frontera.settings')
    assert settings.get('MAX_REQUESTS') == 5
Esempio n. 23
0
def test_fallsback_to_frontera_default_settings():
    settings = Settings()
    assert settings.get('MAX_NEXT_REQUESTS') == 0
Esempio n. 24
0
def test_settings_passed_as_attributes_can_be_found():
    settings = Settings(attributes={'SETTING': 'value'})
    assert settings.get('SETTING') == 'value'