Exemple #1
0
 def __init__(self,
              file: type(defaults.file) = defaults.file,
              index_name: type(defaults.index_name) = defaults.index_name,
              id_col: type(defaults.id_col) = defaults.id_col,
              host: type(defaults.host) = defaults.host,
              port: type(defaults.port) = defaults.port,
              delim: type(defaults.delim) = defaults.delim,
              shards: type(defaults.shards) = defaults.shards,
              verbose: type(defaults.verbose) = defaults.verbose,
              **_):
     """
     :param name: name of the index
     :param id_col: column number of the id
     :param field_col: column number of the field data
     :param field_name: name of the field
     :param host: host of the search api server
     :param port: port the the server
     :param shards: number of shards for the index
     """
     self.file = file
     self.index_name = index_name
     self.id_col = id_col
     self.host = host
     self.port = port
     self.delim = delim
     self.shards = shards
     self.logger = set_logger(self.__class__.__name__, verbose=verbose)
Exemple #2
0
 def __init__(self,
              file: str,
              name: str = 'nboost',
              id_col: int = 0,
              field_col: int = 1,
              field_name: str = 'passage',
              host: str = '0.0.0.0',
              port: int = 9200,
              delim: str = '\t',
              shards: int = 3,
              verbose: bool = False,
              **_):
     """
     :param name: name of the index
     :param id_col: column number of the id
     :param field_col: column number of the field data
     :param field_name: name of the field
     :param host: host of the search api server
     :param port: port the the server
     :param shards: number of shards for the index
     """
     self.file = file
     self.name = name
     self.id_col = id_col
     self.field_col = field_col
     self.field_name = field_name
     self.host = host
     self.port = port
     self.delim = delim
     self.shards = shards
     self.logger = set_logger(self.__class__.__name__, verbose=verbose)
Exemple #3
0
    def __init__(self,
                 model_dir: str = 'nboost/pt-tinybert-msmarco',
                 verbose: bool = defaults.verbose,
                 max_seq_len: int = defaults.max_seq_len,
                 **kwargs):
        super().__init__(**kwargs)
        self.logger = set_logger(model_dir, verbose=verbose)
        self.max_seq_len = max_seq_len

        self.logger.info('Loading from checkpoint %s' % model_dir)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        if self.device == torch.device("cpu"):
            self.logger.info("RUNNING ON CPU")
        else:
            self.logger.info("RUNNING ON CUDA")
            torch.cuda.synchronize(self.device)

        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
            model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir,
                                                       use_fast=True)

        self.rerank_model.to(self.device, non_blocking=True)
Exemple #4
0
def resolve_model(data_dir: Path, model_dir: str, model_cls: str, **kwargs):
    """Dynamically import class from a module in the CLASS_MAP. This is used
    to manage dependencies within nboost. For example, you don't necessarily
    want to import pytorch models everytime you boot up tensorflow..."""

    logger = set_logger('resolve_model')
    data_dir.mkdir(parents=True, exist_ok=True)
    if 'http' in model_dir:
        module = MODULE_MAP[model_cls]
        model = import_class(module, model_cls)
        return model(model_dir=model_dir)

    model_dir = data_dir.joinpath(model_dir).absolute()

    if model_dir.exists():
        logger.info('Using model cache from %s', model_dir)

        if model_dir.name in CLASS_MAP:
            model_cls = CLASS_MAP[model_dir.name]
        elif model_cls not in MODULE_MAP:
            raise ImportError('Class "%s" not in %s.' % CLASS_MAP.keys())

        module = MODULE_MAP[model_cls]
        model = import_class(module, model_cls)  # type: Type[ModelPlugin]
        return model(model_dir=str(model_dir), **kwargs)
    else:
        if model_dir.name in CLASS_MAP:
            model_cls = CLASS_MAP[model_dir.name]
            module = MODULE_MAP[model_cls]
            if model_dir.name in URL_MAP:  # DOWNLOAD AND CACHE
                url = URL_MAP[model_dir.name]
                binary_path = data_dir.joinpath(Path(url).name)

                if binary_path.exists():
                    logger.info('Found model cache in %s', binary_path)
                else:
                    logger.info('Downloading "%s" model.', model_dir)
                    download_file(url, binary_path)

                if binary_path.suffixes == ['.tar', '.gz']:
                    logger.info('Extracting "%s" from %s', model_dir,
                                binary_path)
                    extract_tar_gz(binary_path, data_dir)
            else:  # pass along to plugin maybe it can resolve it
                model_dir = model_dir.name

            model = import_class(module, model_cls)  # type: Type[ModelPlugin]
            return model(model_dir=str(model_dir), **kwargs)
        else:
            if model_cls in MODULE_MAP:
                module = MODULE_MAP[model_cls]
                model = import_class(module,
                                     model_cls)  # type: Type[ModelPlugin]
                return model(model_dir=model_dir.name, **kwargs)
            else:
                raise ImportError('model_dir %s not found in %s. You must '
                                  'set --model class to continue.' %
                                  (model_dir.name, CLASS_MAP.keys()))
Exemple #5
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = set_logger(self.model_dir, verbose=True)
        self.logger.info('Distil Loading from checkpoint %s' % self.model_dir)

        self.model = DistilBertForQuestionAnswering.from_pretrained(self.model_dir)
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
Exemple #6
0
 def __init__(self,
              max_query_length: type(
                  defaults.max_query_length) = defaults.max_query_length,
              model_dir: str = defaults.qa_model_dir,
              max_seq_len: int = defaults.max_seq_len,
              **kwargs):
     super().__init__(**kwargs)
     self.model_dir = model_dir
     self.max_query_length = max_query_length
     self.max_seq_len = max_seq_len
     self.logger = set_logger('qamodel', verbose=True)
Exemple #7
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger = set_logger(self.model_dir, verbose=True)
        self.device_id = 0 if torch.cuda.is_available() else -1
        model_path = (Path(self.model_dir) / "model.tar.gz").absolute()

        self.model = TransformerQAPredictor.from_archive(
            load_archive(str(model_path), self.device_id),
            predictor_name="transformer_qa")

        self.logger.info('Loading AllenNLP from %s' % self.model_dir)
Exemple #8
0
 def __init__(self, model_dir: str, lr: float = 10e-3, batch_size: int = 4,
              max_seq_len: int = 128, verbose: bool = False, filter_results: bool = False,  **_):
     """Model dir will be a full path if the binary is present, and will
     be just the name of the "model_dir" if it is not."""
     super().__init__()
     self.filter_results = filter_results
     self.model_dir = model_dir
     self.lr = lr
     self.max_seq_len = max_seq_len
     self.batch_size = batch_size
     self.logger = set_logger(model_dir, verbose=verbose)
Exemple #9
0
 def __init__(self,
              host: str = '0.0.0.0',
              port: int = 8000,
              backlog: int = 100,
              workers: int = 10,
              **kwargs):
     super().__init__()
     self.address = (host, port)
     self.backlog = backlog
     self.workers = workers
     self.is_ready = Event()
     self.sock = self.set_socket()
     self.logger = set_logger(self.__class__.__name__)
Exemple #10
0
 def __init__(self, host: type(defaults.host) = defaults.host,
              port: type(defaults.port) = defaults.port,
              backlog: type(defaults.backlog) = defaults.backlog,
              workers: type(defaults.workers) = defaults.workers,
              verbose: type(defaults.verbose) = defaults.verbose,
              **kwargs):
     super().__init__()
     self.address = (socket.gethostbyname(host), port)
     self.backlog = backlog
     self.workers = workers
     self.is_ready = Event()
     self.sock = self.set_socket()
     self.logger = set_logger(self.__class__.__name__, verbose=verbose)
Exemple #11
0
 def __init__(self,
              model_dir: type(defaults.model_dir) = defaults.model_dir,
              batch_size: type(defaults.batch_size) = defaults.batch_size,
              max_seq_len: type(
                  defaults.max_seq_len) = defaults.max_seq_len,
              verbose: type(defaults.verbose) = defaults.verbose,
              lr: type(defaults.lr) = defaults.lr,
              **_):
     """Model dir will be a full path if the binary is present, and will
     be just the name of the "model_dir" if it is not."""
     super().__init__()
     self.model_dir = model_dir
     self.lr = lr
     self.max_seq_len = max_seq_len
     self.batch_size = batch_size
     self.logger = set_logger(model_dir, verbose=verbose)
Exemple #12
0
    def __init__(self,
                 model: Type[BaseModel],
                 codex: Type[BaseCodex],
                 uhost: str = '0.0.0.0',
                 uport: int = 9200,
                 bufsize: int = 2048,
                 verbose: bool = False,
                 **kwargs):
        super().__init__(**kwargs)
        self.kwargs = kwargs
        self.uaddress = (uhost, uport)
        self.bufsize = bufsize
        self.logger = set_logger(model.__name__, verbose=verbose)

        # pass command line arguments to instantiate each component
        self.model = model(verbose=verbose, **kwargs)
        self.codex = codex(verbose=verbose, **kwargs)
Exemple #13
0
 def __init__(self,
              lr: float = 10e-3,
              model_dir: str = 'bert-base-uncased-msmarco',
              data_dir: Path = PKG_PATH.joinpath('.cache'),
              max_seq_len: int = 128,
              batch_size: int = 4,
              **_):
     super().__init__()
     self.lr = lr
     self.max_seq_len = max_seq_len
     self.batch_size = batch_size
     self.data_dir = data_dir
     if not os.path.exists(model_dir):
         self.model_dir = data_dir.joinpath(model_dir).absolute()
     else:
         self.model_dir = Path(model_dir)
     self.logger = set_logger(model_dir)
Exemple #14
0
    def __init__(self,
                 data_dir: Path = PKG_PATH.joinpath('.cache'),
                 model_dir: str = 'pt-bert-base-uncased-msmarco',
                 qa_model_dir: str = 'distilbert-base-uncased-distilled-squad',
                 qa_model: str = str(),
                 model: str = str(),
                 qa: bool = False,
                 config: str = 'elasticsearch',
                 verbose: bool = False,
                 **kwargs):
        super().__init__(**kwargs)
        self.qa = qa
        self.data_dir = data_dir
        self.data_dir.mkdir(parents=True, exist_ok=True)

        self.model_dir = data_dir.joinpath(model_dir).absolute()
        self.model = self.resolve_model(self.model_dir,
                                        model,
                                        verbose=verbose,
                                        **kwargs)
        self.logger = set_logger(self.model.__class__.__name__,
                                 verbose=verbose)

        if qa:
            self.qa_model_dir = data_dir.joinpath(qa_model_dir).absolute()
            self.qa_model = self.resolve_model(self.qa_model_dir, qa_model,
                                               **kwargs)

        # these are global parameters that are overrided by nboost json key
        self.config = {
            'model': self.model.__class__.__name__,
            'model_dir': model_dir,
            'qa_model': self.qa_model.__class__.__name__ if qa else None,
            'qa_model_dir': qa_model_dir if qa else None,
            'data_dir': str(data_dir),
            **CONFIG_MAP[config],
            **kwargs
        }
Exemple #15
0
    def __init__(
            self,
            host: type(defaults.host) = defaults.host,
            port: type(defaults.port) = defaults.port,
            verbose: type(defaults.verbose) = defaults.verbose,
            data_dir: type(defaults.data_dir) = defaults.data_dir,
            no_rerank: type(defaults.no_rerank) = defaults.no_rerank,
            model: type(defaults.model) = defaults.model,
            model_dir: type(defaults.model_dir) = defaults.model_dir,
            qa: type(defaults.qa) = defaults.qa,
            qa_model: type(defaults.qa_model) = defaults.qa_model,
            qa_model_dir: type(defaults.qa_model_dir) = defaults.qa_model_dir,
            search_route: type(defaults.search_route) = defaults.search_route,
            frontend_route: type(
                defaults.frontend_route) = defaults.frontend_route,
            status_route: type(defaults.status_route) = defaults.status_route,
            debug: type(defaults.debug) = defaults.debug,
            prerank: type(defaults.prerank) = defaults.prerank,
            **cli_args):
        self.logger = set_logger(self.__class__.__name__, verbose=verbose)
        BackwardsCompatibility().set()
        db = Database()
        plugins = []  # type: List[Plugin]

        if prerank:
            preRankPlugin = PrerankPlugin()
            plugins.append(preRankPlugin)

        if not no_rerank:
            rerank_model_plugin = resolve_model(
                data_dir=data_dir,
                model_dir=model_dir,
                model_cls=model,
                **cli_args)  # type: RerankModelPlugin

            plugins.append(rerank_model_plugin)

        if qa:
            qa_model_plugin = resolve_model(data_dir=data_dir,
                                            model_dir=qa_model_dir,
                                            model_cls=qa_model,
                                            **cli_args)  # type: QAModelPlugin

            plugins.append(qa_model_plugin)

        if debug:
            debug_plugin = DebugPlugin(**cli_args)
            plugins.append(debug_plugin)

        static_dir = str(PKG_PATH.joinpath('resources/frontend'))
        flask_app = Flask(__name__)

        @flask_app.route(frontend_route, methods=['GET'])
        def frontend_root():
            return send_from_directory(static_dir, 'index.html')

        @flask_app.route(frontend_route + '/<path:path>', methods=['GET'])
        def frontend_path(path):
            return send_from_directory(static_dir, path)

        @flask_app.route(frontend_route + status_route)
        def status_path():
            configs = {}

            for plugin in plugins:
                configs.update(plugin.configs)

            stats = db.get_stats()
            return jsonify({**configs, **stats})

        flask_app.url_map.add(Rule('/<path:path>', endpoint='proxy'))

        @flask_app.route('/', defaults={'path': ''})
        @flask_app.endpoint('proxy')
        def proxy_through(path):
            # parse the client request
            dict_request = flask_request_to_dict_request(
                flask_request)  # takes the json
            """Search request."""
            db_row = db.new_row()

            # combine command line args and runtime args sent by request
            query_args = {}
            for key in list(dict_request['url']['query']):
                if key in defaults.__dict__:
                    query_args[key] = dict_request['url']['query'].pop(key)
            json_args = dict_request['body'].pop('nboost', {})
            args = {**cli_args, **json_args, **query_args}

            request = RequestDelegate(dict_request, **args)
            request.dict['headers'].pop('Host', '')
            request.set_path('url.headers.host',
                             '%s:%s' % (request.uhost, request.uport))
            request.set_path('url.netloc',
                             '%s:%s' % (request.uhost, request.uport))
            request.set_path('url.scheme', 'https' if request.ussl else 'http')

            for plugin in plugins:  # type: Plugin
                plugin.on_request(request, db_row)

            # get response from upstream server
            start_time = perf_counter()
            requests_response = dict_request_to_requests_response(dict_request)
            db_row.response_time = perf_counter() - start_time
            try:
                dict_response = requests_response_to_dict_response(
                    requests_response)
            except JSONDecodeError:
                print(requests_response.content)
                return requests_response.content
            response = ResponseDelegate(dict_response, request)
            response.set_path('body.nboost', {})
            db_row.choices = len(response.choices)

            for plugin in plugins:  # type: Plugin
                plugin.on_response(response, db_row)

            # save stats to sql lite
            db.insert(db_row)

            return dict_response_to_flask_response(dict_response)

        @flask_app.errorhandler(Exception)
        def handle_json_response(error):
            self.logger.error('', exc_info=True)
            return jsonify({
                'type': error.__class__.__name__,
                'doc': error.__class__.__doc__,
                'msg': str(error.args)
            }), 500

        self.run = lambda: (self.logger.critical('LISTENING %s:%s' % (
            host, port)) or flask_app.run(host=host, port=port))
Exemple #16
0
def execute(command: str):
    """Execute command in subprocess"""
    logger = set_logger('RELEASE')
    logger.info(command)
    subprocess.call(command, shell=True)