def __init__(self, config, *args, **kwargs):
        """
        Load an token classification model for NER from ONNX
        """
        super(TokenClassificationEngine,
              self).__init__(config, *args, **kwargs)

        if self.config.type != 'token_classification':
            raise ValueError(
                f"{self.config.model_path} isn't a Token Classification model (type '{self.config.type}'"
            )

        # load model
        dynamic_shapes = {
            'max': (1, self.config['dataset']['max_seq_length'])
        }  # (batch_size, sequence_length)

        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)

        self.model = load_model(self.config, dynamic_shapes)

        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config['tokenizer']['tokenizer_name'])
Exemple #2
0
    def __init__(self, config, *args, **kwargs):
        """
        Load an question answering model from ONNX
        """
        super(QuestionAnswerEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'qa':
            raise ValueError(
                f"{self.config.model_path} isn't a Question Answering model (type '{self.config.type}'"
            )

        # load model
        dynamic_shapes = {
            'max': (1, self.config['dataset']['max_seq_length'])
        }  # (batch_size, sequence_length)

        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)

        self.model = load_model(self.config, dynamic_shapes)

        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config['tokenizer']['tokenizer_name'])
        self.question_first = bool(self.tokenizer.padding_side == "right")
Exemple #3
0
    def __init__(self, config, *args, **kwargs):
        """
        Load an Intent/Slot classification model from ONNX
        """
        super(IntentSlotEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'intent_slot':
            raise ValueError(
                f"{self.config.model_path} isn't an Intent/Slot model (type '{self.config.type}'"
            )

        # load model
        dynamic_shapes = {
            'max': (1, self.config['language_model']['max_seq_length'])
        }  # (batch_size, sequence_length)

        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)

        self.model = load_model(self.config, dynamic_shapes)

        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config['tokenizer']['tokenizer_name'])
        self.null_slot = self.slot_labels[
            -1]  # 'O' in assistant dataset - always the last label?
Exemple #4
0
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(TTSEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'tts':
            raise ValueError(
                f"{self.config.model_path} isn't a Text-to-Speech model (type '{self.config.type}'"
            )

        # load text->MEL generator model
        self.generator = load_model(self.config.generator)

        # load MEL->audio vocoder model
        features = self.config.vocoder.features

        dynamic_shapes = {
            'min': (1, features, 1),
            'opt': (1, features, 160),  # ~5-6 words
            'max': (1, features, 1024)  # ~20-30 words?
        }

        self.vocoder = load_model(self.config.vocoder,
                                  dynamic_shapes=dynamic_shapes)

        # create map of symbol->ID embeddings
        self.symbol_to_id = {s: i for i, s in enumerate(self.get_symbols())}

        # create operators for num-to-word conversion
        self.number_regex = re.compile(
            r'\d+(?:,\d+)?')  # https://stackoverflow.com/a/16321189
        self.number_inflect = inflect.engine()
Exemple #5
0
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(ASREngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'asr' and self.config.type != 'asr_classification':
            raise ValueError(
                f"{self.config.model_path} isn't an ASR model (type '{self.config.type}'"
            )

        # set some default config options that are non-standard in nemo
        if 'streaming' not in self.config:
            self.config['streaming'] = {}

        self.config['streaming'].setdefault(
            'frame_length', 1.0
        )  # duration of signal frame, seconds (TODO shorter defaults for VAD/command classifiers)
        self.config['streaming'].setdefault(
            'frame_overlap',
            0.5)  # duration of overlap before/after current frame, seconds

        # some config changes for streaming
        if not self.classification:
            self.config['preprocessor']['dither'] = 0.0
            self.config['preprocessor']['pad_to'] = 0

            if 'ctc_decoder' not in self.config:
                self.config['ctc_decoder'] = {}

            self.config['ctc_decoder'].setdefault(
                'type', 'greedy')  # greedy or beamsearch
            self.config['ctc_decoder'].setdefault(
                'add_punctuation', True)  # add period to the end of sentences

            if 'add_punctuation' in kwargs:
                self.config['ctc_decoder']['add_punctuation'] = kwargs[
                    'add_punctuation']
                logging.info(f"add_punctuation = {kwargs['add_punctuation']}")

        if not self.classification and self.config['preprocessor'][
                'features'] == 64:  # TODO normalization coefficients for citrinet (N=80)
            normalization = {}

            normalization['fixed_mean'] = [
                -14.95827016, -12.71798736, -11.76067913, -10.83311182,
                -10.6746914, -10.15163465, -10.05378331, -9.53918999,
                -9.41858904, -9.23382904, -9.46470918, -9.56037, -9.57434245,
                -9.47498732, -9.7635205, -10.08113074, -10.05454561,
                -9.81112681, -9.68673603, -9.83652977, -9.90046248,
                -9.85404766, -9.92560366, -9.95440354, -10.17162966,
                -9.90102482, -9.47471025, -9.54416855, -10.07109475,
                -9.98249912, -9.74359465, -9.55632283, -9.23399915,
                -9.36487649, -9.81791084, -9.56799225, -9.70630899,
                -9.85148006, -9.8594418, -10.01378735, -9.98505315,
                -9.62016094, -10.342285, -10.41070709, -10.10687659,
                -10.14536695, -10.30828702, -10.23542833, -10.88546868,
                -11.31723646, -11.46087382, -11.54877829, -11.62400934,
                -11.92190509, -12.14063815, -11.65130117, -11.58308531,
                -12.22214663, -12.42927197, -12.58039805, -13.10098969,
                -13.14345864, -13.31835645, -14.47345634
            ]

            normalization['fixed_std'] = [
                3.81402054, 4.12647781, 4.05007065, 3.87790987, 3.74721178,
                3.68377423, 3.69344, 3.54001005, 3.59530412, 3.63752368,
                3.62826417, 3.56488469, 3.53740577, 3.68313898, 3.67138151,
                3.55707266, 3.54919572, 3.55721289, 3.56723346, 3.46029304,
                3.44119672, 3.49030548, 3.39328435, 3.28244406, 3.28001423,
                3.26744937, 3.46692348, 3.35378948, 2.96330901, 2.97663111,
                3.04575148, 2.89717604, 2.95659301, 2.90181116, 2.7111687,
                2.93041291, 2.86647897, 2.73473181, 2.71495654, 2.75543763,
                2.79174615, 2.96076456, 2.57376336, 2.68789782, 2.90930817,
                2.90412004, 2.76187531, 2.89905006, 2.65896173, 2.81032176,
                2.87769857, 2.84665271, 2.80863137, 2.80707634, 2.83752184,
                3.01914511, 2.92046439, 2.78461139, 2.90034605, 2.94599508,
                2.99099718, 3.0167554, 3.04649716, 2.94116777
            ]

            self.config['preprocessor']['normalize'] = normalization

        # create preprocessor instance
        preprocessor_name = self.config['preprocessor']['_target_'].rsplit(
            ".", 1)
        preprocessor_class = getattr(
            importlib.import_module(preprocessor_name[0]),
            preprocessor_name[1])
        logging.debug(f'ASR preprocessor - {preprocessor_class}')

        preprocessor_config = self.config['preprocessor'].copy()
        preprocessor_config.pop('_target_')

        self.preprocessor = preprocessor_class(**preprocessor_config)

        # load the model
        features = self.config.preprocessor.n_mels if self.classification else self.config.preprocessor.features
        time_to_fft = self.sample_rate * (
            1.0 / 160.0
        )  # rough conversion from samples to MEL spectrogram dims

        dynamic_shapes = {
            'min': (1, features,
                    int(0.1 * time_to_fft)),  # minimum plausible frame length
            'opt':
            (1, features,
             int(1.5 *
                 time_to_fft)),  # default of .5s overlap factor (1,64,121)
            'max': (1, features, int(3.0 * time_to_fft)
                    )  # enough for 2s overlap factor
        }

        self.model = load_model(self.config, dynamic_shapes)

        # create CTC decoder
        if not self.classification:
            self.ctc_decoder = CTCDecoder.from_config(
                self.config['ctc_decoder'],
                self.config['decoder']['vocabulary'],
                os.path.dirname(self.config.model_path))

            logging.info(f"CTC decoder type: '{self.ctc_decoder.type}'")

        # create streaming buffer
        self.n_frame_len = int(self.frame_length * self.sample_rate)
        self.n_frame_overlap = int(self.frame_overlap * self.sample_rate)

        self.buffer_length = self.n_frame_len + self.n_frame_overlap
        self.buffer_duration = self.buffer_length / self.sample_rate

        self.buffer = np.zeros(shape=self.buffer_length,
                               dtype=np.float32)  # 2*self.n_frame_overlap