def __init__(self, config, *args, **kwargs): """ Load an token classification model for NER from ONNX """ super(TokenClassificationEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'token_classification': raise ValueError( f"{self.config.model_path} isn't a Token Classification model (type '{self.config.type}'" ) # load model dynamic_shapes = { 'max': (1, self.config['dataset']['max_seq_length']) } # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.config['tokenizer']['tokenizer_name'])
def __init__(self, config, *args, **kwargs): """ Load an question answering model from ONNX """ super(QuestionAnswerEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'qa': raise ValueError( f"{self.config.model_path} isn't a Question Answering model (type '{self.config.type}'" ) # load model dynamic_shapes = { 'max': (1, self.config['dataset']['max_seq_length']) } # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.config['tokenizer']['tokenizer_name']) self.question_first = bool(self.tokenizer.padding_side == "right")
def __init__(self, config, *args, **kwargs): """ Load an Intent/Slot classification model from ONNX """ super(IntentSlotEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'intent_slot': raise ValueError( f"{self.config.model_path} isn't an Intent/Slot model (type '{self.config.type}'" ) # load model dynamic_shapes = { 'max': (1, self.config['language_model']['max_seq_length']) } # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.config['tokenizer']['tokenizer_name']) self.null_slot = self.slot_labels[ -1] # 'O' in assistant dataset - always the last label?
def __init__(self, config, *args, **kwargs): """ Loads a streaming ASR model from ONNX or serialized TensorRT engine. Parameters: model (string) -- path to ONNX model or serialized TensorRT engine/plan config (string) -- path to model configuration json (will be inferred from model path if empty) """ super(TTSEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'tts': raise ValueError( f"{self.config.model_path} isn't a Text-to-Speech model (type '{self.config.type}'" ) # load text->MEL generator model self.generator = load_model(self.config.generator) # load MEL->audio vocoder model features = self.config.vocoder.features dynamic_shapes = { 'min': (1, features, 1), 'opt': (1, features, 160), # ~5-6 words 'max': (1, features, 1024) # ~20-30 words? } self.vocoder = load_model(self.config.vocoder, dynamic_shapes=dynamic_shapes) # create map of symbol->ID embeddings self.symbol_to_id = {s: i for i, s in enumerate(self.get_symbols())} # create operators for num-to-word conversion self.number_regex = re.compile( r'\d+(?:,\d+)?') # https://stackoverflow.com/a/16321189 self.number_inflect = inflect.engine()
def __init__(self, config, *args, **kwargs): """ Loads a streaming ASR model from ONNX or serialized TensorRT engine. Parameters: model (string) -- path to ONNX model or serialized TensorRT engine/plan config (string) -- path to model configuration json (will be inferred from model path if empty) """ super(ASREngine, self).__init__(config, *args, **kwargs) if self.config.type != 'asr' and self.config.type != 'asr_classification': raise ValueError( f"{self.config.model_path} isn't an ASR model (type '{self.config.type}'" ) # set some default config options that are non-standard in nemo if 'streaming' not in self.config: self.config['streaming'] = {} self.config['streaming'].setdefault( 'frame_length', 1.0 ) # duration of signal frame, seconds (TODO shorter defaults for VAD/command classifiers) self.config['streaming'].setdefault( 'frame_overlap', 0.5) # duration of overlap before/after current frame, seconds # some config changes for streaming if not self.classification: self.config['preprocessor']['dither'] = 0.0 self.config['preprocessor']['pad_to'] = 0 if 'ctc_decoder' not in self.config: self.config['ctc_decoder'] = {} self.config['ctc_decoder'].setdefault( 'type', 'greedy') # greedy or beamsearch self.config['ctc_decoder'].setdefault( 'add_punctuation', True) # add period to the end of sentences if 'add_punctuation' in kwargs: self.config['ctc_decoder']['add_punctuation'] = kwargs[ 'add_punctuation'] logging.info(f"add_punctuation = {kwargs['add_punctuation']}") if not self.classification and self.config['preprocessor'][ 'features'] == 64: # TODO normalization coefficients for citrinet (N=80) normalization = {} normalization['fixed_mean'] = [ -14.95827016, -12.71798736, -11.76067913, -10.83311182, -10.6746914, -10.15163465, -10.05378331, -9.53918999, -9.41858904, -9.23382904, -9.46470918, -9.56037, -9.57434245, -9.47498732, -9.7635205, -10.08113074, -10.05454561, -9.81112681, -9.68673603, -9.83652977, -9.90046248, -9.85404766, -9.92560366, -9.95440354, -10.17162966, -9.90102482, -9.47471025, -9.54416855, -10.07109475, -9.98249912, -9.74359465, -9.55632283, -9.23399915, -9.36487649, -9.81791084, -9.56799225, -9.70630899, -9.85148006, -9.8594418, -10.01378735, -9.98505315, -9.62016094, -10.342285, -10.41070709, -10.10687659, -10.14536695, -10.30828702, -10.23542833, -10.88546868, -11.31723646, -11.46087382, -11.54877829, -11.62400934, -11.92190509, -12.14063815, -11.65130117, -11.58308531, -12.22214663, -12.42927197, -12.58039805, -13.10098969, -13.14345864, -13.31835645, -14.47345634 ] normalization['fixed_std'] = [ 3.81402054, 4.12647781, 4.05007065, 3.87790987, 3.74721178, 3.68377423, 3.69344, 3.54001005, 3.59530412, 3.63752368, 3.62826417, 3.56488469, 3.53740577, 3.68313898, 3.67138151, 3.55707266, 3.54919572, 3.55721289, 3.56723346, 3.46029304, 3.44119672, 3.49030548, 3.39328435, 3.28244406, 3.28001423, 3.26744937, 3.46692348, 3.35378948, 2.96330901, 2.97663111, 3.04575148, 2.89717604, 2.95659301, 2.90181116, 2.7111687, 2.93041291, 2.86647897, 2.73473181, 2.71495654, 2.75543763, 2.79174615, 2.96076456, 2.57376336, 2.68789782, 2.90930817, 2.90412004, 2.76187531, 2.89905006, 2.65896173, 2.81032176, 2.87769857, 2.84665271, 2.80863137, 2.80707634, 2.83752184, 3.01914511, 2.92046439, 2.78461139, 2.90034605, 2.94599508, 2.99099718, 3.0167554, 3.04649716, 2.94116777 ] self.config['preprocessor']['normalize'] = normalization # create preprocessor instance preprocessor_name = self.config['preprocessor']['_target_'].rsplit( ".", 1) preprocessor_class = getattr( importlib.import_module(preprocessor_name[0]), preprocessor_name[1]) logging.debug(f'ASR preprocessor - {preprocessor_class}') preprocessor_config = self.config['preprocessor'].copy() preprocessor_config.pop('_target_') self.preprocessor = preprocessor_class(**preprocessor_config) # load the model features = self.config.preprocessor.n_mels if self.classification else self.config.preprocessor.features time_to_fft = self.sample_rate * ( 1.0 / 160.0 ) # rough conversion from samples to MEL spectrogram dims dynamic_shapes = { 'min': (1, features, int(0.1 * time_to_fft)), # minimum plausible frame length 'opt': (1, features, int(1.5 * time_to_fft)), # default of .5s overlap factor (1,64,121) 'max': (1, features, int(3.0 * time_to_fft) ) # enough for 2s overlap factor } self.model = load_model(self.config, dynamic_shapes) # create CTC decoder if not self.classification: self.ctc_decoder = CTCDecoder.from_config( self.config['ctc_decoder'], self.config['decoder']['vocabulary'], os.path.dirname(self.config.model_path)) logging.info(f"CTC decoder type: '{self.ctc_decoder.type}'") # create streaming buffer self.n_frame_len = int(self.frame_length * self.sample_rate) self.n_frame_overlap = int(self.frame_overlap * self.sample_rate) self.buffer_length = self.n_frame_len + self.n_frame_overlap self.buffer_duration = self.buffer_length / self.sample_rate self.buffer = np.zeros(shape=self.buffer_length, dtype=np.float32) # 2*self.n_frame_overlap