def test_sequence_mask(self): seq_len = torch.tensor([1, 4, 0, 3]).int() expected_mask = torch.tensor([[1, 0, 0, 0], [1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 0]]).bool() expected_mask2 = torch.tensor([[1, 0, 0, 0, 0], [1, 1, 1, 1, 0], [0, 0, 0, 0, 0], [1, 1, 1, 0, 0]]).bool() generated_mask = utils.sequence_mask(seq_len) generated_mask2 = utils.sequence_mask(seq_len, max_len=5) self.assertTensorEqual(generated_mask, expected_mask) self.assertTensorEqual(generated_mask2, expected_mask2)
def forward(self, src_tokens, src_lengths: Tensor, **unused): if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.conv_layers_before is not None: x, src_lengths, padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) bsz, seqlen = x.size(0), x.size(1) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if self.residual and i > 0: # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0) if i < len( self.lstm) - 1: # not applying dropout for the last layer x = F.dropout(x, p=self.dropout_out, training=self.training) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask if encoder_padding_mask.any() else None, # T x B encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=src_lengths, # B )
def forward(self, src, src_lengths): x = src.transpose(1, 2).contiguous() # B x T x C -> B x C x T x = F.relu(self.bn(self.tdnn(x))) x = x.transpose(2, 1).contiguous() # B x C x T -> B x T x C x_lengths = self.output_lengths(src_lengths) padding_mask = ~speech_utils.sequence_mask(x_lengths, x.size(1)) if padding_mask.any(): x = x.masked_fill(padding_mask.unsqueeze(-1), 0.0) return x, x_lengths, padding_mask
def forward(self, src, src_lengths): # B X T X C -> B X (input channel num) x T X (C / input channel num) x = src.view( src.size(0), src.size(1), self.in_channels, src.size(2) // self.in_channels, ).transpose(1, 2) for conv, bn in zip(self.convolutions, self.batchnorms): x = F.relu(bn(conv(x))) # B X (output channel num) x T X C' -> B X T X (output channel num) X C' x = x.transpose(1, 2) # B X T X (output channel num) X C' -> B X T X C x = x.contiguous().view(x.size(0), x.size(1), x.size(2) * x.size(3)) x_lengths = self.output_lengths(src_lengths) padding_mask = ~speech_utils.sequence_mask(x_lengths, x.size(1)) if padding_mask.any(): x = x.masked_fill(padding_mask.unsqueeze(-1), 0.0) return x, x_lengths, padding_mask
def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ if self.conv_layers_before is not None: x, src_lengths, encoder_padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, encoder_padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) if not encoder_padding_mask.any(): encoder_padding_mask = None x = F.dropout(x, p=self.dropout, training=self.training) if self.fc0 is not None: x = self.fc0(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.layer_norm: x = self.layer_norm(x) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T }
def main(args): assert args.path is not None, '--path required for recognition!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) for i, m in enumerate(models): if hasattr(m, 'is_wordlm') and m.is_wordlm: # assume subword LM comes before word LM if isinstance(models[i - 1], FairseqLanguageModel): models[i - 1] = MultiLevelLanguageModel( m, models[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del models[i] print('| LM fusion with Multi-level LM') else: models[i] = TensorizedLookaheadLanguageModel( m, dict, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) print('| LM fusion with Look-ahead Word LM') # assume subword LM comes after E2E models elif i == len(models) - 1 and isinstance(m, FairseqLanguageModel): print('| LM fusion with Subword LM') if args.lm_weight != 0.0: print('| using LM fusion with lm-weight={:.2f}'.format(args.lm_weight)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, 'encoder') else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator if args.match_source_len: print('| The option match_source_len is not applicable to ' 'speech recognition. Ignoring it.') gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute WER scorer = wer.Scorer(dict, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens, lm_weight=args.lm_weight, ) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample['net_input'] src_tokens = net_input['src_tokens'] output_lengths = models[0].encoder.output_lengths( net_input['src_lengths']) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None utt_id = sample['utt_id'][i] # Retrieve the original sentences if has_target: target_str = task.dataset( args.gen_subset).tgt.get_original_tokens(sample_id) if not args.quiet: target_sent = dict.tokens_to_sentence( target_str, use_unk_sym=False, bpe_symbol=args.remove_bpe, ) print('T-{}\t{}'.format(utt_id, target_sent)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dict.string(hypo['tokens'].int().cpu() ) # not removing bpe at this point if not args.quiet or i == 0: hypo_sent = dict.tokens_to_sentence( hypo_str, bpe_symbol=args.remove_bpe) if not args.quiet: print('H-{}\t{}\t{}'.format(utt_id, hypo_sent, hypo['score'])) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo['attention'][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo['attention'] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, 'attn_plots') os.makedirs(save_dir, exist_ok=True) plot_attention(attention, hypo_sent, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str, bpe_symbol=args.remove_bpe) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str, bpe_symbol=args.remove_bpe) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: print('| Saved attention plots in ' + save_dir) if has_target: assert args.test_text_files is not None scorer.add_ordered_utt_list(*args.test_text_files) os.makedirs(args.results_path, exist_ok=True) fn = 'decoded_char_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_char_results()) print('| Decoded char results saved as ' + f.name) fn = 'decoded_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_results()) print('| Decoded results saved as ' + f.name) if has_target: header = ' Recognize {} with beam={}: '.format(args.gen_subset, args.beam) fn = 'wer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.wer())) print('|' + header + res) f.write(res + '\n') print('| WER saved in ' + f.name) fn = 'cer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.cer())) print('|' + ' ' * len(header) + res) f.write(res + '\n') print('| CER saved in ' + f.name) fn = 'aligned_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_aligned_results()) print('| Aligned results saved as ' + f.name) return scorer
def forward( self, src_tokens: Tensor, src_lengths: Tensor, enforce_sorted: bool = True, **unused, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` enforce_sorted (bool, optional): if True, `src_tokens` is expected to contain sequences sorted by length in a decreasing order. If False, this condition is not required. Default: True. """ if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.pre_encoder is not None: x, src_lengths, padding_mask = self.pre_encoder(src_tokens, src_lengths) else: x, padding_mask = ( src_tokens, ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)), ) bsz, seqlen = x.size(0), x.size(1) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) if self.multilayer_rnn_as_single_module: state_size = ( (2 if self.bidirectional else 1) * self.num_layers, bsz, self.hidden_size, ) h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full( src_lengths.size(), x.size(0), device="cpu" ) ), enforce_sorted=enforce_sorted, ) # apply LSTM packed_outs, (_, _) = self.lstm(packed_x, (h0, c0)) # unpack outputs x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0 ) else: # for back-compatibility state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if ( self.residual and i > 0 ): # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full( src_lengths.size(), x.size(0), device="cpu" ) ), enforce_sorted=enforce_sorted, ) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0 ) if i < len(self.lstm) - 1: # not applying dropout for the last layer x = self.dropout_out_module(x) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `foward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # T x B "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [src_lengths], # B }
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('espresso.speech_recognize') if output_file is not sys.stdout: # also print to stdout logger.addHandler(logging.StreamHandler(sys.stdout)) print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dictionary = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, ) for i, m in enumerate(models): if hasattr(m, 'is_wordlm') and m.is_wordlm: # assume subword LM comes before word LM if isinstance(models[i - 1], FairseqLanguageModel): models[i - 1] = MultiLevelLanguageModel( m, models[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del models[i] logger.info('LM fusion with Multi-level LM') else: models[i] = TensorizedLookaheadLanguageModel( m, dictionary, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) logger.info('LM fusion with Look-ahead Word LM') # assume subword LM comes after E2E models elif i == len(models) - 1 and isinstance(m, FairseqLanguageModel): logger.info('LM fusion with Subword LM') if args.lm_weight != 0.0: logger.info('using LM fusion with lm-weight={:.2f}'.format( args.lm_weight)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, 'encoder') else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator if args.match_source_len: logger.warning( 'The option match_source_len is not applicable to speech recognition. Ignoring it.' ) gen_timer = StopwatchMeter() generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute WER scorer = wer.Scorer(dictionary, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens, lm_weight=args.lm_weight, ) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample['net_input'] src_tokens = net_input['src_tokens'] output_lengths = models[0].encoder.output_lengths( net_input['src_lengths']) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i in range(len(sample['id'])): has_target = sample['target'] is not None utt_id = sample['utt_id'][i] # Retrieve the original sentences if has_target: target_str = sample['target_raw_text'][i] if not args.quiet: detok_target_str = decode_fn(target_str) print('T-{}\t{}'.format(utt_id, detok_target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dictionary.string( hypo['tokens'].int().cpu(), bpe_symbol=None, extra_symbols_to_ignore={dictionary.pad()}, ) # not removing bpe at this point detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(utt_id, detok_hypo_str, score), file=output_file) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo['attention'][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo['attention'] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, 'attn_plots') os.makedirs(save_dir, exist_ok=True) plot_attention(attention, detok_hypo_str, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: logger.info('Saved attention plots in ' + save_dir) if has_target: scorer.add_ordered_utt_list(task.datasets[args.gen_subset].tgt.utt_ids) fn = 'decoded_char_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_char_results()) logger.info('Decoded char results saved as ' + f.name) fn = 'decoded_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_results()) logger.info('Decoded results saved as ' + f.name) if has_target: header = 'Recognize {} with beam={}: '.format(args.gen_subset, args.beam) fn = 'wer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.wer())) logger.info(header + res) f.write(res + '\n') logger.info('WER saved in ' + f.name) fn = 'cer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.cer())) logger.info(' ' * len(header) + res) f.write(res + '\n') logger.info('CER saved in ' + f.name) fn = 'aligned_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_aligned_results()) logger.info('Aligned results saved as ' + f.name) return scorer
def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ if self.conv_layers_before is not None: x, src_lengths, encoder_padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, encoder_padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) x = F.dropout(x, p=self.dropout, training=self.training) if self.fc0 is not None: x = self.fc0(x) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) encoder_padding_mask = encoder_padding_mask.t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def forward( self, src_tokens, src_lengths, return_all_hiddens: bool = False, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ if self.conv_layers_before is not None: x, src_lengths, encoder_padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, encoder_padding_mask = ( src_tokens, ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1))) x = self.dropout_module(x) if self.fc0 is not None: x = self.fc0(x) if self.embed_positions is not None: # 0s in `~encoder_padding_mask` are used as pad_idx for positional embeddings x = x + self.embed_positions((~encoder_padding_mask).int()) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) elif self.embed_positions is not None: # 0s in `~encoder_padding_mask` are used as pad_idx for positional embeddings x = x + self.embed_positions((~encoder_padding_mask).int()) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) # B x T x C -> T x B x C x = x.transpose(0, 1) attn_mask = self.get_attn_mask(src_lengths) encoder_states = [] # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask, attn_mask=attn_mask) if return_all_hiddens: assert encoder_states is not None encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `foward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward( self, src_tokens: Tensor, src_lengths: Tensor, enforce_sorted: bool = True, **unused, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` enforce_sorted (bool, optional): if True, `src_tokens` is expected to contain sequences sorted by length in a decreasing order. If False, this condition is not required. Default: True. """ if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.conv_layers_before is not None: x, src_lengths, padding_mask = self.conv_layers_before(src_tokens, src_lengths) else: x, padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) bsz, seqlen = x.size(0), x.size(1) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if self.residual and i > 0: # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full(src_lengths.size(), x.size(0), device="cpu") ), enforce_sorted=enforce_sorted ) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_value*1.0) if i < len(self.lstm) - 1: # not applying dropout for the last layer x = self.dropout_out_module(x) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask if encoder_padding_mask.any() else None, # T x B encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=src_lengths, # B )
def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: namedtuple: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ if self.conv_layers_before is not None: x, src_lengths, encoder_padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, encoder_padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) x = self.dropout_module(x) if self.fc0 is not None: x = self.fc0(x) if self.embed_positions is not None: # 0s in `~encoder_padding_mask` are used as pad_idx for positional embeddings x = x + self.embed_positions((~encoder_padding_mask).int()) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) elif self.embed_positions is not None: # 0s in `~encoder_padding_mask` are used as pad_idx for positional embeddings x = x + self.embed_positions((~encoder_padding_mask).int()) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) attn_mask = self.get_attn_mask(src_lengths) encoder_states = [] if return_all_hiddens else None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask, attn_mask=attn_mask) if return_all_hiddens: assert encoder_states is not None encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask, # B x T encoder_embedding=None, encoder_states=encoder_states, # List[T x B x C] src_tokens=None, src_lengths=None, )
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("espresso.speech_recognize") if output_file is not sys.stdout: # also print to stdout logger.addHandler(logging.StreamHandler(sys.stdout)) print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dictionary = task.target_dictionary overrides = ast.literal_eval(args.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=overrides, task=task, suffix=getattr(args, "checkpoint_suffix", ""), strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, ) if args.lm_path is not None: overrides["data"] = args.data try: lms, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args.lm_path), arg_overrides=overrides, task=None, ) except: logger.warning( f"Failed to load language model! Please make sure that the language model dict is the same " f"as target dict and is located in the data dir ({args.data})") raise assert len(lms) == 1 or len(lms) == 2 # Multi-level LM expects two LMs else: lms = [None] for i, m in enumerate(lms): if m is None: continue if hasattr(m, "is_wordlm") and m.is_wordlm: # assume subword LM comes before word LM if i > 0 and isinstance(lms[i - 1], FairseqLanguageModel): lms[i - 1] = MultiLevelLanguageModel( m, lms[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del lms[i] logger.info("LM fusion with Multi-level LM") else: lms[i] = TensorizedLookaheadLanguageModel( m, dictionary, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) logger.info("LM fusion with Look-ahead Word LM") else: assert isinstance(m, FairseqLanguageModel) logger.info("LM fusion with Subword LM") if args.lm_weight != 0.0: logger.info("using LM fusion with lm-weight={:.2f}".format( args.lm_weight)) # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if args.fp16: model.half() if use_cuda and not args.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(args) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator if args.match_source_len: logger.warning( "The option match_source_len is not applicable to speech recognition. Ignoring it." ) gen_timer = StopwatchMeter() extra_gen_cls_kwargs = { "lm_model": lms[0], "lm_weight": args.lm_weight, "eos_factor": args.eos_factor, } args.score_reference = False # not applicable for ASR temp_val = args.print_alignment args.print_alignment = False # not applicable for ASR generator = task.build_generator(models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs) args.print_alignment = temp_val # Handle tokenization and BPE tokenizer = task.build_tokenizer(args) bpe = task.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute WER scorer = wer.Scorer(dictionary, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample["net_input"] src_tokens = net_input["src_tokens"] output_lengths = models[0].encoder.output_lengths( net_input["src_lengths"]) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i in range(len(sample["id"])): has_target = sample["target"] is not None utt_id = sample["utt_id"][i] # Retrieve the original sentences if has_target: target_str = sample["target_raw_text"][i] if not args.quiet: detok_target_str = decode_fn(target_str) print("T-{}\t{}".format(utt_id, detok_target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dictionary.string( hypo["tokens"].int().cpu(), bpe_symbol=None, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) # not removing bpe at this point detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo["score"] / math.log(2) # convert to base 2 print("H-{}\t{}\t{}".format(utt_id, detok_hypo_str, score), file=output_file) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo["attention"][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo["attention"] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, "attn_plots") os.makedirs(save_dir, exist_ok=True) plot_attention(attention, detok_hypo_str, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample["id"].numel() logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: logger.info("Saved attention plots in " + save_dir) if has_target: scorer.add_ordered_utt_list(task.datasets[args.gen_subset].tgt.utt_ids) fn = "decoded_char_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_char_results()) logger.info("Decoded char results saved as " + f.name) fn = "decoded_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_results()) logger.info("Decoded results saved as " + f.name) if has_target: header = "Recognize {} with beam={}: ".format(args.gen_subset, args.beam) fn = "wer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.wer())) logger.info(header + res) f.write(res + "\n") logger.info("WER saved in " + f.name) fn = "cer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.cer())) logger.info(" " * len(header) + res) f.write(res + "\n") logger.info("CER saved in " + f.name) fn = "aligned_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_aligned_results()) logger.info("Aligned results saved as " + f.name) return scorer
def forward_scriptable( self, src_tokens, src_lengths, return_all_hiddens: bool = False, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ if self.pre_encoder is not None: x, src_lengths, encoder_padding_mask = self.pre_encoder( src_tokens, src_lengths ) else: x, encoder_padding_mask = ( src_tokens, ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)), ) has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any() if self.fc0 is not None: x = self.dropout_module(x) x = self.fc0(x) x = self.embed_scale * x if self.embed_positions is not None: # 0s in `~encoder_padding_mask` are used as pad_idx for positional embeddings x = x + self.embed_positions((~encoder_padding_mask).int()) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) if self.quant_noise is not None: x = self.quant_noise(x) # account for padding while computing the representation if has_pads: x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x)) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] fc_results = [] if return_all_hiddens: encoder_states.append(x) attn_mask = self.get_attn_mask(src_lengths) # nested tensor and BT enable layer = self.layers[0] BT_flag = False NT_flag = False # torch version check, BT>=1.12.0 and NT>=1.13.0.dev20220613 # internal format is '1.13.0a0+fb' # external format is '1.13.0.dev20220613'(cpu&gpu) for nightly or "1.11.0"(cpu) or '1.11.0+cu102'(gpu) for stable BT_version = False NT_version = False if "fb" in torch.__version__: BT_version = True NT_version = True else: if "+" in torch.__version__: torch_version = torch.__version__.split("+")[0] else: torch_version = torch.__version__ torch_version = torch_version.split(".") int_version = ( int(torch_version[0]) * 1000 + int(torch_version[1]) * 10 + int(torch_version[2]) ) if len(torch_version) == 3: if int_version >= 1120: BT_version = True if int_version >= 1131: NT_version = True elif len(torch_version) == 4: if int_version >= 1130: BT_version = True # Consider _nested_tensor_from_mask_left_aligned is landed after "20220613" if int_version >= 1131 or ( int_version == 1130 and torch_version[3][3:] >= "20220613" ): NT_version = True if ( BT_version and x.dim() == 3 and layer.load_to_BT and not layer.return_fc and layer.can_use_fastpath and not layer.training and not layer.ever_training and not layer.cfg_checkpoint_activations ): # Batch first can not be justified but needs user to make sure x = x.transpose(0, 1) # Check mask conditions for nested tensor if NT_version: if ( encoder_padding_mask is not None and torch._nested_tensor_from_mask_left_aligned( x, encoder_padding_mask.logical_not() ) ): if not torch.is_grad_enabled() or not x.requires_grad: x = torch._nested_tensor_from_mask( x, encoder_padding_mask.logical_not() ) NT_flag = True BT_flag = True # encoder layers if NT_flag: processing_mask = None else: processing_mask = encoder_padding_mask encoder_padding_mask_out = processing_mask if has_pads else None for layer in self.layers: lr = layer( x, encoder_padding_mask=encoder_padding_mask_out, attn_mask=attn_mask, ) if isinstance(lr, tuple) and len(lr) == 2: x, fc_result = lr else: x = lr fc_result = None if return_all_hiddens and not torch.jit.is_scripting(): assert encoder_states is not None encoder_states.append(x) fc_results.append(fc_result) # change back to non-nested and Batch second if NT_flag: x = x.to_padded_tensor(0.0) if NT_flag or BT_flag: x = x.transpose(0, 1) if self.layer_norm is not None: x = self.layer_norm(x) # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `forward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], "encoder_states": encoder_states, # List[T x B x C] "fc_results": fc_results, # List[T x B x C] "src_tokens": [], "src_lengths": [src_lengths], # List[B] }