def load_openvino_model(): logdir = os.path.join('logs', FLAGS.name) tokenizer = HuggingFaceTokenizer(cache_dir=logdir, vocab_size=FLAGS.bpe_size) _, transform, input_size = build_transform(feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, pad_to_divisible=False, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) ie = IECore() encoder_net = ie.read_network(model=os.path.join(logdir, 'encoder.xml'), weights=os.path.join(logdir, 'encoder.bin')) encoder = ie.load_network(network=encoder_net, device_name='CPU') decoder_net = ie.read_network(model=os.path.join(logdir, 'decoder.xml'), weights=os.path.join(logdir, 'decoder.bin')) decoder = ie.load_network(network=decoder_net, device_name='CPU') joint_net = ie.read_network(model=os.path.join(logdir, 'joint.xml'), weights=os.path.join(logdir, 'joint.bin')) joint = ie.load_network(network=joint_net, device_name='CPU') return encoder, decoder, joint, tokenizer, transform
def __init__(self, FLAGS): self.FLAGS = FLAGS logdir = os.path.join('logs', FLAGS.name) self.tokenizer = HuggingFaceTokenizer( cache_dir=logdir, vocab_size=FLAGS.bpe_size) _, self.transform, input_size = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, pad_to_divisible=False, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) ie = IECore() encoder_net = ie.read_network( model=os.path.join(logdir, 'encoder.xml'), weights=os.path.join(logdir, 'encoder.bin')) self.encoder = ie.load_network(network=encoder_net, device_name='CPU') decoder_net = ie.read_network( model=os.path.join(logdir, 'decoder.xml'), weights=os.path.join(logdir, 'decoder.bin')) self.decoder = ie.load_network(network=decoder_net, device_name='CPU') joint_net = ie.read_network( model=os.path.join(logdir, 'joint.xml'), weights=os.path.join(logdir, 'joint.bin')) self.joint = ie.load_network(network=joint_net, device_name='CPU') self.reset_profile() self.reset()
def __init__(self): super(ParallelTraining, self).__init__() _, _, input_size = build_transform(feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) self.log_path = None self.loss_fn = RNNTLoss(blank=NUL) if FLAGS.tokenizer == 'char': self.tokenizer = CharTokenizer(cache_dir=self.logdir) else: self.tokenizer = HuggingFaceTokenizer(cache_dir='BPE-2048', vocab_size=FLAGS.bpe_size) self.vocab_size = self.tokenizer.vocab_size print(FLAGS.enc_type) self.model = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, module_type=FLAGS.enc_type, output_loss=False, ) self.latest_alignment = None self.steps = 0 self.epoch = 0 self.best_wer = 1000
def __init__(self, FLAGS): self.FLAGS = FLAGS logdir = os.path.join('logs', FLAGS.name) self.tokenizer = HuggingFaceTokenizer( cache_dir='BPE-'+str(FLAGS.bpe_size), vocab_size=FLAGS.bpe_size) assert self.tokenizer.tokenizer != None _, self.transform, input_size = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, pad_to_divisible=False, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) model_path = os.path.join(logdir, 'models', FLAGS.model_name) if os.path.exists(model_path): checkpoint = torch.load(model_path, lambda storage, loc: storage) else: model_path = os.path.join(logdir, FLAGS.model_name) checkpoint = torch.load(model_path, lambda storage, loc: storage) transducer = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.tokenizer.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, output_loss=False, ) transducer.load_state_dict(convert_lightning2normal(checkpoint)['model']) transducer.eval() self.encoder = transducer.encoder self.decoder = transducer.decoder self.joint = transducer.joint self.reset_profile() self.reset()
def main(argv): assert FLAGS.step_n_frame % 2 == 0, ("step_n_frame must be divisible by " "reduction_factor of TimeReduction") logdir = os.path.join('logs', FLAGS.name) tokenizer = HuggingFaceTokenizer(cache_dir=logdir, vocab_size=FLAGS.bpe_size) transform_train, transform_test, input_size = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) model_path = os.path.join(logdir, 'models', FLAGS.model_name) checkpoint = torch.load(model_path, lambda storage, loc: storage) transducer = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=tokenizer.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, ) transducer.load_state_dict(checkpoint['model']) transducer.eval() export_encoder(transducer, input_size, tokenizer.vocab_size, logdir) export_decoder(transducer, input_size, tokenizer.vocab_size, logdir) export_join(transducer, input_size, tokenizer.vocab_size, logdir)
def load_pytorch_model(): logdir = os.path.join('logs', FLAGS.name) tokenizer = HuggingFaceTokenizer(cache_dir=logdir, vocab_size=FLAGS.bpe_size) _, transform, input_size = build_transform(feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, pad_to_divisible=False, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) model_path = os.path.join(logdir, 'models', '%d.pt' % FLAGS.step) checkpoint = torch.load(model_path, lambda storage, loc: storage) transducer = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=tokenizer.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, ) transducer.load_state_dict(checkpoint['model']) transducer.eval() encoder = transducer.encoder decoder = transducer.decoder joint = transducer.joint return encoder, decoder, joint, tokenizer, transform
class ParallelTraining(pl.LightningModule): def __init__(self): super(ParallelTraining, self).__init__() _, _, input_size = build_transform(feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) self.log_path = None self.loss_fn = RNNTLoss(blank=NUL) if FLAGS.tokenizer == 'char': self.tokenizer = CharTokenizer(cache_dir=self.logdir) else: self.tokenizer = HuggingFaceTokenizer(cache_dir='BPE-2048', vocab_size=FLAGS.bpe_size) self.vocab_size = self.tokenizer.vocab_size print(FLAGS.enc_type) self.model = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, module_type=FLAGS.enc_type, output_loss=False, ) self.latest_alignment = None self.steps = 0 self.epoch = 0 self.best_wer = 1000 def warmup_optimizer_step(self, steps): if steps < FLAGS.warmup_step: lr_scale = min(1., float(steps + 1) / FLAGS.warmup_step * 1.0) for pg in self.optimizer.param_groups: pg['lr'] = lr_scale * FLAGS.lr def forward(self, batch): xs, ys, xlen, ylen = batch # xs, ys, xlen = xs.cuda(), ys, xlen.cuda() alignment = self.model(xs, ys, xlen, ylen) return alignment def training_step(self, batch, batch_nb): xs, ys, xlen, ylen = batch # xs, ys, xlen = xs.cuda(), ys, xlen.cuda() if xs.shape[1] != xlen.max(): xs = xs[:, :xlen.max()] ys = ys[:, :ylen.max()] alignment = self.model(xs, ys, xlen, ylen) xlen = self.model.scale_length(alignment, xlen) loss = self.loss_fn(alignment, ys.int(), xlen, ylen) if batch_nb % 100 == 0: lr_val = 0 for param_group in self.optimizer.param_groups: lr_val = param_group['lr'] self.logger.experiment.add_scalar('lr', lr_val, self.steps) self.steps += 1 if self.steps < FLAGS.warmup_step: self.warmup_optimizer_step(self.steps) return {'loss': loss, 'log': {'loss': loss.item()}} def validation_step(self, batch, batch_nb): xs, ys, xlen, ylen = batch y, nll = self.model.greedy_decode(xs, xlen) hypothesis = self.tokenizer.decode_plus(y) ground_truth = self.tokenizer.decode_plus(ys.cpu().numpy()) measures = jiwer.compute_measures(ground_truth, hypothesis) return { 'val_loss': nll.mean().item(), 'wer': measures['wer'], 'ground_truth': ground_truth[0], 'hypothesis': hypothesis[0] } def validation_end(self, outputs): # OPTIONAL self.logger.experiment.add_text('test', 'This is test', 0) avg_wer = np.mean([x['wer'] for x in outputs]) ppl = np.mean([x['val_loss'] for x in outputs]) self.logger.experiment.add_scalar('val/WER', avg_wer, self.steps) self.logger.experiment.add_scalar('val/perplexity', ppl, self.steps) hypothesis, ground_truth = '', '' for idx in range(min(5, len(outputs))): hypothesis += outputs[idx]['hypothesis'] + '\n\n' ground_truth += outputs[idx]['ground_truth'] + '\n\n' self.logger.experiment.add_text('generated', hypothesis, self.steps) self.logger.experiment.add_text('grouth_truth', ground_truth, self.steps) if self.latest_alignment != None: alignment = self.latest_alignment idx = random.randint(0, alignment.size(0) - 1) alignment = torch.softmax(alignment[idx], dim=-1) alignment[:, :, 0] = 0 # ignore blank token alignment = alignment.mean(dim=-1) self.logger.experiment.add_image("alignment", plot_alignment_to_numpy( alignment.data.numpy().T), self.steps, dataformats='HWC') self.logger.experiment.flush() if self.best_wer > avg_wer and self.epoch > 0: print('best checkpoint found!') # checkpoint = { # 'model': self.model.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'epoch': self.epoch # } # if FLAGS.apex: # checkpoint['amp'] = amp.state_dict() # torch.save(checkpoint, os.path.join(self.log_path, str(self.epoch)+'amp_checkpoint.pt')) self.trainer.save_checkpoint( os.path.join(self.log_path, str(self.epoch) + 'amp_checkpoint.pt')) self.best_wer = avg_wer self.plateau_scheduler.step(avg_wer) self.epoch += 1 return { 'val/WER': torch.tensor(avg_wer), 'wer': torch.tensor(avg_wer), 'val/perplexity': torch.tensor(ppl) } def validation_epoch_end(self, outputs): avg_wer = np.mean([x['wer'] for x in outputs]) ppl = np.mean([x['val_loss'] for x in outputs]) hypothesis, ground_truth = '', '' for idx in range(5): hypothesis += outputs[idx]['hypothesis'] + '\n\n' ground_truth += outputs[idx]['ground_truth'] + '\n\n' writer.add_text('generated', hypothesis, self.steps) writer.add_text('grouth_truth', ground_truth, self.steps) if self.latest_alignment != None: alignment = self.latest_alignment idx = random.randint(0, alignment.size(0) - 1) alignment = torch.softmax(alignment[idx], dim=-1) alignment[:, :, 0] = 0 # ignore blank token alignment = alignment.mean(dim=-1) writer.add_image("alignment", plot_alignment_to_numpy(alignment.data.numpy().T), self.steps, dataformats='HWC') self.logger.experiment.add_scalar('val/WER', avg_wer, self.steps) self.logger.experiment.add_scalar('val/perplexity', ppl, self.steps) self.logger.experiment.flush() self.plateau_scheduler.step(avg_wer) self.epoch += 1 return { 'val/WER': torch.tensor(avg_wer), 'val/perplexity': torch.tensor(ppl) } def configure_optimizers(self): if FLAGS.optim == 'adam': self.optimizer = AdamW(self.model.parameters(), lr=FLAGS.lr, weight_decay=1e-5) elif FLAGS.optim == 'sm3': self.optimizer = SM3(self.model.parameters(), lr=FLAGS.lr, momentum=0.0) else: self.optimizer = Novograd(self.model.parameters(), lr=FLAGS.lr, weight_decay=1e-3) scheduler = [] if FLAGS.sched: self.plateau_scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, patience=FLAGS.sched_patience, factor=FLAGS.sched_factor, min_lr=FLAGS.sched_min_lr, verbose=1) scheduler = [self.plateau_scheduler] self.warmup_optimizer_step(0) return [self.optimizer] @pl.data_loader def train_dataloader(self): transform_train, _, _ = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) dataloader = DataLoader( dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_train_500, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech(root=FLAGS.LibriSpeech_train_360, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), # Librispeech( # root=FLAGS.LibriSpeech_train_100, # tokenizer=self.tokenizer, # transform=transform_train, # audio_max_length=FLAGS.audio_max_length), TEDLIUM(root=FLAGS.TEDLIUM_train, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), CommonVoice(root=FLAGS.CommonVoice, labels='train.tsv', tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length, audio_min_length=1), YoutubeCaption(root='../speech_data/youtube-speech-text/', labels='bloomberg2_meta.csv', tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length, audio_min_length=1), YoutubeCaption(root='../speech_data/youtube-speech-text/', labels='life_meta.csv', tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length, audio_min_length=1), YoutubeCaption(root='../speech_data/youtube-speech-text/', labels='news_meta.csv', tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length, audio_min_length=1), YoutubeCaption(root='../speech_data/youtube-speech-text/', labels='english2_meta.csv', tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length, audio_min_length=1), ]), batch_size=FLAGS.sub_batch_size, shuffle=True, num_workers=FLAGS.num_workers, collate_fn=seq_collate, drop_last=True) return dataloader @pl.data_loader def val_dataloader(self): _, transform_test, _ = build_transform(feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) val_dataloader = DataLoader(dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_test, tokenizer=self.tokenizer, transform=transform_test, reverse_sorted_by_length=True) ]), batch_size=FLAGS.eval_batch_size, shuffle=False, num_workers=FLAGS.num_workers, collate_fn=seq_collate) return val_dataloader
def __init__(self): self.name = FLAGS.name self.logdir = os.path.join('logs', FLAGS.name) self.model_dir = os.path.join(self.logdir, 'models') # Transform transform_train, transform_test, input_size = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) # Tokenizer if FLAGS.tokenizer == 'char': self.tokenizer = CharTokenizer(cache_dir=self.logdir) else: self.tokenizer = HuggingFaceTokenizer(cache_dir=self.logdir, vocab_size=FLAGS.bpe_size) # Dataloader self.dataloader_train = DataLoader( dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_train_500, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech(root=FLAGS.LibriSpeech_train_360, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech(root=FLAGS.LibriSpeech_train_100, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), # TEDLIUM( # root=FLAGS.TEDLIUM_train, # tokenizer=self.tokenizer, # transform=transform_train, # audio_max_length=FLAGS.audio_max_length), # CommonVoice( # root=FLAGS.CommonVoice, labels='train.tsv', # tokenizer=self.tokenizer, # transform=transform_train, # audio_max_length=FLAGS.audio_max_length) ]), batch_size=FLAGS.batch_size, shuffle=True, num_workers=FLAGS.num_workers, collate_fn=seq_collate, drop_last=True) self.dataloader_val = DataLoader(dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_test, tokenizer=self.tokenizer, transform=transform_test, reverse_sorted_by_length=True) ]), batch_size=FLAGS.eval_batch_size, shuffle=False, num_workers=FLAGS.num_workers, collate_fn=seq_collate) self.tokenizer.build(self.dataloader_train.dataset.texts()) self.vocab_size = self.dataloader_train.dataset.tokenizer.vocab_size # Model self.model = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, ).to(device) # Optimizer if FLAGS.optim == 'adam': self.optim = optim.Adam(self.model.parameters(), lr=FLAGS.lr) else: self.optim = optim.SGD(self.model.parameters(), lr=FLAGS.lr, momentum=0.9) # Scheduler if FLAGS.sched: self.sched = optim.lr_scheduler.ReduceLROnPlateau( self.optim, patience=FLAGS.sched_patience, factor=FLAGS.sched_factor, min_lr=FLAGS.sched_min_lr, verbose=1) # Apex if FLAGS.apex: self.model, self.optim = amp.initialize(self.model, self.optim, opt_level=FLAGS.opt_level) # Multi GPU if FLAGS.multi_gpu: self.model = torch.nn.DataParallel(self.model)
class Trainer: def __init__(self): self.name = FLAGS.name self.logdir = os.path.join('logs', FLAGS.name) self.model_dir = os.path.join(self.logdir, 'models') # Transform transform_train, transform_test, input_size = build_transform( feature_type=FLAGS.feature, feature_size=FLAGS.feature_size, n_fft=FLAGS.n_fft, win_length=FLAGS.win_length, hop_length=FLAGS.hop_length, delta=FLAGS.delta, cmvn=FLAGS.cmvn, downsample=FLAGS.downsample, T_mask=FLAGS.T_mask, T_num_mask=FLAGS.T_num_mask, F_mask=FLAGS.F_mask, F_num_mask=FLAGS.F_num_mask) # Tokenizer if FLAGS.tokenizer == 'char': self.tokenizer = CharTokenizer(cache_dir=self.logdir) else: self.tokenizer = HuggingFaceTokenizer(cache_dir=self.logdir, vocab_size=FLAGS.bpe_size) # Dataloader self.dataloader_train = DataLoader( dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_train_500, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech(root=FLAGS.LibriSpeech_train_360, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech(root=FLAGS.LibriSpeech_train_100, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), # TEDLIUM( # root=FLAGS.TEDLIUM_train, # tokenizer=self.tokenizer, # transform=transform_train, # audio_max_length=FLAGS.audio_max_length), # CommonVoice( # root=FLAGS.CommonVoice, labels='train.tsv', # tokenizer=self.tokenizer, # transform=transform_train, # audio_max_length=FLAGS.audio_max_length) ]), batch_size=FLAGS.batch_size, shuffle=True, num_workers=FLAGS.num_workers, collate_fn=seq_collate, drop_last=True) self.dataloader_val = DataLoader(dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_test, tokenizer=self.tokenizer, transform=transform_test, reverse_sorted_by_length=True) ]), batch_size=FLAGS.eval_batch_size, shuffle=False, num_workers=FLAGS.num_workers, collate_fn=seq_collate) self.tokenizer.build(self.dataloader_train.dataset.texts()) self.vocab_size = self.dataloader_train.dataset.tokenizer.vocab_size # Model self.model = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.vocab_size, input_size=input_size, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, ).to(device) # Optimizer if FLAGS.optim == 'adam': self.optim = optim.Adam(self.model.parameters(), lr=FLAGS.lr) else: self.optim = optim.SGD(self.model.parameters(), lr=FLAGS.lr, momentum=0.9) # Scheduler if FLAGS.sched: self.sched = optim.lr_scheduler.ReduceLROnPlateau( self.optim, patience=FLAGS.sched_patience, factor=FLAGS.sched_factor, min_lr=FLAGS.sched_min_lr, verbose=1) # Apex if FLAGS.apex: self.model, self.optim = amp.initialize(self.model, self.optim, opt_level=FLAGS.opt_level) # Multi GPU if FLAGS.multi_gpu: self.model = torch.nn.DataParallel(self.model) def scale_length(self, prob, xlen): scale = (xlen.max().float() / prob.shape[1]).ceil() xlen = (xlen / scale).ceil().int() return xlen def train(self, start_step=1): if FLAGS.mode == "resume": exist_ok = True else: exist_ok = False os.makedirs(self.model_dir, exist_ok=exist_ok) writer = SummaryWriter(self.logdir) writer.add_text('flagfile', FLAGS.flags_into_string().replace('\n', '\n\n')) FLAGS.append_flags_into_file(os.path.join(self.logdir, 'flagfile.txt')) looper = infloop(self.dataloader_train) losses = [] steps = len(self.dataloader_train) * FLAGS.epochs with trange(start_step, steps + 1, dynamic_ncols=True) as pbar: for step in pbar: if step <= FLAGS.warmup_step: scale = step / FLAGS.warmup_step self.optim.param_groups[0]['lr'] = FLAGS.lr * scale batch, epoch = next(looper) loss = self.train_step(batch) losses.append(loss) lr = self.optim.param_groups[0]['lr'] pbar.set_description('Epoch %d, loss: %.4f, lr: %.3E' % (epoch, loss, lr)) if step % FLAGS.loss_step == 0: train_loss = torch.stack(losses).mean() losses = [] writer.add_scalar('train_loss', train_loss, step) if step % FLAGS.save_step == 0: self.save(step) if step % FLAGS.eval_step == 0: pbar.set_description('Evaluating ...') val_loss, wer, pred_seqs, true_seqs = self.evaluate() if FLAGS.sched: self.sched.step(val_loss) writer.add_scalar('WER', wer, step) writer.add_scalar('val_loss', val_loss, step) for i in range(FLAGS.sample_size): log = "`%s`\n\n`%s`" % (true_seqs[i], pred_seqs[i]) writer.add_text('val/%d' % i, log, step) pbar.write('Epoch %d, step %d, loss: %.4f, WER: %.4f' % (epoch, step, val_loss, wer)) def train_step(self, batch): sub_losses = [] start_idxs = range(0, FLAGS.batch_size, FLAGS.sub_batch_size) self.optim.zero_grad() for sub_batch_idx, start_idx in enumerate(start_idxs): sub_slice = slice(start_idx, start_idx + FLAGS.sub_batch_size) xs, ys, xlen, ylen = [x[sub_slice].to(device) for x in batch] xs = xs[:, :xlen.max()].contiguous() ys = ys[:, :ylen.max()].contiguous() loss = self.model(xs, ys, xlen, ylen) if FLAGS.multi_gpu: loss = loss.mean() / len(start_idxs) else: loss = loss / len(start_idxs) if FLAGS.apex: delay_unscale = sub_batch_idx < len(start_idxs) - 1 with amp.scale_loss( loss, self.optim, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() else: loss.backward() sub_losses.append(loss.detach()) if FLAGS.gradclip is not None: if FLAGS.apex: parameters = amp.master_params(self.optim) else: parameters = self.model.parameters() torch.nn.utils.clip_grad_norm_(parameters, FLAGS.gradclip) self.optim.step() loss = torch.stack(sub_losses).sum() return loss def evaluate(self): self.model.eval() wers = [] losses = [] pred_seqs = [] true_seqs = [] with torch.no_grad(): with tqdm(self.dataloader_val, dynamic_ncols=True) as pbar: for batch in pbar: loss, wer, pred_seq, true_seq = self.evaluate_step(batch) wers.append(wer) losses.append(loss) sample_nums = FLAGS.sample_size - len(pred_seqs) pred_seqs.extend(pred_seq[:sample_nums]) true_seqs.extend(true_seq[:sample_nums]) pbar.set_description('wer: %.4f, loss: %.4f' % (wer, loss)) loss = np.mean(losses) wer = np.mean(wers) self.model.train() return loss, wer, pred_seqs, true_seqs def evaluate_step(self, batch): xs, ys, xlen, ylen = [x.to(device) for x in batch] xs = xs[:, :xlen.max()] ys = ys[:, :ylen.max()].contiguous() loss = self.model(xs, ys, xlen, ylen) if FLAGS.multi_gpu: loss = loss.mean() if FLAGS.multi_gpu: ys_hat, nll = self.model.module.greedy_decode(xs, xlen) else: ys_hat, nll = self.model.greedy_decode(xs, xlen) pred_seq = self.tokenizer.decode_plus(ys_hat) true_seq = self.tokenizer.decode_plus(ys.cpu().numpy()) wer = jiwer.wer(true_seq, pred_seq) return loss.item(), wer, pred_seq, true_seq def save(self, step): checkpoint = {'optim': self.optim.state_dict()} if FLAGS.multi_gpu: checkpoint.update({'model': self.model.module.state_dict()}) else: checkpoint.update({'model': self.model.state_dict()}) if self.sched is not None: checkpoint.update({'sched': self.sched.state_dict()}) if FLAGS.apex: checkpoint.update({'amp': amp.state_dict()}) path = os.path.join(self.model_dir, '%d.pt' % step) torch.save(checkpoint, path) def load(self, path): checkpoint = torch.load(path) # self.optim.load_state_dict(checkpoint['optim']) if FLAGS.multi_gpu: self.model.module.load_state_dict(checkpoint['model']) else: self.model.load_state_dict(checkpoint['model']) if self.sched is not None: self.sched.load_state_dict(checkpoint['sched']) if FLAGS.apex: amp.load_state_dict(checkpoint['amp']) def sanity_check(self): self.model.eval() batch = next(iter(self.dataloader_val)) self.evaluate_step(batch) self.model.train()
def __init__(self): self.name = FLAGS.name self.logdir = os.path.join('logs', FLAGS.name) self.model_dir = os.path.join(self.logdir, 'models') # Transform transform = torch.nn.Sequential( TrimAudio(sampling_rate=16000, max_audio_length=FLAGS.audio_max_length) ) transform_train, transform_test = transform, transform # Tokenizer if FLAGS.tokenizer == 'char': self.tokenizer = CharTokenizer(cache_dir=self.logdir) else: self.tokenizer = HuggingFaceTokenizer( cache_dir='BPE-2048', vocab_size=FLAGS.bpe_size) # Dataloader self.dataloader_train = DataLoader( dataset=MergedDataset([ Librispeech( root=FLAGS.LibriSpeech_train_100, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), Librispeech( root=FLAGS.LibriSpeech_dev, tokenizer=self.tokenizer, transform=transform_train, audio_max_length=FLAGS.audio_max_length), ]), batch_size=FLAGS.batch_size, shuffle=True, num_workers=FLAGS.num_workers, collate_fn=seq_collate, drop_last=True ) self.dataloader_val = DataLoader( dataset=MergedDataset([ Librispeech( root=FLAGS.LibriSpeech_test, tokenizer=self.tokenizer, transform=transform_test, reverse_sorted_by_length=True)]), batch_size=FLAGS.eval_batch_size, shuffle=False, num_workers=FLAGS.num_workers, collate_fn=seq_collate) self.tokenizer.build(self.dataloader_train.dataset.texts()) self.vocab_size = self.dataloader_train.dataset.tokenizer.vocab_size # Model self.frontend = FrontEnd( frontend_params = [(10, 5, 32)]+[(3, 2, 128)]*4 + [(2,2,128)] *3, bias=True, ) self.model = Transducer( vocab_embed_size=FLAGS.vocab_embed_size, vocab_size=self.vocab_size, input_size=128, enc_hidden_size=FLAGS.enc_hidden_size, enc_layers=FLAGS.enc_layers, enc_dropout=FLAGS.enc_dropout, enc_proj_size=FLAGS.enc_proj_size, enc_time_reductions=[], dec_hidden_size=FLAGS.dec_hidden_size, dec_layers=FLAGS.dec_layers, dec_dropout=FLAGS.dec_dropout, dec_proj_size=FLAGS.dec_proj_size, joint_size=FLAGS.joint_size, ) if FLAGS.use_pretrained: self.frontend, self.model = load_pretrained_model(self.frontend, self.model) print('load pretrained model') self.frontend = self.frontend.to(device) self.model = self.model.to(device) # Optimizer if FLAGS.optim == 'adam': self.optim = optim.Adam( list(self.model.parameters())+list(self.frontend.parameters()), lr=FLAGS.lr) else: self.optim = optim.SGD( list(self.model.parameters())+list(self.frontend.parameters()), lr=FLAGS.lr, momentum=0.9) # Scheduler if FLAGS.sched: self.sched = optim.lr_scheduler.ReduceLROnPlateau( self.optim, patience=FLAGS.sched_patience, factor=FLAGS.sched_factor, min_lr=FLAGS.sched_min_lr, verbose=1) # Apex if FLAGS.apex: self.model, self.optim = amp.initialize( self.model, self.optim, opt_level=FLAGS.opt_level) # Multi GPU if FLAGS.multi_gpu: self.model = torch.nn.DataParallel(self.model)
reduce=False) for key, value in logging_output.items(): if key not in logging_outputs: logging_outputs[key] = [] if FLAGS.multi_gpu and isinstance(value, torch.Tensor): value = value.mean() logging_outputs[key].append(value) model.train() return {key: np.mean(scores) for key, scores in logging_outputs.items()} if __name__ == '__main__': # tokenizer is not needed in this stage tokenizer = HuggingFaceTokenizer(cache_dir='BPE-2048', vocab_size=2048) transform = torch.nn.Sequential( TrimAudio(sampling_rate=16000, max_audio_length=15)) dataloader = DataLoader(dataset=MergedDataset([ YoutubeCaption( '../yt_speech/', labels='news_dummy.csv', tokenizer=tokenizer, transform=transform, audio_max_length=14, ), YoutubeCaption( '../yt_speech/', labels='life_dummy.csv',
def main(argv): assert FLAGS.step_n_frame % 2 == 0, ("step_n_frame must be divisible by " "reduction_factor of TimeReduction") tokenizer = HuggingFaceTokenizer(cache_dir=os.path.join( 'logs', FLAGS.name), vocab_size=FLAGS.bpe_size) dataloader = DataLoader(dataset=MergedDataset([ Librispeech(root=FLAGS.LibriSpeech_test, tokenizer=tokenizer, transform=None, reverse_sorted_by_length=True) ]), batch_size=1, shuffle=False, num_workers=0) pytorch_decoder = PytorchStreamDecoder(FLAGS) # pytorch_decoder.reset_profile() # wers = [] # total_time = 0 # total_frame = 0 # with tqdm(dataloader, dynamic_ncols=True) as pbar: # pbar.set_description("Pytorch full sequence decode") # for waveform, tokens in pbar: # true_seq = tokenizer.decode(tokens[0].numpy()) # # pytorch: Encode waveform at a time # start = time.time() # pred_seq, frames = fullseq_decode(pytorch_decoder, waveform) # # pbar.write(true_seq) # # pbar.write(pred_seq) # elapsed = time.time() - start # total_time += elapsed # total_frame += frames # wer = jiwer.wer(true_seq, pred_seq) # wers.append(wer) # pbar.set_postfix(wer='%.3f' % wer, elapsed='%.3f' % elapsed) # wer = np.mean(wers) # print('Mean wer: %.3f, Frame: %d, Time: %.3f, FPS: %.3f, speed: %.3f' % ( # wer, total_frame, total_time, total_frame / total_time, # total_frame / total_time / 16000)) pytorch_decoder.reset_profile() wers = [] total_time = 0 total_frame = 0 with tqdm(dataloader, dynamic_ncols=True) as pbar: pbar.set_description("Pytorch frame wise decode") for waveform, tokens in pbar: true_seq = tokenizer.decode(tokens[0].numpy()) # pytorch: Encode waveform at a time start = time.time() pred_seq, frames = stream_decode(pytorch_decoder, waveform) elapsed = time.time() - start total_time += elapsed total_frame += frames wer = jiwer.wer(true_seq, pred_seq) wers.append(wer) pbar.set_postfix(wer='%.3f' % wer, elapsed='%.3f' % elapsed) wer = np.mean(wers) print('Mean wer: %.3f, Frame: %d, Time: %.3f, FPS: %.3f, speed: %.3f' % (wer, total_frame, total_time, total_frame / total_time, total_frame / total_time / 16000)) print("Mean encoding time: %.3f ms" % (1000 * np.mean(pytorch_decoder.encoder_elapsed))) print("Mean decoding time: %.3f ms" % (1000 * np.mean(pytorch_decoder.decoder_elapsed))) print("Mean joint time: %.3f ms" % (1000 * np.mean(pytorch_decoder.joint_elapsed))) openvino_decoder = OpenVINOStreamDecoder(FLAGS) openvino_decoder.reset_profile() wers = [] total_time = 0 total_frame = 0 with tqdm(dataloader, dynamic_ncols=True) as pbar: pbar.set_description("OpenVINO frame wise decode") for waveform, tokens in pbar: true_seq = tokenizer.decode(tokens[0].numpy()) # pytorch: Encode waveform at a time start = time.time() pred_seq, frames = stream_decode(openvino_decoder, waveform) # pbar.write(true_seq) # pbar.write(pred_seq) elapsed = time.time() - start total_time += elapsed total_frame += frames wer = jiwer.wer(true_seq, pred_seq) wers.append(wer) pbar.set_postfix(wer='%.3f' % wer, elapsed='%.3f' % elapsed) wer = np.mean(wers) print('Mean wer: %.3f, Frame: %d, Time: %.3f, FPS: %.3f, speed: %.3f' % (wer, total_frame, total_time, total_frame / total_time, total_frame / total_time / 16000)) print("Mean encoding time: %.3f ms" % (1000 * np.mean(openvino_decoder.encoder_elapsed))) print("Mean decoding time: %.3f ms" % (1000 * np.mean(openvino_decoder.decoder_elapsed))) print("Mean joint time: %.3f ms" % (1000 * np.mean(openvino_decoder.joint_elapsed)))