def _load_or_create_model(epochs, dataset, continue_from, learning_rate, rnn_type, hidden_size, hidden_layers, momentum, cuda, tensorboard_writer): weights_dir = _util.getRelWeightsPath(dataset) if continue_from: continue_from = _get_checkpoint_filepath(weights_dir, continue_from) print('Loading checkpoint model {}'.format(continue_from)) package = torch.load(continue_from, map_location=lambda storage, loc: storage) model = _model.LipReader.load_model_package(package) labels = _model.LipReader.get_labels(model) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, nesterov=True) optimizer.load_state_dict(package['optim_dict']) # Index start at 0 for training start_epoch = int(package.get('epoch', 1)) - 1 start_iter = package.get('iteration', None) if start_iter is None: # We saved model after epoch finished, start at the next epoch. start_epoch += 1 start_iter = 0 else: start_iter += 1 avg_tr_loss = int(package.get('avg_tr_loss', 0)) avg_val_loss = int(package.get('avg_val_loss', 0)) tr_loss_results = package['tr_loss_results'] val_loss_results = package['val_loss_results'] cer_results = package['cer_results'] wer_results = package['wer_results'] # Previous scores to tensorboard logs if tensorboard_writer and package[ 'tr_loss_results'] is not None and start_epoch > 0: # REVIEW josephz: Also include train? # package['tr_loss_results'] for i, (val_loss, wer, cer) in enumerate( zip(package['val_loss_results'], package['val_cer_results'], package['val_wer_results'])): _tensorboard_log(tensorboard_writer, dataset, i, val_loss, wer, cer, mode="Validation") else: avg_tr_loss = avg_val_loss = start_iter = start_epoch = 0 tr_loss_results = torch.Tensor(epochs) val_loss_results = torch.Tensor(epochs) cer_results = torch.Tensor(epochs) wer_results = torch.Tensor(epochs) labels_path = os.path.join(weights_dir, 'labels.json') try: with open(labels_path) as label_file: labels = str(''.join(json.load(label_file))) except: labels = _labels _getSharedLogger().warning( "Could not open '{}'... using hardcoded labels: '{}'".format( labels_path, labels)) rnn_type = rnn_type.lower() assert rnn_type in _model.supported_rnns, "rnn_type should be either lstm, rnn or gru" model = _model.LipReader(rnn_hidden_size=hidden_size, nb_layers=hidden_layers, labels=labels, rnn_type=_model.supported_rnns[rnn_type]) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, nesterov=True) if cuda: model.cuda() return labels, model, optimizer, \ (avg_tr_loss, avg_val_loss, start_iter, start_epoch), \ (tr_loss_results, val_loss_results, cer_results, wer_results)
def train( epochs=-1, dataset=None, batch=-1, checkpoint=False, train_split=-1.0, num_workers=-1, hidden_size=-1, hidden_layers=-1, rnn_type=None, cuda=False, learning_rate=-1.0, momentum=-1.0, max_norm=-1, annealing=-1.0, silent=False, tensorboard=False, continue_from=-1, seed=123456, ): """ Runs the primary training loop. :param epochs: Number of epochs to train for. :param dataset: Location containing dataset generated by 'generate_dataview'. :param batch: Number of sequences that are trained concurrently. :param checkpoint: Whether or not to save checpoints for each epoch. :param train_split: Fraction of videos which will be in the train set, (1 - train_split) will be validation. :param num_workers: Number of workers to use during dataset loading. :param hidden_size: Number of hidden units in the RNN. :param hidden_layers: Number of hiddel layers in RNN. :param rnn_type: Type of RNN cell to use; either rnn, gru, or lstm. :param cuda: Use CUDA to train this model. :param learning_rate: Initial training learning rate. :param momentum: Nesterov SGD momentum. :param max_norm: L2 norm cutoff to prevent gradient explosion. :param annealing: Annealing applied to learning rate every epoch. :param silent: Turn off progress tracking per iteration. :param tensorboard: Turn on tensorboard graphing. :param continue_from: Checkpoint number to start from. """ torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) weights_dir = _util.getRelWeightsPath(dataset) dataset_dir = _util.getRelDatasetsPath(dataset) tensorboard_writer = _get_tensorboard_writer(weights_dir, tensorboard) # REVIEW josephz: Can this be further broken down? labels, model, optimizer, \ (avg_tr_loss, avg_val_loss, start_iter, start_epoch), \ (tr_loss_results, val_loss_results, cer_results, wer_results) = _load_or_create_model( epochs, dataset, continue_from, learning_rate, rnn_type, hidden_size, hidden_layers, momentum, cuda, tensorboard_writer) (train_dataset, train_loader), (test_dataset, test_loader) = _get_datasets( dataset_dir, train_split, labels, batch, num_workers) best_wer = None batch_time, data_time, tr_losses, val_losses = _init_averages() print(model) print("Number of parameters: %d" % _model.LipReader.get_param_size(model)) # josephz: CTCLoss, see https://github.com/SeanNaren/warp-ctc criterion = CTCLoss() decoder = _decoder.GreedyDecoder(labels) for epoch in range(start_epoch, epochs): model.train() epoch_start = time.time() for i, (data) in enumerate(train_loader, start=start_iter): batch_start = time.time() inputs, targets, input_percentages, target_sizes = data assert len(inputs.shape) == 4 and inputs.shape[2:] == (68, 3) batch_size, seq_len, num_pts, pts_dim = inputs.shape input_sizes = input_percentages.mul(int(inputs.size(1))).int() # Measure elapsed data loading time. data_time.update(time.time() - batch_start) if cuda: inputs = inputs.cuda() out, output_sizes = model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax) # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence # act_lens: Tensor of size (batch) containing size of each output sequence from the network # label_lens: Tensor of (batch) containing label length of each example assert len(targets.shape) == 1 assert len( out.shape) == 3 and out.shape[:2] == (seq_len, batch_size) tr_loss = criterion(out, targets, output_sizes, target_sizes) # Average loss by minibatch. tr_loss /= inputs.size(0) val_loss_value = tr_loss.item() if val_loss_value == np.inf or val_loss_value == -np.inf: print("WARNING: received an inf loss, setting loss value to 0") val_loss_value = 0 avg_tr_loss += val_loss_value tr_losses.update(val_loss_value, inputs.size(0)) # Compute gradient. optimizer.zero_grad() tr_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) # SDG step! optimizer.step() # Measure elapsed batch time. batch_time.update(time.time() - batch_start) if not silent: print('Epoch[{}][{}{}]'.format(epoch + 1, i + 1, len(train_loader)), end='\t') print('Time {:0.3f} ({:0.3f})'.format(batch_time.val, batch_time.avg), end='\t') print('Data {:0.3f} ({:0.3f})'.format(data_time.val, data_time.avg), end='\t') print('Loss {:0.4f} ({:0.4f})'.format(tr_losses.val, tr_losses.avg)) avg_tr_loss /= len(train_loader) print('Training Summary Epoch: [{}]'.format(epoch + 1), end='\t') print('Time taken (s): {:0.0f}'.format(time.time() - epoch_start)) print('Time taken (s): {:0.0f}'.format(time.time() - epoch_start)) print('Average Training Loss: {:0.3f}'.format(avg_tr_loss)) # Reset start iteration in preparation for next epoch. start_iter = 0 total_cer = total_wer = 0 model.eval() with torch.no_grad(): for i, (data) in tqdm.tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(1))).int() batch_size, seq_len, num_pts, pts_dim = inputs.shape # Unflatten targets? split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if cuda: inputs = inputs.cuda() out, output_sizes = model(inputs, input_sizes) out_loss = out.transpose(0, 1) # TxNxH assert len(targets.shape) == 1 assert len( out_loss.shape) == 3 and out_loss.shape[:2] == (seq_len, batch_size) # out is supposed to be (seqLength x batch x outputDim). val_loss = criterion(out_loss, targets, output_sizes, target_sizes) val_loss_value = val_loss.item() if val_loss_value == np.inf or val_loss_value == -np.inf: print( "WARNING: received an inf loss, setting loss value to 0" ) val_loss_value = 0 avg_val_loss += val_loss_value val_losses.update(val_loss_value, inputs.size(0)) decoded_output, _ = decoder.decode(out.data, output_sizes) target_strings = decoder.convert_to_strings(split_targets) for x in range(len(target_strings)): transcript, reference = decoded_output[x][ 0], target_strings[x][0] total_wer += decoder.wer(transcript, reference) / float( len(reference.split())) total_cer += decoder.cer(transcript, reference) / float( len(reference)) avg_val_loss /= len(test_loader) val_loss_results[epoch] = avg_val_loss wer = wer_results[epoch] = 100 * total_wer / len( test_loader.dataset) # .dataset? cer = cer_results[epoch] = 100 * total_cer / len( test_loader.dataset) print('Validation Summary Epoch: [{}]'.format(epoch + 1), end='\t') print('Average WER: {:0.3f}'.format(wer_results[epoch]), end='\t') print('Average CER: {:0.3f}'.format(cer_results[epoch]), end='\t') print('Average Validation Loss: {:0.3f}'.format(avg_val_loss)) if tensorboard: _tensorboard_log(tensorboard_writer, dataset, epoch + 1, avg_val_loss, wer, cer, mode="Validation") if checkpoint: weights_path = _get_checkpoint_filepath(weights_dir, epoch + 1) torch.save( _model.LipReader.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=val_loss_results, wer_results=wer_results, cer_results=cer_results), weights_path) # Do annealing. optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] /= annealing optimizer.load_state_dict(optim_state) if best_wer is None or best_wer > wer_results[epoch]: print('Found better validated model, saving to {}'.format) model_path = os.path.join(weights_dir, 'model.pth') weights_path = _get_checkpoint_filepath(weights_dir, epoch + 1) if os.path.isfile(weights_path): shutil.copyfile(weights_path, model_path) else: torch.save( _model.LipReader.serialize( model, optimizer=optimizer, epoch=epoch, loss_results=val_loss_results, wer_results=wer_results, cer_results=cer_results), model_path) best_wer = wer avg_tr_loss = 0
hidden_size=800 hidden_layers=5 train_split=0.5 rnn_type="gru" epochs=70 cuda=False learning_rate=3e-4 momentum=0.9 max_norm=400 anneal=1.1 silent=True checkpoint=True tensorboard=True continue_from=0 weights_dir = _util.getRelWeightsPath(dataset) dataset_dir = _util.getRelDatasetsPath(dataset) raw_dir = _util.getRelRawPath(dataset) # tensorboard_writer = _train._get_tensorboard_writer(weights_dir, tensorboard) # # labels, model, optimizer, \ # avg_tr_loss, avg_val_loss, start_iter, start_epoch, \ # loss_results, val_loss_results, cer_results, wer_results = _train._load_or_create_model( # epochs, dataset, continue_from, learning_rate, rnn_type, hidden_size, hidden_layers, momentum, cuda, tensorboard_writer) # # (train_dataset, train_loader), (test_dataset, test_loader) = _train._get_datasets( # dataset_dir, train_split, labels, batch, num_workers) # # for i, (data) in enumerate(train_loader, start=start_iter): # assert len(data) == 2
def train( data="StephenColbert/medium_no_vtx1", labels="labels.json", sentence_dataset=False, occlussion_threshold=0.8, train_split=0.8, num_workers=1, refresh=False, patience=10, batch_size=4, learning_rate=1e-4, annealings=2, enable_ctc=False, grad_norm=50, tr_epochs=50, max_tfr=0.9, min_tfr=0.0, num_layers=1, frame_dim=68 * 3, hidden_size=700, char_dim=300, rnn_type='LSTM', attention_type='1_layer_nn', attn_hidden_size=-1, bidirectional=False, rnn_dropout=0.0, seed=123456, cuda=False, ): """ Runs the primary training loop. :param data: :param labels: :param sentence_dataset: :param occlussion_threshold: :param train_split: :param num_workers: :param patience: :param batch_size: :param learning_rate: :param annealings: Number of times to anneal learning rate before training is finished. :param enable_ctc: :param max_tfr: :param grad_norm: :param num_layers: :param frame_dim: :param hidden_size: :param char_dim: :param rnn_type: :param attention_type: :param attn_hidden_size: :param bidirectional: :param rnn_dropout: :param seed: :param cuda: """ # Setup seed. torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) rand = np.random.RandomState(seed=seed) # Setup device. # REVIEW josephz: Is there a clean way to use multiple or different GPUs? device = torch.device('cuda') if cuda else torch.device('cpu') print("Device: ", device) # Init Data. print("Initializing dataset '{}'".format(data)) train_dataset, val_dataset, test_dataset = _get_datasets( data, train_split, sentence_dataset, threshold=occlussion_threshold, labels=labels, rand=rand, refresh=refresh, include_test=True) train_loader = _data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=_data_loader._collate_fn) val_loader = _data.DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=_data_loader._collate_fn) test_loader = _data.DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=_data_loader._collate_fn) # Init Models. print("Initializing model") encoder, decoding_step = _init_models(train_dataset.char2idx, num_layers, frame_dim, hidden_size, char_dim, enable_ctc, rnn_type, attention_type, attn_hidden_size, bidirectional, rnn_dropout, device) # Initialize Logging. weights_dir = _util.getRelWeightsPath(data, use_existing=False) tensorboard_writer = tensorboardX.SummaryWriter(weights_dir) _getSharedLogger().info("Writing Tensorboard logs to '%s'", weights_dir) print() print("Try visualizing by running the following:") print(f"\ttensorboard --logdir='{weights_dir}'") print( "Then open the following URL in your local browser. " "\n\tIf you're running on a remote machine see `README_TENSORBOARD.md` for help..." ) # REVIEW josephz: Multi-input support doesn't seem ready yet: https://github.com/lanpa/tensorboardX/issues/256 # tensorboard_writer.add_graph(encoder, # torch.autograd.Variable( # torch.tensor([torch.zeros(batch_size, 100, 68, 3), torch.zeros(batch_size,)))) # tensorboard_writer.add_graph(decoding_step, # torch.autograd.Variable( # torch.tensor(torch.zeros(batch_size,), torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(batch_size,), torch.zeros(batch_size, 100, # hidden_size)))) # Train. val_cers = [] train_decoder_losses = [] train_ctc_losses = [] best_val_cer = 1.0 best_val_cer_idx = -1 # Initial evaluation print("Initial evaluation...") decoder_loss, val_correct, val_count = _train.eval(encoder, decoding_step, val_loader, device, train_dataset.char2idx) val_cer = (val_count - val_correct).float() / val_count print("\tCER: ", str(val_cer)) encoder_path = os.path.join(weights_dir, "best_encoder.pth") decoder_path = os.path.join(weights_dir, "best_decoder.pth") num_epochs = 0 num_annealings = 0 print("Beginning training loop") ts = time.time() while val_cer < best_val_cer or num_annealings < annealings: print("Epoch {}:".format(num_epochs + 1)) if num_epochs - best_val_cer_idx > patience: # If the model does not improve after our set 'patience' number of epochs, we will reduce the learning rate. num_annealings += 1 learning_rate /= 5 print(f'\tAnnealing to {learning_rate}') restore(encoder, encoder_path) restore(decoding_step, decoder_path) # Must set best val CER to here, or else this will also trigger next loop # if val CER does not go down. best_val_cer_idx = num_epochs # Apply linear teacher-forcing ratio decay. curr_tfr = max(min_tfr, max_tfr - num_epochs / tr_epochs) assert 0.0 <= curr_tfr <= 1.0 print(f'\tCurrent Teacher Forcing Ratio: {curr_tfr}') avg_decoder_loss, avg_ctc_loss = _train.train( encoder, decoding_step, train_loader, opt=torch.optim.Adam(list(encoder.parameters()) + list(decoding_step.parameters()), lr=learning_rate), device=device, char2idx=train_dataset.char2idx, teacher_forcing_ratio=curr_tfr, grad_norm=grad_norm) print(f'\tAVG Decoder Loss: {avg_decoder_loss}') print(f'\tAVG CTC Loss: {avg_ctc_loss}') tensorboard_writer.add_scalar(os.path.join(data, 'avg decoder loss'), avg_decoder_loss, global_step=num_epochs) tensorboard_writer.add_scalar(os.path.join(data, 'avg CTC loss'), avg_ctc_loss, global_step=num_epochs) decoder_loss, val_correct, val_count = _train.eval( encoder, decoding_step, val_loader, device, train_dataset.char2idx) _, train_correct, train_count = _train.eval(encoder, decoding_step, train_loader, device, train_dataset.char2idx) val_cer = (val_count - val_correct).float() / val_count train_cer = (train_count - train_correct).float() / train_count encoder.save_best_model(val_cer, encoder_path) decoding_step.save_best_model(val_cer, decoder_path) print(f'\tTrain CER: {train_cer}') print(f'\tVal CER: {val_cer}') # ANALYSIS encoder.eval() decoding_step.eval() with torch.no_grad(): # CER _, test_correct, test_count = _train.eval(encoder, decoding_step, test_loader, device, train_dataset.char2idx) test_cer = (test_count - test_correct).float() / test_count print(f'\tTest CER: {train_cer}') # Sample teacher forcing output print('Some teacher-forcing outputs:') _analysis.print_samples(encoder, decoding_step, test_loader, device, train_dataset.char2idx, max_=10) # confusion matrix print('drawing confusion matrix:') try: _analysis.get_confusion_matrix(encoder, decoding_step, test_loader, device, test_dataset.char2idx, num_epochs) except: print( 'oops something wrong happened in drawing confusion matrix' ) # inference print('Some student-forcing outputs with beam search:') for frames, frame_lens, chars, char_lens in test_loader: frames, frame_lens, chars, char_lens = frames[: 2], frame_lens[: 2], chars[: 2], char_lens[: 2] frames, frame_lens, chars, char_lens = frames.to( device), frame_lens.to(device), chars.to( device), char_lens.to(device) pred, gt = _analysis.inference(encoder, decoding_step, frames, frame_lens, chars, char_lens, device, test_dataset.char2idx, beam_width=10, max_label_len=100) for gt_, pred_ in zip(gt, pred): print(f'GTL\t: {gt_}') print(f'Pred\t: {pred_}') break tensorboard_writer.add_scalars(os.path.join(data, 'CER'), { "Train": train_cer, "Val": val_cer }, global_step=num_epochs) tensorboard_writer.add_scalar(os.path.join(data, 'learning rate'), learning_rate, global_step=num_epochs) val_cers.append(val_cer) train_decoder_losses.append(avg_decoder_loss) train_ctc_losses.append(avg_ctc_loss) if val_cer < best_val_cer: best_val_cer = val_cer best_val_cer_idx = num_epochs num_epochs += 1 te = time.time() total_time = te - ts print() print("Training complete: Took '{}' seconds, or '{}' per epoch".format( total_time, total_time / num_epochs)) print("Training Statistics") print("\tBest Val CER: '{}'".format(np.min(val_cers))) print("\tBest Decoder Loss: '{}'".format(np.min(train_decoder_losses))) print("\tBest CTC Loss: '{}'".format(np.min(train_ctc_losses))) print()
return meta @staticmethod def is_parallel(model): return isinstance(model, torch.nn.parallel.DataParallel) or \ isinstance(model, torch.nn.parallel.DistributedDataParallel) if __name__ == '__main__': import os.path import argparse parser = argparse.ArgumentParser( description='LipReading model information') parser.add_argument('--model-path', default=_util.getRelWeightsPath("lipreader", "v0"), help='Path to model file created by training') args = parser.parse_args() package = torch.load(args.model_path, map_location=lambda storage, loc: storage) model = LipReader.load_model(args.model_path) print("Model name: ", os.path.basename(args.model_path)) print("DeepSpeech version: ", model._version) print("") print("Recurrent Neural Network Properties") print(" RNN Type: ", model._rnn_type.__name__.lower()) print(" RNN Layers: ", model._hidden_layers) print(" RNN Size: ", model._hidden_size) print(" Classes: ", len(model._labels)) print("")