Exemple #1
0
def train(
        data_layer,
        data_layer_eval,
        model,
        ctc_loss,
        greedy_decoder,
        optimizer,
        optim_level,
        labels,
        multi_gpu,
        args,
        fn_lr_policy=None):
    """Trains model
    Args:
        data_layer: training data layer
        data_layer_eval: evaluation data layer
        model: model ( encapsulates data processing, encoder, decoder)
        ctc_loss: loss function
        greedy_decoder: greedy ctc decoder
        optimizer: optimizer
        optim_level: AMP optimization level
        labels: list of output labels
        multi_gpu: true if multi gpu training
        args: script input argument list
        fn_lr_policy: learning rate adjustment function
    """
    def eval():
        """Evaluates model on evaluation dataset
        """
        with torch.no_grad():
            _global_var_dict = {
                'EvalLoss': [],
                'predictions': [],
                'transcripts': [],
            }
            eval_dataloader = data_layer_eval.data_iterator
            for data in eval_dataloader:
                tensors = []
                for d in data:
                    if isinstance(d, torch.Tensor):
                        tensors.append(d.cuda())
                    else:
                        tensors.append(d)
                t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors

                model.eval()
                t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e))
                t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e)
                t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)

                values_dict = dict(
                    loss=[t_loss_e],
                    predictions=[t_predictions_e],
                    transcript=[t_transcript_e],
                    transcript_length=[t_transcript_len_e]
                )
                process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

            # final aggregation across all workers and minibatches) and logging of results
            wer, eloss = process_evaluation_epoch(_global_var_dict)

            print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss))
            print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer))

    print_once("Starting .....")
    start_time = time.time()

    train_dataloader = data_layer.data_iterator
    epoch = args.start_epoch
    step = epoch * args.step_per_epoch

    while True:
        if multi_gpu:
            data_layer.sampler.set_epoch(epoch)
        print_once("Starting epoch {0}, step {1}".format(epoch, step))
        last_epoch_start = time.time()
        batch_counter = 0
        average_loss = 0
        for data in train_dataloader:
            tensors = []
            for d in data:
                if isinstance(d, torch.Tensor):
                    tensors.append(d.cuda())
                else:
                    tensors.append(d)

            if batch_counter == 0:

                if fn_lr_policy is not None:
                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                optimizer.zero_grad()
                last_iter_start = time.time()

            t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors
            model.train()
            
            t_log_probs_t, t_encoded_len_t = model(x=(t_audio_signal_t, t_a_sig_length_t))
            t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t)
            if args.gradient_accumulation_steps > 1:
                    t_loss_t = t_loss_t / args.gradient_accumulation_steps

            if optim_level in AmpOptimizations:
                with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                t_loss_t.backward()
            batch_counter += 1
            average_loss += t_loss_t.item()

            if batch_counter % args.gradient_accumulation_steps == 0:
                optimizer.step()

                if step % args.train_frequency == 0:
                    t_predictions_t = greedy_decoder(log_probs=t_log_probs_t)

                    e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                    train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                    print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                    print_once("Step time: {0} seconds".format(time.time() - last_iter_start))

                if step > 0 and step % args.eval_frequency == 0:
                    print_once("Doing Evaluation ....................... ......  ... .. . .")
                    eval()
                step += 1
                batch_counter = 0
                average_loss = 0
                if args.num_steps is not None and step >= args.num_steps:
                    break

        if args.num_steps is not None and step >= args.num_steps:
            break
        print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
        epoch += 1
        if epoch % args.save_frequency == 0 and epoch > 0:
            save(model, optimizer, epoch, output_dir=args.output_dir)
        if args.num_steps is None and epoch >= args.num_epochs:
            break
    print_once("Done in {0}".format(time.time() - start_time))
    print_once("Final Evaluation ....................... ......  ... .. . .")
    eval()
    save(model, optimizer, epoch, output_dir=args.output_dir)
Exemple #2
0
def train(
        data_layer,
        model,
        loss_fn,
        greedy_decoder,
        optimizer,
        optim_level,
        labels,
        multi_gpu,
        data_transforms,
        args,
        evalutaion,
        logger,
        fn_lr_policy):
    """Trains model
    Args:
        data_layer: training data layer
        model: model ( encapsulates data processing, encoder, decoder)
        loss_fn: loss function
        greedy_decoder: greedy ctc decoder
        optimizer: optimizer
        optim_level: AMP optimization level
        labels: list of output labels
        multi_gpu: true if multi gpu training
        args: script input argument list
        fn_lr_policy: function returning lr in given step
    """
    print_once("Starting .....")
    start_time = time.time()

    train_dataloader = data_layer.data_iterator
    epoch = args.start_epoch
    step = epoch * args.step_per_epoch
    start_step = step

    if args.ipex:
        print("is ipex")
        if args.bf16:
            print("is bf16")
            print("running bfloat16 training step\n")
        elif args.fp32:
            print("running fp32 training step\n")
        total_time = 0
        while True:
            if multi_gpu:
                data_layer.sampler.set_epoch(epoch)
            print_once("Starting epoch {0}, step {1}".format(epoch, step))
            last_epoch_start = time.time()
            batch_counter = 0
            average_loss = 0
            for data in tqdm(train_dataloader):
                if batch_counter == 0:
                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                    optimizer.zero_grad()
                    last_iter_start = time.time()

                t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data)
                model.train()

                if args.profiling and (step - start_step) >= args.warmup:
                    with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof:
                        if (step - start_step) >= args.warmup:
                            t0 = time.perf_counter()
                        if args.bf16:
                            with torch.cpu.amp.autocast():
                                t_log_probs_t, (x_len, y_len) = model(
                                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                                )
                        elif args.fp32:
                            t_log_probs_t, (x_len, y_len) = model(
                                ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                            )
                        if args.bf16:
                            t_log_probs_t = t_log_probs_t.to(torch.float32)
                        t_loss_t = loss_fn(
                            (t_log_probs_t, x_len), (t_transcript_t, y_len)
                        )
                        logger.log_scalar('loss', t_loss_t.item(), step)
                        del t_log_probs_t
                        if args.gradient_accumulation_steps > 1:
                            t_loss_t = t_loss_t / args.gradient_accumulation_steps

                        if args.cuda and optim_level in AmpOptimizations:
                            assert False, "not supported in ipex"
                        else:
                            t_loss_t.backward()
                        t1 = time.perf_counter()
                        if (step - start_step) >= args.warmup:
                            total_time += (t1 - t0)
                else:
                    if (step - start_step) >= args.warmup:
                        t0 = time.perf_counter()
                    if args.bf16:
                        with torch.cpu.amp.autocast():
                            t_log_probs_t, (x_len, y_len) = model(
                                ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                            )
                    elif args.fp32:
                        t_log_probs_t, (x_len, y_len) = model(
                            ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                        )
                    if args.bf16:
                        t_log_probs_t = t_log_probs_t.to(torch.float32)
                    t_loss_t = loss_fn(
                        (t_log_probs_t, x_len), (t_transcript_t, y_len)
                    )
                    logger.log_scalar('loss', t_loss_t.item(), step)
                    del t_log_probs_t
                    if args.gradient_accumulation_steps > 1:
                        t_loss_t = t_loss_t / args.gradient_accumulation_steps

                    if args.cuda and optim_level in AmpOptimizations:
                        assert False, "not supported in ipex"
                    else:
                        t_loss_t.backward()
                    t1 = time.perf_counter()
                    if (step - start_step) >= args.warmup:
                        total_time += (t1 - t0)

                batch_counter += 1
                average_loss += t_loss_t.item()

                if batch_counter % args.gradient_accumulation_steps == 0:
                    optimizer.step()

                    if (step + 1) % args.train_frequency == 0:
                        # t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t)

                        # e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                        # train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                        print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                        print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
                        # logger.log_scalar('wer', train_wer, step)

                    step += 1
                    batch_counter = 0
                    average_loss = 0
                    if args.num_steps is not None and step >= args.num_steps:
                        break

            # evalutaion(epoch)

            if args.num_steps is not None and step >= args.num_steps:
                break
            print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
            epoch += 1
            if epoch % args.save_frequency == 0 and epoch > 0:
                save(model, optimizer, epoch, output_dir=args.output_dir)
            if args.num_steps is None and epoch >= args.num_epochs:
                break
        if args.profiling:
            print(prof.key_averages().table(sort_by="self_cpu_time_total"))

        print_once("Done in {0}".format(time.time() - start_time))
        if args.num_steps is not None:
            total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size
        else:
            total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size
        print("total samples tested: ", total_samples)
        print("Model training time:", total_time, "s")
        perf = total_samples / total_time
        print("Throughput: {:.3f} fps".format(perf))
        # print_once("Final Evaluation ....................... ......  ... .. . .")
        # evalutaion()
        save(model, optimizer, epoch, output_dir=args.output_dir)
    else:
        total_time = 0
        while True:
            if multi_gpu:
                data_layer.sampler.set_epoch(epoch)
            print_once("Starting epoch {0}, step {1}".format(epoch, step))
            last_epoch_start = time.time()
            batch_counter = 0
            average_loss = 0

            for data in train_dataloader:

                if batch_counter == 0:

                    adjusted_lr = fn_lr_policy(step)
                    for param_group in optimizer.param_groups:
                            param_group['lr'] = adjusted_lr
                    optimizer.zero_grad()
                    last_iter_start = time.time()

                t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data)
                model.train()

                if (step - start_step) >= args.warmup:
                    t0 = time.perf_counter()
                t_log_probs_t, (x_len, y_len) = model(
                    ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)),
                )

                t_loss_t = loss_fn(
                    (t_log_probs_t, x_len), (t_transcript_t, y_len)
                )
                print(t_loss_t)
                logger.log_scalar('loss', t_loss_t.item(), step)
                del t_log_probs_t
                if args.gradient_accumulation_steps > 1:
                    t_loss_t = t_loss_t / args.gradient_accumulation_steps

                if args.cuda and optim_level in AmpOptimizations:
                    assert False, "not supported in ipex"
                else:
                    t_loss_t.backward()
                t1 = time.perf_counter()
                if (step - start_step) >= args.warmup:
                    total_time += (t1 - t0)

                batch_counter += 1
                average_loss += t_loss_t.item()

                if batch_counter % args.gradient_accumulation_steps == 0:
                    optimizer.step()

                    if (step + 1) % args.train_frequency == 0:
                        t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t)

                        e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t]
                        train_wer = monitor_asr_train_progress(e_tensors, labels=labels)
                        print_once("Loss@Step: {0}  ::::::: {1}".format(step, str(average_loss)))
                        print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
                        logger.log_scalar('wer', train_wer, step)

                    step += 1
                    batch_counter = 0
                    average_loss = 0
                    if args.num_steps is not None and step >= args.num_steps:
                        break

            # evalutaion(epoch)

            if args.num_steps is not None and step >= args.num_steps:
                break
            print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
            epoch += 1
            if epoch % args.save_frequency == 0 and epoch > 0:
                save(model, optimizer, epoch, output_dir=args.output_dir)
            if args.num_steps is None and epoch >= args.num_epochs:
                break
        print_once("Done in {0}".format(time.time() - start_time))
        if args.num_steps is not None:
            total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size
        else:
            total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size
        print("total samples tested: ", total_samples)
        print("Model training time:", total_time, "s")
        perf = total_samples / total_time
        print("Throughput: {:.3f} fps".format(perf))
        # print_once("Final Evaluation ....................... ......  ... .. . .")
        # evalutaion()
        save(model, optimizer, epoch, output_dir=args.output_dir)