Example #1
0
def run(config, num_checkpoints, cuda=False):

    train_joint_transform_list, train_img_transform, train_label_transform = get_transforms(
        config, mode="train")
    val_joint_transform_list, val_img_transform, val_label_transform = None, None, None

    train_dataset = DataSet(mode="train",
                            joint_transform_list=train_joint_transform_list,
                            img_transform=train_img_transform,
                            label_transform=train_label_transform)
    val_dataset = DataSet(mode="val",
                          joint_transform_list=val_joint_transform_list,
                          img_transform=val_img_transform,
                          label_transform=val_label_transform)

    train_loader = data.DataLoader(train_dataset,
                                   batch_size=config.batch_size,
                                   shuffle=True,
                                   num_workers=config.num_workers,
                                   drop_last=True)
    val_loader = data.DataLoader(val_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=False,
                                 num_workers=config.num_workers)

    criterion, val_criterion = get_loss(config, cuda=cuda)

    model = get_net(config, criterion, cuda=cuda)

    checkpoints = get_checkpoints(config, num_checkpoints)
    print("[*] Checkpoints as follow:")
    pprint.pprint(checkpoints)

    util_checkpoint.load_checkpoint(model, None, checkpoints[0])
    for i, checkpoint in enumerate(checkpoints[1:]):
        model2 = get_net(config, criterion, cuda=cuda)

        util_checkpoint.load_checkpoint(model2, None, checkpoint)
        swa.moving_average(model, model2, 1. / (i + 2))

    with torch.no_grad():
        swa.update_bn(train_loader, model, cuda=cuda)

    output_name = "model-swa.pth"
    print(f"[*] SAVED: to {output_name}")
    checkpoint_dir = os.path.join(ROOT_DIR, LOG_DIR,
                                  os.path.basename(config.model_dir))
    util_checkpoint.save_checkpoint(checkpoint_dir, output_name, model)

    # test the model
    scores = validation(config,
                        val_loader,
                        model,
                        val_criterion,
                        "swa",
                        cuda=cuda,
                        is_record=False)
    print(scores)
    with open(os.path.join(checkpoint_dir, "swa-scores.json"), "w") as f:
        json.dump(scores["FWIOU"], f)
Example #2
0
def save_periodic_checkpoint(conf, runner, epoch, best_val_metrics):
    log_file_path = get_periodic_checkpoint_path(conf.run_dir, epoch)
    if not os.path.isdir(os.path.dirname(log_file_path)):
        logging.warning(('Skip saving periodic checkpoint: {} does not '
                         'exist').format(os.path.dirname(log_file_path)))
        return
    logging.info('Saving periodic checkpoint to {}'.format(log_file_path))

    save_checkpoint(log_file_path, conf, runner, epoch, best_val_metrics)

    num_checkpoints = conf.get_attr('num_periodic_checkpoints',
                                    default=DEFAULT_NUM_PERIODIC_CHECKPOINTS)
    prune_checkpoints(os.path.dirname(log_file_path), num_checkpoints)
Example #3
0
def save_best_checkpoint(best_dir, best_val, conf, runner, epoch,
                         best_val_metrics):
    log_file_path = get_best_checkpoint_path(best_dir, epoch, best_val)
    if not os.path.isdir(os.path.dirname(log_file_path)):
        print(('Skip saving best value checkpoint: {} does not '
               'exist').format(os.path.dirname(log_file_path)))
        return

    print('Saving best value checkpoint to {}'.format(log_file_path))

    save_checkpoint(log_file_path, conf, runner, epoch, best_val_metrics)

    num_checkpoints = conf.get_attr('num_best_checkpoints',
                                    default=DEFAULT_NUM_BEST_CHECKPOINTS)
    prune_checkpoints(os.path.dirname(log_file_path), num_checkpoints)
Example #4
0
    def train_session(self, model: Tacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        model.r = session.r
        simple_table([(f'Steps with r={session.r}',
                       str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Outputs/Step (r)', model.r)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens,
                    mel_lens) in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                x, m = x.to(device), m.to(device)

                m1_hat, m2_hat, attention = model(x, m)

                m1_loss = F.l1_loss(m1_hat, m)
                m2_loss = F.l1_loss(m2_hat, m)
                loss = m1_loss + m2_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.tts_checkpoint_every == 0:
                    ckpt_name = f'taco_step{k}K'
                    save_checkpoint('tts',
                                    self.paths,
                                    model,
                                    optimizer,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.tts_plot_every == 0:
                    self.generate_plots(model, session)

                _, att_score = attention_score(attention, mel_lens)
                att_score = torch.mean(att_score)
                self.writer.add_scalar('Attention_Score/train', att_score,
                                       model.get_step())
                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss, val_att_score = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            self.writer.add_scalar('Attention_Score/val', val_att_score,
                                   model.get_step())
            save_checkpoint('tts',
                            self.paths,
                            model,
                            optimizer,
                            is_silent=True)

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
    def train_session(self, model: ForwardTacotron,
                      optimizer: Optimizer, session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
                session.train_set, 1
            ):
                start = time.time()
                model.train()
                x, m, dur, x_lens, mel_lens, pitch, puncts = (
                    x.to(device),
                    m.to(device),
                    dur.to(device),
                    x_lens.to(device),
                    mel_lens.to(device),
                    pitch.to(device),
                    puncts.to(device),
                )
                # print("*" * 20)
                # print(x)
                # print("*" * 20)
                m1_hat, m2_hat, dur_hat, pitch_hat = model(
                    x, m, dur, mel_lens, pitch, puncts
                )
                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)
                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
                pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
                loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm)
                optimizer.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward', self.paths, model, optimizer,
                                    name=ckpt_name, is_silent=True)

                if step % hp.forward_plot_every == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs, model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step())

                stream(msg)

            m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step())
            self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step())
            save_checkpoint('forward', self.paths, model, optimizer, is_silent=True)

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
Example #6
0
    def train_session(self, model: WaveRNN, optimizer: Optimizer,
                      session: VocSession, train_gta: bool) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps ', str(training_steps // 1000) + 'k'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Sequence Length', self.train_cfg['seq_len']),
                      ('GTA Training', train_gta)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters

        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                batch = to_device(batch, device=device)
                x, y = batch['x'], batch['y']
                y_hat = model(x, batch['mel'])
                if model.mode == 'RAW':
                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
                elif model.mode == 'MOL':
                    y = batch['y'].float()
                y = y.unsqueeze(-1)

                loss = self.loss_func(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['gen_samples_every'] == 0:
                    stream(msg + 'generating samples...')
                    gen_result = self.generate_samples(model, session)
                    if gen_result is not None:
                        mel_loss, gen_wav = gen_result
                        self.writer.add_scalar('Loss/generated_mel_l1',
                                               mel_loss, model.get_step())
                        self.track_top_models(mel_loss, gen_wav, model)

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.voc_checkpoints /
                                    f'wavernn_step{k}k.pt')

                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.voc_checkpoints /
                            'latest_model.pt')

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
Example #7
0
def tts_train_loop_af_offline(paths: Paths,
                              model: Tacotron,
                              optimizer,
                              train_set,
                              lr,
                              train_steps,
                              attn_example,
                              hp=None):
    # setattr(model, 'mode', 'attention_forcing')
    # import pdb

    def smooth(d, eps=float(1e-10)):
        u = 1.0 / float(d.size()[2])
        return eps * u + (1 - eps) * d

    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss_out, running_loss_attn = 0, 0

        # Perform 1 epoch
        for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1):

            # print(x.size())
            # print(m.size())
            # print(attn_ref.size())
            # # print(m1_hat.size(), m2_hat.size())
            # # print(attention.size(), attention.size(1)*model.r)
            # pdb.set_trace()

            x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention = data_parallel_workaround(
                    model, x, m, False, attn_ref)
            else:
                m1_hat, m2_hat, attention = model(x,
                                                  m,
                                                  generate_gta=False,
                                                  attn_ref=attn_ref)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)
            # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean'
            attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref))

            loss_out = m1_loss + m2_loss
            loss_attn = attn_loss * hp.attn_loss_coeff
            loss = loss_out + loss_attn

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss_out += loss_out.item()
            avg_loss_out = running_loss_out / i
            running_loss_attn += loss_attn.item()
            avg_loss_attn = running_loss_attn / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attn_ref[idx][:, :160]),
                               paths.tts_attention / f'{step}_tf')
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / f'{step}_af')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Example #8
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
Example #9
0
    def train_session(self, model: ForwardTacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                batch = to_device(batch, device=device)
                start = time.time()
                model.train()

                pitch_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['pitch_zoneout']
                energy_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['energy_zoneout']

                pitch_target = batch['pitch'].detach().clone()
                energy_target = batch['energy'].detach().clone()
                batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to(
                    device).float()
                batch['energy'] = batch['energy'] * energy_zoneout_mask.to(
                    device).float()

                pred = model(batch)

                m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                       batch['mel_len'])
                m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                       batch['mel_len'])

                dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                        batch['dur'].unsqueeze(1),
                                        batch['x_len'])
                pitch_loss = self.l1_loss(pred['pitch'],
                                          pitch_target.unsqueeze(1),
                                          batch['x_len'])
                energy_loss = self.l1_loss(pred['energy'],
                                           energy_target.unsqueeze(1),
                                           batch['x_len'])

                loss = m1_loss + m2_loss \
                       + self.train_cfg['dur_loss_factor'] * dur_loss \
                       + self.train_cfg['pitch_loss_factor'] * pitch_loss \
                       + self.train_cfg['energy_loss_factor'] * energy_loss

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()

                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.forward_checkpoints /
                                    f'forward_step{k}k.pt')

                if step % self.train_cfg['plot_every'] == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss,
                                       model.get_step())
                self.writer.add_scalar('Energy_Loss/train', energy_loss,
                                       model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_out = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'],
                                   model.get_step())
            self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'],
                                   model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'],
                                   model.get_step())
            self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'],
                                   model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.forward_checkpoints /
                            'latest_model.pt')

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
Example #10
0
def run(cur_gpu, hparams):
    if hparams.distributed_mode == 'gpus':
        dist.init_process_group(backend=hparams.dist_backend,
                                init_method=hparams.dist_url,
                                world_size=hparams.world_size,
                                rank=cur_gpu)

    model = getattr(models,
                    hparams.model_name)(hparams.n_classes, hparams.n_channels,
                                        hparams.model_version)

    if cur_gpu >= 0:
        torch.cuda.set_device(cur_gpu)
        model.cuda()

    if hparams.fp16:
        model = convert_to_half(model)

    if hparams.distributed_mode == 'gpus':
        model = nn.parallel.DistributedDataParallel(model,
                                                    device_ids=[cur_gpu],
                                                    output_device=cur_gpu)

    criterion = cross_entropy

    params_no_bn, params_no_bn_clone = get_parameters(
        model,
        exclude=(nn.BatchNorm2d, nn.SyncBatchNorm, nn.GroupNorm),
        clone=hparams.fp16)
    params_bn, params_bn_clone = get_parameters(model,
                                                include=(nn.BatchNorm2d,
                                                         nn.SyncBatchNorm,
                                                         nn.GroupNorm),
                                                clone=hparams.fp16)
    optimizer = optim.SGD([{
        'params': params_no_bn_clone if hparams.fp16 else params_no_bn,
        'weight_decay': hparams.weight_decay
    }, {
        'params': params_bn_clone if hparams.fp16 else params_bn,
        'weight_decay': 0.0
    }],
                          lr=hparams.initial_learning_rate,
                          momentum=hparams.momentum)

    lr_scheduler = MultiStepLRWithWarmup(optimizer,
                                         hparams.lr_milestones,
                                         hparams.lr_warmup_epochs,
                                         factor_min=hparams.lr_factor_min,
                                         gamma=hparams.lr_decay_rate)

    best_acc1 = 0
    best_acc5 = 0
    start_epoch = hparams.start_epoch
    if hparams.checkpoint and os.path.isfile(hparams.checkpoint):
        start_epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5 = load_checkpoint(
            hparams.checkpoint, cur_gpu, model, optimizer, lr_scheduler)

    torch.backends.cudnn.benchmark = True

    train_loader, train_sampler = get_train_loader(
        hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size,
        hparams.n_data_loading_workers, hparams.distributed_mode,
        hparams.world_size, cur_gpu)
    val_loader = get_val_loader(hparams.data_dir, hparams.image_size,
                                hparams.per_replica_batch_size,
                                hparams.n_data_loading_workers,
                                hparams.distributed_mode, hparams.world_size,
                                cur_gpu)

    if hparams.evaluate:
        return validate(cur_gpu, val_loader, model, criterion, 0, hparams)

    monitor = get_progress_monitor(cur_gpu, hparams.log_dir,
                                   hparams.steps_per_epoch, hparams.epochs,
                                   hparams.print_freq, start_epoch)

    for epoch in range(start_epoch, hparams.epochs):
        monitor and monitor.before_epoch()

        if train_sampler:
            train_sampler.set_epoch(epoch)
        train(cur_gpu, train_loader, model, criterion, optimizer, lr_scheduler,
              params_no_bn + params_bn, params_no_bn_clone + params_bn_clone,
              epoch, hparams, monitor)

        loss, acc1, acc5 = validate(cur_gpu, val_loader, model, criterion,
                                    epoch, hparams)

        monitor and monitor.after_epoch(loss, acc1, acc5)

        if hparams.save_model and cur_gpu in (-1, 0):
            is_best = acc1 > best_acc1
            best_acc1 = acc1 if is_best else best_acc1
            save_checkpoint(hparams.model_dir, epoch, model, optimizer,
                            lr_scheduler, best_acc1, best_acc5, is_best)

    if hparams.distributed_mode == 'gpus':
        dist.destroy_process_group()

    monitor and monitor.end()
Example #11
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, init_lr, final_lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # for g in optimizer.param_groups: g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        adjust_learning_rate(optimizer, e, epochs, init_lr,
                             final_lr)  # 初始学习率与最终学习率-Begee
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(
                device)  # x/y: (Batch, sub_bands, T)

            #########################  MultiBand-WaveRNN   #########################
            if hp.voc_multiband:
                y0 = y[:, 0, :].squeeze(0).unsqueeze(
                    -1)  # y0/y1/y2/y3: (Batch, T, 1)
                y1 = y[:, 1, :].squeeze(0).unsqueeze(-1)
                y2 = y[:, 2, :].squeeze(0).unsqueeze(-1)
                y3 = y[:, 3, :].squeeze(0).unsqueeze(-1)

                y_hat = model(x, m)  # (Batch, T, num_classes, sub_bands)

                if model.mode == 'RAW':
                    y_hat0 = y_hat[:, :, :, 0].transpose(1, 2).unsqueeze(
                        -1)  # (Batch, num_classes, T, 1)
                    y_hat1 = y_hat[:, :, :, 1].transpose(1, 2).unsqueeze(-1)
                    y_hat2 = y_hat[:, :, :, 2].transpose(1, 2).unsqueeze(-1)
                    y_hat3 = y_hat[:, :, :, 3].transpose(1, 2).unsqueeze(-1)

                elif model.mode == 'MOL':
                    y0 = y0.float()
                    y1 = y1.float()
                    y2 = y2.float()
                    y3 = y3.float()

                loss = loss_func(y_hat0, y0) + loss_func(
                    y_hat1, y1) + loss_func(y_hat2, y2) + loss_func(
                        y_hat3, y3)

            #########################  MultiBand-WaveRNN   #########################

            optimizer.zero_grad()
            loss.backward()

            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm).cpu()
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
def train_loop(paths: Paths, model, optimizer, train_set, lr, train_steps,
               mel_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0
        dur_running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, mel_len, dur) in enumerate(train_set, 1):

            x, m, dur = x.to(device), m.to(device), dur.to(device)
            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m_hat, m_post_hat, dur_hat = data_parallel_workaround(
                    model, x, m, dur)
            else:
                m_hat, m_post_hat, dur_hat = model(x, m, dur)

            lin_loss = F.l1_loss(m_hat, m)
            post_loss = F.l1_loss(m_post_hat, m)
            dur_loss = F.l1_loss(dur_hat, dur)
            loss = lin_loss + post_loss + dur_loss
            optimizer.zero_grad()

            loss.backward()

            if hp.forward_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.forward_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += post_loss.item()
            avg_loss = running_loss / i
            dur_running_loss += dur_loss.item()
            dur_avg_loss = dur_running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.forward_checkpoint_every == 0:
                ckpt_name = f'fast_speech_step{k}K'
                save_checkpoint('forward',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if mel_example in ids:
                idx = ids.index(mel_example)
                try:
                    seq = x[idx].tolist()
                    m_gen = model.generate(seq)
                    save_spectrogram(m_gen,
                                     paths.forward_mel_plot / f'{step}_gen',
                                     600)
                except Exception:
                    traceback.print_exc()
                save_spectrogram(np_now(m_post_hat[idx]),
                                 paths.forward_mel_plot / f'{step}_gta', 600)
                save_spectrogram(np_now(m[idx]),
                                 paths.forward_mel_plot / f'{step}_target',
                                 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {avg_loss:#.4} ' \
                  f'| Duration Loss: {dur_avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)
        model.log(paths.forward_log, msg)

        save_checkpoint('forward', paths, model, optimizer, is_silent=True)
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention, r = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention, r = model(x, m)

            #print(att_guides.shape)
            orig_attention = attention
            n = int(len(att_guides[0]) / r)
            #print("n", n)
            #reduce guide by r factor
            ga = [a[t] for a in att_guides for t in range(0, len(a), r)]

            assert n == len(attention[0])
            guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)]

            attention = np_now(attention)
            attention = [
                pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention
            ]

            guided_attention = torch.tensor(guided_attention)
            guided_attention = guided_attention.to(device)

            attention = torch.tensor(attention)
            attention = attention.to(device)

            #create attention mask
            attention_masks = torch.ne(attention, -1).type(torch.FloatTensor)

            attention_masks = torch.tensor(attention_masks)
            attention_masks = attention.to(device)

            multiply = torch.abs(
                attention * guided_attention) * attention_masks

            attention_loss = torch.sum(multiply)

            mask_sum = torch.sum(attention_masks)

            attention_loss /= mask_sum

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            #print("mask sum", mask_sum)
            #print("attention loss", attention_loss)
            #print("m losses", m1_loss, m2_loss)
            prev_loss = m1_loss + m2_loss
            #print("prev loss", prev_loss)
            loss = m1_loss + m2_loss + attention_loss
            #print("loss + att", loss)

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(orig_attention[idx][:, :160]),
                               paths.tts_attention / f'{step}')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Example #14
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention = model(x, m)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            loss = m1_loss + m2_loss

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = 'taco_step%sK' % (repr1(k))
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / '%s' % (repr1(step)))
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / '%s' % (repr1(step)),
                                 600)

            msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % (
                repr1(e), repr1(epochs), repr1(i), repr1(total_iters),
                avg_loss, speed, repr1(k))
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
Example #15
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # set learning rate
    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    total_number_of_batches = len(train_set)

    writer = SummaryWriter("runs/{0}-{1}".format(
        model_name_prefix,
        datetime.now().strftime("%Y%m%d-%H%M%S")))
    scheduler = StepLR(optimizer, step_size=1, gamma=0.983)

    for e in range(EPOCH, epochs + 1):

        start = time.time()
        running_loss = 0.
        avg_loss = 0

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            # Write to tensorboard per batch
            writer.add_scalar('Epoch loss', loss.item(),
                              e * total_number_of_batches + i)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)
        """
        ####################### Testing ############################
        torch.cuda.empty_cache()
        loss_test = 0
        for _, (x_test, y_test, m_test) in enumerate(test_set, 1):
            x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device)
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                raise RuntimeError("Unsupported")
            else:
                y_test_hat = model(x_test, m_test)

            if model.mode == 'RAW':
                y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y_test = y_test.float()

            y_test = y_test.unsqueeze(-1)

            loss_test += loss_func(y_test_hat, y_test).item()
        avg_loss_test = loss_test / len(test_set)
        msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | '
        stream("\n")
        stream(msg)

        writer.add_scalar('Test loss', loss_test, e)
        writer.add_scalar('Average test loss', avg_loss_test, e)
        ############################################################
        """

        # Write to tensorboard per epoch
        writer.add_scalar('Running loss', running_loss, e)
        writer.add_scalar('Average loss', avg_loss, e)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc',
                        paths,
                        model,
                        optimizer,
                        name="{0}-epoch-{1}-loss-{2}".format(
                            model_name_prefix, e, avg_loss),
                        is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
        scheduler.step()
        print('Epoch:', e, 'LR:', scheduler.get_lr())
Example #16
0
def main():
    # Make some variable global
    global args, train_csv, test_csv, exp_dir, best_result, device, tb_writer, tb_freq

    # Args parser
    args = args_parser()

    start_epoch = 0
    ############ EVALUATE MODE ############
    if args.evaluate:  # Evaluate mode
        print('\n==> Evaluation mode!')

        # Define paths
        chkpt_path = args.evaluate

        # Check that the checkpoint file exist
        assert os.path.isfile(
            chkpt_path), "- No checkpoint found at: {}".format(chkpt_path)

        # Experiment director
        exp_dir = os.path.dirname(os.path.abspath(chkpt_path))
        sys.path.append(exp_dir)

        # Load checkpoint
        print('- Loading checkpoint:', chkpt_path)

        # Load the checkpoint
        checkpoint = torch.load(chkpt_path)

        # Assign some local variables
        args = checkpoint['args']
        start_epoch = checkpoint['epoch']
        best_result = checkpoint['best_result']
        print('- Checkpoint was loaded successfully.')

        # Compare the checkpoint args with the json file in case I wanted to change some args
        compare_args_w_json(args, exp_dir, start_epoch + 1)
        args.evaluate = chkpt_path

        device = torch.device(
            "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")
        model = checkpoint['model'].to(device)

        print_args(args)

        _, val_loader = create_dataloader(args, eval_mode=True)

        loss = get_loss_fn(args).to(device)

        evaluate_epoch(val_loader, model, loss, start_epoch)

        return  # End program

############ RESUME MODE ############
    elif args.resume:  # Resume mode
        print('\n==> Resume mode!')

        # Define paths
        chkpt_path = args.resume
        assert os.path.isfile(
            chkpt_path), "- No checkpoint found at: {}".format(chkpt_path)

        # Experiment directory
        exp_dir = os.path.dirname(os.path.abspath(chkpt_path))
        sys.path.append(exp_dir)

        # Load checkpoint
        print('- Loading checkpoint:', chkpt_path)
        checkpoint = torch.load(chkpt_path)
        args = checkpoint['args']
        start_epoch = checkpoint['epoch'] + 1
        best_result = checkpoint['best_result']
        print('- Checkpoint ({}) was loaded successfully!\n'.format(
            checkpoint['epoch']))

        # Compare the checkpoint args with the json file in case I wanted to change some args
        compare_args_w_json(args, exp_dir, start_epoch)
        args.resume = chkpt_path

        device = torch.device(
            "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")
        model = checkpoint['model'].to(device)
        optimizer = checkpoint['optimizer']

        print_args(args)

        train_loader, val_loader = create_dataloader(args, eval_mode=False)

############ NEW EXP MODE ############
    else:  # New Exp
        print('\n==> Starting a new experiment "{}" \n'.format(args.exp))

        # Check if experiment exists
        ws_path = os.path.join('workspace/', args.workspace)
        exp = args.exp
        exp_dir = os.path.join(ws_path, exp)
        assert os.path.isdir(exp_dir), '- Experiment "{}" not found!'.format(
            exp)

        # Which device to use
        device = torch.device(
            "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")

        # Add the experiment's folder to python path
        sys.path.append(exp_dir)

        print_args(args)

        # Create dataloader
        train_loader, val_loader = create_dataloader(args, eval_mode=False)

        # import the model
        f = importlib.import_module('network')
        model = f.CNN().to(device)
        print('\n==> Model "{}" was loaded successfully!'.format(
            model.__name__))

        # Optimize only parameters that requires_grad
        parameters = filter(lambda p: p.requires_grad, model.parameters())

        # Create Optimizer
        if args.optimizer.lower() == 'sgd':
            optimizer = SGD(parameters,
                            lr=args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)
        elif args.optimizer.lower() == 'adam':
            optimizer = Adam(parameters,
                             lr=args.lr,
                             weight_decay=args.weight_decay,
                             amsgrad=True)

############ IF RESUME/NEW EXP ############
# Error metrics that are set to the worst
    best_result = create_error_metric(args)
    best_result.set_to_worst()

    # Tensorboard
    tb = args.tb_log if hasattr(args, 'tb_log') else False
    tb_freq = args.tb_freq if hasattr(args, 'tb_freq') else 1000
    tb_writer = None
    if tb:
        tb_writer = SummaryWriter(
            os.path.join(
                exp_dir, 'tb_log',
                datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))

    # Create Loss
    loss = get_loss_fn(args).to(device)

    # Define Learning rate decay
    lr_decayer = lr_scheduler.StepLR(optimizer,
                                     step_size=args.lr_decay_step,
                                     gamma=args.lr_decay_factor,
                                     last_epoch=start_epoch - 1)

    # Create or Open Logging files
    train_csv = LogFile(os.path.join(exp_dir, 'train.csv'), args)
    test_csv = LogFile(os.path.join(exp_dir, 'test.csv'), args)
    best_txt = os.path.join(exp_dir, 'best.txt')

    save_args(exp_dir, args)  # Save args to JSON file

    ############ TRAINING LOOP ############
    for epoch in range(start_epoch, args.epochs):
        print('\n==> Training Epoch [{}] (lr={})'.format(
            epoch, optimizer.param_groups[0]['lr']))

        train_err_avg = train_epoch(train_loader, model, optimizer, loss,
                                    epoch)

        # Learning rate scheduler
        lr_decayer.step()

        train_csv.update_log(train_err_avg, epoch)

        # Save checkpoint in case evaluation crashed
        save_checkpoint(
            {
                'args': args,
                'epoch': epoch,
                'model': model,
                'best_result': best_result,
                'optimizer': optimizer,
            }, False, epoch, exp_dir)

        # Evaluate the trained epoch
        test_err_avg, out_image = evaluate_epoch(
            val_loader, model, loss, epoch)  # evaluate on validation set

        # Evaluate Uncerainty
        ause = None
        if args.eval_uncert:
            if args.loss == 'masked_prob_loss_var':
                ause, ause_fig = eval_ause(model,
                                           val_loader,
                                           args,
                                           epoch,
                                           uncert_type='v')
            else:
                ause, ause_fig = eval_ause(model,
                                           val_loader,
                                           args,
                                           epoch,
                                           uncert_type='c')

        # Log to tensorboard if enabled
        if tb_writer is not None:
            avg_meter = test_err_avg.get_avg()
            tb_writer.add_scalar('Loss/val', avg_meter.loss, epoch)
            tb_writer.add_scalar('MAE/val', avg_meter.metrics['mae'], epoch)
            tb_writer.add_scalar('RMSE/val', avg_meter.metrics['rmse'], epoch)
            if ause is not None:
                tb_writer.add_scalar('AUSE/val', ause, epoch)
            tb_writer.add_images(
                'Prediction', colored_depthmap_tensor(out_image[:, :1, :, :]),
                epoch)
            tb_writer.add_images(
                'Input_Conf_Log_Scale',
                colored_depthmap_tensor(torch.log(out_image[:, 2:, :, :] + 1)),
                epoch)
            tb_writer.add_images(
                'Output_Conf_Log_Scale',
                colored_depthmap_tensor(torch.log(out_image[:, 1:2, :, :] +
                                                  1)), epoch)
            tb_writer.add_figure('Sparsification_Plot', ause_fig, epoch)

        # Update Log files
        test_csv.update_log(test_err_avg, epoch, ause)

        # Save best model
        # TODO: How to decide the best based on dataset?
        is_best = test_err_avg.metrics['rmse'] < best_result.metrics['rmse']
        if is_best:
            best_result = test_err_avg  # Save the new best locally
            test_err_avg.print_to_txt(best_txt, epoch)  # Print to a text file

        # Save it again if it is best checkpoint
        save_checkpoint(
            {
                'args': args,
                'epoch': epoch,
                'model': model,
                'best_result': best_result,
                'optimizer': optimizer,
            }, is_best, epoch, exp_dir)
Example #17
0
    def train_session(self, model_tts: ForwardTacotron,
                      model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer,
                      tts_session: ForwardSession, asr_session: ASRSession,
                      asr_trainer, optimizer_asr) -> None:
        # print(tts_session.path)
        # exit()
        asr_trainer_state = {'logs': []}
        current_step = model_tts.get_step()
        tts_training_steps = tts_session.max_step - current_step
        try:
            _, asr_current_step = get_last_checkpoint(
                './checkpoints/sme_speech_tts.asr_forward/', 'model_at')
            asr_training_steps = tts_session.max_step - asr_current_step
        except:
            asr_current_step = 0
            asr_training_steps = tts_training_steps

        total_iters = len(tts_session.train_set)
        epochs = tts_training_steps // total_iters + 1
        simple_table([
            ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'),
            ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'),
            ('Batch Size TTS', tts_session.bs),
            ('Learning Rate', tts_session.lr)
        ])

        for g in optimizer_tts.param_groups:
            g['lr'] = tts_session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()

        device = next(model_tts.parameters()
                      ).device  # use same device as model parameters
        warnings.filterwarnings('ignore', category=UserWarning)
        for e in range(1, epochs + 1):

            #tts train loop for epoch
            for i, (x, m, ids, x_lens, mel_lens,
                    dur) in enumerate(tts_session.train_set, 1):
                start = time.time()
                model_tts.train()
                x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                     x_lens.to(device), mel_lens.to(device)

                m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)

                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                        x_lens)

                tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss
                optimizer_tts.zero_grad()
                # tts_s_loss.backward()
                torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                               hp.tts_clip_grad_norm)
                # optimizer_tts.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model_tts.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                # pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg_tts = f'| TTS MODEL (supervised training ): '\
                      f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward',
                                    self.paths,
                                    model_tts,
                                    optimizer_tts,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.forward_plot_every == 0:

                    self.generate_plots(model_tts, tts_session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/batch_size', tts_session.bs,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/learning_rate', tts_session.lr,
                                       model_tts.get_step())

                stream(msg_tts)
                # print(msg_tts)
            # print(torch.cuda.memory_allocated(device=device))
            # model_tts = model_tts.to('cpu')

            for step, inputs in enumerate(asr_session.train_set):

                optimizer_asr.zero_grad()

                model_asr.to(device)
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.to(device)
                model_asr.train()
                outputs = model_asr(**inputs)
                asr_s_loss = outputs["loss"] if isinstance(
                    outputs, dict) else outputs[0]
                # asr_s_loss = asr_s_loss.mean()

                msg_asr =  f'| ASR MODEL (supervised training) : '\
                            f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\
                            f' ||||||||||||||||||||||'

                stream(msg_asr)
            # # model_asr.to('cuda')

            m_val_loss, dur_val_loss = self.evaluate(model_tts,
                                                     tts_session.val_set)
            eval_tts_msg = f'| TTS MODEL (supervised eval ): '\
                        f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \
                        f'| Dur Val Loss: {dur_val_loss:#.4} ' \

            stream(eval_tts_msg)
            tts_eval_loss = m_val_loss + dur_val_loss
            #     print(eval_tts_msg)

            # ASR eval supervised
            print('\nEvaluating ASR model ...')
            # model_asr.to('cpu')
            asr_eval_loss = 0
            eval_wer = 0

            for step, inputs in enumerate(asr_session.test_set):
                asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step(
                    model_asr, inputs, False)
                asr_eval_loss += asr_eval_loss_i
                logits_a.to('cpu')
                eval_wer_i = asr_trainer.compute_metrics(
                    EvalPrediction(predictions=logits_a, label_ids=labels_a))
                eval_wer += eval_wer_i['wer']
                # print(eval_wer)
            eval_wer = eval_wer / step
            asr_eval_loss = asr_eval_loss / step

            msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr_eval)

            # dual transformation loop
            # tts_s_loss = 3
            # asr_s_loss = 1
            tts_u_loss, asr_u_loss = self.dual_transform(
                model_tts, model_asr, optimizer_tts, optimizer_asr,
                asr_session.test_set, m_loss_avg, dur_loss_avg, device,
                asr_current_step, e, epochs, duration_avg, total_iters,
                tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path)
            step += 1
            asr_path = f'checkpoint-27364'
            modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/'
            new_check = modelasr_folder + asr_path
            os.makedirs(new_check, exist_ok=True)

            # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name)

            save_checkpoint('forward',
                            self.paths,
                            model_tts,
                            optimizer_tts,
                            is_silent=True)

            # asr_u_loss = 2

            if "logs" not in asr_trainer_state:
                asr_trainer_state['logs'] = []
            asr_trainer_state['logs'].append({
                'step':
                step,
                'epoch':
                e,
                'asr_s_loss':
                int(asr_s_loss),
                'asr_u_loss':
                int(asr_u_loss),
                'tts_s_loss':
                int(tts_s_loss),
                'tts_u_loss':
                int(tts_u_loss),
                'tts_eval_loss':
                int(tts_eval_loss),
                'asr_eval_loss':
                int(asr_eval_loss),
                'eval_wer':
                eval_wer
            })

            with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json',
                      'w') as f:
                json.dump(asr_trainer_state, f)

            model_asr.save_pretrained(f'{new_check}')

            torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt')

            print("Exiting due to cuda OOM!")
            exit(11)
Example #18
0
def run(cur_gpu, hparams):
    if hparams.distributed_mode == 'gpus':
        dist.init_process_group(backend=hparams.dist_backend, init_method=hparams.dist_url,
                                world_size=hparams.world_size, rank=cur_gpu)

    if cur_gpu >= 0:
        torch.cuda.set_device(cur_gpu)
        model = getattr(models, hparams.model_name)(hparams, use_cuda=True, use_fp16=hparams.fp16)
        model.cuda()
    else:
        model = getattr(models, hparams.model_name)(hparams)

    if hparams.fp16:
        model = convert_to_half(model)

    if hparams.distributed_mode == 'gpus':
        model = nn.parallel.DistributedDataParallel(model, device_ids=[cur_gpu], output_device=cur_gpu,
                                                    find_unused_parameters=True)

    criterion = cross_entropy

    params, params_clone = get_parameters(model, clone=hparams.fp16)
    optimizer = optim.SGD([
        {'params': params_clone if hparams.fp16 else params, 'weight_decay': hparams.weight_decay},
    ], lr=hparams.initial_learning_rate, momentum=hparams.momentum)

    lr_scheduler = get_lr_scheduler(hparams.lr_scheduler, optimizer, hparams)

    best_acc1 = 0
    best_acc5 = 0
    start_epoch = hparams.start_epoch
    if hparams.checkpoint and os.path.isfile(hparams.checkpoint):
        start_epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5 = load_checkpoint(
            hparams.checkpoint, cur_gpu, model, optimizer, lr_scheduler)

    torch.backends.cudnn.benchmark = True

    train_loader, train_sampler = get_train_loader(hparams.data_dir, hparams.image_size,
                                                   hparams.per_replica_batch_size,
                                                   hparams.n_data_loading_workers,
                                                   hparams.distributed_mode,
                                                   hparams.world_size, cur_gpu)
    val_loader = get_val_loader(hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size,
                                hparams.n_data_loading_workers, hparams.distributed_mode,
                                hparams.world_size, cur_gpu)

    if hparams.evaluate:
        return validate(cur_gpu, val_loader, model, criterion, 0, hparams)

    monitor = get_monitor()
    for epoch in range(start_epoch, hparams.epochs):
        if cur_gpu == -1 or cur_gpu == 0:
            print('Epoch %d\n' % (epoch + 1))
        monitor and monitor.before_epoch()

        if train_sampler:
            train_sampler.set_epoch(epoch)
        train(cur_gpu, train_loader, model, criterion, optimizer, lr_scheduler,
              params, params_clone, epoch, hparams)

        loss, acc1, acc5 = validate(cur_gpu, val_loader, model, criterion, epoch, hparams)
        monitor and monitor.after_epoch(loss, acc1, acc5)

        if hparams.save_model and cur_gpu in (-1, 0):
            is_best = acc1 > best_acc1
            best_acc1 = acc1 if is_best else best_acc1
            save_checkpoint(hparams.model_dir, epoch, model, optimizer, lr_scheduler,
                            best_acc1, best_acc5, is_best)

    if hparams.distributed_mode == 'gpus':
        dist.destroy_process_group()
Example #19
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example, max_y, max_x):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention, r = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention, r = model(x, m)

            reduced_guides = []

            att_guide_path = hp.attention_path
            for j, item_id in enumerate(ids):
                att = np.load(f'{att_guide_path}/{item_id}.npy')
                reduced = att[0::r]

                pred_attention = attention[j]
                n_frames = pred_attention.shape[0]
                n_phones = pred_attention.shape[-1]

                #  pred_attention = torch.tensor(pred_attention)
                # reduced = torch.tensor(reduced)

                padded_guides = pad2d_nonzero(reduced, n_frames, n_phones)
                #padded_guides = torch.tensor(padded_guides)
                reduced_guides.append(padded_guides)

            reduced_guides = torch.tensor(reduced_guides)
            mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor)

            mask = torch.tensor(mask)
            padded_guides = [
                pad2d_zero(x, n_frames, n_phones) for x in reduced_guides
            ]
            padded_guides = torch.tensor(padded_guides)
            padded_guides = padded_guides.to(device)
            attention = attention.to(device)
            mask = mask.to(device)
            attention = attention * mask
            print("guide att shape", att.shape)
            print(att)

            print("reduced guide", padded_guides.shape)

            #   print("attention size",n_frames, n_phones)
            print("mask", mask.shape)
            print(mask)

            print(padded_guides.shape, attention.shape, mask.shape)

            print(attention)
            print(padded_guides)

            multiply = torch.pow((attention - padded_guides), 2)
            print(multiply)

            #multiply = torch.pow((pred_attention - padded_guides),2)* mask
            #print(multiply)

            attention_loss = torch.sum(multiply)
            print(attention_loss)
            mask_sum1 = torch.sum(mask)

            attention_loss /= mask_sum1
            print(attention_loss)

            #    batch_attention_losses.append(attention_loss)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses)
            #print("attention loss", average_att_loss)
            #print("m losses", m1_loss, m2_loss)
            prev_loss = m1_loss + m2_loss
            print("prev loss", prev_loss)
            loss = m1_loss + m2_loss + attention_loss
            print("loss + att", loss)
            #exit()
            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = f'taco_step{k}K'
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / f'{step}')
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / f'{step}', 600)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')