Ejemplo n.º 1
0
    def do_train(self,
                 paths,
                 dataset,
                 optimiser,
                 epochs,
                 batch_size,
                 step,
                 lr=1e-4,
                 valid_index=[],
                 use_half=False):
        if use_half:
            import apex
            optimiser = apex.fp16_utils.FP16_Optimizer(optimiser,
                                                       dynamic_loss_scale=True)
        for p in optimiser.param_groups:
            p['lr'] = lr
        criterion = nn.NLLLoss().cuda()
        k = 0
        saved_k = 0
        print(win_length, hop_length, win_length / hop_length)

        for e in range(epochs):

            # trn_loader = DataLoader(dataset, collate_fn=lambda batch: env.collate(0, int( win_length/hop_length), 0, batch), batch_size=batch_size,
            #                         num_workers=2, shuffle=True, pin_memory=True)
            trn_loader = DataLoader(
                dataset,
                collate_fn=lambda batch: env.collate(0, 16, 0, batch),
                batch_size=batch_size,
                num_workers=2,
                shuffle=True,
                pin_memory=True)

            start = time.time()
            running_loss_c = 0.
            running_loss_f = 0.

            iters = len(trn_loader)

            for i, (mels, coarse, fine, coarse_f,
                    fine_f) in enumerate(trn_loader):
                mels, coarse, fine, coarse_f, fine_f = mels.cuda(
                ), coarse.cuda(), fine.cuda(), coarse_f.cuda(), fine_f.cuda()
                coarse, fine, coarse_f, fine_f = [
                    t[:, hop_length:1 - hop_length]
                    for t in [coarse, fine, coarse_f, fine_f]
                ]
                if use_half:
                    mels = mels.half()
                    coarse_f = coarse_f.half()
                    fine_f = fine_f.half()

                x = torch.cat([
                    coarse_f[:, :-1].unsqueeze(-1),
                    fine_f[:, :-1].unsqueeze(-1), coarse_f[:, 1:].unsqueeze(-1)
                ],
                              dim=2)

                p_c, p_f, _h_n = self(x, mels)
                loss_c = criterion(p_c.transpose(1, 2).float(), coarse[:, 1:])
                loss_f = criterion(p_f.transpose(1, 2).float(), fine[:, 1:])
                loss = loss_c + loss_f

                optimiser.zero_grad()
                if use_half:
                    optimiser.backward(loss)
                else:
                    loss.backward()
                optimiser.step()
                running_loss_c += loss_c.item()
                running_loss_f += loss_f.item()

                self.after_update()

                speed = (i + 1) / (time.time() - start)
                avg_loss_c = running_loss_c / (i + 1)
                avg_loss_f = running_loss_f / (i + 1)

                step += 1
                k = step // 1000
                logger.status(
                    f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} -- Speed: {speed:#.4} steps/sec -- Step: {k}k '
                )

            os.makedirs(paths.checkpoint_dir, exist_ok=True)
            torch.save(self.state_dict(), paths.model_path())
            np.save(paths.step_path(), step)
            logger.log_current_status()
            logger.log(
                f' <saved>; w[0][0] = {self.wavernn.gru.weight_ih_l0[0][0]}')
            if k > saved_k + 50:
                torch.save(self.state_dict(), paths.model_hist_path(step))
                saved_k = k
                self.do_generate(paths,
                                 step,
                                 dataset.path,
                                 valid_index,
                                 use_half=use_half)
Ejemplo n.º 2
0
    def do_train(self,
                 paths,
                 dataset,
                 optimiser,
                 epochs,
                 batch_size,
                 step,
                 lr=1e-4,
                 valid_index=[],
                 use_half=False,
                 do_clip=False):

        if use_half:
            import apex
            optimiser = apex.fp16_utils.FP16_Optimizer(optimiser,
                                                       dynamic_loss_scale=True)
        for p in optimiser.param_groups:
            p['lr'] = lr
        criterion = nn.NLLLoss().cuda()
        k = 0
        saved_k = 0
        pad_left = self.pad_left()
        pad_left_encoder = self.pad_left_encoder()
        pad_left_decoder = self.pad_left_decoder()
        if self.noise_x:
            extra_pad_right = 127
        else:
            extra_pad_right = 0
        pad_right = self.pad_right() + extra_pad_right
        window = 16 * self.total_scale()
        logger.log(
            f'pad_left={pad_left_encoder}|{pad_left_decoder}, pad_right={pad_right}, total_scale={self.total_scale()}'
        )

        for e in range(epochs):

            trn_loader = DataLoader(
                dataset,
                collate_fn=lambda batch: env.collate_multispeaker_samples(
                    pad_left, window, pad_right, batch),
                batch_size=batch_size,
                num_workers=2,
                shuffle=True,
                pin_memory=True)

            start = time.time()
            running_loss_c = 0.
            running_loss_f = 0.
            running_loss_vq = 0.
            running_loss_vqc = 0.
            running_entropy = 0.
            running_max_grad = 0.
            running_max_grad_name = ""

            iters = len(trn_loader)

            for i, (speaker, wave16) in enumerate(trn_loader):

                speaker = speaker.cuda()
                wave16 = wave16.cuda()

                coarse = (wave16 + 2**15) // 256
                fine = (wave16 + 2**15) % 256

                coarse_f = coarse.float() / 127.5 - 1.
                fine_f = fine.float() / 127.5 - 1.
                total_f = (wave16.float() + 0.5) / 32767.5

                if self.noise_y:
                    noisy_f = total_f * (
                        0.02 * torch.randn(total_f.size(0), 1).cuda()
                    ).exp() + 0.003 * torch.randn_like(total_f)
                else:
                    noisy_f = total_f

                if use_half:
                    coarse_f = coarse_f.half()
                    fine_f = fine_f.half()
                    noisy_f = noisy_f.half()

                x = torch.cat([
                    coarse_f[:, pad_left -
                             pad_left_decoder:-pad_right].unsqueeze(-1),
                    fine_f[:, pad_left -
                           pad_left_decoder:-pad_right].unsqueeze(-1),
                    coarse_f[:, pad_left - pad_left_decoder + 1:1 -
                             pad_right].unsqueeze(-1),
                ],
                              dim=2)
                y_coarse = coarse[:, pad_left + 1:1 - pad_right]
                y_fine = fine[:, pad_left + 1:1 - pad_right]

                if self.noise_x:
                    # Randomly translate the input to the encoder to encourage
                    # translational invariance
                    total_len = coarse_f.size(1)
                    translated = []
                    for j in range(coarse_f.size(0)):
                        shift = random.randrange(256) - 128
                        translated.append(
                            noisy_f[j, pad_left - pad_left_encoder +
                                    shift:total_len - extra_pad_right + shift])
                    translated = torch.stack(translated, dim=0)
                else:
                    translated = noisy_f[:, pad_left - pad_left_encoder:]
                p_cf, vq_pen, encoder_pen, entropy = self(
                    speaker, x, translated)
                p_c, p_f = p_cf
                loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse)
                loss_f = criterion(p_f.transpose(1, 2).float(), y_fine)
                encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1))
                loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen

                optimiser.zero_grad()
                if use_half:
                    optimiser.backward(loss)
                    if do_clip:
                        raise RuntimeError(
                            "clipping in half precision is not implemented yet"
                        )
                else:
                    loss.backward()
                    if do_clip:
                        max_grad = 0
                        max_grad_name = ""
                        for name, param in self.named_parameters():
                            if param.grad is not None:
                                param_max_grad = param.grad.data.abs().max()
                                if param_max_grad > max_grad:
                                    max_grad = param_max_grad
                                    max_grad_name = name
                                if 1000000 < param_max_grad:
                                    logger.log(
                                        f'Very large gradient at {name}: {param_max_grad}'
                                    )
                        if 100 < max_grad:
                            for param in self.parameters():
                                if param.grad is not None:
                                    if 1000000 < max_grad:
                                        param.grad.data.zero_()
                                    else:
                                        param.grad.data.mul_(100 / max_grad)
                        if running_max_grad < max_grad:
                            running_max_grad = max_grad
                            running_max_grad_name = max_grad_name

                        if 100000 < max_grad:
                            torch.save(self.state_dict(), "bad_model.pyt")
                            raise RuntimeError(
                                "Aborting due to crazy gradient (model saved to bad_model.pyt)"
                            )
                optimiser.step()
                running_loss_c += loss_c.item()
                running_loss_f += loss_f.item()
                running_loss_vq += vq_pen.item()
                running_loss_vqc += encoder_pen.item()
                running_entropy += entropy

                self.after_update()

                speed = (i + 1) / (time.time() - start)
                avg_loss_c = running_loss_c / (i + 1)
                avg_loss_f = running_loss_f / (i + 1)
                avg_loss_vq = running_loss_vq / (i + 1)
                avg_loss_vqc = running_loss_vqc / (i + 1)
                avg_entropy = running_entropy / (i + 1)

                step += 1
                k = step // 1000
                logger.status(
                    f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} vq={avg_loss_vq:#.4} vqc={avg_loss_vqc:#.4} -- Entropy: {avg_entropy:#.4} -- Grad: {running_max_grad:#.1} {running_max_grad_name} Speed: {speed:#.4} steps/sec -- Step: {k}k '
                )

            os.makedirs(paths.checkpoint_dir, exist_ok=True)
            torch.save(self.state_dict(), paths.model_path())
            np.save(paths.step_path(), step)
            logger.log_current_status()
            logger.log(
                f' <saved>; w[0][0] = {self.overtone.wavernn.gru.weight_ih_l0[0][0]}'
            )
            if k > saved_k + 50:
                torch.save(self.state_dict(), paths.model_hist_path(step))
                saved_k = k
                self.do_generate(paths, step, dataset.path, valid_index)
Ejemplo n.º 3
0
    def do_train(self,
                 paths,
                 dataset,
                 optimiser,
                 epochs,
                 batch_size,
                 step,
                 lr=1e-4,
                 valid_index=[],
                 use_half=False):

        if use_half:
            import apex
            optimiser = apex.fp16_utils.FP16_Optimizer(optimiser,
                                                       dynamic_loss_scale=True)
        for p in optimiser.param_groups:
            p['lr'] = lr
        criterion = nn.NLLLoss().cuda()
        k = 0
        saved_k = 0
        pad_left = self.overtone.pad()
        time_span = 16 * 64

        for e in range(epochs):

            trn_loader = DataLoader(
                dataset,
                collate_fn=lambda batch: env.collate_samples(
                    pad_left, time_span, 1, batch),
                batch_size=batch_size,
                num_workers=2,
                shuffle=True,
                pin_memory=True)

            start = time.time()
            running_loss_c = 0.
            running_loss_f = 0.
            max_grad = 0.
            max_grad_name = ""

            iters = len(trn_loader)

            for i, wave16 in enumerate(trn_loader):

                wave16 = wave16.to(self.DEVICE)

                coarse = (wave16 + 2**15) // 256
                fine = (wave16 + 2**15) % 256

                coarse_f = coarse.float() / 127.5 - 1.
                fine_f = fine.float() / 127.5 - 1.

                if use_half:
                    coarse_f = coarse_f.half()
                    fine_f = fine_f.half()

                x = torch.cat([
                    coarse_f[:, :-1].unsqueeze(-1),
                    fine_f[:, :-1].unsqueeze(-1),
                    coarse_f[:, 1:].unsqueeze(-1),
                ],
                              dim=2)
                y_coarse = coarse[:, pad_left + 1:]
                y_fine = fine[:, pad_left + 1:]

                p_c, p_f = self(x)
                loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse)
                loss_f = criterion(p_f.transpose(1, 2).float(), y_fine)
                loss = loss_c + loss_f

                optimiser.zero_grad()
                if use_half:
                    optimiser.backward(loss)
                else:
                    loss.backward()
                    for name, param in self.named_parameters():
                        param_max_grad = param.grad.data.abs().max()
                        if param_max_grad > max_grad:
                            max_grad = param_max_grad
                            max_grad_name = name
                    nn.utils.clip_grad_norm_(self.parameters(), 1, 'inf')
                optimiser.step()
                running_loss_c += loss_c.item()
                running_loss_f += loss_f.item()

                self.after_update()

                speed = (i + 1) / (time.time() - start)
                avg_loss_c = running_loss_c / (i + 1)
                avg_loss_f = running_loss_f / (i + 1)

                step += 1
                k = step // 1000
                logger.status(
                    f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} -- Grad: {max_grad:#.1} {max_grad_name} Speed: {speed:#.4} steps/sec -- Step: {k}k '
                )

            torch.save(self.state_dict(), paths.model_path())
            np.save(paths.step_path(), step)
            logger.log_current_status()
            logger.log(
                f' <saved>; w[0][0] = {self.overtone.wavernn.gru.weight_ih_l0[0][0]}'
            )
            if k > saved_k + 50:
                torch.save(self.state_dict(), paths.model_hist_path(step))
                saved_k = k
                self.do_generate(paths, step, dataset.path, valid_index)
                logger.log('done generation')
Ejemplo n.º 4
0
    def generate(self, cond, global_cond, n=None, seq_len=None, verbose=False, use_half=False):
        start = time.time()
        if n is None:
            n = cond.size(0)
        if seq_len is None:
            seq_len = (cond.size(1) - self.cond_pad) * 64
        if use_half:
            std_tensor = torch.tensor([]).cuda().half()
        else:
            std_tensor = torch.tensor([]).cuda()

        # Warmup
        c0 = self.conv0(std_tensor.new_zeros(n, 10, 1), global_cond).repeat(1, 10, 1)
        c1 = self.conv1(c0, global_cond).repeat(1, 10, 1)
        c2 = self.conv2(c1, global_cond)

        if cond is None:
            pad_cond = None
        else:
            pad_cond = cond[:, :self.cond_pad]
        #logger.log(f'pad_cond: {pad_cond.size()}')
        r0, h0 = self.rnn0(torch.cat(filter_none([c2.repeat(1, 85, 1), pad_cond]), dim=2), global_cond)
        r1, h1 = self.rnn1(torch.cat([c1.repeat(1, 9, 1)[:, :84], r0], dim=2), global_cond)
        r2, h2 = self.rnn2(torch.cat([c0.repeat(1, 8, 1), r1], dim=2), global_cond)
        if global_cond is not None:
            global_cond_1 = global_cond.unsqueeze(1).expand(-1, r2.size(1), -1)
        else:
            global_cond_1 = None
        h3 = self.wavernn(std_tensor.new_zeros(n, 64, 3), torch.cat(filter_none([r2, global_cond_1]), dim=2))[2]

        # Create cells
        cell0 = self.rnn0.to_cell()
        cell1 = self.rnn1.to_cell()
        cell2 = self.rnn2.to_cell()
        wcell = self.wavernn.to_cell()

        # Main loop!
        coarse = std_tensor.new_zeros(n, 10, 1)
        c_val = std_tensor.new_zeros(n)
        f_val = std_tensor.new_zeros(n)
        zero = std_tensor.new_zeros(n)
        output = []
        for t in range(seq_len):
            #logger.log(f't = {t}')
            t0 = t % 4
            ct0 = (-t) % 4

            if t0 == 0:
                t1 = (t // 4) % 4
                ct1 = ((-t) // 4) % 4

                #logger.log(f'written to c0[{-ct1-1}]')
                c0[:, -ct1-1].copy_(self.conv0(coarse, global_cond).squeeze(1))
                coarse[:, :-4].copy_(coarse[:, 4:])

                if t1 == 0:
                    t2 = (t // 16) % 4
                    ct2 = ((-t) // 16) % 4

                    #logger.log('read c0')
                    #logger.log(f'written to c1[{-ct2-1}]')
                    c1[:, -ct2-1].copy_(self.conv1(c0, global_cond).squeeze(1))
                    c0[:, :-4].copy_(c0[:, 4:])

                    if t2 == 0:
                        #logger.log('read c1')
                        #logger.log('written to c2')
                        c2 = self.conv2(c1, global_cond).squeeze(1)
                        c1[:, :-4].copy_(c1[:, 4:])

                        #logger.log('read c2')
                        #logger.log('written to r0')
                        if cond is None:
                            inp0 = c2
                        else:
                            inp0 = torch.cat([c2, cond[:, t // 64 + self.cond_pad]], dim=1)
                        r0, h0 = cell0(inp0, global_cond, h0)

                    #logger.log(f'read r0[{t2}]')
                    #logger.log(f'written to r1')
                    #logger.log(f'c1: {c1.size()} r0: {r0.size()}')
                    r1, h1 = cell1(torch.cat([c1[:, -ct2-1], r0[:, t2]], dim=1), global_cond, h1)

                #logger.log(f'read r1[{t1}]')
                #logger.log(f'written to r2')
                #logger.log(f'c0: {c0.size()} r1: {r1.size()}')
                r2, h2 = cell2(torch.cat([c0[:, -ct1-1], r1[:, t1]], dim=1), global_cond, h2)

            #logger.log(f'read r2[{t0}]')
            wcond = torch.cat(filter_none([r2[:, t0], global_cond]), dim=1)

            x = torch.stack([c_val, f_val, zero], dim=1)
            o_c = wcell.forward_c(x, wcond, None, None, h3)
            c_cat = utils.nn.sample_softmax(o_c).float()
            c_val_new = (c_cat / 127.5 - 1.0).to(std_tensor)

            x = torch.stack([c_val, f_val, c_val_new], dim=1)
            o_f, h3 = wcell.forward_f(x, wcond, None, None, h3)
            f_cat = utils.nn.sample_softmax(o_f).float()
            f_val = (f_cat / 127.5 - 1.0).to(std_tensor)
            c_val = c_val_new

            sample = (c_cat * 256 + f_cat) / 32767.5 - 1.0
            coarse[:, 6+t0].copy_(c_val.unsqueeze(1))

            if verbose and t % 10000 < 100:
                logger.log(f'c={c_cat[0]} f={f_cat[0]} sample={sample[0]}')
            output.append(sample)
            if t % 100 == 0 :
                speed = int((t + 1) / (time.time() - start))
                logger.status(f'{t+1}/{seq_len} -- Speed: {speed} samples/sec')

        return torch.stack(output, dim=1)
Ejemplo n.º 5
0
    def generate(self,
                 feat,
                 aux1=None,
                 aux2=None,
                 aux3=None,
                 deterministic=False,
                 use_half=False,
                 verbose=False,
                 seq_len=None,
                 batch_size=None):
        start = time.time()
        if seq_len is None:
            seq_len = feat.size(1)
        if batch_size is None:
            batch_size = feat.size(0)
        h = torch.zeros(batch_size, self.rnn_dims).cuda()
        if use_half:
            h = h.half()

        c_val = torch.zeros(batch_size).cuda()
        f_val = torch.zeros(batch_size).cuda()
        zero = torch.zeros(batch_size).cuda()
        rnn_cell = self.to_cell()
        output = []

        for i in range(seq_len):
            if feat is None:
                m_t = None
            else:
                m_t = feat[:, i, :]
            if aux1 is None:
                a1_t = None
            else:
                a1_t = aux1[:, i, :]
            if aux2 is None:
                a2_t = None
            else:
                a2_t = aux2[:, i, :]
            if aux3 is None:
                a3_t = None
            else:
                a3_t = aux3[:, i, :]

            x = torch.stack([c_val, f_val, zero], dim=1)
            if use_half:
                x = x.half()
            o_c = rnn_cell.forward_c(x, m_t, a1_t, a2_t, h)
            if deterministic:
                c_cat = torch.argmax(o_c, dim=1).to(torch.float32)
            else:
                posterior_c = F.softmax(o_c.float(), dim=1)
                distrib_c = torch.distributions.Categorical(posterior_c)
                c_cat = distrib_c.sample().float()
            c_val_new = c_cat / 127.5 - 1.0

            x = torch.stack([c_val, f_val, c_val_new], dim=1)
            if use_half:
                x = x.half()
            o_f, h = rnn_cell.forward_f(x, m_t, a1_t, a3_t, h)
            if deterministic:
                f_cat = torch.argmax(o_f, dim=1).to(torch.float32)
            else:
                posterior_f = F.softmax(o_f.float(), dim=1)
                distrib_f = torch.distributions.Categorical(posterior_f)
                f_cat = distrib_f.sample().float()
            f_val = f_cat / 127.5 - 1.0

            c_val = c_val_new

            sample = (c_cat * 256 + f_cat) / 32767.5 - 1.0
            if verbose and i % 10000 < 100:
                logger.log(f'c={c_cat[0]} f={f_cat[0]} sample={sample[0]}')
            output.append(sample)
            if i % 100 == 0:
                speed = int((i + 1) / (time.time() - start))
                logger.status(f'{i+1}/{seq_len} -- Speed: {speed} samples/sec')

        return torch.stack(output, dim=1)