Esempio n. 1
0
    def sample(self, condition):
        x = None
        seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0)
        input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
        audio_lengths = torch.LongTensor([0]).cuda()

        ## Tier 1 ##
        tqdm.write('Tier 1')
        for t in tqdm(range(self.args.timestep // self.t_div)):
            audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    mu, std, pi, _ = self.tiers[1](x, seq, input_lengths,
                                                   audio_lengths)
                else:
                    mu, std, pi = self.tiers[1](x, audio_lengths)
                temp = sample_gmm(mu, std, pi)
                x[:, m, t] = temp[:, m, t]

        ## Tier 2~N ##
        for tier in tqdm(range(2, self.hp.model.tier + 1)):
            tqdm.write('Tier %d' % tier)
            mu, std, pi = self.tiers[tier](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            x = self.tierutil.interleave(x, temp, tier + 1)

        return x
Esempio n. 2
0
def sample_model_with_breakdown(model, condition):
  x = None
  seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0)
  input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
  audio_lengths = torch.LongTensor([0]).cuda()
  breakdown = {}

  ## Tier 1 ##
  tqdm.write('Tier 1')
  for t in tqdm(range(model.args.timestep // model.t_div)):
      audio_lengths += 1
      if x is None:
          x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()
      else:
          x = torch.cat([x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()], dim=-1)
      for m in tqdm(range(model.n_mels // model.f_div)):
          torch.cuda.synchronize()
          if model.infer_hp.conditional:
              mu, std, pi, _ = model.tiers[1](x, seq, input_lengths, audio_lengths)
          else:
              mu, std, pi = model.tiers[1](x, audio_lengths)
          temp = sample_gmm(mu, std, pi)
          x[:, m, t] = temp[:, m, t]
  breakdown[1] = (x.clone()[0].cpu().detach().numpy(), x.clone()[0].cpu().detach().numpy())

  ## Tier 2~N ##
  for tier in tqdm(range(2, model.hp.model.tier + 1)):
      tqdm.write('Tier %d' % tier)
      mu, std, pi = model.tiers[tier](x, audio_lengths)
      temp = sample_gmm(mu, std, pi)
      breakdown[tier] = (x.clone()[0].cpu().detach().numpy(), temp.clone()[0].cpu().detach().numpy())
      x = model.tierutil.interleave(x, temp, tier + 1)

  return breakdown, x
Esempio n. 3
0
def run_inference_on_tier(source, tier, text, timestep):
  # Returns a tuple, (inference, next_tier)
  # inference is the conditional inference on the current tier
  # next_tier interleaves the inference with the input to generate the next tier
  args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  infererence_hp = HParam(args.infer_config)

  assert timestep % t_div[hp.model.tier] == 0, \
      "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep)

  model = MelNet(hp, args, infererence_hp).cuda()
  model.load_tiers()
  model.eval()
  audio_lengths = torch.LongTensor([0]).cuda()
  if tier > 1:
    for t in tqdm(range(model.args.timestep // model.t_div)):
      audio_lengths += 1

  # source = breakdown[tier][0]
  x = torch.unsqueeze(torch.from_numpy(source), 0)
  mu, std, pi = model.tiers[tier](x, audio_lengths)
  temp = sample_gmm(mu, std, pi)
  next_tier = model.tierutil.interleave(x, temp, tier + 1)
  return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
Esempio n. 4
0
    def sample(self, condition):
        x = None
        if condition is not None:
            # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0)
            x = condition
        else:
            seq = torch.LongTensor([[0]])
        # input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
        if x is not None:
            audio_lengths = torch.LongTensor([x.size()[-1]]).cuda()
        else:
            audio_lengths = torch.LongTensor([0]).cuda()
        ## Tier 1 ##
        tqdm.write('Tier 1')
        if self.args.timestep == 0:
            mu, std, pi = self.tiers[1](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            return temp

        for t in tqdm(range(self.args.timestep // self.t_div)):
            audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths)
                    break
                else:
                    mu, std, pi = self.tiers[1](x, audio_lengths)
                temp = sample_gmm(mu, std, pi)
                new_idx = audio_lengths.item() - 1
                x[:, m, new_idx] = temp[:, m, new_idx]

        ## Tier 2~N ##
        for tier in tqdm(range(2, self.hp.model.tier + 1)):
            tqdm.write('Tier %d' % tier)
            mu, std, pi = self.tiers[tier](x)
            temp = sample_gmm(mu, std, pi)
            x = self.tierutil.interleave(x, temp, tier + 1)

        return x
Esempio n. 5
0
    def sample_dependence(self, condition, label, dependence_length):
        x = None
        if condition is not None:
            # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0)
            x = condition
        else:
            seq = torch.LongTensor([[0]])
        if x is not None:
            audio_lengths = torch.LongTensor([x.size()[-1]]).cuda()
        else:
            audio_lengths = torch.LongTensor([0]).cuda()
        for t in tqdm(range(self.args.timestep // self.t_div)):
            # audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths)
                    break
                else:
                    class_label = torch.tensor(
                        label, dtype=torch.long) if isinstance(
                            label, int) else torch.LongTensor(label)
                    if m == 0:
                        mu, std, pi, h_t, h_c = self.tiers[1](
                            x[:, :, -dependence_length:],
                            audio_lengths,
                            class_label.cuda(non_blocking=True).unsqueeze(0),
                            save_hidden=True,
                            hidden_t=None,
                            hidden_c=None)
                    else:
                        mu, std, pi = self.tiers[1](
                            x[:, :, -dependence_length:],
                            audio_lengths,
                            class_label.cuda(non_blocking=True).unsqueeze(0),
                            save_hidden=False,
                            hidden_t=h_t,
                            hidden_c=h_c)
                temp = sample_gmm(mu, std, pi)
                new_idx = audio_lengths.item() - 1
                x[:, m, -1] = temp[:, m, new_idx]

        return x
Esempio n. 6
0
def run_inference(source, timestep, tier_to_breakdown):
    # First load in the model
    hp = HParam('./config/blizzard_alldata_v5.yaml')
    infer_hp = HParam('./config/inference.yaml')
    args = parse_inference_args([
        '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml',
        '-t',
        str(timestep), '-n', 'test_tiers', '-i', SENTENCE
    ])
    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()
    audio_lengths = torch.LongTensor([0]).cuda()
    for t in tqdm(range(model.args.timestep // model.t_div)):
        audio_lengths += 1
    ## Tier 2~N ##
    x = torch.unsqueeze(torch.from_numpy(source), 0)
    for tier in tqdm(
            range(model.hp.model.tier + 1 - TESTING_TIERS,
                  model.hp.model.tier + 1)):
        tqdm.write('Tier %d' % tier)
        # Save original source and inference source
        actual_source = tier_to_breakdown[tier][0]
        actual_target = tier_to_breakdown[tier][1]
        actual_interleaved = tier_to_breakdown[tier + 1][0]
        current_source = x
        save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier)
        save_image(actual_source, 'tier_%d_actual_source' % tier)
        save_image(actual_target, 'tier_%d_actual_target' % tier)
        mu, std, pi = model.tiers[tier](x, audio_lengths)
        temp = sample_gmm(mu, std, pi)
        save_image(temp[0].cpu().detach().numpy(),
                   'tier_%d_inference_target' % tier)
        x = model.tierutil.interleave(x, temp, tier + 1)
        save_image(x.detach().numpy()[0],
                   'tier_%d_inference_interleaved' % tier)
        save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier)
    reconstructed_mel_tensor = x.detach().numpy()
    return reconstructed_mel_tensor[0]
Esempio n. 7
0
def run_inference(sentence, timestep, tier_to_breakdown):
    # First load in the model
    hp = HParam('./config/blizzard_alldata_v5.yaml')
    infer_hp = HParam('./config/inference.yaml')
    args = parse_inference_args([
        '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml',
        '-t',
        str(timestep), '-n', 'test_tiers', '-i', SENTENCE
    ])
    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    x = None
    seq = torch.from_numpy(process_blizzard(sentence)).long().unsqueeze(0)
    input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
    audio_lengths = torch.LongTensor([0]).cuda()

    actual_target = tier_to_breakdown[1][1]
    # save_image(seq.detach().numpy(), 'tier_1_seq_source')

    ## Tier 1 ##
    tqdm.write('Tier 1')
    for t in tqdm(range(model.args.timestep // model.t_div)):
        audio_lengths += 1
        if x is None:
            x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()
        else:
            x = torch.cat(
                [x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()],
                dim=-1)
        for m in tqdm(range(model.n_mels // model.f_div)):
            torch.cuda.synchronize()
            if model.infer_hp.conditional:
                mu, std, pi, _ = model.tiers[1](x, seq, input_lengths,
                                                audio_lengths)
            else:
                mu, std, pi = model.tiers[1](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            x[:, m, t] = temp[:, m, t]

    save_image(x[0].cpu().detach().numpy(), 'tier_1_inference_target')
    save_image(actual_target, 'tier_1_actual_target')

    # for t in tqdm(range(model.args.timestep // model.t_div)):
    #   audio_lengths += 1
    # ## Tier 2~N ##
    # x = torch.unsqueeze(torch.from_numpy(source), 0)
    # for tier in tqdm(range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)):
    #     tqdm.write('Tier %d' % tier)
    #     # Save original source and inference source
    #     actual_source = tier_to_breakdown[tier][0]
    #     actual_target = tier_to_breakdown[tier][1]
    #     actual_interleaved = tier_to_breakdown[tier+1][0]
    #     current_source = x
    #     save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier)
    #     save_image(actual_source, 'tier_%d_actual_source' % tier)
    #     save_image(actual_target, 'tier_%d_actual_target' % tier)
    #     mu, std, pi = model.tiers[tier](x, audio_lengths)
    #     temp = sample_gmm(mu, std, pi)
    #     save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier)
    #     x = model.tierutil.interleave(x, temp, tier + 1)
    #     save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier)
    #     save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier)
    reconstructed_mel_tensor = x.detach().numpy()
    return reconstructed_mel_tensor[0]