Esempio n. 1
0
def synthesis(text, args):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1,1, 80]).cuda()
    pos_text = t.arange(1, text.size(1)+1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m=m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)
    
    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([t.zeros([1,1, 80]).cuda(),postnet_pred], dim=1)

        mag_pred = m_post.forward(postnet_pred)
        
    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    write(hp.sample_path + "/test.wav", hp.sr, wav)
Esempio n. 2
0
def synthesis(text, args):
    m = Model()
    m.load_state_dict(load_checkpoint(args.restore_path))
    print("[%s][%s] Synthesizing:" % (args.lang, args.spk), text)

    text = np.asarray([1] + list(text.encode('utf-8')) + [2])
    text = t.LongTensor(text).unsqueeze(0)
    text = text
    mel_input = t.zeros([1, 1, 80])
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text
    lang_to_id = json.load(open(os.path.join(args.data_path, 'lang_id.json')))
    spk_to_id = json.load(open(os.path.join(args.data_path, 'spk_id.json')))
    lang_id = lang_to_id[args.lang]
    spk_id = spk_to_id[args.spk]

    lang_id = t.LongTensor([lang_id])
    spk_id = t.LongTensor([spk_id])
    m.train(False)
    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0)
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = \
                m.forward(text, mel_input, pos_text, pos_mel, lang_id, spk_id)
            mel_input = t.cat([mel_input, mel_pred[:, -1:, :]], dim=1)
            if stop_token[:, -1].item() > 0:
                break

    mel = postnet_pred.squeeze(0).cpu().numpy()
    wav = mel2wav(mel)
    np.save(args.out_path + "_mel.npy", mel)
    write(args.out_path + ".wav", hp.sr, wav)
    plot_mel(args.out_path + "_mel.png", mel)
    plot_attn(attn, args.out_path + '_align.png')
Esempio n. 3
0
def synthesis(text, num):
    m = Model()
    # m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(num, "transformer"))
    # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    # m_post = m_post.cuda()
    m.train(False)
    # m_post.train(False)

    # pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for _ in range(1000):
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)

        # mag_pred = m_post.forward(postnet_pred)

    # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    mel_postnet = postnet_pred[0].cpu().numpy().T
    plot_data([mel_postnet for _ in range(2)])
    wav = audio.inv_mel_spectrogram(mel_postnet)
    wav = wav[0:audio.find_endpoint(wav)]
    audio.save_wav(wav, "result.wav")
Esempio n. 4
0
def test(path):
    model = Model()
    model.to("cuda:0")
    model.eval()
    checkpoint = torch.load("./model.pth")
    model.load_state_dict(checkpoint["model"])
    img = np.array(Image.open(path).resize([448, 448]))[np.newaxis]
    img = np.transpose(img, axes=[0, 3, 1, 2]) / 255
    img = torch.tensor(img, dtype=torch.float32).to("cuda:0")
    preds = model(img).cpu().detach().numpy()
    cell_h, cell_w = IMG_H / S, IMG_W / S
    x, y = np.meshgrid(range(S), range(S))
    preds_xywhs = []
    for i in range(B):
        preds_x = (preds[0, :, :, i * 4] + x) * cell_w
        preds_y = (preds[0, :, :, i * 4 + 1] + y) * cell_h
        preds_w = preds[0, :, :, i * 4 + 2] * IMG_W
        preds_h = preds[0, :, :, i * 4 + 3] * IMG_H
        preds_xywh = np.dstack((preds_x, preds_y, preds_w, preds_h))
        preds_xywhs.append(preds_xywh)
    preds_xywhs = np.dstack(preds_xywhs)
    preds_xywhs = np.reshape(preds_xywhs, [-1, 4])
    preds_class = preds[0, :, :, 10:]
    preds_class = np.reshape(preds_class, [-1, 20])
    preds_c = preds[0, :, :, 8:10]
    preds_c = np.reshape(preds_c, [-1, 1])
    max_arg = np.argmax(preds_c, axis=0)
    print("max confidence: %f" % (preds_c[max_arg]))
    max_arg_ = np.argmax(preds_class[int(max_arg // 2)])
    print("class confidence: %f" % (preds_class[max_arg // 2, max_arg_]))
    print("class category: %s" % (CLASSES[int(max_arg_)]))
    Image.fromarray(
        np.uint8(
            draw_bboxes(np.array(Image.open(path).resize([448, 448])),
                        preds_xywhs[max_arg[0]:max_arg[0] + 1]))).show()
Esempio n. 5
0
def synthesis(text, args):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = torch.LongTensor(text).unsqueeze(0)
    text = text.cuda()

    mel_input = np.load('3_0.pt.npy')

    pos_text = torch.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)

    with torch.no_grad():
        mag_pred = m_post.forward(
            torch.from_numpy(mel_input).unsqueeze(0).cuda())

    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    write(hp.sample_path + "/test.wav", hp.sr, wav)
Esempio n. 6
0
def synthesis(text, args, num):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)

    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            # print('mel_pred==================',mel_pred.shape)
            # print('postnet_pred==================', postnet_pred.shape)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)
            #print(postnet_pred[:, -1:, :])
            #print(t.argmax(attn[1][1][i]).item())
            #print('mel_input==================', mel_input.shape)

    # #直接用真实mel测试postnet效果
    #aa = t.from_numpy(np.load('D:\SSHdownload\\000101.pt.npy')).cuda().unsqueeze(0)
    # # print(aa.shape)
    mag_pred = m_post.forward(postnet_pred)
    #real_mag = t.from_numpy((np.load('D:\SSHdownload\\003009.mag.npy'))).cuda().unsqueeze(dim=0)
    #wav = spectrogram2wav(postnet_pred)

    #print('shappe============',attn[2][0].shape)
    # count = 0
    # for j in range(4):
    #     count += 1
    #     attn1 = attn[0][j].cpu()
    #     plot_alignment(attn1, path='./training_loss/'+ str(args.restore_step1)+'_'+str(count)+'_'+'S'+str(num)+'.png', title='sentence'+str(num))

    attn1 = attn[0][1].cpu()
    plot_alignment(attn1,
                   path='./training_loss/' + str(args.restore_step1) + '_' +
                   'S' + str(num) + '.png',
                   title='sentence' + str(num))

    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().detach().numpy())
    write(
        hp.sample_path + '/' + str(args.restore_step1) + '-' + "test" +
        str(num) + ".wav", hp.sr, wav)
Esempio n. 7
0
def ensemble(state, X_test, y_test, g):
    mod1 = Model().to(state['device'])
    mod2 = Model().to(state['device'])
    mod3 = Model().to(state['device'])
    mod4 = Model().to(state['device'])
    mod = Model().to(state['device'])
    mod1.load_state_dict(torch.load(state['path1'])['model_state_dict'])
    mod2.load_state_dict(torch.load(state['path2'])['model_state_dict'])
    mod3.load_state_dict(torch.load(state['path3'])['model_state_dict'])
    mod4.load_state_dict(torch.load(state['path4'])['model_state_dict'])

    for p, p1, p2, p3, p4 in zip(mod.parameters(), mod1.parameters(),
                                 mod2.parameters(), mod3.parameters(),
                                 mod4.parameters()):
        p.data.copy_(
            p1.data.mul(0.25).add(p2.data.mul(0.25)).add(
                p3.data.mul(0.25)).add(p4.data.mul(0.25)))
    mod.state_dict()
    acc = test_with_dropout(X_test, y_test, mod, state['device'],
                            state['cuda'])
    path = g + str(state['itr']) + 'epoch.' + str(state['acq']) + 'acq.pth.tar'
    state['rep'] = path
    torch.save({'model_state_dict': mod.state_dict()}, state['rep'])

    return mod, acc
Esempio n. 8
0
def model(dataset, model_name=None, device=None, train=True):
    """加载模型"""
    device = device or torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")
    net = Model(vocab_size=dataset.vocab_size,
                embedding_dim=config.embedding_dim,
                output_size=dataset.target_vocab_size,
                encoder_hidden_size=config.encoder_hidden_size,
                decoder_hidden_size=config.decoder_hidden_size,
                encoder_layers=config.encoder_layers,
                decoder_layers=config.decoder_layers,
                dropout=config.dropout,
                embedding_weights=dataset.vector_weights,
                device=device)
    if model_name:  # 如果指定了模型名称, 就加载对应的模型
        pre_trained_state_dict = torch.load(FILE_PATH + config.model_path +
                                            model_name,
                                            map_location=device)
        state_dict = net.state_dict()
        state_dict.update(pre_trained_state_dict)
        net.load_state_dict(state_dict)
    net.train() if train else net.eval()
    return net
Esempio n. 9
0
    np.random.seed(random_seed)
    np.random.shuffle(indices)
val_indices = indices[:split]

# check if dataset load order is correct
# for ind in val_indices:
#     print(ind)
#     data = my_dataset[ind]
#     img = data['image']
#     plt.figure()
#     plt.imshow(img.permute(1,2,0))
#     plt.show()

# load model
model = Model().to(device=device)
model.load_state_dict(torch.load('model_saved.pth'))
model = model.float()
model.eval()

for ind in val_indices:
    data = my_dataset[ind]
    img = data['image']
    img = img.to(device=device)
    img = img.unsqueeze(dim=0)
    position_map, feature_maps = model(img)

    position_map = position_map.squeeze()  # must be (128,128)
    feature_maps = feature_maps.squeeze()  # should be (16,128,128)
    feature_maps = feature_maps.permute(1, 2, 0)  # should be (128,128,16)

    position_map = position_map.detach().cpu().numpy()
Esempio n. 10
0
def synthesis(args):
    m = Model()
    m_post = ModelPostNet()
    m_stop = ModelStopToken()
    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_stop.load_state_dict(load_checkpoint(args.restore_step3, "stop_token"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    m=m.cuda()
    m_post = m_post.cuda()
    m_stop = m_stop.cuda()
    m.train(False)
    m_post.train(False)
    m_stop.train(False)
    test_dataset = get_dataset(hp.test_data_csv)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1)
    ref_dataset = get_dataset(hp.test_data_csv)
    ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1)

    writer = get_writer(hp.checkpoint_path, hp.log_directory)

    ref_dataloader_iter = iter(ref_dataloader)
    for i, data in enumerate(test_dataloader):
        character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data
        ref_character, ref_mel, ref_mel_input, ref_pos_text, ref_pos_mel, ref_text_length, ref_mel_length, ref_fname = next(ref_dataloader_iter)
        stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1)
        mel_input = t.zeros([1,1,80]).cuda()
        stop=[]
        character = character.cuda()
        mel = mel.cuda()
        mel_input = mel_input.cuda()
        pos_text = pos_text.cuda()
        pos_mel = pos_mel.cuda()
        ref_character = ref_character.cuda()
        ref_mel = ref_mel.cuda()
        ref_mel_input = ref_mel_input.cuda()
        ref_pos_text = ref_pos_text.cuda()
        ref_pos_mel = ref_pos_mel.cuda()

        with t.no_grad():
            start=time.time()
            for i in range(args.max_len):
                pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda()
                mel_pred, postnet_pred, attn_probs, decoder_output, attns_enc, attns_dec, attns_style = m.forward(character, mel_input, pos_text, pos_mel, ref_mel, ref_pos_mel)
                stop_token = m_stop.forward(decoder_output)
                mel_input = t.cat([mel_input, postnet_pred[:,-1:,:]], dim=1)
                stop.append(t.sigmoid(stop_token).squeeze(-1)[0,-1])
                if stop[-1] > 0.5:
                    print("stop token at " + str(i) + " is :", stop[-1])
                    print("model inference time: ", time.time() - start)
                    break
            if stop[-1] == 0:
                continue
            mag_pred = m_post.forward(postnet_pred)
            inf_time = time.time() - start
            print("inference time: ", inf_time)

        wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
        print("rtx : ", (len(wav)/hp.sr) / inf_time)
        wav_path = os.path.join(hp.sample_path, 'wav')
        if not os.path.exists(wav_path):
            os.makedirs(wav_path)
        write(os.path.join(wav_path, "text_{}_ref_{}_synth.wav".format(fname, ref_fname)), hp.sr, wav)
        print("written as text{}_ref_{}_synth.wav".format(fname, ref_fname))
        attns_enc_new=[]
        attns_dec_new=[]
        attn_probs_new=[]
        attns_style_new=[]
        for i in range(len(attns_enc)):
            attns_enc_new.append(attns_enc[i].unsqueeze(0))
            attns_dec_new.append(attns_dec[i].unsqueeze(0))
            attn_probs_new.append(attn_probs[i].unsqueeze(0))
            attns_style_new.append(attns_style[i].unsqueeze(0))
        attns_enc = t.cat(attns_enc_new, 0)
        attns_dec = t.cat(attns_dec_new, 0)
        attn_probs = t.cat(attn_probs_new, 0)
        attns_style = t.cat(attns_style_new, 0)

        attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3))
        attns_enc = attns_enc.permute(1,0,2,3,4)
        attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3))
        attns_dec = attns_dec.permute(1,0,2,3,4)
        attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3))
        attn_probs = attn_probs.permute(1,0,2,3,4)
        attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3))
        attns_style = attns_style.permute(1,0,2,3,4)

        save_dir = os.path.join(hp.sample_path, 'figure', "text_{}_ref_{}_synth.wav".format(fname, ref_fname))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), mel_length, text_length, args.restore_step1, 'Validation', save_dir)
Esempio n. 11
0
def synthesis(args):
    m = Model()
    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m = m.cuda()
    m.train(False)
    vocoder = SmartVocoder(Hyperparameters(parse_args()))
    vocoder.load_state_dict(
        t.load('./mel2audio/merged_STFT_checkpoint.pth')["state_dict"])
    vocoder = vocoder.cuda()
    vocoder.eval()
    with open('./hifi_gan/config.json') as f:
        data = f.read()
    json_config = json.loads(data)
    h = AttrDict(json_config)
    hifi_gan = Generator(h).cuda()
    state_dict_g = t.load('./hifi_gan/g_00334000', map_location='cuda')
    hifi_gan.load_state_dict(state_dict_g['generator'])
    hifi_gan.eval()
    hifi_gan.remove_weight_norm()

    test_dataset = get_dataset(hp.test_data_csv)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 collate_fn=collate_fn_transformer,
                                 drop_last=True,
                                 num_workers=1)
    ref_dataset = get_dataset(hp.test_data_csv_shuf)
    ref_dataloader = DataLoader(ref_dataset,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=collate_fn_transformer,
                                drop_last=True,
                                num_workers=1)

    writer = get_writer(hp.checkpoint_path, hp.log_directory)

    mel_basis = t.from_numpy(
        librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels, 50,
                            11000)).unsqueeze(0)  # (n_mels, 1+n_fft//2)

    ref_dataloader_iter = iter(ref_dataloader)
    _, ref_mel, _, _, _, ref_pos_mel, _, _, ref_fname = next(
        ref_dataloader_iter)

    for i, data in enumerate(test_dataloader):
        character, _, _, _, pos_text, _, text_length, _, fname = data
        mel_input = t.zeros([1, 1, 80]).cuda()
        character = character.cuda()
        ref_mel = ref_mel.cuda()
        mel_input = mel_input.cuda()
        pos_text = pos_text.cuda()
        with t.no_grad():
            start = time.time()
            memory, c_mask, attns_enc, duration_mask = m.encoder(character,
                                                                 pos=pos_text)
            style, coarse_emb = m.ref_encoder(ref_mel)
            memory = t.cat((memory, coarse_emb.expand(-1, memory.size(1), -1)),
                           -1)
            memory = m.memory_coarse_layer(memory)
            duration_predictor_output = m.duration_predictor(
                memory, duration_mask)
            duration = t.ceil(duration_predictor_output)
            duration = duration * duration_mask
            #            max_length = t.sum(duration).type(t.LongTensor)
            #            print("length : ", max_length)

            monotonic_interpolation, pos_mel_, weights = m.length_regulator(
                memory, duration, duration_mask)
            kv_mask = t.zeros([1, mel_input.size(1),
                               character.size(1)]).cuda()  # B, t', N
            kv_mask[:, :, :3] = 1
            kv_mask = kv_mask.eq(0)
            stop_flag = False
            ctr = 0
            for j in range(1200):
                pos_mel = t.arange(1,
                                   mel_input.size(1) + 1).unsqueeze(0).cuda()
                mel_pred, postnet_pred, attn_probs, decoder_output, attns_dec, attns_style = m.decoder(
                    memory,
                    style,
                    mel_input,
                    c_mask,
                    pos=pos_mel,
                    ref_pos=ref_pos_mel,
                    mono_inter=monotonic_interpolation[:, :mel_input.shape[1]],
                    kv_mask=kv_mask)
                mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)
                #                print("j", j, "mel_input", mel_input.shape)
                if stop_flag and ctr == 10:
                    break
                elif stop_flag:
                    ctr += 1
                kv_mask, stop_flag = update_kv_mask(
                    kv_mask, attn_probs)  # B, t', N --> B, t'+1, N
            postnet_pred = t.cat((postnet_pred,
                                  t.zeros(postnet_pred.size(0), 5,
                                          postnet_pred.size(-1)).cuda()), 1)
            gen_length = mel_input.size(1)
            #            print("gen_length", gen_length)
            post_linear = m.postnet(postnet_pred)
            post_linear = resample(post_linear,
                                   seq_len=mel_input.size(1),
                                   scale=args.rhythm_scale)
            postnet_pred = resample(mel_input,
                                    seq_len=mel_input.size(1),
                                    scale=args.rhythm_scale)
            inf_time = time.time() - start
            print("inference time: ", inf_time)
            #            print("speech_rate: ", len(postnet_pred[0])/len(character[0]))

            postnet_pred_v = postnet_pred.transpose(2, 1)
            postnet_pred_v = (postnet_pred_v * 100 + 20 - 100) / 20
            B, C, T = postnet_pred_v.shape
            z = t.randn(1, 1, T * hp.hop_length).cuda()
            z = z * 0.6  # Temp
            t.cuda.synchronize()
            timestemp = time.time()
            with t.no_grad():
                y_gen = vocoder.reverse(z, postnet_pred_v).squeeze()
            t.cuda.synchronize()
            print('{} seconds'.format(time.time() - timestemp))
            wav = y_gen.to(t.device("cpu")).data.numpy()
            wav = np.pad(
                wav, [0, 4800], mode='constant',
                constant_values=0)  #pad 0 for 0.21 sec silence at the end

            post_linear_v = post_linear.transpose(1, 2)
            post_linear_v = 10**((post_linear_v * 100 + 20 - 100) / 20)
            mel_basis = mel_basis.repeat(post_linear_v.shape[0], 1, 1)
            post_linear_mel_v = t.log10(t.bmm(mel_basis.cuda(), post_linear_v))
            B, C, T = post_linear_mel_v.shape
            z = t.randn(1, 1, T * hp.hop_length).cuda()
            z = z * 0.6  # Temp
            t.cuda.synchronize()
            timestemp = time.time()
            with t.no_grad():
                y_gen_linear = vocoder.reverse(z, post_linear_mel_v).squeeze()
            t.cuda.synchronize()
            wav_linear = y_gen_linear.to(t.device("cpu")).data.numpy()
            wav_linear = np.pad(
                wav_linear, [0, 4800], mode='constant',
                constant_values=0)  #pad 0 for 0.21 sec silence at the end

            wav_hifi = hifi_gan(post_linear_mel_v).squeeze().clamp(
                -1, 1).detach().cpu().numpy()
            wav_hifi = np.pad(
                wav_hifi, [0, 4800], mode='constant',
                constant_values=0)  #pad 0 for 0.21 sec silence at the end

        mel_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale),
                                'mel')
        if not os.path.exists(mel_path):
            os.makedirs(mel_path)
        np.save(
            os.path.join(
                mel_path,
                'text_{}_ref_{}_synth_{}.mel'.format(i, ref_fname,
                                                     str(args.rhythm_scale))),
            postnet_pred.cpu())

        linear_path = os.path.join(
            hp.sample_path + '_' + str(args.rhythm_scale), 'linear')
        if not os.path.exists(linear_path):
            os.makedirs(linear_path)
        np.save(
            os.path.join(
                linear_path, 'text_{}_ref_{}_synth_{}.linear'.format(
                    i, ref_fname, str(args.rhythm_scale))), post_linear.cpu())

        wav_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale),
                                'wav')
        if not os.path.exists(wav_path):
            os.makedirs(wav_path)
        write(
            os.path.join(
                wav_path,
                "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname,
                                                     str(args.rhythm_scale))),
            hp.sr, wav)
        print("rtx : ", (len(wav) / hp.sr) / inf_time)

        wav_linear_path = os.path.join(
            hp.sample_path + '_' + str(args.rhythm_scale), 'wav_linear')
        if not os.path.exists(wav_linear_path):
            os.makedirs(wav_linear_path)
        write(
            os.path.join(
                wav_linear_path,
                "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname,
                                                     str(args.rhythm_scale))),
            hp.sr, wav_linear)

        wav_hifi_path = os.path.join(
            hp.sample_path + '_' + str(args.rhythm_scale), 'wav_hifi')
        if not os.path.exists(wav_hifi_path):
            os.makedirs(wav_hifi_path)
        write(
            os.path.join(
                wav_hifi_path,
                "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname,
                                                     str(args.rhythm_scale))),
            hp.sr, wav_hifi)

        show_weights = weights.contiguous().view(weights.size(0), 1, 1,
                                                 weights.size(1),
                                                 weights.size(2))
        attns_enc_new = []
        attns_dec_new = []
        attn_probs_new = []
        attns_style_new = []
        for i in range(len(attns_enc)):
            attns_enc_new.append(attns_enc[i].unsqueeze(0))
            attns_dec_new.append(attns_dec[i].unsqueeze(0))
            attn_probs_new.append(attn_probs[i].unsqueeze(0))
            attns_style_new.append(attns_style[i].unsqueeze(0))
        attns_enc = t.cat(attns_enc_new, 0)
        attns_dec = t.cat(attns_dec_new, 0)
        attn_probs = t.cat(attn_probs_new, 0)
        attns_style = t.cat(attns_style_new, 0)

        attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1,
                                                hp.n_heads, attns_enc.size(2),
                                                attns_enc.size(3))
        attns_enc = attns_enc.permute(1, 0, 2, 3, 4)
        attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1,
                                                hp.n_heads, attns_dec.size(2),
                                                attns_dec.size(3))
        attns_dec = attns_dec.permute(1, 0, 2, 3, 4)
        attn_probs = attn_probs.contiguous().view(attn_probs.size(0),
                                                  1, hp.n_heads,
                                                  attn_probs.size(2),
                                                  attn_probs.size(3))
        attn_probs = attn_probs.permute(1, 0, 2, 3, 4)
        attns_style = attns_style.contiguous().view(attns_style.size(0), 1,
                                                    hp.n_heads,
                                                    attns_style.size(2),
                                                    attns_style.size(3))
        attns_style = attns_style.permute(1, 0, 2, 3, 4)

        save_dir = os.path.join(
            hp.sample_path + '_' + str(args.rhythm_scale), 'figure',
            "text_{}_ref_{}_synth_{}.wav".format(fname, ref_fname,
                                                 str(args.rhythm_scale)))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        writer.add_alignments(attns_enc.detach().cpu(),
                              attns_dec.detach().cpu(),
                              attn_probs.detach().cpu(),
                              attns_style.detach().cpu(),
                              show_weights.detach().cpu(),
                              [t.tensor(gen_length).type(t.LongTensor)],
                              text_length, args.restore_step1, 'Inference',
                              save_dir)
Esempio n. 12
0
def random_run(acquisition_iterations, X_Pool, y_Pool, pool_subset,
               dropout_iterations, nb_classes, Queries, X_test, y_test, rep,
               X_old, y_old, device, itr, cuda, g):
    mod = Model().to(device)
    if cuda:
        cp = torch.load(rep)
        print("\n ********load gpu version******* \n")
    else:
        cp = torch.load(rep, map_location='cpu')
    mod.load_state_dict(cp['model_state_dict'])
    optimizer = optim.Adam(mod.parameters(), lr=0.001,
                           weight_decay=0.5)  #,weight_decay=0.5
    #optimizer = optim.SGD(mod.parameters(), lr=0.001,weight_decay=0.5)
    optimizer.load_state_dict(cp['optimizer_state_dict'])
    criterion = nn.CrossEntropyLoss()
    X_train = np.empty([0, 1, 28, 28])
    y_train = np.empty([
        0,
    ])
    AA = []
    losses_train = []
    #acc = test(test_loader,mod,device,cuda)
    acc = test(X_test, y_test, mod, device, cuda)
    AA.append(acc)
    print('initial test accuracy: ', acc)
    for i in range(acquisition_iterations):
        pool_subset_dropout = np.asarray(
            random.sample(range(0, X_Pool.shape[0]), pool_subset))
        X_Pool_Dropout = X_Pool[pool_subset_dropout, :, :, :]
        y_Pool_Dropout = y_Pool[pool_subset_dropout]

        x_pool_index = np.random.choice(X_Pool_Dropout.shape[0],
                                        Queries,
                                        replace=False)
        Pooled_X = X_Pool_Dropout[x_pool_index, :, :, :]
        Pooled_Y = y_Pool_Dropout[x_pool_index]

        delete_Pool_X = np.delete(X_Pool, (pool_subset_dropout), axis=0)
        delete_Pool_Y = np.delete(y_Pool, (pool_subset_dropout), axis=0)

        delete_Pool_X_Dropout = np.delete(X_Pool_Dropout, (x_pool_index),
                                          axis=0)
        delete_Pool_Y_Dropout = np.delete(y_Pool_Dropout, (x_pool_index),
                                          axis=0)

        X_Pool = np.concatenate((delete_Pool_X, delete_Pool_X_Dropout), axis=0)
        y_Pool = np.concatenate((delete_Pool_Y, delete_Pool_Y_Dropout), axis=0)
        print('updated pool size is ', X_Pool.shape[0])

        X_train = np.concatenate((X_train, Pooled_X), axis=0)
        y_train = np.concatenate((y_train, Pooled_Y), axis=0)
        print('number of data points from pool', X_train.shape[0])

        batch_size = 100
        X = np.vstack((X_old, Pooled_X))
        y = np.hstack((y_old, Pooled_Y))
        X, y = shuffle(X, y)
        num_batch = X.shape[0] // batch_size
        print("number of batch: ", num_batch)
        mod.train()
        for h in range(itr):
            losses = 0
            for j in range(num_batch):
                slce = get_slice(j, batch_size)
                X_fog_ = torch.from_numpy(X[slce]).float().to(device)
                y_fog_ = torch.from_numpy(y[slce]).long().to(device)
                optimizer.zero_grad()
                out = mod(X_fog_)
                train_loss = criterion(out, y_fog_)
                losses += train_loss
                train_loss.backward()
                optimizer.step()
            losses_train.append(losses.item() / num_batch)
        acc = test(X_test, y_test, mod, device, cuda)
        print('test accuracy: ', acc)
        AA.append(acc)
    torch.save(
        {
            'model_state_dict': mod.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, g)
    return AA, mod, X_train, y_train, losses_train, optimizer
        'GPUs': 0
    }
    if isinstance(parameters['GPUs'], int):
        parameters['GPUs'] = (parameters['GPUs'], )

    testset = MyDataset(
        filelist='../dataset/wp1_real.txt',
        input_transform=transforms.Compose([
            Resize((300, 300)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    )

    model = Model(parameters['n_classes'])
    model.load_state_dict(torch.load('wp1-cold.pth'))

    if parameters['GPUs']:
        model = model.cuda(parameters['GPUs'][0])
        if len(parameters['GPUs']) > 1:
            model = nn.DataParallel(model, device_ids=parameters['GPUs'])

    model.eval()

    all_features, all_outputs, all_preds, all_labels = predict(model, testset, **parameters)

    recall = np.sum(all_preds == all_labels) / float(len(testset))
    ap = AP(all_outputs, all_labels)
    mean_ap = meanAP(all_outputs, all_labels)

    print('Mean Recall: ', recall)
Esempio n. 14
0
import torch
import torchvision
from network import Model
from PIL import Image

PATH = './state_dict.pth'
state_dict = torch.load(PATH)

model = Model()
model = torch.nn.DataParallel(model)
model.load_state_dict(state_dict)

sample = torch.randn(64, 10)
data = model.module.fc4(model.module.fc3(sample)).view(64, 10, 7, 7)
data = model.module.decoder(data)
torchvision.utils.save_image(data.view(64, 1, 28, 28), 'result2.png')

# sample = model.module.decoder(model.module.fc2(sample).view(64, 128, 7, 7)).cpu()
# torchvision.utils.save_image(sample.data.view(64, 1, 28, 28), 'result/sample_' + str(1) + '.png')
Esempio n. 15
0
validation_split = .1
shuffle_dataset = True
random_seed = 42

# Creating data indices for training and validation splits:
dataset_size = len(my_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices = indices[split:]

# load model
model = Model().to(device=device)
model.load_state_dict(
    torch.load('stats/model_saved.pth', map_location=torch.device(device)))
model = model.float()

for ind in train_indices:
    data = my_dataset[ind]
    img = data['image']
    position_target = data['point_map']

    img = img.to(device=device)
    position_target = position_target.to(device=device)

    img = img.unsqueeze(dim=0)
    position_target = position_target.unsqueeze(dim=0)

    pred = model(img)
    logits = pred['logits']