Example #1
0
data = pd.read_csv(csv_path)
paths = data["ImageId"].values
paths = [os.path.join(img_dir, p) for p in paths]
labels = data["TrueLabel"].values

encoder = Encoder(channels, out_ch=2048)
decoder = Decoder(2048, channels)

encoder.load_state_dict(torch.load(config["encoder"], map_location="cpu"))
decoder.load_state_dict(torch.load(config["decoder"], map_location="cpu"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

encoder.eval()
decoder.eval()
x_adv = []
with torch.no_grad():
    bar = tqdm.tqdm(paths)
    for path in bar:
        filename = os.path.basename(path)
        bar.set_description(f"processing:{filename}")
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w = image.shape[:2]
        norm = Compose([
            Resize(img_size, img_size, always_apply=True),
            Normalize(mean=means, std=std, always_apply=True)
        ])
        norm_data = norm(image=image)
Example #2
0
def inference(checkpoint_file, text):
    ds = tiny_words(max_text_length=hp.max_text_length,
                    max_audio_length=hp.max_audio_length,
                    max_dataset_size=args.data_size)

    print(ds.texts)

    # prepare input
    indexes = indexes_from_text(ds.lang, text)
    indexes.append(EOT_token)
    padded_indexes = pad_indexes(indexes, hp.max_text_length, PAD_token)
    texts_v = Variable(torch.from_numpy(padded_indexes))
    texts_v = texts_v.unsqueeze(0)

    if hp.use_cuda:
        texts_v = texts_v.cuda()

    encoder = Encoder(ds.lang.num_chars,
                      hp.embedding_dim,
                      hp.encoder_bank_k,
                      hp.encoder_bank_ck,
                      hp.encoder_proj_dims,
                      hp.encoder_highway_layers,
                      hp.encoder_highway_units,
                      hp.encoder_gru_units,
                      dropout=hp.dropout,
                      use_cuda=hp.use_cuda)

    decoder = AttnDecoder(hp.max_text_length,
                          hp.attn_gru_hidden_size,
                          hp.n_mels,
                          hp.rf,
                          hp.decoder_gru_hidden_size,
                          hp.decoder_gru_layers,
                          dropout=hp.dropout,
                          use_cuda=hp.use_cuda)

    postnet = PostNet(hp.n_mels,
                      1 + hp.n_fft // 2,
                      hp.post_bank_k,
                      hp.post_bank_ck,
                      hp.post_proj_dims,
                      hp.post_highway_layers,
                      hp.post_highway_units,
                      hp.post_gru_units,
                      use_cuda=hp.use_cuda)

    encoder.eval()
    decoder.eval()
    postnet.eval()

    if hp.use_cuda:
        encoder.cuda()
        decoder.cuda()
        postnet.cuda()

    # load model
    checkpoint = torch.load(checkpoint_file)
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])
    postnet.load_state_dict(checkpoint['postnet'])

    encoder_out = encoder(texts_v)

    # Prepare input and output variables
    GO_frame = np.zeros((1, hp.n_mels))
    decoder_in = Variable(torch.from_numpy(GO_frame).float())
    if hp.use_cuda:
        decoder_in = decoder_in.cuda()
    h, hs = decoder.init_hiddens(1)

    decoder_outs = []
    for t in range(int(hp.max_audio_length / hp.rf)):
        decoder_out, h, hs, _ = decoder(decoder_in, h, hs, encoder_out)
        decoder_outs.append(decoder_out)
        # use predict
        decoder_in = decoder_out[:, -1, :].contiguous()

    # (batch_size, T, n_mels)
    decoder_outs = torch.cat(decoder_outs, 1)

    # postnet
    post_out = postnet(decoder_outs)
    s = post_out[0].cpu().data.numpy()

    print("Recontructing wav...")
    s = np.where(s < 0, 0, s)
    wav = spectrogram2wav(s**hp.power)
    # wav = griffinlim(s**hp.power)
    write("demo.wav", hp.sr, wav)