def encode_dataset(cfg):
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(root_path / "test.json") as file:
        metadata = json.load(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    encoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])

    encoder.eval()

    if cfg.save_auxiliary:
        auxiliary = []

        def hook(module, input, output):
            auxiliary.append(output.clone())

        encoder.encoder[-1].register_forward_hook(hook)

    for _, _, _, path in tqdm(metadata):
        path = root_path.parent / path
        mel = torch.from_numpy(np.load(
            path.with_suffix(".mel.npy"))).unsqueeze(0).to(device)
        with torch.no_grad():
            z, c, indices = encoder.encode(mel)

        z = z.squeeze().cpu().numpy()

        out_path = out_dir / path.stem
        with open(out_path.with_suffix(".txt"), "w") as file:
            np.savetxt(file, z, fmt="%.16f")

        if cfg.save_auxiliary:
            aux_path = out_dir.parent / "auxiliary_embedding1"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            c = c.squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, c, fmt="%.16f")

            aux_path = out_dir.parent / "auxiliary_embedding2"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            aux = auxiliary.pop().squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, aux, fmt="%.16f")
Exemple #2
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))

    synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list))
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)

    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(
            wav_path.with_suffix(".wav"),
            sr=cfg.preprocessing.sr)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
Exemple #3
0
def get_encoder(latent_dim, fckpt=''):
    E = Encoder(latent_dim)
    if fckpt and os.path.exists(fckpt):
        ckpt = torch.load(fckpt)
        loaded_sd = ckpt['E']
        try:
            E.load_state_dict(loaded_sd)
        except:
            curr_params = E.state_dict()
            curr_keys = list(curr_params.keys())

            updated_params = {}
            for k, v in loaded_sd.items():
                if 'bn7' in k:
                    newk = k.replace('bn7', 'conv7')
                else:
                    newk = k
                if newk in curr_keys and loaded_sd[k].shape == curr_params[
                        newk].shape:
                    updated_params[newk] = v
                else:
                    print('Failed to load:', k)
            curr_params.update(updated_params)
            E.load_state_dict(curr_params)
    return E.to(device)
Exemple #4
0
class Model:
    def __init__(self, chpt_enc_path, chpt_dec_path, chpt_stat_path):
        historyLength = 10

        encoder_dim = hiddenDimension
        lstm_input_dim = historyLength + 1
        decoder_dim = hiddenDimension
        attention_dim = hiddenDimension
        output_dim = 1

        self.decodeLength = 20

        self.encoder = Encoder()
        self.decoder = DecoderWithAttention(encoder_dim, lstm_input_dim, decoder_dim, attention_dim, output_dim)

        self.encoder.load_state_dict(torch.load(chpt_enc_path))
        self.decoder.load_state_dict(torch.load(chpt_dec_path))

        self.encoder = self.encoder.to(device)
        self.decoder = self.decoder.to(device)

        self.encoder.eval()
        self.decoder.eval()

        with open(chpt_stat_path, 'rb') as f:
            chpt_stat = pickle.load(f)

        self.cMean = chpt_stat['cMean_tr']
        self.cStd = chpt_stat['cStd_tr']

        self.vMean = chpt_stat['vMean_tr']
        self.vStd = chpt_stat['vStd_tr']

        self.aMean = chpt_stat['aMean_tr']
        self.aStd = chpt_stat['aStd_tr']

        self.mean = torch.Tensor([self.vMean, self.aMean]).to(device)
        self.std = torch.Tensor([self.vStd, self.aStd]).to(device)

    def predict(self, curvatures, currentSpeed, histSpeeds, currentAccelX, histAccelXs):
        curvatures = torch.FloatTensor(curvatures).to(device)

        currentSpeed = torch.FloatTensor([currentSpeed]).to(device)
        histSpeeds = torch.FloatTensor(histSpeeds).to(device)

        currentAccelX = torch.FloatTensor([currentAccelX]).to(device)
        histAccelXs = torch.FloatTensor(histAccelXs).to(device)

        curvatures = (curvatures - self.cMean) / self.cStd
        currentSpeed = (currentSpeed - self.vMean) / self.vStd
        histSpeeds = (histSpeeds - self.vMean) / self.vStd
        currentAccelX = (currentAccelX - self.aMean) / self.aStd
        histAccelXs = (histAccelXs - self.aMean) / self.aStd

        curvatures = self.encoder(curvatures.unsqueeze(dim=0).unsqueeze(dim=0))
        predictions, alphas, alphas_target = self.decoder(curvatures, currentSpeed, histSpeeds.unsqueeze(dim=0), currentAccelX, histAccelXs.unsqueeze(dim=0),
                                    self.decodeLength, self.vMean, self.vStd, self.aMean, self.aStd)

        return (predictions.squeeze()*self.aStd + self.aMean).cpu().detach().numpy(), alphas.squeeze().cpu().detach().numpy()
Exemple #5
0
def initialize_for_test(params):
    data_loader = get_loader(params, mode='test')
    encoder_file = os.path.join(params.encoder_save,
                                'epoch-%d.pkl' % params.num_epochs)
    decoder_file = os.path.join(params.decoder_save,
                                'epoch-%d.pkl' % params.num_epochs)
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = Encoder(params)
    decoder = Decoder(params, vocab_size)
    encoder.eval()
    decoder.eval()

    # Load the trained weights.
    encoder.load_state_dict(torch.load(encoder_file))
    decoder.load_state_dict(torch.load(decoder_file))
    encoder.to(params.device)
    decoder.to(params.device)
    return data_loader, encoder, decoder
def load_encoder(data_root, weight_path, device):
    encoder = Encoder()
    if weight_path:
        weight = torch.load(weight_path)
    else:
        weight = torch.load(get_best_weight(data_root))
    encoder.load_state_dict(weight)

    if device >= 0:
        encoder = encoder.to(f"cuda:{device}")
    encoder.eval()
    return encoder
Exemple #7
0
def main(data_name):
    dataset = MyDataSet(data_name=data_name, reset=False)
    vocab_size = dataset.vocab_size
    corpus = dataset.corpus
    id2word = {v: k for k, v in corpus.items()}
    train_loader, val_loader = _get_data_loader(dataset, 0.5, batch_size)

    embedding, embed_dim = load_embedding(basic_settings['word2vec'], corpus)

    encoder = Encoder(dataset.feature_dim, output_dim=100)
    decoder = DecoderWithAttention(encoder.get_output_dim(),
                                   decoder_dim=100,
                                   attn_dim=100,
                                   embed_dim=embed_dim,
                                   vocab_size=vocab_size)
    decoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, decoder.parameters()),
                                         lr=lr)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)

    best_bleu4 = 0
    best_hypos = []
    best_refs = []

    for epoch in range(1, epoches + 1):
        # One epoch's training
        train_epoch(train_loader=train_loader,
                    encoder=encoder,
                    decoder=decoder,
                    criterion=criterion,
                    optimizer=decoder_optimizer,
                    epoch=epoch)

        # One epoch's validation
        bleu4_score, refs, hypos = validate(val_loader=val_loader,
                                            encoder=encoder,
                                            decoder=decoder,
                                            criterion=criterion,
                                            word2id=corpus)
        if bleu4_score > best_bleu4:
            best_bleu4 = bleu4_score
            best_refs = refs
            best_hypos = hypos
    name = data_name + '_' + str(best_bleu4) + '.xlsx'
    save_result(name, best_refs, best_hypos, id2word)
def main(args):
    # Image preprocessing
    # In generation phase, we need should random crop, just resize
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wraper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build model
    encoder = Encoder(embed_size=args.embed_size).eval()
    decoder = Decoder(stateful=False,
                      embed_size=args.embed_size,
                      hidden_size=args.hidden_size,
                      vocab_size=len(vocab),
                      num_layers=args.num_layers).to(device)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path, map_location=device))
    decoder.load_state_dict(torch.load(args.decoder_path, map_location=device))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<<end>>':
            break
    sentence = ' '.join(sampled_caption)
    print(sentence)
Exemple #9
0
def load_model(
    encoder_path,
    decoder_path,
    vocab_size,
    layer_type='gru',
    embed_size=256,
    hidden_size=512,
    num_layers=2,
):
    if layer_type == 'lstm':
        from model import Encoder, Decoder
    else:
        from model_gru import Encoder, Decoder

    # eval mode (batchnorm uses moving mean/variance)
    encoder = Encoder(embed_size).eval()
    decoder = Decoder(embed_size, hidden_size, vocab_size, num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))
    return encoder, decoder
Exemple #10
0
def train_model(cfg):
    tensorboard_path = Path(
        utils.to_absolute_path("tensorboard")) / cfg.checkpoint_dir
    checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir))
    writer = SummaryWriter(tensorboard_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    cpc = CPCLoss(**cfg.model.cpc)
    encoder.to(device)
    cpc.to(device)

    optimizer = optim.Adam(chain(encoder.parameters(), cpc.parameters()),
                           lr=cfg.training.scheduler.initial_lr)
    scheduler = WarmupScheduler(
        optimizer,
        warmup_epochs=cfg.training.scheduler.warmup_epochs,
        initial_lr=cfg.training.scheduler.initial_lr,
        max_lr=cfg.training.scheduler.max_lr,
        milestones=cfg.training.scheduler.milestones,
        gamma=cfg.training.scheduler.gamma)

    if cfg.resume:
        print("Resume checkpoint from: {}:".format(cfg.resume))
        resume_path = utils.to_absolute_path(cfg.resume)
        checkpoint = torch.load(resume_path,
                                map_location=lambda storage, loc: storage)
        encoder.load_state_dict(checkpoint["encoder"])
        cpc.load_state_dict(checkpoint["cpc"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        start_epoch = checkpoint["epoch"]
    else:
        start_epoch = 1

    root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    dataset = CPCDataset(
        root=root_path,
        n_sample_frames=cfg.training.sample_frames +
        cfg.training.n_prediction_steps,
        n_utterances_per_speaker=cfg.training.n_utterances_per_speaker,
        hop_length=cfg.preprocessing.hop_length,
        sr=cfg.preprocessing.sr)

    dataloader = DataLoader(dataset,
                            batch_size=cfg.training.n_speakers_per_batch,
                            shuffle=True,
                            num_workers=cfg.training.n_workers,
                            pin_memory=True,
                            drop_last=True)

    for epoch in range(start_epoch, cfg.training.n_epochs + 1):
        if epoch % cfg.training.log_interval == 0 or epoch == start_epoch:
            average_cpc_loss = average_vq_loss = average_perplexity = 0
            average_accuracies = np.zeros(cfg.training.n_prediction_steps // 2)

        for i, (mels, _) in enumerate(tqdm(dataloader), 1):
            mels = mels.to(device)
            mels = mels.view(
                cfg.training.n_speakers_per_batch *
                cfg.training.n_utterances_per_speaker,
                cfg.preprocessing.n_mels, -1)

            optimizer.zero_grad()

            z, c, vq_loss, perplexity = encoder(mels)
            cpc_loss, accuracy = cpc(z, c)
            loss = cpc_loss + vq_loss

            loss.backward()
            optimizer.step()

            average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i
            average_vq_loss += (vq_loss.item() - average_vq_loss) / i
            average_perplexity += (perplexity.item() - average_perplexity) / i
            average_accuracies += (np.array(accuracy) - average_accuracies) / i

        scheduler.step()

        if epoch % cfg.training.log_interval == 0 and epoch != start_epoch:
            writer.add_scalar("cpc_loss/train", average_cpc_loss, epoch)
            writer.add_scalar("vq_loss/train", average_vq_loss, epoch)
            writer.add_scalar("perplexity/train", average_perplexity, epoch)

            print(
                "epoch:{}, cpc loss:{:.2E}, vq loss:{:.2E}, perpexlity:{:.3f}".
                format(epoch, cpc_loss, average_vq_loss, average_perplexity))
            print(100 * average_accuracies)

        if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch:
            save_checkpoint(encoder, cpc, optimizer, scheduler, epoch,
                            checkpoint_dir)
Exemple #11
0
def main(epoch_num, batch_size, verbose, UNSEEN, SEEN, MODE):
    [
        hownet_file, sememe_file, word_index_file, word_vector_file,
        dictionary_file, word_cilinClass_file
    ] = [
        'hownet.json', 'sememe.json', 'word_index.json', 'word_vector.npy',
        'dictionary_sense.json', 'word_cilinClass.json'
    ]
    word2index, index2word, word2vec, sememe_num, label_size, label_size_chara, word_defi_idx_all = load_data(
        hownet_file, sememe_file, word_index_file, word_vector_file,
        dictionary_file, word_cilinClass_file)
    (word_defi_idx_TrainDev, word_defi_idx_seen, word_defi_idx_test2000,
     word_defi_idx_test200, word_defi_idx_test272) = word_defi_idx_all
    index2word = np.array(index2word)
    length = len(word_defi_idx_TrainDev)
    valid_dataset = MyDataset(word_defi_idx_TrainDev[int(0.9 * length):])
    test_dataset = MyDataset(word_defi_idx_test2000 + word_defi_idx_test200 +
                             word_defi_idx_test272)
    if SEEN:
        mode = 'S_' + MODE
        print('*METHOD: Seen defi.')
        print('*TRAIN: [Train + allSeen(2000+200+272)]')
        print('*TEST: [2000rand1 + 200desc + 272desc]')
        train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)] +
                                  word_defi_idx_seen)
    elif UNSEEN:
        mode = 'U_' + MODE
        print('*METHOD: Unseen All words and defi.')
        print('*TRAIN: [Train]')
        print('*TEST: [2000rand1 + 200desc + 272desc]')
        train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)])
    print('*MODE: [%s]' % mode)

    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   collate_fn=my_collate_fn)
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   collate_fn=my_collate_fn)
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=my_collate_fn_test)

    print('Train dataset: ', len(train_dataset))
    print('Valid dataset: ', len(valid_dataset))
    print('Test dataset: ', len(test_dataset))
    word_defi_idx = word_defi_idx_TrainDev + word_defi_idx_seen

    wd2sem = word2sememe(word_defi_idx, len(word2index), sememe_num)
    wd_sems = label_multihot(wd2sem, sememe_num)
    wd_sems = torch.from_numpy(np.array(wd_sems[:label_size])).to(device)
    wd_POSs = label_multihot(word2POS(word_defi_idx, len(word2index), 13), 13)
    wd_POSs = torch.from_numpy(np.array(wd_POSs[:label_size])).to(device)
    wd_charas = label_multihot(
        word2chara(word_defi_idx, len(word2index), label_size_chara),
        label_size_chara)
    wd_charas = torch.from_numpy(np.array(wd_charas[:label_size])).to(device)
    wd2Cilin1 = word2Cn(word_defi_idx, len(word2index), 'C1', 13)
    wd_C1 = label_multihot(wd2Cilin1, 13)  #13 96 1426 4098
    wd_C1 = torch.from_numpy(np.array(wd_C1[:label_size])).to(device)
    wd_C2 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C2', 96),
                           96)
    wd_C2 = torch.from_numpy(np.array(wd_C2[:label_size])).to(device)
    wd_C3 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C3', 1426),
                           1426)
    wd_C3 = torch.from_numpy(np.array(wd_C3[:label_size])).to(device)
    wd_C4 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C4', 4098),
                           4098)
    wd_C4 = torch.from_numpy(np.array(wd_C4[:label_size])).to(device)
    '''wd2Cilin = word2Cn(word_defi_idx, len(word2index), 'C', 5633)
    wd_C0 = label_multihot(wd2Cilin, 5633) 
    wd_C0 = torch.from_numpy(np.array(wd_C0[:label_size])).to(device)
    wd_C = [wd_C1, wd_C2, wd_C3, wd_C4, wd_C0]
    '''
    wd_C = [wd_C1, wd_C2, wd_C3, wd_C4]
    #----------mask of no sememes
    print('calculating mask of no sememes...')
    mask_s = torch.zeros(label_size, dtype=torch.float32, device=device)
    for i in range(label_size):
        sems = set(wd2sem[i].detach().cpu().numpy().tolist()) - set(
            [sememe_num])
        if len(sems) == 0:
            mask_s[i] = 1

    mask_c = torch.zeros(label_size, dtype=torch.float32, device=device)
    for i in range(label_size):
        cc = set(wd2Cilin1[i].detach().cpu().numpy().tolist()) - set([13])
        if len(cc) == 0:
            mask_c[i] = 1

    model = Encoder(vocab_size=len(word2index),
                    embed_dim=word2vec.shape[1],
                    hidden_dim=200,
                    layers=1,
                    class_num=label_size,
                    sememe_num=sememe_num,
                    chara_num=label_size_chara)
    model.embedding.weight.data = torch.from_numpy(word2vec)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam
    best_valid_accu = 0
    DEF_UPDATE = True
    for epoch in range(epoch_num):
        print('epoch: ', epoch)
        model.train()
        train_loss = 0
        label_list = list()
        pred_list = list()
        for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm(
                train_dataloader, disable=verbose):
            optimizer.zero_grad()
            loss, _, indices = model('train',
                                     x=definition_words_t,
                                     w=words_t,
                                     ws=wd_sems,
                                     wP=wd_POSs,
                                     wc=wd_charas,
                                     wC=wd_C,
                                     msk_s=mask_s,
                                     msk_c=mask_c,
                                     mode=MODE)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            predicted = indices[:, :100].detach().cpu().numpy().tolist()
            train_loss += loss.item()
            label_list.extend(words_t.detach().cpu().numpy())
            pred_list.extend(predicted)
        train_accu_1, train_accu_10, train_accu_100 = evaluate(
            label_list, pred_list)
        del label_list
        del pred_list
        gc.collect()
        print('train_loss: ', train_loss / len(train_dataset))
        print('train_accu(1/10/100): %.2f %.2F %.2f' %
              (train_accu_1, train_accu_10, train_accu_100))
        model.eval()
        with torch.no_grad():
            valid_loss = 0
            label_list = []
            pred_list = []
            for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm(
                    valid_dataloader, disable=verbose):
                loss, _, indices = model('train',
                                         x=definition_words_t,
                                         w=words_t,
                                         ws=wd_sems,
                                         wP=wd_POSs,
                                         wc=wd_charas,
                                         wC=wd_C,
                                         msk_s=mask_s,
                                         msk_c=mask_c,
                                         mode=MODE)
                predicted = indices[:, :100].detach().cpu().numpy().tolist()
                valid_loss += loss.item()
                label_list.extend(words_t.detach().cpu().numpy())
                pred_list.extend(predicted)
            valid_accu_1, valid_accu_10, valid_accu_100 = evaluate(
                label_list, pred_list)
            print('valid_loss: ', valid_loss / len(valid_dataset))
            print('valid_accu(1/10/100): %.2f %.2F %.2f' %
                  (valid_accu_1, valid_accu_10, valid_accu_100))
            del label_list
            del pred_list
            gc.collect()

            if valid_accu_10 > best_valid_accu:
                best_valid_accu = valid_accu_10
                print('-----best_valid_accu-----')
                #torch.save(model, 'saved.model')
                label_list = []
                pred_list = []
                for words_t, definition_words_t in tqdm(test_dataloader,
                                                        disable=verbose):
                    indices = model('test',
                                    x=definition_words_t,
                                    w=words_t,
                                    ws=wd_sems,
                                    wP=wd_POSs,
                                    wc=wd_charas,
                                    wC=wd_C,
                                    msk_s=mask_s,
                                    msk_c=mask_c,
                                    mode=MODE)
                    predicted = indices[:, :1000].detach().cpu().numpy(
                    ).tolist()
                    label_list.extend(words_t.detach().cpu().numpy())
                    pred_list.extend(predicted)
                test_accu_1, test_accu_10, test_accu_100, median, variance = evaluate_test(
                    label_list, pred_list)
                print('test_accu(1/10/100): %.2f %.2F %.2f %.1f %.2f' %
                      (test_accu_1, test_accu_10, test_accu_100, median,
                       variance))
                if epoch > 10:
                    json.dump((index2word[label_list]).tolist(),
                              open(mode + '_label_list.json', 'w'))
                    json.dump((index2word[np.array(pred_list)]).tolist(),
                              open(mode + '_pred_list.json', 'w'))
                del label_list
                del pred_list
                gc.collect()
        assert args.snapshot is not None
    else:
        if args.sort_by_freq is False:
            assert args.order_free in ["pla", "mla"]
        else:
            if args.order_free:
                raise ValueError(
                    'Sort by freq and order_free are mutually exclusive.')
    resume = 0
    highest_f1 = 0
    epochs_without_imp = 0
    iterations = 0
    encoder = Encoder(encoder_weights=args.encoder_weights)
    decoder = Decoder(args.hidden_size, args.embed_size, args.attention_size,
                      args.dropout)
    encoder = encoder.to('cuda')
    decoder = decoder.to('cuda')

    snapshot = args.snapshot
    test_model = args.test_model
    train_from_scratch = args.train_from_scratch
    swa_params = eval(args.swa_params)
    finetune_encoder = args.finetune_encoder

    if not test_model:
        if finetune_encoder:
            encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                                 lr=args.encoder_lr)
        decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                             lr=args.decoder_lr)
    else:
                      embedding_dimension=opt.embedding_dim,
                      hidden_size=opt.rnn_hidden,
                      num_layer=opt.num_layers)
    optimizer1 = torch.optim.Adam(encoder.parameters(), lr=opt.lr)
    decoder = BahdanauAttnDecoderRNN(opt.rnn_hidden,
                                     opt.embedding_dim,
                                     len(en_config.word2ix),
                                     n_layers=2,
                                     dropout_p=0.1)
    # decoder =
    optimizer2 = torch.optim.Adam(decoder.parameters(), lr=opt.lr)
    if opt.save_path:
        encoder.load_state_dict(torch.load(opt.save_path + 'encoder.pth'))
        decoder.load_state_dict(torch.load(opt.save_path + 'decoder.pth'))
        print('load update model')
    encoder.to(device)
    decoder.to(device)
    loss_meter = AverageValueMeter()
    '''
    for epoch in range(200):
        loss_meter.reset()

        for ii, ((in_lang,in_lengths),(out_lang,out_lengths)) in tqdm(enumerate(train_dataloader)):
            in_lang = in_lang.to(device)
            out_lang = out_lang.to(device)
            optimizer1.zero_grad()
            optimizer2.zero_grad()

            encoder_outputs, encoder_hidden = encoder(in_lang,in_lengths) # MAX_LENGTH, BATCH_SIZE, EMBEDDING DIMENSION // n_layer, BATCH_SIZE, EMBEDDING DIMENSION
            # Prepare input and output variables
            decoder_input = torch.LongTensor([fr_config.word2ix[opt.start]] * in_lang.shape[1]).to(device)
class Solver(object):
    """Solver for training and testing"""

    def __init__(self, data_loader, config):
        """Initialize configurations."""

        # Data loader.
        self.data_loader = data_loader

        # Model configurations.
        self.a_dim = config.a_dim
        self.id_dim = config.id_dim

        # Training configurations.
        self.batch_size = config.batch_size
        self.num_iters = config.num_iters
        self.num_iters_decay = config.num_iters_decay
        self.lr = config.lr
        self.n_critic = config.n_critic
        self.beta1 = config.beta1
        self.beta2 = config.beta2
        self.resume_iters = config.resume_iters

        # Test configurations.
        self.test_iters = config.test_iters

        # Miscellaneous.
        self.use_tensorboard = config.use_tensorboard
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Directories.
        self.log_dir = config.log_dir
        self.sample_dir = config.sample_dir
        self.model_save_dir = config.model_save_dir
        self.result_dir = config.result_dir

        # Step size.
        self.log_step = config.log_step
        self.sample_step = config.sample_step
        self.model_save_step = config.model_save_step
        self.lr_update_step = config.lr_update_step

        # Build the model and tensorboard.
        self.build_model()
        if self.use_tensorboard:
            self.build_tensorboard()

    def build_model(self):
        """Create a generator and a discriminator."""

        self.G = Generator()
        self.D = Discriminator()
        self.I = Encoder()
        self.C = Encoder()
        self.A = Attribute()
        self.g_optimizer = torch.optim.Adam(self.G.parameters(), self.lr, [self.beta1, self.beta2])
        self.d_optimizer = torch.optim.Adam(self.D.parameters(), self.lr, [self.beta1, self.beta2])
        self.i_optimizer = torch.optim.Adam(self.I.parameters(), self.lr, [self.beta1, self.beta2])
        self.c_optimizer = torch.optim.Adam(self.C.parameters(), self.lr, [self.beta1, self.beta2])
        self.a_optimizer = torch.optim.Adam(self.A.parameters(), self.lr, [self.beta1, self.beta2])


        self.G.to(self.device)
        self.D.to(self.device)
        self.A.to(self.device)
        self.I.to(self.device)
        self.C.to(self.device)

    def print_network(self, model, name):
        """Print out the network information."""
        num_params = 0
        for p in model.parameters():
            num_params += p.numel()
        print(model)
        print(name)
        print("The number of parameters: {}".format(num_params))

    def restore_model(self, resume_iters):
        """Restore the trained generator and discriminator."""
        print('Loading the trained models from step {}...'.format(resume_iters))
        G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(resume_iters))
        D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(resume_iters))
        A_path = os.path.join(self.model_save_dir, '{}-A.ckpt'.format(resume_iters))
        I_path = os.path.join(self.model_save_dir, '{}-I.ckpt'.format(resume_iters))
        C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(resume_iters))
        self.A.load_state_dict(torch.load(A_path, map_location=lambda storage, loc: storage))
        self.I.load_state_dict(torch.load(I_path, map_location=lambda storage, loc: storage))
        self.C.load_state_dict(torch.load(C_path, map_location=lambda storage, loc: storage))
        self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
        self.D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage))

    def build_tensorboard(self):
        """Build a tensorboard logger."""
        from logger import Logger
        self.logger = Logger(self.log_dir)

    def update_lr(self, lr):
        """Decay learning rates of the generator and discriminator."""
        for param_group in self.g_optimizer.param_groups:
            param_group['lr'] = lr
        for param_group in self.d_optimizer.param_groups:
            param_group['lr'] = lr
        for param_group in self.i_optimizer.param_groups:
            param_group['lr'] = lr
        for param_group in self.a_optimizer.param_groups:
            param_group['lr'] = lr
        for param_group in self.c_optimizer.param_groups:
            param_group['lr'] = lr

    def reset_grad(self):
        """Reset the gradient buffers."""
        self.g_optimizer.zero_grad()
        self.d_optimizer.zero_grad()
        self.i_optimizer.zero_grad()
        self.a_optimizer.zero_grad()
        self.c_optimizer.zero_grad()

    def denorm(self, x):
        """Convert the range from [-1, 1] to [0, 1]."""
        out = (x + 1) / 2
        return out.clamp_(0, 1)

    def classification_loss(self, logit, target):
        """Compute binary or softmax cross entropy loss."""

        return F.cross_entropy(logit, target)

    def mse_loss(self, out, gt):
        """Computes the MSE between model output and scalar gt"""
        loss = 0.5 * torch.mean(torch.abs(out - gt)**2)
        return loss

    def L1_loss(self, pred, target):
        """
        Calculate L1 loss
        """
        return torch.mean(torch.abs(pred - target))

    def reparameterization(self, mu, logvar):
        std = torch.exp(logvar / 2)
        sampled_z = torch.FloatTensor(np.random.normal(0, 1, (mu.size(0), 8))).to(self.device)
        z = sampled_z * std + mu
        return z

    def train(self):
        """Train StarGAN within a single dataset."""
        # Set data loader.

        data_loader = self.data_loader

        # Fetch fixed inputs for debugging.
        data_iter = iter(data_loader)
        batch_fixed = next(data_iter)

        for k in batch_fixed:
            batch_fixed[k] = batch_fixed[k].to(self.device)

        # Learning rate cache for decaying.
        lr = self.lr


        # Start training from scratch or resume training.
        start_iters = 0
        if self.resume_iters:
            start_iters = self.resume_iters
            self.restore_model(self.resume_iters)

        # Start training.
        print('Start training...')
        start_time = time.time()
        for i in range(start_iters, self.num_iters):

            # =================================================================================== #
            #                             1. Preprocess input data                                #
            # =================================================================================== #

            # Fetch real images and labels.
            try:
                batch = next(data_iter)
            except:
                data_iter = iter(data_loader)
                batch = next(data_iter)
            for k in batch:
                batch[k] = batch[k].to(self.device)
            # =================================================================================== #
            #                             2. Train the discriminator                              #
            # =================================================================================== #
            loss = {}
            # get identity z
            id_z, _ = self.I(batch['img_profile'])
            # get attribute z
            mu, logvar = self.A(batch['img_frontal'])
            a_z = self.reparameterization(mu, logvar)
            # get x'
            x = torch.cat([id_z, a_z], 1)

            x_fake = self.G(x)

            # Get the predicted identity
            id_pred, _ = self.C(batch['img_profile'])
            # distinguish the true and the false
            d_real, _ = self.D(batch['img_frontal'])
            d_fake, _ = self.D(x_fake.detach())
            # train I
            loss_Li = self.classification_loss(id_z, batch['label'])
            # train A
            loss_KL = torch.sum(0.5 * (mu**2 + torch.exp(logvar) - logvar - 1))
            loss_GR = self.mse_loss(batch['img_frontal'], x_fake)
            # triain C
            loss_C = self.classification_loss(id_pred, batch['label'])
            # train D
            loss_D = - torch.mean(d_real) + torch.mean(d_fake)
            d_loss = loss_D + loss_C + loss_GR + loss_KL + loss_Li

            self.reset_grad()
            d_loss.backward()
            self.d_optimizer.step()
            self.c_optimizer.step()
            self.a_optimizer.step()
            self.i_optimizer.step()

            loss['C/loss_C'] = loss_C.item()
            loss['A/loss_GR'] = loss_GR.item()
            loss['I/loss_Li'] = loss_Li.item()
            loss['D/loss_D'] = loss_D.item()

            # =================================================================================== #
            #                               3. Train the generator                                #
            # =================================================================================== #

            if (i + 1) % self.n_critic == 0:
                id_z, _ = self.I(batch['img_profile'])
                # get attribute z
                mu, logvar = self.A(batch['img_frontal'])
                a_z = self.reparameterization(mu, logvar)
                # get x'
                x = torch.cat([id_z, a_z], 1)

                x_fake = self.G(x)
                # Get the predicted identity
                _, c_f_s = self.C(batch['img_profile'])
                _, c_f_x = self.C(x_fake)
                # distinguish the true and the false
                d_real, d_f_a = self.D(batch['img_frontal'])
                d_fake, d_f_x = self.D(x_fake)

                loss_GR = self.mse_loss(batch['img_frontal'], x_fake)
                # triain C

                loss_GC = self.mse_loss(c_f_x, c_f_s)
                loss_GD = self.mse_loss(d_f_x, d_f_a)
                loss_g = - torch.mean(d_fake)
                g_loss = loss_g + loss_GC + loss_GR + loss_GD

                self.reset_grad()
                g_loss.backward()
                self.g_optimizer.step()

                # Logging.
                loss['G/loss_GR'] = loss_GR.item()
                loss['G/loss_GC'] = loss_GC.item()
                loss['G/loss_GD'] = loss_GD.item()
                loss['G/loss_g'] = loss_g.item()

            # =================================================================================== #
            #                                 4. Miscellaneous                                    #
            # =================================================================================== #

            # Print out training information.
            if (i + 1) % self.log_step == 0:
                et = time.time() - start_time
                et = str(datetime.timedelta(seconds=et))[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(et, i + 1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

                if self.use_tensorboard:
                    for tag, value in loss.items():
                        self.logger.scalar_summary(tag, value, i + 1)

            # Translate fixed images for debugging.
            if (i + 1) % self.sample_step == 0:
                for k in batch_fixed:
                    batch_fixed[k] = batch_fixed[k].to(self.device)
                with torch.no_grad():
                    x_fake_list = [batch_fixed['img_profile']]

                    id_z, _ = self.I(batch_fixed['img_profile'])
                    # get attribute z
                    mu, logvar = self.A(batch_fixed['img_frontal'])
                    a_z = self.reparameterization(mu, logvar)
                    # get x'
                    x = torch.cat([id_z, a_z], 1)

                    x_fake = self.G(x)

                    x_fake_list.append(x_fake)
                    x_concat = torch.cat(x_fake_list, dim=3)
                    sample_path = os.path.join(self.sample_dir, '{}-images.jpg'.format(i + 1))
                    save_image(self.denorm(x_concat.data.cpu()), sample_path, nrow=2, padding=5)
                    print('Saved real and fake images into {}...'.format(sample_path))

            # Save model checkpoints.
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1))
                D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1))
                torch.save(self.G.state_dict(), G_path)
                torch.save(self.D.state_dict(), D_path)
                print('Saved model checkpoints into {}...'.format(self.model_save_dir))

            # Decay learning rates.
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (self.num_iters - self.num_iters_decay):
                lr -= (self.lr / float(self.num_iters_decay))
                self.update_lr(lr)
                print('Decayed learning rates, lr: {}'.format(lr))
Exemple #15
0
def main():
    epoch = 1000
    batch_size = 64
    hidden_dim = 256

    encoder = Encoder(num_words, hidden_dim)
    if args.attn:
        attn_model = 'dot'
        decoder = LuongAttnDecoderLength(attn_model, hidden_dim, num_words,
                                         MAX_TGT_LEN)
    else:
        decoder = DecoderTask1(hidden_dim, num_words)

    if args.train:
        weight = torch.ones(num_words)
        weight[word2idx_mapping[PAD_TOKEN]] = 0

        encoder = encoder.to(device)
        decoder = decoder.to(device)
        weight = weight.to(device)

        encoder_optimizer = Adam(encoder.parameters(), lr=0.001)
        decoder_optimizer = Adam(decoder.parameters(), lr=0.001)
        criterion = nn.NLLLoss(ignore_index=int(word2idx_mapping[PAD_TOKEN]),
                               size_average=True)
        #criterion = nn.CrossEntropyLoss(weight=weight)

        np.random.seed(1124)
        order = np.arange(len(train_data))

        best_loss = 1e5
        best_epoch = 0

        for e in range(epoch):
            #if e - best_epoch > 20: break

            #np.random.shuffle(order)
            choice = np.random.choice(order, 10000, replace=False)
            shuffled_train_data = train_data[choice]
            train_loss = 0
            valid_loss = 0
            for b in tqdm(range(int(len(choice) // batch_size))):
                batch_x = torch.LongTensor(
                    shuffled_train_data[b * batch_size:(b + 1) *
                                        batch_size][:, 0].tolist()).t()
                batch_y = torch.LongTensor(
                    shuffled_train_data[b * batch_size:(b + 1) *
                                        batch_size][:, 1].tolist()).t()

                batch_x, batch_y = batch_x.to(device), batch_y.to(device)

                train_loss += train(batch_x, batch_y, encoder, decoder,
                                    encoder_optimizer, decoder_optimizer,
                                    criterion, False)

            train_loss /= b
            '''
            for b in range(len(valid_data) // batch_size):
                batch_x = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 0].tolist()).t()
                batch_y = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 1].tolist()).t()
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)

                valid_loss += train(batch_x, batch_y, max_seqlen, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, True)
            valid_loss /= b
            '''
            print(
                "epoch {}, train_loss {:.4f}, valid_loss {:.4f}, best_epoch {}, best_loss {:.4f}"
                .format(e, train_loss, valid_loss, best_epoch, best_loss))
            '''
            if valid_loss < best_loss:
                best_loss = valid_loss
                best_epoch = e
                torch.save(encoder.state_dict(), args.encoder_path + '.best')
                torch.save(decoder.state_dict(), args.decoder_path + '.best')
            '''
            torch.save(encoder.state_dict(), args.encoder_path)
            torch.save(decoder.state_dict(), args.decoder_path)
        print(encoder)
        print(decoder)

    else:
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location=torch.device(device)))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location=torch.device(device)))
        print(encoder)
        print(decoder)
    print("==========================================================")

    predict(encoder, decoder)
        image, padding=2, normalize=False))


netE = Encoder(ngpu=1, nz=nz, nc=3)
netD = Res_Discriminator(channel=6)
netG = ResnetGenerator32(z_dim=nz)
netD2 = ResnetDiscriminator32(stack=6, ch=opt.ch)

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    netE = nn.DataParallel(netE)
    netD = nn.DataParallel(netD)
    netG = nn.DataParallel(netG)
    netD2 = nn.DataParallel(netD2)

netE.to(device)
netD2.to(device)
netG.to(device)
netD.to(device)

optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(0, .9))
optimizerD2 = optim.Adam(netD2.parameters(), lr=opt.lr, betas=(0, .9))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(0, .9))
optimizerE = optim.Adam(netE.parameters(), lr=opt.lr, betas=(.5, .9))

start = opt.start


def adjust_learning_rate(optimizer, epoch, num_epochs):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = opt.lr - opt.lr * (epoch - 45) / (num_epochs - 45)
Exemple #17
0
def train_model(resume):

    with open(Path("./cfg/cfg.json").absolute()) as file:
        para = json.load(file)
    tensorboard_path = Path("./tensorboard/writer").absolute()
    checkpoint_dir = Path("./checkpoint").absolute()
    writer = SummaryWriter(tensorboard_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(in_channels=para['encoder']['in_channels'], channels=para['encoder']['channels'],
                      n_embeddings=para['encoder']['n_embeddings'], embedding_dim=para['encoder']['embedding_dim'], jitter=para['encoder']['jitter'])
    decoder = Decoder(in_channels=para['decoder']['in_channels'], conditioning_channels=para['decoder']['conditioning_channels'],
                      n_speakers = para['decoder']['n_speakers'], speaker_embedding_dim=para['decoder']['speaker_embedding_dim'],
                      mu_embedding_dim=para['decoder']['mu_embedding_dim'], rnn_channels=para['decoder']['rnn_channels'], fc_channels=para['decoder']['fc_channels'],
                      bits=para['decoder']['bits'], hop_length=para['decoder']['hop_length'])


    encoder.to(device)
    decoder.to(device)



    if resume:

        resume_path = Path("./checkpoint/model.pt").absolute()
        print("Resume checkpoint from: {}:".format(str(resume_path)))
        checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage)
        print(checkpoint.keys())
        encoder.load_state_dict(checkpoint["encoder"])
        decoder.load_state_dict(checkpoint["decoder"])
        optimizer = optim.Adam(
            chain(encoder.parameters(), decoder.parameters()),
            lr=1e-5)

        # [encoder, decoder], optimizer = amp.initialize([encoder, decoder], optimizer, opt_level="O1")
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[300000, 400000],
            gamma=0.5)
        optimizer.load_state_dict(checkpoint["optimizer"])
        #amp.load_state_dict(checkpoint["amp"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        global_step = checkpoint["step"]


    else:
        global_step = 0
        optimizer = optim.Adam(
            chain(encoder.parameters(), decoder.parameters()),
            lr=1e-5)

        # [encoder, decoder], optimizer = amp.initialize([encoder, decoder], optimizer, opt_level="O1")
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[300000, 400000],
            gamma=0.5)


    sdataset = SpeechDataset(
        root='./preprocessed_file/train',
        hop_length=para['preprocess']['hop_length'],
        sr=para['preprocess']['sr'],
        sample_frames=para['preprocess']['sample_frames'])
    print(len(sdataset))
    dataloader = DataLoader(
        dataset=sdataset,
        batch_size=16,
        shuffle=True,
        num_workers=1,
        pin_memory=True,
        drop_last=True)

    print(len(dataloader))
    n_epochs = 1
#    start_epoch = global_step // len(dataloader) + 1

    for epoch in range(global_step, global_step+n_epochs):
        average_recon_loss = average_vq_loss = average_perplexity = 0

        for i, (audio, mels, speakers) in enumerate(tqdm(dataloader), 1):
            #audio, mels, speakers = audio.to(device), mels.to(device), speakers.to(device)
            #print(speakers)
            optimizer.zero_grad()
            z, vq_loss, perplexity = encoder(mels)
            output = decoder(audio[:, :-1], z, speakers)
            recon_loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:])
            loss = recon_loss + vq_loss

            loss.backward()

            #with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()

            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1)
            optimizer.step()
            scheduler.step()

            average_recon_loss += (recon_loss.item() - average_recon_loss) / i
            average_vq_loss += (vq_loss.item() - average_vq_loss) / i
            average_perplexity += (perplexity.item() - average_perplexity) / i

            global_step += 1


        save_checkpoint(
                encoder, decoder, optimizer, amp,
                scheduler, global_step, checkpoint_dir)

        writer.add_scalar("recon_loss/train", average_recon_loss, global_step)
        writer.add_scalar("vq_loss/train", average_vq_loss, global_step)
        writer.add_scalar("average_perplexity", average_perplexity, global_step)

        print("epoch:{}, recon loss:{:.2E}, vq loss:{:.2E}, perpexlity:{:.3f}"
              .format(epoch, average_recon_loss, average_vq_loss, average_perplexity))
Exemple #18
0
def main():
    input_lang, output_lang, pairs, data1, data2 = read_langs("eng", "fra", True)
    input_tensor = [[input_lang.word2index[s] for s in es.split(' ')] for es in data1]
    target_tensor = [[output_lang.word2index[s] for s in es.split(' ')] for es in data2]
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
    target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
    print(len(target_tensor))

    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                    target_tensor,
                                                                                                    test_size=0.2)

    # Show length
    print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

    BUFFER_SIZE = len(input_tensor_train)
    BATCH_SIZE = 64
    N_BATCH = BUFFER_SIZE // BATCH_SIZE
    embedding_dim = 256
    units = 1024
    vocab_inp_size = len(input_lang.word2index)
    vocab_tar_size = len(output_lang.word2index)

    train_dataset = MyData(input_tensor_train, target_tensor_train)
    val_dataset = MyData(input_tensor_val, target_tensor_val)

    dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                         drop_last=True,
                         shuffle=True)

    device = torch.device("cpu")

    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

    encoder.to(device)
    decoder.to(device)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),
                           lr=0.001)

    EPOCHS = 10

    for epoch in range(EPOCHS):
        start = time()

        encoder.train()
        decoder.train()

        total_loss = 0

        for (batch, (inp, targ, inp_len)) in enumerate(dataset):
            loss = 0

            xs, ys, lens = sort_batch(inp, targ, inp_len)
            enc_output, enc_hidden = encoder(xs.to(device), lens, device)
            dec_hidden = enc_hidden
            dec_input = torch.tensor([[output_lang.word2index['<sos>']]] * BATCH_SIZE)

            for t in range(1, ys.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                     dec_hidden.to(device),
                                                     enc_output.to(device))
                loss += loss_function(criterion, ys[:, t].to(device), predictions.to(device))
                # loss += loss_
                dec_input = ys[:, t].unsqueeze(1)

            batch_loss = (loss / int(ys.size(1)))
            total_loss += batch_loss

            optimizer.zero_grad()

            loss.backward()

            ### UPDATE MODEL PARAMETERS
            optimizer.step()

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                             batch,
                                                             batch_loss.detach().item()))

        ### TODO: Save checkpoint for model
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / N_BATCH))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
Exemple #19
0
valid_loader = torch.utils.data.DataLoader(CaptionDataset(DATA_FOLDER, 'VAL'),
                                           batch_size=BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=1,
                                           pin_memory=True)

# Optimizer
optimizer = torch.optim.Adam(decoder.parameters(), lr=LEARNING_RATE)

# Parameters check
model_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('\n>> {} parameters\n'.format(params))

encoder = encoder.to(DEVICE)
decoder = decoder.to(DEVICE)

#=========================================================================================================
#=========================================================================================================
#================================ 3. TRAINING

for epoch in range(START_EPOCH, START_EPOCH + N_EPOCHS):
    decoder.train()
    encoder.train()
    epoch_loss = 0.

    time = datetime.now()

    for i, (image, caption, length) in enumerate(tqdm(train_loader)):
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    if args.model_type == 'no_attention':
        encoder = Encoder(args.embed_size).to(device)
        decoder = Decoder(args.embed_size, args.hidden_size, len(vocab),
                          args.num_layers).to(device)

    elif args.model_type == 'attention':
        encoder = EncoderAtt(encoded_image_size=9).to(device)
        decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size,
                             args.attention_dim, args.embed_size,
                             args.dropout_ratio, args.alpha_c).to(device)

    elif args.model_type == 'transformer':

        model = Transformer(len(vocab), args.embed_size,
                            args.transformer_layers, 8,
                            args.dropout_ratio).eval()

    else:
        print('Select model_type attention or no_attention')

    if args.model_type != 'transformer':
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location=torch.device('cpu')))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location=torch.device('cpu')))
    else:
        model = model.to(device)
        model.load_state_dict(
            torch.load(args.model_path, map_location=torch.device('cpu')))

    filenames = os.listdir(args.image_dir)

    predicted = {}

    for file in tqdm(filenames):
        if file == '.DS_Store':
            continue
        # Prepare an image
        image = load_image(os.path.join(args.image_dir, file), transform)
        image_tensor = image.to(device)

        if args.model_type == 'attention':
            features = encoder(image_tensor)
            sampled_ids, _ = decoder.sample(features)
            sampled_ids = sampled_ids[0].cpu().numpy()
            sampled_caption = ['<start>']
        elif args.model_type == 'no_attention':
            features = encoder(image_tensor)
            sampled_ids = decoder.sample(features)
            sampled_ids = sampled_ids[0].cpu().numpy()
            sampled_caption = ['<start>']

        elif args.model_type == 'transformer':
            e_outputs = model.encoder(image_tensor)
            max_seq_length = 20
            sampled_ids = torch.zeros(max_seq_length, dtype=torch.long)
            sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']]
                                               ]).to(device)

            for i in range(1, max_seq_length):

                trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8')
                trg_mask = Variable(torch.from_numpy(trg_mask) == 0).to(device)

                out = model.decoder(sampled_ids[:i].unsqueeze(0), e_outputs,
                                    trg_mask)

                out = model.out(out)
                out = F.softmax(out, dim=-1)
                val, ix = out[:, -1].data.topk(1)
                sampled_ids[i] = ix[0][0]

            sampled_ids = sampled_ids.cpu().numpy()
            sampled_caption = []

        # Convert word_ids to words
        #sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        #print(sentence)
        predicted[file] = sentence
        #print(file, sentence)

    json.dump(predicted, open(args.predict_json, 'w'))
Exemple #21
0
s_testloader = DataLoader(s_testset, batch_size=batch_size, shuffle=True)
t_trainset, t_testset = load_usps(data_per_class)  #transformの指定は禁止
t_trainloader = DataLoader(t_trainset, batch_size=batch_size, shuffle=True)
t_testloader = DataLoader(t_testset, batch_size=64, shuffle=True)

net_g = Encoder()
net_h = classifier()
net_DCD = DCD()
loss_func = torch.nn.CrossEntropyLoss()  #損失関数は共通

#ソースにおいてgとhを訓練
print("part 1 : initial training for g and h")
optimizer = torch.optim.Adam(list(net_g.parameters()) +
                             list(net_h.parameters()),
                             lr=0.001)  #optimizerが両者を更新
net_g = net_g.to(device)
net_h = net_h.to(device)
net_DCD = net_DCD.to(device)
if not device == "cpu":
    net_g = nn.DataParallel(net_g)
    net_h = nn.DataParallel(net_h)
    net_DCD = nn.DataParallel(net_DCD)

for epoch in range(num_ep_init_gh):
    for data, label in s_trainloader:
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        pred = net_h(net_g(data))
        loss = loss_func(pred, label)
        loss.backward()
        optimizer.step()
Exemple #22
0
def main():

    device = torch.device('cuda:0')
    n_features = 256
    n_epochs = 40
    batch_size = 64
    skip_training = False

    # Create the transformer model
    encoder = Encoder(src_vocab_size=trainset.input_lang.n_words,
                      n_blocks=3,
                      n_features=n_features,
                      n_heads=16,
                      n_hidden=1024)
    decoder = Decoder(tgt_vocab_size=trainset.output_lang.n_words,
                      n_blocks=3,
                      n_features=n_features,
                      n_heads=16,
                      n_hidden=1024)
    encoder.to(device)
    decoder.to(device)

    # define training loop parameters
    parameters = list(encoder.parameters()) + list(decoder.parameters())
    adam = torch.optim.Adam(parameters, lr=0, betas=(0.9, 0.98), eps=1e-9)
    optimizer = NoamOptimizer(n_features, 2, 10000, adam)
    loss_method = nn.NLLLoss(ignore_index=0, reduction='mean')

    # prepare data
    data_dir = tools.select_data_dir()
    trainset = TranslationDataset(data_dir, train=True)
    trainloader = DataLoader(dataset=trainset,
                             batch_size=64,
                             shuffle=True,
                             collate_fn=collate,
                             pin_memory=True)

    # training
    if not skip_training:
        for epoch in range(n_epochs):
            loss = training_loop(encoder, decoder, optimizer, loss_method,
                                 trainloader)
            print(f'Train Epoch {epoch+1}: Loss: {loss}')

    # save and load trained model
        tools.save_model(encoder, 'tr_encoder.pth')
        tools.save_model(decoder, 'tr_decoder.pth')
    else:
        encoder = Encoder(src_vocab_size=trainset.input_lang.n_words,
                          n_blocks=3,
                          n_features=256,
                          n_heads=16,
                          n_hidden=1024)
        tools.load_model(encoder, 'tr_encoder.pth', device)

        decoder = Decoder(tgt_vocab_size=trainset.output_lang.n_words,
                          n_blocks=3,
                          n_features=256,
                          n_heads=16,
                          n_hidden=1024)
        tools.load_model(decoder, 'tr_decoder.pth', device)

    # Generate translations with the trained model

    # translate sentences from the training set
    print('Translate training data:')
    print('-----------------------------')
    for i in range(5):
        src_sentence, tgt_sentence = trainset[np.random.choice(len(trainset))]
        print(
            '>', ' '.join(trainset.input_lang.index2word[i.item()]
                          for i in src_sentence))
        print(
            '=', ' '.join(trainset.output_lang.index2word[i.item()]
                          for i in tgt_sentence))
        out_sentence = translate(encoder, decoder, src_sentence)
        print(
            '<', ' '.join(trainset.output_lang.index2word[i.item()]
                          for i in out_sentence), '\n')

    # translate sentences from the test set
    testset = TranslationDataset(data_dir, train=False)
    print('Translate test data:')
    print('-----------------------------')
    for i in range(5):
        input_sentence, target_sentence = testset[np.random.choice(
            len(testset))]
        print(
            '>', ' '.join(testset.input_lang.index2word[i.item()]
                          for i in input_sentence))
        print(
            '=', ' '.join(testset.output_lang.index2word[i.item()]
                          for i in target_sentence))
        output_sentence = translate(encoder, decoder, input_sentence)
        print(
            '<', ' '.join(testset.output_lang.index2word[i.item()]
                          for i in output_sentence), '\n')
Exemple #23
0
def DDF(cfg):

    filter_list_path = Path(utils.to_absolute_path(cfg.filter_list))
    with open(filter_list_path) as file:
        filter_list = json.load(file)
    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)
    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    encoder.eval()
    decoder.eval()
    meter = pyloudnorm.Meter(cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Low":
        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            # librosa.load (it will return audio time series, and its sampling rate)
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            path = out_dir / out_filename

            # to return raw recording in mel-spectrogram without any filtering
            if cfg.output_type == "Embedding":
                mel = librosa.feature.melspectrogram(
                    preemphasis(wav, cfg.preprocessing.preemph),
                    sr=cfg.preprocessing.sr,
                    n_fft=cfg.preprocessing.n_fft,
                    n_mels=cfg.preprocessing.n_mels,
                    hop_length=cfg.preprocessing.hop_length,
                    win_length=cfg.preprocessing.win_length,
                    fmin=cfg.preprocessing.fmin,
                    power=1)
                logmel = librosa.amplitude_to_db(
                    mel, top_db=cfg.preprocessing.top_db)
                logmel = logmel / cfg.preprocessing.top_db + 1
                mel = torch.FloatTensor(logmel).squeeze().to(device).numpy()
                np.savetxt(path.with_suffix(".mel.txt"), mel)

            # to return raw recording in waveform without any filtering
            if cfg.output_type == "Recording":
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         wav.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Moderate":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    speaker = decoder.speaker(speaker)
                vq = vq.squeeze().to(device).numpy()
                speaker = speaker.squeeze().to(device).numpy()

                np.savetxt(path.with_suffix(".vq.txt"), vq)
                np.savetxt(path.with_suffix(".speaker.txt"), speaker)

    #---------------------------------------
    if cfg.privacy_preference == "High":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                vq = vq.squeeze().cpu().numpy()
                np.savetxt(path.with_suffix(".vq.txt"), vq)
Exemple #24
0
    embedding_sd = checkpoint['embedding']

    embedding = nn.Embedding(Config.vocab_size, Config.hidden_size)
    embedding.load_state_dict(embedding_sd)

    encoder = Encoder(embedding)
    attn_model = 'dot'
    decoder = Decoder(
        attn_model,
        embedding,
    )

    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

    encoder = encoder.to(Config.device)
    decoder = decoder.to(Config.device)

    # Set dropout layers to eval mode
    encoder.eval()
    decoder.eval()

    # Initialize search module
    searcher = GreedySearchDecoder(encoder, decoder)

    vocab2id = json.load(open('./data/vocab2id.json', 'r'))
    id2vocab = json.load(open('./data/id2vocab.json', 'r'))
    print(id2vocab)

    # Begin chatting (uncomment and run the following line to begin)
    evaluateInput(encoder, decoder, searcher, vocab2id, id2vocab)
Exemple #25
0
def train(save_path, checkpoint, data_root, batch_size, dataset):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    transform = transforms.Compose(
        [transforms.Resize((128, 128)),
         transforms.ToTensor()])
    target_transform = transforms.Compose(
        [transforms.Resize((128, 128)),
         ToTensor()])
    if dataset == 'cityscapes':
        train_data = Cityscapes(str(data_root),
                                split='train',
                                mode='fine',
                                target_type='semantic',
                                transform=transform,
                                target_transform=transform)
        eG = 35
        dG = [35, 35, 20, 14, 10, 4, 1]
        eC = 8
        dC = 280
        n_classes = len(Cityscapes.classes)
        update_lr = update_lr_default
        epoch = 200
    else:
        train_data = Deepfashion(str(data_root),
                                 split='train',
                                 transform=transform,
                                 target_transform=transform)
        n_classes = len(Deepfashion.eclasses)
        eG = 8
        eC = 64
        dG = [8, 8, 4, 4, 2, 2, 1]
        dC = 160
        update_lr = update_lr_deepfashion
        epoch = 100
    data_loader = torch.utils.data.DataLoader(train_data,
                                              batch_size=batch_size,
                                              num_workers=1)

    os.makedirs(save_path, exist_ok=True)

    n_channels = 3
    encoder = Encoder(n_classes * n_channels, C=eC, G=eG)
    decoder = Decoder(8 * eG, n_channels, n_classes, C=dC, Gs=dG)
    discriminator = Discriminator(n_classes + n_channels)
    vgg = Vgg19().eval()

    encoder = torch.nn.DataParallel(encoder)
    decoder = torch.nn.DataParallel(decoder)
    discriminator = torch.nn.DataParallel(discriminator)
    vgg = torch.nn.DataParallel(vgg)

    gen_opt = optim.Adam(list(encoder.parameters()) +
                         list(decoder.parameters()),
                         lr=0.0001,
                         betas=(0, 0.9))
    dis_opt = optim.Adam(discriminator.parameters(), lr=0.0004, betas=(0, 0.9))
    gen_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr)
    dis_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr)
    params = [
        'encoder', 'decoder', 'discriminator', 'gen_opt', 'dis_opt',
        'gen_scheduler', 'dis_scheduler'
    ]

    if os.path.exists(checkpoint):
        cp = torch.load(checkpoint)
        print(f'Load checkpoint: {checkpoint}')
        for param in params:
            eval(param).load_state_dict(cp[param])
        # encoder.load_state_dict(cp['encoder'])
        # decoder.load_state_dict(cp['decoder'])
        # discriminator.load_state_dict(cp['discriminator'])
        # gen_opt.load_state_dict(cp['gen_opt'])
        # dis_opt.load_state_dict(cp['dis_opt'])
        # gen_scheduler.load_state_dict(cp['gen_scheduler'])
        # dis_scheduler.load_state_dict(cp['dis_scheduler'])

    def to_device_optimizer(opt):
        for state in opt.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)

    to_device_optimizer(gen_opt)
    to_device_optimizer(dis_opt)

    encoder = encoder.to(device)
    decoder = decoder.to(device)
    discriminator = discriminator.to(device)
    vgg = vgg.to(device)
    print(len(data_loader))
    for epoch in range(epoch):
        e_g_loss = []
        e_d_loss = []
        for i, batch in tqdm(enumerate(data_loader)):
            x, sem = batch
            x = x.to(device)
            sem = sem.to(device)
            sem = sem * 255.0
            sem = sem.long()
            s = split_class(x, sem, n_classes)
            sem_target = sem.clone()
            del sem
            sem = torch.zeros(x.size()[0],
                              n_classes,
                              sem_target.size()[2],
                              sem_target.size()[3],
                              device=x.device)
            sem.scatter_(1, sem_target, 1)
            s = s.detach()
            s = s.to(device)
            mu, sigma = encoder(s)
            z = mu + torch.exp(0.5 * sigma) * torch.rand(mu.size(),
                                                         device=mu.device)
            gen = decoder(z, sem)
            d_fake = discriminator(gen, sem)
            d_real = discriminator(x, sem)
            l1loss = nn.L1Loss()
            gen_opt.zero_grad()
            loss_gen = 0.5 * d_fake[0][-1].mean() + 0.5 * d_fake[1][-1].mean()
            loss_fm = sum([
                sum([l1loss(f, g) for f, g in zip(fs, rs)])
                for fs, rs in zip(d_fake, d_real)
            ]).mean()

            f_fake = vgg(gen)
            f_real = vgg(x)
            # loss_p = 1.0 / 32 * l1loss(f_fake.relu1_2, f_real.relu1_2) + \
            #     1.0 / 16 * l1loss(f_fake.relu2_2, f_real.relu2_2) + \
            #     1.0 / 8 * l1loss(f_fake.relu3_3, f_real.relu3_3) + \
            #     1.0 / 4 * l1loss(f_fake.relu4_3, f_real.relu4_3) + \
            #     l1loss(f_fake.relu5_3, f_real.relu5_3)
            loss_p = 1.0 / 32 * l1loss(f_fake[0], f_real[0]) + \
                1.0 / 16 * l1loss(f_fake[1], f_real[1]) + \
                1.0 / 8 * l1loss(f_fake[2], f_real[2]) + \
                1.0 / 4 * l1loss(f_fake[3], f_real[3]) + \
                l1loss(f_fake[4], f_real[4])
            loss_kl = -0.5 * torch.sum(1 + sigma - mu * mu - torch.exp(sigma))
            loss = loss_gen + 10.0 * loss_fm + 10.0 * loss_p + 0.05 * loss_kl
            loss.backward(retain_graph=True)
            gen_opt.step()

            dis_opt.zero_grad()
            loss_dis = torch.mean(-torch.mean(torch.min(d_real[0][-1] - 1, torch.zeros_like(d_real[0][-1]))) +
                                  -torch.mean(torch.min(-d_fake[0][-1] - 1, torch.zeros_like(d_fake[0][-1])))) + \
                                  torch.mean(-torch.mean(torch.min(d_real[1][-1] - 1, torch.zeros_like(d_real[1][-1]))) +
                                  -torch.mean(torch.min(-d_fake[1][-1] - 1, torch.zeros_like(d_fake[1][-1]))))
            loss_dis.backward()
            dis_opt.step()

            e_g_loss.append(loss.item())
            e_d_loss.append(loss_dis.item())
            #plt.imshow((gen.detach().cpu().numpy()[0]).transpose(1, 2, 0))
            #plt.pause(.01)
            #print(i, 'g_loss', e_g_loss[-1], 'd_loss', e_d_loss[-1])
            os.makedirs(save_path / str(epoch), exist_ok=True)

            Image.fromarray((gen.detach().cpu().numpy()[0].transpose(1, 2, 0) *
                             255.0).astype(np.uint8)).save(
                                 save_path / str(epoch) / f'{i}.png')
        print('g_loss', np.mean(e_g_loss), 'd_loss', np.mean(e_d_loss))

        # save
        cp = {}
        for param in params:
            cp[param] = eval(param).state_dict()
        torch.save(cp, save_path / 'latest.pth'
                   )  #{param:eval(param).state_dict() for param in params})
Exemple #26
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path(
        "datasets")) / cfg.dataset.path  #zerospeech/datasets/2019/english
    with open(dataset_path / "speakers.json") as file:  # 말하는 사람들 이름 써있는 데이터
        speakers = sorted(json.load(file))  # speakers라는 객체로 저장

    synthesis_list_path = Path(utils.to_absolute_path(
        cfg.synthesis_list))  # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(
            file)  # datasets/2019/english에 있는 synthesis.json보면됨

    in_dir = Path(utils.to_absolute_path(
        cfg.in_dir))  # ???임. zerospeech 폴더로 경로따면 될듯. (./)
    out_dir = Path(utils.to_absolute_path(
        cfg.out_dir))  #???임. 목소리 변환된 결과를 저장할 경로
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # gpu안되면 cpu로

    encoder = Encoder(
        **cfg.model.encoder)  #ZeroSpeech/config/model/default에 있는 encoder
    decoder = Decoder(
        **cfg.model.decoder)  #ZeroSpeech/config/model/default에 있는 decoder
    encoder.to(device)  # cpu or gpu
    decoder.to(device)  # cpu or gpu

    print("Load checkpoint from: {}:".format(cfg.checkpoint)
          )  ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage
                            )  # checkpoint에 지정된 weight들을 불러옵니다
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    meter = pyloudnorm.Meter(
        cfg.preprocessing.sr
    )  #sr:16000으로 조정??  https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다..

    for wav_path, speaker_id, out_filename in tqdm(
            synthesis_list
    ):  #"english/test/S002_0379088085","V002","V002_0379088085"
        wav_path = in_dir / wav_path  # ./english/test/S002_0379088085
        wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                              sr=cfg.preprocessing.sr)
        ref_loudness = meter.integrated_loudness(wav)  #인풋의 음량을 측정인듯
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(
            device)  #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입

        #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze

        #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/

        speaker = torch.LongTensor([speakers.index(speaker_id)
                                    ]).to(device)  # 마찬가지로 텐서로 만드는데

        #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데,
        #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다.
        #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다.

        # 즉 mel은 소수점있고 speaker는 소숫점 없으니까!
        with torch.no_grad(
        ):  # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        output_loudness = meter.integrated_loudness(output)  #아웃풋의 음량을 측정인듯
        output = pyloudnorm.normalize.loudness(output, output_loudness,
                                               ref_loudness)
        # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경
        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"),
                                 output.astype(np.float32),
                                 sr=cfg.preprocessing.sr)
Exemple #27
0
def test(args):
    '''
    compute bleu score on all images, and average its bleu score
    '''
    train_json_path = './data/annotations/captions_train2014.json'
    test_json_path = './data/annotations/captions_val2014.json'
    train_image_dir = './data/train2014'
    test_image_dir = './data/val2014'

    if args.eval == 'eval':
        print('eval bleu')
        jsonPath = test_json_path
        image_dir = test_image_dir
    else:
        print('train bleu')
        jsonPath = train_json_path
        image_dir = train_image_dir

    # Image preprocessing
    # In generation phase, we need should not random crop, just resize
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    # Load vocabulary wraper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build model
    encoder = Encoder(embed_size=args.embed_size).eval()
    decoder = Decoder(stateful=False,
                      embed_size=args.embed_size,
                      hidden_size=args.hidden_size,
                      vocab_size=len(vocab),
                      num_layers=args.num_layers).eval()
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path, map_location=device))
    decoder.load_state_dict(torch.load(args.decoder_path, map_location=device))

    name_caption_frame = get_image_name(jsonPath)
    unique_image_names = pd.unique(name_caption_frame['file_name'])

    # Add image directory path train2014 or val2014
    unique_image_names = [
        os.path.join(image_dir, image_name)
        for image_name in unique_image_names
    ]

    total_generated_score4 = []
    total_theoratical_score4 = []

    total_generated_score3 = []
    total_theoratical_score3 = []

    total_generated_score2 = []
    total_theoratical_score2 = []

    total_generated_score1 = []
    total_theoratical_score1 = []

    # Parallelize the process
    def score_helper(image_path):
        caption = generate_caption(image_path, vocab, encoder, decoder,
                                   transform)

        generated_score4, theoratical_score4 = bleu4_score(
            image_path, caption, name_caption_frame)
        total_generated_score4.append(generated_score4)
        total_theoratical_score4.append(theoratical_score4)

        generated_score3, theoratical_score3 = bleu3_score(
            image_path, caption, name_caption_frame)
        total_generated_score3.append(generated_score3)
        total_theoratical_score3.append(theoratical_score3)

        generated_score2, theoratical_score2 = bleu2_score(
            image_path, caption, name_caption_frame)
        total_generated_score2.append(generated_score2)
        total_theoratical_score2.append(theoratical_score2)

        generated_score1, theoratical_score1 = bleu1_score(
            image_path, caption, name_caption_frame)
        total_generated_score1.append(generated_score1)
        total_theoratical_score1.append(theoratical_score1)

    _ = pd.Series(unique_image_names).apply(score_helper)

    print('Average bleu-4 score:',
          sum(total_generated_score4) / len(total_generated_score4),
          ' | Average theoratical bleu-4 score:',
          sum(total_theoratical_score4) / len(total_theoratical_score4))

    print('Average bleu-3 score:',
          sum(total_generated_score3) / len(total_generated_score3),
          ' | Average theoratical bleu-3 score:',
          sum(total_theoratical_score3) / len(total_theoratical_score3))

    print('Average bleu-2 score:',
          sum(total_generated_score2) / len(total_generated_score2),
          ' | Average theoratical bleu-2 score:',
          sum(total_theoratical_score2) / len(total_theoratical_score2))

    print('Average bleu-1 score:',
          sum(total_generated_score1) / len(total_generated_score1),
          ' | Average theoratical bleu-1 score:',
          sum(total_theoratical_score1) / len(total_theoratical_score1))
Exemple #28
0
    start_epoch = checkpoint['epoch'] + 1
    epochs_since_improvement = checkpoint['epochs_since_improvement']
    best_bleu4 = checkpoint['bleu-4']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    if fine_tune_encoder is True and encoder_optimizer is None:
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, encoder.parameters()),
                                             lr=encoder_lr)

# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)

# Loss function
criterion = nn.CrossEntropyLoss().to(device)

# Custom dataloaders

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
    CaptionDataset(data_folder,
                   data_name,
                   'TRAIN',
                   transform=transforms.Compose([normalize])),
    batch_size=batch_size,
    shuffle=True,
Exemple #29
0
def main():
    global epochs_since_improvement, best_loss_tr

    encoder = Encoder()
    decoder = DecoderWithAttention(encoder_dim, lstm_input_dim, decoder_dim,
                                   attention_dim, output_dim)

    encoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, encoder.parameters()),
                                         lr=encoder_lr)
    decoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, decoder.parameters()),
                                         lr=decoder_lr)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    trainLoader = torch.utils.data.DataLoader(Dataset(driver, circuit_tr,
                                                      curvatureLength,
                                                      historyLength,
                                                      predLength),
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=workers,
                                              pin_memory=True)

    cMean_tr = trainLoader.dataset.cMean
    cStd_tr = trainLoader.dataset.cStd
    vMean_tr = trainLoader.dataset.vMean
    vStd_tr = trainLoader.dataset.vStd
    aMean_tr = trainLoader.dataset.aMean
    aStd_tr = trainLoader.dataset.aStd

    validLoader = torch.utils.data.DataLoader(Dataset(driver,
                                                      circuit_vl,
                                                      curvatureLength,
                                                      historyLength,
                                                      predLength,
                                                      cMean=cMean_tr,
                                                      cStd=cStd_tr,
                                                      vMean=vMean_tr,
                                                      vStd=vStd_tr,
                                                      aMean=aMean_tr,
                                                      aStd=aStd_tr),
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=workers,
                                              pin_memory=True)

    print('Training version.{} (A->V)'.format(vNumber))
    print('Training data ({} - {})'.format(driver, circuit_tr))
    print('Validation data ({} - {})'.format(driver, circuit_vl))
    print('curvature len {}'.format(curvatureLength))
    print('history len {}'.format(historyLength))
    print('pred len {}'.format(predLength))
    print('hiddenDimension {}'.format(hiddenDimension))

    print('\nTraining...\n')

    for epoch in tqdm(range(start_epoch, epochs)):

        loss, vMape, vRmse, vCorr, aCorr = train(
            trainLoader=trainLoader,
            encoder=encoder,
            decoder=decoder,
            criterion=criterion,
            encoder_optimizer=encoder_optimizer,
            decoder_optimizer=decoder_optimizer,
            epoch=epoch)

        writer.add_scalars('Loss', {'tr': loss}, epoch)
        writer.add_scalars('MAPE', {'tr': vMape}, epoch)
        writer.add_scalars('RMSE', {'tr': vRmse}, epoch)
        writer.add_scalars('vCorr', {'tr': vCorr}, epoch)
        writer.add_scalars('aCorr', {'tr': aCorr}, epoch)

        is_best = loss < best_loss_tr
        best_loss_tr = min(loss, best_loss_tr)
        if not is_best:
            epochs_since_improvement += 1
            print(
                '\nEpoch {} Epoch Epochs since last improvement (unit: 100): {}\n'
                .format(epoch, epochs_since_improvement))
        else:
            epochs_since_improvement = 0

        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            adjust_learning_rate(epoch, encoder_optimizer, 0.8)
            adjust_learning_rate(epoch, decoder_optimizer, 0.8)

        if epoch % 5 == 0:
            loss_vl, vMape_vl, vRmse_vl, vCorr_vl, aCorr_vl = validate(
                validLoader=validLoader,
                encoder=encoder,
                decoder=decoder,
                criterion=criterion)
            writer.add_scalars('Loss', {'vl': loss_vl}, epoch)
            writer.add_scalars('MAPE', {'vl': vMape_vl}, epoch)
            writer.add_scalars('RMSE', {'vl': vRmse_vl}, epoch)
            writer.add_scalars('vCorr', {'vl': vCorr_vl}, epoch)
            writer.add_scalars('aCorr', {'vl': aCorr_vl}, epoch)

        if epoch % 10 == 0:
            save_checkpoint(chptFolderPath, encoder, decoder, epoch, cMean_tr,
                            cStd_tr, vMean_tr, vStd_tr, aMean_tr, aStd_tr,
                            curvatureLength, historyLength)
    writer.close()
Exemple #30
0
def main():
    epoch = 1000
    batch_size = 256
    hidden_dim = 128

    encoder = Encoder(num_words,
                      hidden_dim,
                      n_layers=args.n_layers,
                      bidirectional=args.bidirectional).to(device)
    if args.attn:
        decoder = AttnDecoder(hidden_dim,
                              num_words,
                              max_seqlen,
                              n_layers=args.n_layers).to(device)
    else:
        decoder = Decoder(hidden_dim, num_words,
                          n_layers=args.n_layers).to(device)

    if args.train:
        weight = torch.ones(num_words)
        weight[word2idx[PAD_TOKEN]] = 0
        encoder = encoder.to(device)
        decoder = decoder.to(device)
        weight = weight.to(device)
        encoder_optimizer = Adam(encoder.parameters(), lr=0.001)
        decoder_optimizer = Adam(decoder.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss(ignore_index=word2idx[PAD_TOKEN])

        np.random.seed(1124)
        order = np.arange(len(train_X))

        best_loss = 1e10
        best_percentage = 0
        best_percentage_epoch = 0
        best_epoch = 0
        start_epoch = 0
        if args.resume:
            start_epoch, best_loss = load_checkpoint(args.model_path, encoder,
                                                     encoder_optimizer,
                                                     decoder,
                                                     decoder_optimizer)

        for e in range(start_epoch, start_epoch + epoch):
            if e - best_percentage_epoch > 2: break

            np.random.shuffle(order)
            shuffled_train_X = train_X[order]
            shuffled_train_Y = train_Y[order]
            train_loss = 0
            valid_loss = 0

            for b in tqdm(range(int(len(order) // batch_size))):
                batch_x = torch.LongTensor(
                    shuffled_train_X[b * batch_size:(b + 1) *
                                     batch_size].tolist()).t()
                batch_y = torch.LongTensor(
                    shuffled_train_Y[b * batch_size:(b + 1) *
                                     batch_size].tolist()).t()

                batch_x, batch_y = batch_x.to(device), batch_y.to(device)

                train_loss += train(batch_x, batch_y, encoder, decoder,
                                    encoder_optimizer, decoder_optimizer,
                                    criterion)

            train_loss /= b

            all_control_cnt, all_hit_cnt = [], []
            for b in range(len(valid_X) // batch_size):
                batch_x = torch.LongTensor(valid_X[b * batch_size:(b + 1) *
                                                   batch_size].tolist()).t()
                batch_y = torch.LongTensor(valid_Y[b * batch_size:(b + 1) *
                                                   batch_size].tolist()).t()
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)

                val_loss, control_cnt, hit_cnt = valid(batch_x, batch_y,
                                                       encoder, decoder,
                                                       encoder_optimizer,
                                                       decoder_optimizer,
                                                       criterion)
                valid_loss += val_loss
                all_control_cnt.extend(control_cnt)
                all_hit_cnt.extend(hit_cnt)
            valid_loss /= b
            all_control_cnt = np.array(all_control_cnt)
            all_hit_cnt = np.array(all_hit_cnt)
            nonzero = all_control_cnt != 0
            all_control_cnt = all_control_cnt[nonzero]
            all_hit_cnt = all_hit_cnt[nonzero]
            percentage = np.mean(all_hit_cnt / all_control_cnt)
            logger.info(
                "epoch {}, train_loss {:.4f}, valid_loss {:.4f}, best_epoch {}, best_loss {:.4f}, control_cnt {}, hit_cnt {}, percentage {:.4f}"
                .format(e, train_loss, valid_loss, best_epoch, best_loss,
                        np.sum(all_control_cnt), np.sum(all_hit_cnt),
                        percentage))

            if percentage > best_percentage:
                best_percentage = percentage
                best_percentage_epoch = e
                torch.save(
                    {
                        'encoder_state_dict':
                        encoder.state_dict(),
                        'encoder_optimizer_state_dict':
                        encoder_optimizer.state_dict(),
                        'decoder_state_dict':
                        decoder.state_dict(),
                        'decoder_optimizer_state_dict':
                        decoder_optimizer.state_dict(),
                        'epoch':
                        e,
                        'loss':
                        valid_loss,
                        'percentage':
                        best_percentage,
                    }, args.model_path)

            if valid_loss < best_loss:
                best_loss = valid_loss
                best_epoch = e
                torch.save(
                    {
                        'encoder_state_dict':
                        encoder.state_dict(),
                        'encoder_optimizer_state_dict':
                        encoder_optimizer.state_dict(),
                        'decoder_state_dict':
                        decoder.state_dict(),
                        'decoder_optimizer_state_dict':
                        decoder_optimizer.state_dict(),
                        'epoch':
                        e,
                        'loss':
                        valid_loss
                    }, args.model_path)

        batch_x = torch.LongTensor(valid_X[:batch_size].tolist()).t()
        batch_y = torch.LongTensor(valid_Y[:batch_size].tolist()).t()
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        input_chinese, output_chinese = predict(batch_x, batch_y, encoder,
                                                decoder, encoder_optimizer,
                                                decoder_optimizer, criterion,
                                                20)

        logger.info('*** Results ***')
        logger.info('Best Hit Accuracy: {}'.format(best_percentage))
        logger.info(
            'Best Hit Accuracy Epoch: {}'.format(best_percentage_epoch))
        for inp, out in zip(input_chinese, output_chinese):
            logger.info('{}\t||\t{}'.format(inp, out))
        logger.info(encoder)
        logger.info(decoder)
        logger.info('\n\n' + '=' * 100 + '\n\n')

    else:
        print(encoder)
        print(decoder)