Esempio n. 1
0
class Tester(object):
    def __init__(self, config):

        self.content_images = glob.glob((config.exp_content_dir + '/*/*.jpg')) #+ '_resized/*'))

    
        self.encoder = Encoder().cuda()
        self.decoder = Decoder()
        self.keyencoder = KeyEncoder().cuda()

        self.decoder.load_state_dict(torch.load('./decoder.pth'))
        self.decoder = self.decoder.cuda()
        self.keyencoder.load_state_dict(torch.load('./key.pth'))
        self.keyencoder = self.keyencoder.cuda()

        if config.attention == 'soft':
            self.AsyAtt = AsyAtt()
        else:
            self.AsyAtt = AsyAttHard()


        S_path = os.path.join(config.style_dir, str(config.S))
        style_images = glob.glob((S_path + '/*.jpg'))
        s = Image.open(style_images[0])
        s = trans(s).cuda()
        self.style_image = s.unsqueeze(0)
        self.style_target = torch.stack([s for i in range(config.batch_size)],0)
    

    def test(self):

        self.encoder.eval()
        self.decoder.eval()
        with torch.no_grad():
            style_val = self.encoder(self.style_image)
            style_key = self.keyencoder(style_val)

            for filename in self.content_images:
                name = str(filename).split("test_images")[-1][1:].replace("\\", "-")
                name = name.replace("/", "-")

                
                c = Image.open(filename)
                c_tensor = trans(c).unsqueeze(0).cuda()
                val = self.encoder(c_tensor)
                key = self.keyencoder(val)

                content_feature = self.AsyAtt(style_key[0], style_val[0], key, val)
                out = self.decoder(content_feature)

                out = denorm(out).to('cpu')[0]
                c_tensor = denorm(c_tensor).to('cpu')[0]

                if out.shape[1] > c_tensor.shape[1]:
                    c_tensor = torch.cat([c_tensor, torch.zeros([c_tensor.shape[0],out.shape[1]-c_tensor.shape[1],c_tensor.shape[2]])],1)
                elif out.shape[1] < c_tensor.shape[1]:
                    out = torch.cat([out, torch.zeros([out.shape[0],c_tensor.shape[1]-out.shape[1],out.shape[2]])],1)

                save_image(torch.cat([out, c_tensor], 2), os.path.join('./logs/test', name))
Esempio n. 2
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))

    synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list))
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)

    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(
            wav_path.with_suffix(".wav"),
            sr=cfg.preprocessing.sr)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
Esempio n. 3
0
def main():
    args = check_argv()

    # Code indices
    code_indices_fn = Path(args.code_indices_fn)
    print("Reading: {}".format(code_indices_fn))
    code_indices = np.loadtxt(code_indices_fn, dtype=np.int)

    # Speakers
    with open(Path("datasets/2019/english/speakers.json")) as f:
        speakers = sorted(json.load(f))

    # Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = Encoder(in_channels=80,
                      channels=768,
                      n_embeddings=512,
                      embedding_dim=64,
                      jitter=0.5)
    decoder = Decoder(
        in_channels=64,
        conditioning_channels=128,
        n_speakers=102,
        speaker_embedding_dim=64,
        mu_embedding_dim=256,
        rnn_channels=896,
        fc_channels=256,
        bits=8,
        hop_length=160,
    )
    decoder.to(device)

    print("Reading: {}".format(args.checkpoint))
    checkpoint_path = args.checkpoint
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    encoder.eval()
    decoder.eval()

    # Codes
    embedding = encoder.codebook.embedding.cpu().numpy()
    codes = np.array([embedding[code_indices]])

    # Synthesize
    z = torch.FloatTensor(codes).to(device)
    speaker = torch.LongTensor([speakers.index(args.speaker)]).to(device)
    with torch.no_grad():
        output = decoder.generate(z, speaker)

    wav_fn = Path(code_indices_fn.stem).with_suffix(".wav")
    print("Writing: {}".format(wav_fn))
    librosa.output.write_wav(wav_fn, output.astype(np.float32), sr=16000)
Esempio n. 4
0
def sample(load_dir: str, save_dir: str, use_gpu: bool) -> None:
    '''
    Sample the FantasyMapGAN for new maps.
    Saves the generated images to `save_dir`.

    Parameters
    ----------
    load_dir: str
        folder to load network weights from
    save_dir: str
        folder to save network weights to
    use_gpu: bool
        Set to true to run training on GPU, otherwise run on CPU
    '''
    # Network
    model = Decoder()
    model = model.eval()
    if use_gpu:
        model = model.cuda()
    if load_dir:
        fs = glob(os.path.join(load_dir, '*_dec.pth'))
        fs.sort(key=os.path.getmtime)
        model.load_state_dict(torch.load(fs[-1]))
    
    # Generate
    z = torch.randn((1, model.latent_dim))
    x = model.forward(z)

    # Save
    save_path = os.path.join(save_dir, str(uuid.uuid1()) + '.png')
    save_image(x.squeeze(), save_path)
Esempio n. 5
0
def initialize_for_test(params):
    data_loader = get_loader(params, mode='test')
    encoder_file = os.path.join(params.encoder_save,
                                'epoch-%d.pkl' % params.num_epochs)
    decoder_file = os.path.join(params.decoder_save,
                                'epoch-%d.pkl' % params.num_epochs)
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = Encoder(params)
    decoder = Decoder(params, vocab_size)
    encoder.eval()
    decoder.eval()

    # Load the trained weights.
    encoder.load_state_dict(torch.load(encoder_file))
    decoder.load_state_dict(torch.load(decoder_file))
    encoder.to(params.device)
    decoder.to(params.device)
    return data_loader, encoder, decoder
Esempio n. 6
0
def main(test_img_path):
    options = parse_args()
    is_cuda = use_cuda and not options.no_cuda
    hardware = "cuda" if is_cuda else "cpu"
    device = torch.device(hardware)

    for checkpoint_path in options.checkpoint:
        checkpoint_name, _ = os.path.splitext(
            os.path.basename(checkpoint_path))
        checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda)
                      if checkpoint_path else default_checkpoint)
        encoder_checkpoint = checkpoint["model"].get("encoder")
        decoder_checkpoint = checkpoint["model"].get("decoder")

        test_img = Image.open(test_img_path)
        test_img = test_img.convert("RGB")

        enc = Encoder(img_channels=3, checkpoint=encoder_checkpoint).to(device)
        dec = Decoder(
            1,
            low_res_shape,
            high_res_shape,
            checkpoint=decoder_checkpoint,
            device=device,
        ).to(device)
        enc.eval()
        dec.eval()

        result = evaluate(
            enc,
            dec,
            test_img=test_img,
            device=device,
            checkpoint=checkpoint,
            beam_width=options.beam_width,
            prefix=options.prefix,
        )
        print(result)
Esempio n. 7
0
def main():
    data_set = F2EDataSet(max_length=max_seq_len)
    loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    encoder = Encoder(data_set.in_lang.token_n,
                      embed_size=embed_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      drop_prob=drop_prob).to(device)
    decoder = Decoder(vocab_size=data_set.out_lang.token_n,
                      embed_size=embed_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      attention_size=attention_size,
                      drop_prob=drop_prob).to(device)
    enc_optimizer = optim.Adam(encoder.parameters(), lr=lr)
    dec_optimizer = optim.Adam(decoder.parameters(), lr=lr)
    criteon = nn.CrossEntropyLoss(reduction='none').to(device)
    random_sample_sentences = data_set.random_sample(k=random_sample_k)
    sample_in_indices = []
    for in_sentence, out_sentence in random_sample_sentences:
        sample_in_indices.append(
            data_set.convert_token_to_index(data_set.in_lang, in_sentence))
    # sample_in_indices: shape[random_sample_k, max_len], dtype: int64
    sample_in_indices = torch.LongTensor(sample_in_indices).to(device)
    # sample_in_indices: [random_sample_k, 1, max_len]
    sample_in_indices = torch.unsqueeze(sample_in_indices, dim=1)
    for epoch in range(num_epochs):
        total_loss = 0
        encoder.train()
        decoder.train()
        for batch_idx, (in_seq, out_seq) in enumerate(loader):
            this_batch_size = in_seq.shape[0]
            # in_seq, out_seq shape: [batch_size, max_len], dtype = int64
            in_seq, out_seq = in_seq.to(device), out_seq.to(device)
            # enc_outputs of shape (seq_len, batch, num_directions * hidden_size)
            # enc_hidden of shape (num_layers * num_directions, batch, hidden_size)
            enc_outputs, enc_hidden = encoder(
                in_seq, encoder.init_hidden(this_batch_size, device=device))
            # 解码器在最初时间步的输入是BOS
            # dec_input: [batch_size, 1]
            dec_input = decoder.init_input(this_batch_size, device=device)
            # initialize hidden state of decoder
            # dec_hidden: [num_layers, batch_size, hidden_size]
            dec_hidden = decoder.init_hidden(enc_hidden)
            # mask [batch_size]
            mask = torch.ones(this_batch_size, device=device)
            eos = torch.LongTensor([2] * this_batch_size).to(device)
            pad = torch.zeros(this_batch_size).to(device)
            num_not_pad_tokens = 0
            loss = 0
            for y in torch.transpose(out_seq, 0, 1):
                dec_output, dec_hidden = decoder(dec_input, dec_hidden,
                                                 enc_outputs)
                loss += torch.sum((criteon(dec_output, y) * mask), dim=0)
                # y: [batch_size] => [batch_size, 1]
                dec_input = torch.unsqueeze(y, dim=1)
                num_not_pad_tokens += torch.sum(mask, dim=0)
                # 当遇到EOS时,序列后面的词将均为PAD,相应位置的掩码设成0
                mask = torch.where(y != eos, mask, pad)
            loss /= num_not_pad_tokens
            total_loss += loss
            enc_optimizer.zero_grad()
            dec_optimizer.zero_grad()
            loss.backward()
            enc_optimizer.step()
            dec_optimizer.step()
        decoder.eval()
        encoder.eval()
        print(f"epoch {epoch+1}, loss = {total_loss/data_set.__len__()}")
        if epoch % 10 == 0:
            translate(data_set, random_sample_sentences, sample_in_indices,
                      encoder, decoder, device)
    translate(data_set, random_sample_sentences, sample_in_indices, encoder,
              decoder, device)
Esempio n. 8
0
def main(args):

    #create a writer
    writer = SummaryWriter('loss_plot_' + args.mode, comment='test')
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    val_length = len(os.listdir(args.image_dir_val))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    data_loader_val = get_loader(args.image_dir_val,
                                 args.caption_path_val,
                                 vocab,
                                 transform,
                                 args.batch_size,
                                 shuffle=True,
                                 num_workers=args.num_workers)

    # Build the model
    # if no-attention model is chosen:
    if args.model_type == 'no_attention':
        encoder = Encoder(args.embed_size).to(device)
        decoder = Decoder(args.embed_size, args.hidden_size, len(vocab),
                          args.num_layers).to(device)
        criterion = nn.CrossEntropyLoss()

    # if attention model is chosen:
    elif args.model_type == 'attention':
        encoder = EncoderAtt(encoded_image_size=9).to(device)
        decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size,
                             args.attention_dim, args.embed_size,
                             args.dropout_ratio, args.alpha_c).to(device)

    # if transformer model is chosen:
    elif args.model_type == 'transformer':
        model = Transformer(len(vocab), args.embed_size,
                            args.transformer_layers, 8,
                            args.dropout_ratio).to(device)

        encoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, model.encoder.parameters()),
                                             lr=args.learning_rate_enc)
        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, model.decoder.parameters()),
                                             lr=args.learning_rate_dec)
        criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>'])

    else:
        print('Select model_type attention or no_attention')

    # if model is not transformer: additional step in encoder is needed: freeze lower layers of resnet if args.fine_tune == True
    if args.model_type != 'transformer':
        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, decoder.parameters()),
                                             lr=args.learning_rate_dec)
        encoder.fine_tune(args.fine_tune)
        encoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, encoder.parameters()),
                                             lr=args.learning_rate_enc)

    # initialize lists to store results:
    loss_train = []
    loss_val = []
    loss_val_epoch = []
    loss_train_epoch = []

    bleu_res_list = []
    cider_res_list = []
    rouge_res_list = []

    results = {}

    # calculate total steps fot train and validation
    total_step = len(data_loader)
    total_step_val = len(data_loader_val)

    #For each epoch
    for epoch in tqdm(range(args.num_epochs)):

        loss_val_iter = []
        loss_train_iter = []

        # set model to train mode
        if args.model_type != 'transformer':
            encoder.train()
            decoder.train()
        else:
            model.train()

        # for each entry in data_loader
        for i, (images, captions, lengths) in tqdm(enumerate(data_loader)):
            # load images and captions to device
            images = images.to(device)
            captions = captions.to(device)
            # Forward, backward and optimize

            # forward and backward path is different dependent of model type:
            if args.model_type == 'no_attention':
                # get features from encoder
                features = encoder(images)
                # pad targergets to a length
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # get output from decoder
                outputs = decoder(features, captions, lengths)
                # calculate loss
                loss = criterion(outputs, targets)

                # optimizer and backward step
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            elif args.model_type == 'attention':

                # get features from encoder
                features = encoder(images)

                # get targets - starting from 2 word in captions
                #(the model not sequantial, so targets are predicted in parallel- no need to predict first word in captions)

                targets = captions[:, 1:]
                # decode length = length-1 for each caption
                decode_lengths = [length - 1 for length in lengths]
                #flatten targets
                targets = targets.reshape(targets.shape[0] * targets.shape[1])

                sampled_caption = []

                # get scores and alphas from decoder
                scores, alphas = decoder(features, captions, decode_lengths)

                scores = scores.view(-1, scores.shape[-1])

                #predicted = prediction with maximum score
                _, predicted = torch.max(scores, dim=1)

                # calculate loss
                loss = decoder.loss(scores, targets, alphas)

                # optimizer and backward step
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            elif args.model_type == 'transformer':

                # input is captions without last word
                trg_input = captions[:, :-1]
                # create mask
                trg_mask = create_masks(trg_input)

                # get scores from model
                scores = model(images, trg_input, trg_mask)
                scores = scores.view(-1, scores.shape[-1])

                # get targets - starting from 2 word in captions
                targets = captions[:, 1:]

                #predicted = prediction with maximum score
                _, predicted = torch.max(scores, dim=1)

                # calculate loss
                loss = criterion(
                    scores,
                    targets.reshape(targets.shape[0] * targets.shape[1]))

                #forward and backward path
                decoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()
                loss.backward()
                decoder_optimizer.step()
                encoder_optimizer.step()

            else:
                print('Select model_type attention or no_attention')

            # append results to loss lists and writer
            loss_train_iter.append(loss.item())
            loss_train.append(loss.item())
            writer.add_scalar('Loss/train/iterations', loss.item(), i + 1)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

        print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'.
              format(epoch, args.num_epochs, i, total_step, loss.item(),
                     np.exp(loss.item())))

        #append mean of last 10 batches as approximate epoch loss
        loss_train_epoch.append(np.mean(loss_train_iter[-10:]))

        writer.add_scalar('Loss/train/epoch', np.mean(loss_train_iter[-10:]),
                          epoch + 1)

        #save model
        if args.model_type != 'transformer':
            torch.save(
                decoder.state_dict(),
                os.path.join(
                    args.model_path,
                    'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1)))
            torch.save(
                encoder.state_dict(),
                os.path.join(
                    args.model_path,
                    'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1)))

        else:
            torch.save(
                model.state_dict(),
                os.path.join(
                    args.model_path,
                    'model_' + args.mode + '_{}.ckpt'.format(epoch + 1)))
        np.save(
            os.path.join(args.predict_json,
                         'loss_train_temp_' + args.mode + '.npy'), loss_train)

        #validate model:
        # set model to eval mode:
        if args.model_type != 'transformer':
            encoder.eval()
            decoder.eval()
        else:
            model.eval()
        total_step = len(data_loader_val)

        # set no_grad mode:
        with torch.no_grad():
            # for each entry in data_loader
            for i, (images, captions,
                    lengths) in tqdm(enumerate(data_loader_val)):
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                images = images.to(device)
                captions = captions.to(device)

                # forward and backward path is different dependent of model type:
                if args.model_type == 'no_attention':
                    features = encoder(images)
                    outputs = decoder(features, captions, lengths)
                    loss = criterion(outputs, targets)

                elif args.model_type == 'attention':

                    features = encoder(images)
                    sampled_caption = []
                    targets = captions[:, 1:]
                    decode_lengths = [length - 1 for length in lengths]
                    targets = targets.reshape(targets.shape[0] *
                                              targets.shape[1])

                    scores, alphas = decoder(features, captions,
                                             decode_lengths)

                    _, predicted = torch.max(scores, dim=1)

                    scores = scores.view(-1, scores.shape[-1])

                    sampled_caption = []

                    loss = decoder.loss(scores, targets, alphas)

                elif args.model_type == 'transformer':

                    trg_input = captions[:, :-1]
                    trg_mask = create_masks(trg_input)
                    scores = model(images, trg_input, trg_mask)
                    scores = scores.view(-1, scores.shape[-1])
                    targets = captions[:, 1:]

                    _, predicted = torch.max(scores, dim=1)

                    loss = criterion(
                        scores,
                        targets.reshape(targets.shape[0] * targets.shape[1]))

                #display results
                if i % args.log_step == 0:
                    print(
                        'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}'
                        .format(epoch, args.num_epochs, i, total_step_val,
                                loss.item(), np.exp(loss.item())))

                # append results to loss lists and writer
                loss_val.append(loss.item())
                loss_val_iter.append(loss.item())

                writer.add_scalar('Loss/validation/iterations', loss.item(),
                                  i + 1)

        np.save(
            os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'),
            loss_val)

        print(
            'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}'
            .format(epoch, args.num_epochs, i, total_step_val, loss.item(),
                    np.exp(loss.item())))

        # results: epoch validation loss

        loss_val_epoch.append(np.mean(loss_val_iter))
        writer.add_scalar('Loss/validation/epoch', np.mean(loss_val_epoch),
                          epoch + 1)

        #predict captions:
        filenames = os.listdir(args.image_dir_val)

        predicted = {}

        for file in tqdm(filenames):
            if file == '.DS_Store':
                continue
            # Prepare an image
            image = load_image(os.path.join(args.image_dir_val, file),
                               transform)
            image_tensor = image.to(device)

            # Generate caption starting with <start> word

            # procedure is different for each model type
            if args.model_type == 'attention':

                features = encoder(image_tensor)
                sampled_ids, _ = decoder.sample(features)
                sampled_ids = sampled_ids[0].cpu().numpy()
                #start sampled_caption with <start>
                sampled_caption = ['<start>']

            elif args.model_type == 'no_attention':
                features = encoder(image_tensor)
                sampled_ids = decoder.sample(features)
                sampled_ids = sampled_ids[0].cpu().numpy()
                sampled_caption = ['<start>']

            elif args.model_type == 'transformer':

                e_outputs = model.encoder(image_tensor)
                max_seq_length = 20
                sampled_ids = torch.zeros(max_seq_length, dtype=torch.long)
                sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']]
                                                   ]).to(device)

                for i in range(1, max_seq_length):

                    trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8')
                    trg_mask = Variable(
                        torch.from_numpy(trg_mask) == 0).to(device)

                    out = model.decoder(sampled_ids[:i].unsqueeze(0),
                                        e_outputs, trg_mask)

                    out = model.out(out)
                    out = F.softmax(out, dim=-1)
                    val, ix = out[:, -1].data.topk(1)
                    sampled_ids[i] = ix[0][0]

                sampled_ids = sampled_ids.cpu().numpy()
                sampled_caption = []

            # Convert word_ids to words
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                # break at <end> of the sentence
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            predicted[file] = sentence

        # save predictions to json file:
        json.dump(
            predicted,
            open(
                os.path.join(
                    args.predict_json,
                    'predicted_' + args.mode + '_' + str(epoch) + '.json'),
                'w'))

        #validate model
        with open(args.caption_path_val, 'r') as file:
            captions = json.load(file)

        res = {}
        for r in predicted:
            res[r] = [predicted[r].strip('<start> ').strip(' <end>')]

        images = captions['images']
        caps = captions['annotations']
        gts = {}
        for image in images:
            image_id = image['id']
            file_name = image['file_name']
            list_cap = []
            for cap in caps:
                if cap['image_id'] == image_id:
                    list_cap.append(cap['caption'])
            gts[file_name] = list_cap

        #calculate BLUE, CIDER and ROUGE metrics from real and resulting captions
        bleu_res = bleu(gts, res)
        cider_res = cider(gts, res)
        rouge_res = rouge(gts, res)

        # append resuls to result lists
        bleu_res_list.append(bleu_res)
        cider_res_list.append(cider_res)
        rouge_res_list.append(rouge_res)

        # write results to writer
        writer.add_scalar('BLEU1/validation/epoch', bleu_res[0], epoch + 1)
        writer.add_scalar('BLEU2/validation/epoch', bleu_res[1], epoch + 1)
        writer.add_scalar('BLEU3/validation/epoch', bleu_res[2], epoch + 1)
        writer.add_scalar('BLEU4/validation/epoch', bleu_res[3], epoch + 1)
        writer.add_scalar('CIDEr/validation/epoch', cider_res, epoch + 1)
        writer.add_scalar('ROUGE/validation/epoch', rouge_res, epoch + 1)

    results['bleu'] = bleu_res_list
    results['cider'] = cider_res_list
    results['rouge'] = rouge_res_list

    json.dump(
        results,
        open(os.path.join(args.predict_json, 'results_' + args.mode + '.json'),
             'w'))
    np.save(
        os.path.join(args.predict_json, 'loss_train_' + args.mode + '.npy'),
        loss_train)
    np.save(os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'),
            loss_val)
def train_dynamics(env, args, writer=None):
    """
    Trains the Dynamics module. Supervised.

    Arguments:
    env: the initialized environment (rllab/gym)
    args: input arguments
    writer: initialized summary writer for tensorboard
    """
    args.action_space = env.action_space

    # Initialize models
    enc = Encoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    dec = Decoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete)

    if args.from_checkpoint is not None:
        results_dict = torch.load(args.from_checkpoint)
        enc.load_state_dict(results_dict['enc'])
        dec.load_state_dict(results_dict['dec'])
        d_module.load_state_dict(results_dict['d_module'])

    all_params = chain(enc.parameters(), dec.parameters(),
                       d_module.parameters())

    if args.transfer:
        for p in enc.parameters():
            p.requires_grad = False

        for p in dec.parameters():
            p.requires_grad = False
        all_params = d_module.parameters()

    optimizer = torch.optim.Adam(all_params,
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    if args.gpu:
        enc = enc.cuda()
        dec = dec.cuda()
        d_module = d_module.cuda()

    # Initialize datasets
    val_loader = None
    train_dataset = DynamicsDataset(args.train_set,
                                    args.train_size,
                                    batch=args.train_batch,
                                    rollout=args.rollout)
    val_dataset = DynamicsDataset(args.test_set,
                                  5000,
                                  batch=args.test_batch,
                                  rollout=args.rollout)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.num_workers)

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers)

    results_dict = {
        'dec_losses': [],
        'forward_losses': [],
        'inverse_losses': [],
        'total_losses': [],
        'enc': None,
        'dec': None,
        'd_module': None,
        'd_init': None,
        'args': args
    }

    total_action_taken = 0
    correct_predicted_a_hat = 0

    # create the mask here for re-weighting
    dec_mask = None
    if args.dec_mask is not None:
        dec_mask = torch.ones(9)
        game_vocab = dict([
            (b, a)
            for a, b in enumerate(sorted(env.game.all_possible_features()))
        ])
        dec_mask[game_vocab['Agent']] = args.dec_mask
        dec_mask[game_vocab['Goal']] = args.dec_mask
        dec_mask = dec_mask.expand(args.batch_size, args.maze_length,
                                   args.maze_length, 9).contiguous().view(-1)
        dec_mask = Variable(dec_mask, requires_grad=False)
        if args.gpu:
            dec_mask = dec_mask.cuda()

    for epoch in range(1, args.num_epochs + 1):
        enc.train()
        dec.train()
        d_module.train()

        if args.framework == "mazebase":
            d_init.train()

        # for measuring the accuracy
        train_acc = 0
        current_epoch_actions = 0
        current_epoch_predicted_a_hat = 0

        start = time.time()
        for i, (states, target_actions) in enumerate(train_loader):

            optimizer.zero_grad()

            if args.framework != "mazebase":
                forward_loss, inv_loss, dec_loss, recon_loss, model_loss, _, _ = forward_planning(
                    i, states, target_actions, enc, dec, d_module, args)
            else:
                forward_loss, inv_loss, dec_loss, recon_loss, model_loss, current_epoch_predicted_a_hat, current_epoch_actions = multiple_forward(
                    i, states, target_actions, enc, dec, d_module, args,
                    d_init, dec_mask)

            loss = forward_loss + args.inv_loss_coef * inv_loss + \
                        args.dec_loss_coef * dec_loss

            if i % args.log_interval == 0:
                log(
                    'Epoch [{}/{}]\tIter [{}/{}]\t'.format(
                        epoch, args.num_epochs, i+1, len(
                        train_dataset)//args.batch_size) + \
                    'Time: {:.2f}\t'.format(time.time() - start) + \
                    'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0]) + \
                    'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] ) + \
                    'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0]) + \
                    'Loss: {:.2f}\t'.format(loss.data[0]))

                results_dict['dec_losses'].append(dec_loss.data[0])
                results_dict['forward_losses'].append(forward_loss.data[0])
                results_dict['inverse_losses'].append(inv_loss.data[0])
                results_dict['total_losses'].append(loss.data[0])

                # write the summaries here
                if writer:
                    writer.add_scalar('dynamics/total_loss', loss.data[0],
                                      epoch)
                    writer.add_scalar('dynamics/decoder', dec_loss.data[0],
                                      epoch)
                    writer.add_scalar('dynamics/reconstruction_loss',
                                      recon_loss.data[0], epoch)
                    writer.add_scalar('dynamics/next_state_prediction_loss',
                                      model_loss.data[0], epoch)
                    writer.add_scalar('dynamics/inv_loss', inv_loss.data[0],
                                      epoch)
                    writer.add_scalar('dynamics/forward_loss',
                                      forward_loss.data[0], epoch)

                    writer.add_scalars(
                        'dynamics/all_losses', {
                            "total_loss": loss.data[0],
                            "reconstruction_loss": recon_loss.data[0],
                            "next_state_prediction_loss": model_loss.data[0],
                            "decoder_loss": dec_loss.data[0],
                            "inv_loss": inv_loss.data[0],
                            "forward_loss": forward_loss.data[0],
                        }, epoch)

            loss.backward()

            correct_predicted_a_hat += current_epoch_predicted_a_hat
            total_action_taken += current_epoch_actions

            # does it not work at all without grad clipping ?
            torch.nn.utils.clip_grad_norm(all_params, args.max_grad_norm)
            optimizer.step()

            # maybe add the generated image to add the logs
            # writer.add_image()

        # Run validation
        if val_loader is not None:
            enc.eval()
            dec.eval()
            d_module.eval()
            forward_loss, inv_loss, dec_loss = 0, 0, 0
            for i, (states, target_actions) in enumerate(val_loader):
                f_loss, i_loss, d_loss, _, _, _, _ = forward_planning(
                    i, states, target_actions, enc, dec, d_module, args)
                forward_loss += f_loss
                inv_loss += i_loss
                dec_loss += d_loss
            loss = forward_loss + args.inv_loss_coef * inv_loss + \
                    args.dec_loss_coef * dec_loss
            if writer:
                writer.add_scalar('val/forward_loss', forward_loss.data[0] / i,
                                  epoch)
                writer.add_scalar('val/inverse_loss', inv_loss.data[0] / i,
                                  epoch)
                writer.add_scalar('val/decoder_loss', dec_loss.data[0] / i,
                                  epoch)
            log(
                '[Validation]\t' + \
                'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0] / i) + \
                'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] / i) + \
                'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0] / i) + \
                'Loss: {:.2f}\t'.format(loss.data[0] / i))
        if epoch % args.checkpoint == 0:
            results_dict['enc'] = enc.state_dict()
            results_dict['dec'] = dec.state_dict()
            results_dict['d_module'] = d_module.state_dict()
            if args.framework == "mazebase":
                results_dict['d_init'] = d_init.state_dict()
            torch.save(
                results_dict,
                os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
            log('Saved model %s' % epoch)

    results_dict['enc'] = enc.state_dict()
    results_dict['dec'] = dec.state_dict()
    results_dict['d_module'] = d_module.state_dict()
    torch.save(results_dict,
               os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
    print(os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
Esempio n. 10
0
def train(config):
    train_config = config['train']

    global device
    device = train_config['device']
    if not torch.cuda.is_available(): device = 'cpu'
    tqdm.write('Training on {}'.format(device))
    writer = SummaryWriter('log')

    train_dataset, test_dataset = create_datasets(**config['dataset'])

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=train_config['batch_size'],
                                  shuffle=True,
                                  collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=train_config['batch_size'],
                                 shuffle=False,
                                 collate_fn=collate_fn)

    encoder = Encoder(vocab_size=len(train_dataset.lang1),
                      **config['encoder'],
                      device=device).to(device)
    decoder = Decoder(vocab_size=len(train_dataset.lang2),
                      **config['decoder']).to(device)

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=train_config['lr'])
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=train_config['lr'])

    criterion = nn.NLLLoss()

    tqdm.write('[-] Start training! ')
    epoch_bar = tqdm(range(train_config['n_epochs']),
                     desc='[Total progress]',
                     leave=True,
                     position=0,
                     dynamic_ncols=True)
    for epoch in epoch_bar:
        batch_bar = tqdm(range(len(train_dataloader)),
                         desc='[Train epoch {:2}]'.format(epoch),
                         leave=True,
                         position=0,
                         dynamic_ncols=True)
        encoder.train()
        decoder.train()
        train_loss = 0
        for batch in batch_bar:
            (source, target_bos, target_eos) = next(iter(train_dataloader))
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            source, target_bos, target_eos = source.to(device), target_bos.to(
                device), target_eos.to(device)
            encoder_output, encoder_hidden = encoder(source)
            decoder_output = decoder(target_bos, encoder_hidden)

            loss = criterion(decoder_output.view(-1, decoder_output.size(-1)),
                             target_eos.view(-1))
            train_loss += loss.item()
            n_hit, n_total = hitRate(decoder_output, target_eos)
            loss.backward()
            #print(loss.item())

            encoder_optimizer.step()
            decoder_optimizer.step()

            batch_bar.set_description(
                '[Train epoch {:2} | Loss: {:.2f} | Hit: {}/{}]'.format(
                    epoch, loss, n_hit, n_total))
        train_loss /= len(train_dataloader)

        batch_bar = tqdm(range(len(test_dataloader)),
                         desc='[Test epoch {:2}]'.format(epoch),
                         leave=True,
                         position=0,
                         dynamic_ncols=True)
        encoder.eval()
        decoder.eval()
        test_loss = 0
        for batch in batch_bar:
            (source, target_bos, target_eos) = next(iter(test_dataloader))
            source, target_bos, target_eos = source.to(device), target_bos.to(
                device), target_eos.to(device)

            with torch.no_grad():
                encoder_output, encoder_hidden = encoder(source)
                decoder_output = decoder(target_bos, encoder_hidden)
                loss = criterion(
                    decoder_output.view(-1, decoder_output.size(-1)),
                    target_eos.view(-1))
                test_loss += loss.item()
                n_hit, n_total = hitRate(decoder_output, target_eos)
                batch_bar.set_description(
                    '[Test epoch {:2} | Loss: {:.2f} | Hit: {}/{}]'.format(
                        epoch, loss, n_hit, n_total))

        test_loss /= len(test_dataloader)
        writer.add_scalars('Loss', {
            'train': train_loss,
            'test': test_loss
        }, epoch)
        sample(test_dataset, encoder, decoder)

    tqdm.write('[-] Done!')
Esempio n. 11
0
class Solver(object):
    def __init__(self, hps, data_loader, log_dir='./log/'):
        self.hps = hps
        self.data_loader = data_loader
        self.model_kept = []
        self.max_keep = 20
        self.build_model()
        self.logger = Logger(log_dir)

    def build_model(self):
        hps = self.hps
        ns = self.hps.ns
        emb_size = self.hps.emb_size
        self.Encoder = Encoder(ns=ns, dp=hps.enc_dp)
        self.Decoder = Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)
        self.Generator = Decoder(ns=ns, c_a=hps.n_speakers, emb_size=emb_size)
        self.LatentDiscriminator = LatentDiscriminator(ns=ns, dp=hps.dis_dp)
        self.PatchDiscriminator = PatchDiscriminator(ns=ns,
                                                     n_class=hps.n_speakers)
        if torch.cuda.is_available():
            self.Encoder.cuda()
            self.Decoder.cuda()
            self.Generator.cuda()
            self.LatentDiscriminator.cuda()
            self.PatchDiscriminator.cuda()
        betas = (0.5, 0.9)
        params = list(self.Encoder.parameters()) + list(
            self.Decoder.parameters())
        self.ae_opt = optim.Adam(params, lr=self.hps.lr, betas=betas)
        self.gen_opt = optim.Adam(self.Generator.parameters(),
                                  lr=self.hps.lr,
                                  betas=betas)
        self.lat_opt = optim.Adam(self.LatentDiscriminator.parameters(),
                                  lr=self.hps.lr,
                                  betas=betas)
        self.patch_opt = optim.Adam(self.PatchDiscriminator.parameters(),
                                    lr=self.hps.lr,
                                    betas=betas)

    def save_model(self, model_path, iteration, enc_only=True):
        if not enc_only:
            all_model = {
                'encoder': self.Encoder.state_dict(),
                'decoder': self.Decoder.state_dict(),
                'generator': self.Generator.state_dict(),
                'latent_discriminator': self.LatentDiscriminator.state_dict(),
                'patch_discriminator': self.PatchDiscriminator.state_dict(),
            }
        else:
            all_model = {
                'encoder': self.Encoder.state_dict(),
                'decoder': self.Decoder.state_dict(),
                'generator': self.Generator.state_dict(),
            }
        new_model_path = '{}-{}'.format(model_path, iteration)
        with open(new_model_path, 'wb') as f_out:
            torch.save(all_model, f_out)
        self.model_kept.append(new_model_path)

        if len(self.model_kept) >= self.max_keep:
            os.remove(self.model_kept[0])
            self.model_kept.pop(0)

    def load_model(self, model_path, enc_only=True):
        print('load model from {}'.format(model_path))
        with open(model_path, 'rb') as f_in:
            all_model = torch.load(f_in)
            self.Encoder.load_state_dict(all_model['encoder'])
            self.Decoder.load_state_dict(all_model['decoder'])
            #self.Genrator.load_state_dict(all_model['generator'])
            if not enc_only:
                self.LatentDiscriminator.load_state_dict(
                    all_model['latent_discriminator'])
                self.PatchDiscriminator.load_state_dict(
                    all_model['patch_discriminator'])

    def set_eval(self):
        self.Encoder.eval()
        self.Decoder.eval()
        self.Generator.eval()
        #self.LatentDiscriminator.eval()

    def test_step(self, x, c):
        self.set_eval()
        x = to_var(x).permute(0, 2, 1)
        enc = self.Encoder(x)
        x_tilde = self.Decoder(enc, c)
        return x_tilde.data.cpu().numpy()

    def permute_data(self, data):
        C = [to_var(c, requires_grad=False) for c in data[:2]]
        X = [to_var(x).permute(0, 2, 1) for x in data[2:]]
        return C, X

    def sample_c(self, size):
        c_sample = Variable(torch.multinomial(torch.ones(8),
                                              num_samples=size,
                                              replacement=True),
                            requires_grad=False)
        c_sample = c_sample.cuda() if torch.cuda.is_available() else c_sample
        return c_sample

    def cal_acc(self, logits, y_true):
        _, ind = torch.max(logits, dim=1)
        acc = torch.sum(
            (ind == y_true).type(torch.FloatTensor)) / y_true.size(0)
        return acc

    def encode_step(self, *args):
        enc_list = []
        for x in args:
            enc = self.Encoder(x)
            enc_list.append(enc)
        return tuple(enc_list)

    def decode_step(self, enc, c):
        x_tilde = self.Decoder(enc, c)
        return x_tilde

    def latent_discriminate_step(self,
                                 enc_i_t,
                                 enc_i_tk,
                                 enc_i_prime,
                                 enc_j,
                                 is_dis=True):
        same_pair = torch.cat([enc_i_t, enc_i_tk], dim=1)
        diff_pair = torch.cat([enc_i_prime, enc_j], dim=1)
        if is_dis:
            same_val = self.LatentDiscriminator(same_pair)
            diff_val = self.LatentDiscriminator(diff_pair)
            w_dis = torch.mean(same_val - diff_val)
            gp = calculate_gradients_penalty(self.LatentDiscriminator,
                                             same_pair, diff_pair)
            return w_dis, gp
        else:
            diff_val = self.LatentDiscriminator(diff_pair)
            loss_adv = -torch.mean(diff_val)
            return loss_adv

    def patch_discriminate_step(self, x, x_tilde, cal_gp=True):
        # w-distance
        D_real, real_logits = self.PatchDiscriminator(x, classify=True)
        D_fake, fake_logits = self.PatchDiscriminator(x_tilde, classify=True)
        w_dis = torch.mean(D_real - D_fake)
        if cal_gp:
            gp = calculate_gradients_penalty(self.PatchDiscriminator, x,
                                             x_tilde)
            return w_dis, real_logits, fake_logits, gp
        else:
            return w_dis, real_logits, fake_logits

    # backup
    #def classify():
    #    # aux clssify loss
    #    criterion = nn.NLLLoss()
    #    c_loss = criterion(real_logits, c) + criterion(fake_logits, c_sample)
    #    real_acc = self.cal_acc(real_logits, c)
    #    fake_acc = self.cal_acc(fake_logits, c_sample)

    def train(self, model_path, flag='train'):
        # load hyperparams
        hps = self.hps
        for iteration in range(hps.iters):
            # calculate current alpha
            if iteration + 1 < hps.lat_sched_iters and iteration >= hps.enc_pretrain_iters:
                current_alpha = hps.alpha_enc * (
                    iteration + 1 - hps.enc_pretrain_iters) / (
                        hps.lat_sched_iters - hps.enc_pretrain_iters)
            else:
                current_alpha = 0
            if iteration >= hps.enc_pretrain_iters:
                n_latent_steps = hps.n_latent_steps \
                    if iteration > hps.enc_pretrain_iters else hps.dis_pretrain_iters
                for step in range(n_latent_steps):
                    #===================== Train latent discriminator =====================#
                    data = next(self.data_loader)
                    (c_i, c_j), (x_i_t, x_i_tk, x_i_prime,
                                 x_j) = self.permute_data(data)
                    # encode
                    enc_i_t, enc_i_tk, enc_i_prime, enc_j = self.encode_step(
                        x_i_t, x_i_tk, x_i_prime, x_j)
                    # latent discriminate
                    latent_w_dis, latent_gp = self.latent_discriminate_step(
                        enc_i_t, enc_i_tk, enc_i_prime, enc_j)
                    lat_loss = -hps.alpha_dis * latent_w_dis + hps.lambda_ * latent_gp
                    reset_grad([self.LatentDiscriminator])
                    lat_loss.backward()
                    grad_clip([self.LatentDiscriminator],
                              self.hps.max_grad_norm)
                    self.lat_opt.step()
                    # print info
                    info = {
                        f'{flag}/D_latent_w_dis': latent_w_dis.data[0],
                        f'{flag}/latent_gp': latent_gp.data[0],
                    }
                    slot_value = (step, iteration + 1, hps.iters) + \
                            tuple([value for value in info.values()])
                    log = 'lat_D-%d:[%06d/%06d], w_dis=%.3f, gp=%.2f'
                    print(log % slot_value)
                    for tag, value in info.items():
                        self.logger.scalar_summary(tag, value, iteration)
            # two stage training
            if iteration >= hps.patch_start_iter:
                for step in range(hps.n_patch_steps):
                    #===================== Train patch discriminator =====================#
                    data = next(self.data_loader)
                    (c_i, _), (x_i_t, _, _, _) = self.permute_data(data)
                    # encode
                    enc_i_t, = self.encode_step(x_i_t)
                    c_sample = self.sample_c(x_i_t.size(0))
                    x_tilde = self.decode_step(enc_i_t, c_i)
                    # Aux classify loss
                    patch_w_dis, real_logits, fake_logits, patch_gp = \
                            self.patch_discriminate_step(x_i_t, x_tilde, cal_gp=True)
                    patch_loss = -hps.beta_dis * patch_w_dis + hps.lambda_ * patch_gp + hps.beta_clf * c_loss
                    reset_grad([self.PatchDiscriminator])
                    patch_loss.backward()
                    grad_clip([self.PatchDiscriminator],
                              self.hps.max_grad_norm)
                    self.patch_opt.step()
                    # print info
                    info = {
                        f'{flag}/D_patch_w_dis': patch_w_dis.data[0],
                        f'{flag}/patch_gp': patch_gp.data[0],
                        f'{flag}/c_loss': c_loss.data[0],
                        f'{flag}/real_acc': real_acc,
                        f'{flag}/fake_acc': fake_acc,
                    }
                    slot_value = (step, iteration + 1, hps.iters) + \
                            tuple([value for value in info.values()])
                    log = 'patch_D-%d:[%06d/%06d], w_dis=%.3f, gp=%.2f, c_loss=%.3f, real_acc=%.2f, fake_acc=%.2f'
                    print(log % slot_value)
                    for tag, value in info.items():
                        self.logger.scalar_summary(tag, value, iteration)
            #===================== Train G =====================#
            data = next(self.data_loader)
            (c_i, c_j), (x_i_t, x_i_tk, x_i_prime,
                         x_j) = self.permute_data(data)
            # encode
            enc_i_t, enc_i_tk, enc_i_prime, enc_j = self.encode_step(
                x_i_t, x_i_tk, x_i_prime, x_j)
            # decode
            x_tilde = self.decode_step(enc_i_t, c_i)
            loss_rec = torch.mean(torch.abs(x_tilde - x_i_t))
            # latent discriminate
            loss_adv = self.latent_discriminate_step(enc_i_t,
                                                     enc_i_tk,
                                                     enc_i_prime,
                                                     enc_j,
                                                     is_dis=False)
            ae_loss = loss_rec + current_alpha * loss_adv
            reset_grad([self.Encoder, self.Decoder])
            retain_graph = True if hps.n_patch_steps > 0 else False
            ae_loss.backward(retain_graph=retain_graph)
            grad_clip([self.Encoder, self.Decoder], self.hps.max_grad_norm)
            self.ae_opt.step()
            info = {
                f'{flag}/loss_rec': loss_rec.data[0],
                f'{flag}/loss_adv': loss_adv.data[0],
                f'{flag}/alpha': current_alpha,
            }
            slot_value = (iteration + 1, hps.iters) + tuple(
                [value for value in info.values()])
            log = 'G:[%06d/%06d], loss_rec=%.2f, loss_adv=%.2f, alpha=%.2e'
            print(log % slot_value)
            for tag, value in info.items():
                self.logger.scalar_summary(tag, value, iteration + 1)
            # patch discriminate
            if hps.n_patch_steps > 0 and iteration >= hps.patch_start_iter:
                c_sample = self.sample_c(x_i_t.size(0))
                x_tilde = self.decode_step(enc_i_t, c_sample)
                patch_w_dis, real_logits, fake_logits = \
                        self.patch_discriminate_step(x_i_t, x_tilde, cal_gp=False)
                patch_loss = hps.beta_dec * patch_w_dis + hps.beta_clf * c_loss
                reset_grad([self.Decoder])
                patch_loss.backward()
                grad_clip([self.Decoder], self.hps.max_grad_norm)
                self.decoder_opt.step()
                info = {
                    f'{flag}/G_patch_w_dis': patch_w_dis.data[0],
                    f'{flag}/c_loss': c_loss.data[0],
                    f'{flag}/real_acc': real_acc,
                    f'{flag}/fake_acc': fake_acc,
                }
                slot_value = (iteration + 1, hps.iters) + tuple(
                    [value for value in info.values()])
                log = 'G:[%06d/%06d]: patch_w_dis=%.2f, c_loss=%.2f, real_acc=%.2f, fake_acc=%.2f'
                print(log % slot_value)
                for tag, value in info.items():
                    self.logger.scalar_summary(tag, value, iteration + 1)
            if iteration % 1000 == 0 or iteration + 1 == hps.iters:
                self.save_model(model_path, iteration)
Esempio n. 12
0
File: test.py Progetto: shauray8/CTW
import torch
from torchvision import transforms, datasets, models
from torchvision.utils import save_image
from model import Encoder, Decoder

input_nc = 1
output_nc = 1
enc = Encoder(input_nc, output_nc)
dec = Decoder(input_nc, output_nc)
enc.load_state_dict(torch.load("pretrained/enc.pth"))
dec.load_state_dict(torch.load("pretrained/dec.pth"))

enc.eval()
dec.eval()

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize(mean=[0.5], std=[0.5])])

dataset = datasets.MNIST(root='./data',
                         transform=transform,
                         download=True,
                         train=False)
dataset_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=96,
    shuffle=True,
)

for i, (image, _) in enumerate(dataset_loader):
    encoded = enc(image)
Esempio n. 13
0
def main(args):
    # constant definition
    sos_idx = 0
    eos_idx = 1
    pad_idx = 2
    a_dim = 512
    h_dim = 512
    attn_dim = 512
    embed_dim = 512
    regularize_constant = 1.  # lambda * L => lambda = 1/L

    vocabulary = torch.load(args.voca_path)
    vocab_size = len(vocabulary)

    device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
    encoder = Encoder().to(device)
    decoder = Decoder(a_dim, h_dim, attn_dim, vocab_size, embed_dim).to(device)

    # We do not train the encoder
    encoder.eval()

    if not args.test:
        # train
        validation_term = 1
        best_bleu = 0.
        num_of_epochs_since_improvement = 0
        early_stop_criterion = 20

        train_loader = get_train_data_loader(args.path, args.token_path,
                                             args.voca_path, args.batch_size,
                                             pad_idx)
        valid_loader = get_test_data_loader(args.path,
                                            args.token_path,
                                            args.voca_path,
                                            args.batch_size,
                                            pad_idx,
                                            dataset_type='valid')

        criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
        optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)

        print('Start training ...')
        for epoch in range(args.epochs):

            # early stopping
            if num_of_epochs_since_improvement > early_stop_criterion:
                print("There's no improvement on BLEU score while %d epochs" %
                      (num_of_epochs_since_improvement))
                print("Stop Training")
                break

            start_epoch = time.time()
            i = 0

            ############################################################################################################################################
            # training
            decoder.train()
            for src_batch, trg_batch in train_loader:
                batch_start = time.time()

                src_batch = src_batch.to(device)
                trg_batch = torch.tensor(trg_batch).to(device)

                trg_input = trg_batch[:, :-1]
                trg_output = trg_batch[:, 1:].contiguous().view(-1)

                a = encoder(src_batch)
                preds, alphas = decoder(
                    a, trg_input)  # [batch, C, vocab_size], [batch, C, L]

                optimizer.zero_grad()

                loss = criterion(preds.view(-1, preds.size(-1)),
                                 trg_output)  # NLL loss
                regularize_term = regularize_constant * (
                    (1. - torch.sum(alphas, dim=1))**2).mean()

                total_loss = loss + regularize_term
                total_loss.backward()

                optimizer.step()

                i = i + 1

                # flush the GPU cache
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

                batch_time = time.time() - batch_start
                print(
                    '[%d/%d][%d/%d] train loss : %.4f (%.4f / %.4f) | time : %.2fs'
                    % (epoch + 1, args.epochs, i, train_loader.size //
                       args.batch_size + 1, total_loss.item(), loss.item(),
                       regularize_term.item(), batch_time))

            epoch_time = time.time() - start_epoch
            print('Time taken for %d epoch : %.2fs' % (epoch + 1, epoch_time))

            ############################################################################################################################################
            # validation
            if i % validation_term == 0:
                decoder.eval()
                j = 0
                pred, ref = [], []

                for src_batch, trg_batch in valid_loader:
                    start = time.time()
                    batch_size = src_batch.size(0)

                    src_batch = src_batch.to(device)  # [batch, 3, 244, 244]
                    trg_batch = torch.tensor(trg_batch).to(
                        device)  # [batch * 5, C]
                    trg_batch = torch.split(trg_batch, 5)

                    batches = []
                    for k in range(batch_size):
                        batches.append(trg_batch[k].unsqueeze(0))

                    trg_batch = torch.cat(batches, dim=0)  # [batch, 5, C]

                    max_length = trg_batch.size(-1)

                    pred_batch = torch.zeros(batch_size, 1, dtype=int).to(
                        device)  # [batch, 1] = [[0],[0],...,[0]]

                    # eos_mask[i] = 1 means i-th sentence has eos
                    eos_mask = torch.zeros(batch_size, dtype=int)

                    a = encoder(src_batch)

                    for _ in range(max_length):

                        output, _ = decoder(
                            a, pred_batch)  # [batch, _+1, vocab_size]

                        # greedy search
                        output = torch.argmax(F.softmax(output, dim=-1),
                                              dim=-1)  # [batch_size, _+1]
                        predictions = output[:, -1].unsqueeze(1)
                        pred_batch = torch.cat([pred_batch, predictions],
                                               dim=-1)

                        for l in range(batch_size):
                            if predictions[l] == eos_idx:
                                eos_mask[l] = 1

                        # every sentence has eos
                        if eos_mask.sum() == batch_size:
                            break

                    # flush the GPU cache
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    pred += seq2sen(pred_batch.cpu().numpy().tolist(),
                                    vocabulary)
                    for m in range(batch_size):
                        ref += [
                            seq2sen(trg_batch[m].cpu().numpy().tolist(),
                                    vocabulary)
                        ]

                    t = time.time() - start
                    j += 1
                    print("[%d/%d] prediction done | time : %.2fs" %
                          (j, valid_loader.size // args.batch_size + 1, t))

                bleu_1 = corpus_bleu(ref, pred, weights=(1. / 1., )) * 100
                bleu_2 = corpus_bleu(ref, pred, weights=(
                    1. / 2.,
                    1. / 2.,
                )) * 100
                bleu_3 = corpus_bleu(
                    ref, pred, weights=(
                        1. / 3.,
                        1. / 3.,
                        1. / 3.,
                    )) * 100
                bleu_4 = corpus_bleu(
                    ref, pred, weights=(
                        1. / 4.,
                        1. / 4.,
                        1. / 4.,
                        1. / 4.,
                    )) * 100

                print(f'BLEU-1: {bleu_1:.2f}')
                print(f'BLEU-2: {bleu_2:.2f}')
                print(f'BLEU-3: {bleu_3:.2f}')
                print(f'BLEU-4: {bleu_4:.2f}')

                if bleu_1 > best_bleu:
                    num_of_epochs_since_improvement = 0

                    best_bleu = bleu_1
                    print('Best BLEU-1 has been updated : %.2f' % (best_bleu))
                    save_checkpoint(decoder, 'checkpoints/best')
                else:
                    num_of_epochs_since_improvement += validation_term
                    print(
                        "There's no improvement on BLEU score while %d epochs"
                        % (num_of_epochs_since_improvement))

            ################################################################################################################################################################
        print('End of the training')
    else:
        if os.path.exists(args.checkpoint):
            decoder_checkpoint = torch.load(args.checkpoint)
            decoder.load_state_dict(decoder_checkpoint['state_dict'])
            print("trained decoder " + args.checkpoint + " is loaded")

        decoder.eval()

        # test
        test_loader = get_test_data_loader(args.path, args.token_path,
                                           args.voca_path, args.batch_size,
                                           pad_idx)

        j = 0
        pred, ref = [], []
        for src_batch, trg_batch in test_loader:
            # predict pred_batch from src_batch with your model.
            # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1).
            # every <pad> token (index: 2) should be located after <eos> token (index: 1).
            # example of pred_batch:
            # [[0, 5, 6, 7, 1],
            #  [0, 4, 9, 1, 2],
            #  [0, 6, 1, 2, 2]]
            start = time.time()
            batch_size = src_batch.size(0)

            src_batch = src_batch.to(device)  # [batch, 3, 244, 244]
            trg_batch = torch.tensor(trg_batch).to(device)  # [batch * 5, C]
            trg_batch = torch.split(trg_batch, 5)

            batches = []
            for k in range(batch_size):
                batches.append(trg_batch[k].unsqueeze(0))

            trg_batch = torch.cat(batches, dim=0)  # [batch, 5, C]

            max_length = trg_batch.size(-1)

            pred_batch = torch.zeros(batch_size, 1, dtype=int).to(
                device)  # [batch, 1] = [[0],[0],...,[0]]

            # eos_mask[i] = 1 means i-th sentence has eos
            eos_mask = torch.zeros(batch_size, dtype=int)

            a = encoder(src_batch)

            for _ in range(max_length):

                output, _ = decoder(a, pred_batch)  # [batch, _+1, vocab_size]

                # greedy search
                output = torch.argmax(F.softmax(output, dim=-1),
                                      dim=-1)  # [batch_size, _+1]
                predictions = output[:, -1].unsqueeze(1)
                pred_batch = torch.cat([pred_batch, predictions], dim=-1)

                for l in range(batch_size):
                    if predictions[l] == eos_idx:
                        eos_mask[l] = 1

                # every sentence has eos
                if eos_mask.sum() == batch_size:
                    break

            # flush the GPU cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            pred += seq2sen(pred_batch.cpu().numpy().tolist(), vocabulary)
            for m in range(batch_size):
                ref += [
                    seq2sen(trg_batch[m].cpu().numpy().tolist(), vocabulary)
                ]

            t = time.time() - start
            j += 1
            print("[%d/%d] prediction done | time : %.2fs" %
                  (j, test_loader.size // args.batch_size + 1, t))

        bleu_1 = corpus_bleu(ref, pred, weights=(1. / 1., )) * 100
        bleu_2 = corpus_bleu(ref, pred, weights=(
            1. / 2.,
            1. / 2.,
        )) * 100
        bleu_3 = corpus_bleu(ref, pred, weights=(
            1. / 3.,
            1. / 3.,
            1. / 3.,
        )) * 100
        bleu_4 = corpus_bleu(
            ref, pred, weights=(
                1. / 4.,
                1. / 4.,
                1. / 4.,
                1. / 4.,
            )) * 100

        print(f'BLEU-1: {bleu_1:.2f}')
        print(f'BLEU-2: {bleu_2:.2f}')
        print(f'BLEU-3: {bleu_3:.2f}')
        print(f'BLEU-4: {bleu_4:.2f}')

        with open('results/pred.txt', 'w') as f:
            for line in pred:
                f.write('{}\n'.format(line))

        with open('results/ref.txt', 'w') as f:
            for lines in ref:
                for line in lines:
                    f.write('{}\n'.format(line))
                f.write('_' * 50 + '\n')
Esempio n. 14
0
def main():
    # Praise argparser!
    parser = argparse.ArgumentParser(
        description=
        "Inference script for performing joint tasks on ATIS datasets.")
    parser.add_argument("--train_path",
                        type=str,
                        help="path of train dataset.")
    parser.add_argument("--test_path", type=str, help="path of test dataset.")
    parser.add_argument("--model_dir",
                        type=str,
                        default="./models/",
                        help='path for saved trained models.')

    parser.add_argument('--max_length',
                        type=int,
                        default=60,
                        help='max sequence length')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=100,
                        help='dimension of word embedding vectors')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=50,
                        help='dimension of lstm hidden states')

    args = parser.parse_args()

    # Load data
    print("Loading data...")
    _, word2index, tag2index, intent2index = preprocessing(
        args.train_path, args.max_length)
    index2tag = {v: k for k, v in tag2index.items()}
    index2intent = {v: k for k, v in intent2index.items()}

    # Load model
    print("Loading model...")
    encoder = Encoder(len(word2index), args.embedding_size, args.hidden_size)
    decoder = Decoder(len(tag2index), len(intent2index),
                      len(tag2index) // 3, args.hidden_size * 2)
    encoder.load_state_dict(
        torch.load(os.path.join(args.model_dir, 'jointnlu-encoder.pkl'),
                   map_location=None if USE_CUDA else "cpu"))
    decoder.load_state_dict(
        torch.load(os.path.join(args.model_dir, 'jointnlu-decoder.pkl'),
                   map_location=None if USE_CUDA else "cpu"))

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # Switch to evaluation mode
    encoder.eval()
    decoder.eval()

    # Preprocess test data
    test = open(args.test_path, "r").readlines()
    test = [t[:-1] for t in test]
    test = [[
        t.split("\t")[0].split(" "),
        t.split("\t")[1].split(" ")[:-1],
        t.split("\t")[1].split(" ")[-1]
    ] for t in test]
    test = [
        [t[0][1:-1], t[1][1:], t[2].split("#")[0]] for t in test
    ]  # Note here I split embedded multiple labels into separate labels and get the first one.
    # This could lower error rate.

    slot_f1 = []
    intent_err = []

    # Test cases.
    for index in range(len(test)):
        test_raw = test[index][0]
        test_in = prepare_sequence(test_raw, word2index).to("cpu")
        test_mask = Variable(
            torch.BoolTensor(tuple(map(
                lambda s: s == 0,
                test_in.data)))).cuda() if USE_CUDA else Variable(
                    torch.BoolTensor(tuple(map(lambda s: s == 0,
                                               test_in.data)))).view(1, -1)

        if USE_CUDA:
            start_decode = Variable(
                torch.LongTensor([[word2index['<SOS>']] * 1
                                  ])).cuda().transpose(1, 0)
        else:
            start_decode = Variable(
                torch.LongTensor([[word2index['<SOS>']] * 1])).transpose(1, 0)

        output, hidden_c = encoder(test_in.unsqueeze(0),
                                   test_mask.unsqueeze(0))
        tag_score, intent_score = decoder(start_decode, hidden_c, output,
                                          test_mask)

        v, i = torch.max(tag_score, 1)
        slot_pred = list(map(lambda ii: index2tag[ii], i.data.tolist()))
        slot_gt = test[index][1]
        # Calculate f1_micro with sklearn. Pretty handy.
        slot_f1.append(f1_score(slot_gt, slot_pred, average="micro"))

        v, i = torch.max(intent_score, 1)
        intent_pred = index2intent[i.data.tolist()[0]]
        intent_gt = test[index][2]
        if intent_pred != intent_gt:
            intent_err.append([test[index][0], intent_gt, intent_pred])

        # Print our results.
        print("Input Sentence\t: ", *test[index][0])

        print("Truth\t\t: ", *slot_gt)
        print("Prediction\t: ", *slot_pred)

        print("Truth\t\t: ", intent_gt)
        print("Prediction\t: ", intent_pred)

        print()

    # Print out everything I need to finish my report.

    # print("Got slot err ", len(slot_err[0]))
    # print(*slot_err, sep="\n")
    print("Got intent err ", len(intent_err))
    print("--- BEGIN ERR PRINT ---")
    for case in intent_err:
        print("Input  : ", *case[0])
        print("Truth  : ", case[1])
        print("Predict: ", case[2])
        print()
    print("--- ENDOF ERR PRINT ---")
    print("Total ", len(test))
    print("Slot f1_micro avg %f" % np.average(slot_f1))
    print("Intent acc %f" % (1 - len(intent_err) / len(test)))
def main(args):
    device = Config.device
    print("PyTorch running with device {0}".format(device))

    if args.download:
        print("Downloading data")
        download_required_data()

    if args.lemmatize:
        caption_file = 'data/Flickr_Data/Flickr_TextData/Flickr8k.lemma.token.txt'
    else:
        caption_file = 'data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'

    print("Generating word2id")
    word2id = generate_word2id(caption_file)
    id2word = dict([(v, k) for k, v in word2id.items()])

    print("Loading Encoder and Decoder")
    encoder = Encoder(Config.encoded_size, Config.encoder_finetune)
    decoder = Decoder(Config.encoder_dim,
                      Config.decoder_dim,
                      Config.attention_dim,
                      Config.embed_dim,
                      vocab_size=len(word2id),
                      dropout=Config.dropout,
                      embedding_finetune=Config.embedding_finetune)

    if args.model_path:
        print("Loading model from model_path")
        load_model(encoder, decoder, args.model_path)
    else:
        # no model path, so load pretrained embedding
        print("Generating embedding from pretrained embedding file")
        embedding = load_pretrained_embedding(
            'data/glove.6B.{}d.txt'.format(Config.embed_dim), word2id,
            Config.embed_dim)
        decoder.load_embedding(embedding)

    if not args.test:
        # train
        print("Loading DataLoader and Trainer")
        dloader = DataLoader(caption_file, 'data/Flickr_Data/Images')
        trainer = Trainer(encoder, decoder, dloader)

        print("Start Training")
        loss_history = trainer.train(Config.num_epochs)
        plt.plot(np.arange(len(loss_history)), loss_history, label='Loss')
        plt.legend()
        plt.show()

    else:
        # test
        assert args.image_path

        encoder.eval()
        decoder.eval()

        transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor()
        ])

        image = transform(Image.open(args.image_path))
        image = image.unsqueeze(0)

        # TODO
        # generate caption from an image
        encoder_output = encoder(image)
        captions, alphas = decoder.generate_caption_greedily(encoder_output)

        caption_in_word = ' '.join(list(map(id2word.get, captions[1:])))
        plt.imshow(image[0].numpy().transpose(1, 2, 0))
        plt.title(caption_in_word)
        plt.axis('off')
        plt.show()

        print(caption_in_word)
Esempio n. 16
0
def main(args):
    src, tgt = load_data(args.path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    src_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    src_vocab.load(os.path.join(args.path, 'vocab.en'))
    tgt_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    tgt_vocab.load(os.path.join(args.path, 'vocab.de'))

    sos_idx = 0
    eos_idx = 1
    pad_idx = 2
    max_length = 50

    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    N = 6
    dim = 512

    # MODEL Construction
    encoder = Encoder(N, dim, pad_idx, src_vocab_size, device).to(device)
    decoder = Decoder(N, dim, pad_idx, tgt_vocab_size, device).to(device)

    if args.model_load:
        ckpt = torch.load("drive/My Drive/checkpoint/best.ckpt")
        encoder.load_state_dict(ckpt["encoder"])
        decoder.load_state_dict(ckpt["decoder"])

    params = list(encoder.parameters()) + list(decoder.parameters())

    if not args.test:
        train_loader = get_loader(src['train'],
                                  tgt['train'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size,
                                  shuffle=True)
        valid_loader = get_loader(src['valid'],
                                  tgt['valid'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size)

        warmup = 4000
        steps = 1
        lr = 1. * (dim**-0.5) * min(steps**-0.5, steps * (warmup**-1.5))
        optimizer = torch.optim.Adam(params,
                                     lr=lr,
                                     betas=(0.9, 0.98),
                                     eps=1e-09)

        train_losses = []
        val_losses = []
        latest = 1e08  # to store latest checkpoint

        start_epoch = 0

        if (args.model_load):
            start_epoch = ckpt["epoch"]
            optimizer.load_state_dict(ckpt["optim"])
            steps = start_epoch * 30

        for epoch in range(start_epoch, args.epochs):

            for src_batch, tgt_batch in train_loader:
                encoder.train()
                decoder.train()
                optimizer.zero_grad()
                tgt_batch = torch.LongTensor(tgt_batch)

                src_batch = Variable(torch.LongTensor(src_batch)).to(device)
                gt = Variable(tgt_batch[:, 1:]).to(device)
                tgt_batch = Variable(tgt_batch[:, :-1]).to(device)

                enc_output, seq_mask = encoder(src_batch)
                dec_output = decoder(tgt_batch, enc_output, seq_mask)

                gt = gt.view(-1)
                dec_output = dec_output.view(gt.size()[0], -1)

                loss = F.cross_entropy(dec_output, gt, ignore_index=pad_idx)
                loss.backward()
                train_losses.append(loss.item())
                optimizer.step()

                steps += 1
                lr = (dim**-0.5) * min(steps**-0.5, steps * (warmup**-1.5))
                update_lr(optimizer, lr)

                if (steps % 10 == 0):
                    print("loss : %f" % loss.item())

            for src_batch, tgt_batch in valid_loader:
                encoder.eval()
                decoder.eval()

                src_batch = Variable(torch.LongTensor(src_batch)).to(device)
                tgt_batch = torch.LongTensor(tgt_batch)
                gt = Variable(tgt_batch[:, 1:]).to(device)
                tgt_batch = Variable(tgt_batch[:, :-1]).to(device)

                enc_output, seq_mask = encoder(src_batch)
                dec_output = decoder(tgt_batch, enc_output, seq_mask)

                gt = gt.view(-1)
                dec_output = dec_output.view(gt.size()[0], -1)

                loss = F.cross_entropy(dec_output, gt, ignore_index=pad_idx)

                val_losses.append(loss.item())
            print("[EPOCH %d] Loss %f" % (epoch, loss.item()))

            if (val_losses[-1] <= latest):
                checkpoint = {'encoder':encoder.state_dict(), 'decoder':decoder.state_dict(), \
                    'optim':optimizer.state_dict(), 'epoch':epoch}
                torch.save(checkpoint, "drive/My Drive/checkpoint/best.ckpt")
                latest = val_losses[-1]

            if (epoch % 20 == 0):
                plt.figure()
                plt.plot(val_losses)
                plt.xlabel("epoch")
                plt.ylabel("model loss")
                plt.show()

    else:
        # test
        test_loader = get_loader(src['test'],
                                 tgt['test'],
                                 src_vocab,
                                 tgt_vocab,
                                 batch_size=args.batch_size)

        # LOAD CHECKPOINT

        pred = []
        for src_batch, tgt_batch in test_loader:
            encoder.eval()
            decoder.eval()

            b_s = min(args.batch_size, len(src_batch))
            tgt_batch = torch.zeros(b_s, 1).to(device).long()
            src_batch = Variable(torch.LongTensor(src_batch)).to(device)

            enc_output, seq_mask = encoder(src_batch)
            pred_batch = decoder(tgt_batch, enc_output, seq_mask)
            _, pred_batch = torch.max(pred_batch, 2)

            while (not is_finished(pred_batch, max_length, eos_idx)):
                # do something
                next_input = torch.cat((tgt_batch, pred_batch.long()), 1)
                pred_batch = decoder(next_input, enc_output, seq_mask)
                _, pred_batch = torch.max(pred_batch, 2)
            # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1).
            # every <pad> token (index: 2) should be located after <eos> token (index: 1).
            # example of pred_batch:
            # [[0, 5, 6, 7, 1],
            #  [0, 4, 9, 1, 2],
            #  [0, 6, 1, 2, 2]]
            pred_batch = pred_batch.tolist()
            for line in pred_batch:
                line[-1] = 1
            pred += seq2sen(pred_batch, tgt_vocab)
            # print(pred)

        with open('results/pred.txt', 'w') as f:
            for line in pred:
                f.write('{}\n'.format(line))

        os.system(
            'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
Esempio n. 17
0
def predict(region):
    np.random.seed(0)
    torch.manual_seed(0)

    input_len = 10
    encoder_units = 32
    decoder_units = 64
    encoder_rnn_layers = 3
    encoder_dropout = 0.2
    decoder_dropout = 0.2
    input_size = 2
    output_size = 1
    predict_len = 5
    batch_size = 16
    force_teacher = 0.8

    train_dataset, test_dataset, train_max, train_min = create_dataset(
        input_len, predict_len, region)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             drop_last=True)

    enc = Encoder(input_size, encoder_units, input_len, encoder_rnn_layers,
                  encoder_dropout)
    dec = Decoder(encoder_units * 2, decoder_units, input_len, input_len,
                  decoder_dropout, output_size)
    enc.load_state_dict(torch.load(f"models/{region}_enc.pth"))
    dec.load_state_dict(torch.load(f"models/{region}_dec.pth"))

    test_loader = DataLoader(test_dataset,
                             batch_size=1,
                             shuffle=False,
                             drop_last=False)

    rmse = 0
    p = 4
    predicted = []
    true_target = []
    enc.eval()
    dec.eval()
    for encoder_input, decoder_input, target in test_loader:
        with torch.no_grad():
            enc_vec = enc(encoder_input)
            x = decoder_input[:, 0]
            h, c = dec.initHidden(1)
            pred = []
            for pi in range(predict_len):
                x, h, c = dec(x, h, c, enc_vec)
                pred += [x]
            pred = torch.cat(pred, dim=1)
            predicted += [pred[0, p].item()]
            true_target += [target[0, 0].item()]
    predicted = np.array(predicted).reshape(1, -1)
    predicted = (predicted + train_min) * (train_max - train_min)
    true_target = np.array(true_target).reshape(1, -1)
    true_target = (true_target + train_min) * (train_max - train_min)
    rmse, peasonr = calc_metric(predicted, true_target)
    print(f"{region} RMSE {rmse}")
    print(f"{region} r {peasonr[0]}")
    predicted = predicted.reshape(-1)
    true_target = true_target.reshape(-1)
    x = list(range(len(predicted)))
    plt.plot(x, predicted)
    plt.plot(x, true_target)
    plt.show()
    return f"{region} RMSE {rmse} r {peasonr[0]}"
Esempio n. 18
0
def train(region):
    np.random.seed(0)
    torch.manual_seed(0)

    input_len = 10
    encoder_units = 32
    decoder_units = 64
    encoder_rnn_layers = 3
    encoder_dropout = 0.2
    decoder_dropout = 0.2
    input_size = 2
    output_size = 1
    predict_len = 5
    batch_size = 16
    epochs = 500
    force_teacher = 0.8

    train_dataset, test_dataset, train_max, train_min = create_dataset(
        input_len, predict_len, region)
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    enc = Encoder(input_size, encoder_units, input_len,
                  encoder_rnn_layers, encoder_dropout)
    dec = Decoder(encoder_units*2, decoder_units, input_len,
                  input_len, decoder_dropout, output_size)

    optimizer = AdaBound(list(enc.parameters()) +
                         list(dec.parameters()), 0.01, final_lr=0.1)
    # optimizer = optim.Adam(list(enc.parameters()) + list(dec.parameters()), 0.01)
    criterion = nn.MSELoss()

    mb = master_bar(range(epochs))
    for ep in mb:
        train_loss = 0
        enc.train()
        dec.train()
        for encoder_input, decoder_input, target in progress_bar(train_loader, parent=mb):
            optimizer.zero_grad()
            enc_vec = enc(encoder_input)
            h = enc_vec[:, -1, :]
            _, c = dec.initHidden(batch_size)
            x = decoder_input[:, 0]
            pred = []
            for pi in range(predict_len):
                x, h, c = dec(x, h, c, enc_vec)
                rand = np.random.random()
                pred += [x]
                if rand < force_teacher:
                    x = decoder_input[:, pi]
            pred = torch.cat(pred, dim=1)
            # loss = quantile_loss(pred, target)
            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        test_loss = 0
        enc.eval()
        dec.eval()
        for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb):
            with torch.no_grad():
                enc_vec = enc(encoder_input)
                h = enc_vec[:, -1, :]
                _, c = dec.initHidden(batch_size)
                x = decoder_input[:, 0]
                pred = []
                for pi in range(predict_len):
                    x, h, c = dec(x, h, c, enc_vec)
                    pred += [x]
                pred = torch.cat(pred, dim=1)
            # loss = quantile_loss(pred, target)
            loss = criterion(pred, target)
            test_loss += loss.item()
        print(
            f"Epoch {ep} Train Loss {train_loss/len(train_loader)} Test Loss {test_loss/len(test_loader)}")

    if not os.path.exists("models"):
        os.mkdir("models")
    torch.save(enc.state_dict(), f"models/{region}_enc.pth")
    torch.save(dec.state_dict(), f"models/{region}_dec.pth")

    test_loader = DataLoader(test_dataset, batch_size=1,
                             shuffle=False, drop_last=False)

    rmse = 0
    p = 0
    predicted = []
    true_target = []
    enc.eval()
    dec.eval()
    for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb):
        with torch.no_grad():
            enc_vec = enc(encoder_input)
            x = decoder_input[:, 0]
            h, c = dec.initHidden(1)
            pred = []
            for pi in range(predict_len):
                x, h, c = dec(x, h, c, enc_vec)
                pred += [x]
            pred = torch.cat(pred, dim=1)
            predicted += [pred[0, p].item()]
            true_target += [target[0, p].item()]
    predicted = np.array(predicted).reshape(1, -1)
    predicted = predicted * (train_max - train_min) + train_min
    true_target = np.array(true_target).reshape(1, -1)
    true_target = true_target * (train_max - train_min) + train_min
    rmse, peasonr = calc_metric(predicted, true_target)
    print(f"{region} RMSE {rmse}")
    print(f"{region} r {peasonr[0]}")
    return f"{region} RMSE {rmse} r {peasonr[0]}"
Esempio n. 19
0
def main(params):
    try:
        output_dir = os.path.join(
            params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M"))
        os.makedirs(output_dir)
    except OSError:
        pass

    if torch.cuda.is_available() and not params['cuda']:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

    writer = SummaryWriter(output_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    SOS_token = '<sos>'
    EOS_token = '<eos>'
    PAD_token = '<pad>'

    TEXT = Field(sequential=True,
                 use_vocab=True,
                 tokenize=tokenizer,
                 lower=True,
                 batch_first=True,
                 init_token=SOS_token,
                 eos_token=EOS_token)
    # LABEL = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, batch_first=True, init_token='#', eos_token='$')
    IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True)

    fields = {
        'ans': ('ans', TEXT),
        'img_ind': ('img_ind', IMG_IND),
        'question': ('question', TEXT)
    }

    train, val = TabularDataset.splits(
        path=params['dataroot'],
        train=params['input_train'],
        validation=params['input_test'],
        format='csv',
        skip_header=False,
        fields=fields,
    )

    print("Train data")
    print(train[0].__dict__.keys())
    print(train[0].ans, train[0].img_ind, train[0].question)

    print("Validation data")
    print(val[0].__dict__.keys())
    print(val[0].ans, val[0].img_ind, val[0].question)

    print("Building Vocabulary ..")
    TEXT.build_vocab(train, vectors='glove.6B.100d')
    vocab = TEXT.vocab

    PAD_token_ind = vocab.stoi[PAD_token]
    SOS_token_ind = vocab.stoi[SOS_token]
    EOS_token_ind = vocab.stoi[EOS_token]

    print("Creating Embedding from vocab vectors ..")
    txt_embed = nn.Embedding.from_pretrained(vocab.vectors)
    print("Text Embeddings are generated of size ", txt_embed.weight.size())

    print("Loading Image embeddings ..")
    with open(params['image_embeddings'], 'rb') as f:
        img_embs = pkl.load(f)['image_features']

    img_embed = nn.Embedding.from_pretrained(torch.FloatTensor(img_embs))

    print("Creating Encoder_attn ..")
    encoder = Encoder_attn(img_embed, txt_embed, params)
    print(encoder)

    print("Creating Decoder ..")
    decoder = Decoder(txt_embed, params)
    print(decoder)

    criterion = torch.nn.PairwiseDistance(keepdim=False)
    criterion.to(device)
    encoder.to(device)
    decoder.to(device)

    ## [Completed] TODO(Jay) : Remove this check and use .to(device)
    # if params['cuda']:
    #     encoder.cuda()
    #     decoder.cuda()
    #     criterion.cuda()

    encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                         lr=params['lr'],
                                         weight_decay=1e-5,
                                         amsgrad=True)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                         lr=params['lr'],
                                         weight_decay=1e-5,
                                         amsgrad=True)

    encoder_LR_scheduler = ReduceLROnPlateau(encoder_optimizer,
                                             'min',
                                             patience=1)
    decoder_LR_scheduler = ReduceLROnPlateau(decoder_optimizer,
                                             'min',
                                             patience=1)

    if params['use_checkpoint']:
        checkpoint = torch.load(params['enc_dec_model'])
        encoder.load_state_dict(checkpoint['encoder_state_dict'])
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        encoder_optimizer.load_state_dict(
            checkpoint['encoder_optimizer_state_dict'])
        decoder_optimizer.load_state_dict(
            checkpoint['decoder_optimizer_state_dict'])
        encoder_LR_scheduler.load_state_dict(
            checkpoint['encoder_LR_scheduler'])
        decoder_LR_scheduler.load_state_dict(
            checkpoint['decoder_LR_scheduler'])

    for epoch in range(params['niter']):

        train_iter, val_iter = Iterator.splits(
            (train, val),
            batch_sizes=(params['batch_size'], params['batch_size']),
            sort=False,
            shuffle=True,
            device=device)

        for is_train in (True, False):
            print('Is Training: ', is_train)
            if is_train:
                encoder.train()
                decoder.train()
                data_iter = train_iter
            else:
                encoder.eval()
                decoder.eval()
                data_iter = val_iter

            total_loss = 0
            total_acc = 0

            with torch.set_grad_enabled(is_train):

                for i, row in enumerate(data_iter, 1):

                    if len(row) < params['batch_size']:
                        continue

                    encoder.zero_grad()
                    decoder.zero_grad()

                    ans, img_ind, question = row.ans, row.img_ind, row.question
                    batch_size = params['batch_size']

                    ## target_length-1 since we are not predicting SOS token
                    target_length = ans.shape[1] - 1

                    encoder.hidden = encoder.init_hidden(params)

                    ans = ans.to(device)
                    img_ind = img_ind.to(device)
                    question = question.to(device)
                    encoder.hidden = (encoder.hidden[0].to(device),
                                      encoder.hidden[1].to(device))

                    ans_embed = txt_embed(ans)

                    encoder_output = encoder(img_ind, question)

                    decoder_input = ans_embed[:, 0].reshape(
                        (batch_size, 1, -1))  ## (batch_size, 1) check again
                    ans_embed = ans_embed[:, 1:]  ## removed the SOS token
                    ans = ans[:, 1:]  ## removed the SOS token

                    decoder_hidden = decoder.init_hidden(
                        encoder_output, params)

                    if params['cuda']:
                        decoder_hidden = (decoder_hidden[0].cuda(),
                                          decoder_hidden[1].cuda())

                    outputs = torch.zeros(batch_size, target_length,
                                          params['txt_emb_size'])

                    ## [Completed] TODO(Jay) : remove the sos token from the ans and ans_embed before calc loss and acc
                    for di in range(target_length - 1):
                        decoder_output, decoder_hidden = decoder(
                            decoder_input, decoder_hidden)

                        ## TODO(Jay) : Detach the input from history
                        decoder_input = decoder_output

                        outputs[:, di, :] = decoder_output.reshape(
                            batch_size, -1)

                    filtered_labels, filtered_label_embeds, filtered_outputs = filterOutput(
                        outputs.reshape(batch_size * target_length, -1),
                        ans.reshape(batch_size * target_length, -1),
                        ans_embed.reshape(batch_size * target_length, -1),
                        PAD_token_ind)

                    filtered_label_embeds = filtered_label_embeds.to(device)
                    filtered_outputs = filtered_outputs.to(device)

                    batch_loss = maskedLoss(filtered_label_embeds,
                                            filtered_outputs, criterion)

                    batch_acc = word_accuracy(filtered_outputs,
                                              vocab.vectors.to(device),
                                              filtered_labels)

                    total_loss += batch_loss.item()
                    total_acc += batch_acc

                    if is_train:
                        if i % 1000 == 0:
                            print(
                                '[%d/%d][%d/%d] train_loss: %.4f, Accuracy: %.4f'
                                % (epoch, params['niter'], i, len(data_iter),
                                   total_loss / i, total_acc / i))

                        batch_loss.backward()
                        encoder_optimizer.step()
                        decoder_optimizer.step()

                avg_loss = total_loss / len(data_iter)
                avg_acc = total_acc / len(data_iter)

                if is_train:
                    PATH = os.path.join(output_dir, 'enc_dec_model.pth')
                    torch.save(
                        {
                            'encoder_state_dict':
                            encoder.state_dict(),
                            'decoder_state_dict':
                            decoder.state_dict(),
                            'encoder_optimizer_state_dict':
                            encoder_optimizer.state_dict(),
                            'decoder_optimizer_state_dict':
                            decoder_optimizer.state_dict(),
                            'encoder_LR_scheduler':
                            encoder_LR_scheduler.state_dict(),
                            'decoder_LR_scheduler':
                            decoder_LR_scheduler.state_dict(),
                        }, PATH)

                    writer.add_scalars('data', {
                        'train_loss': avg_loss,
                        'train_acc': avg_acc
                    }, epoch)
                else:
                    print('Calculating Validation loss')
                    print('val_loss: %.4f, Accuracy: %.4f' %
                          (avg_loss, avg_acc))

                    encoder_LR_scheduler.step(avg_loss)
                    decoder_LR_scheduler.step(avg_loss)

                    writer.add_scalars('data', {
                        'val_loss': avg_loss,
                        'val_acc': avg_acc
                    }, epoch)

    writer.close()
Esempio n. 20
0
def main():
    options = parse_args()
    is_cuda = use_cuda and not options.no_cuda
    hardware = "cuda" if is_cuda else "cpu"
    device = torch.device(hardware)

    for dataset_name in options.dataset:
        results = {"best": {}, "mean": {}, "highest_prob": {}}
        for checkpoint_path in options.checkpoint:
            checkpoint_name, _ = os.path.splitext(
                os.path.basename(checkpoint_path))
            checkpoint = (load_checkpoint(checkpoint_path, cuda=is_cuda)
                          if checkpoint_path else default_checkpoint)
            encoder_checkpoint = checkpoint["model"].get("encoder")
            decoder_checkpoint = checkpoint["model"].get("decoder")

            test_set = test_sets[dataset_name]
            dataset = CrohmeDataset(
                test_set["groundtruth"],
                tokensfile,
                root=test_set["root"],
                transform=transformers,
            )
            data_loader = DataLoader(
                dataset,
                batch_size=options.batch_size,
                shuffle=False,
                num_workers=options.num_workers,
                collate_fn=collate_batch,
            )

            enc = Encoder(img_channels=3,
                          checkpoint=encoder_checkpoint).to(device)
            dec = Decoder(
                len(dataset.id_to_token),
                low_res_shape,
                high_res_shape,
                checkpoint=decoder_checkpoint,
                device=device,
            ).to(device)
            enc.eval()
            dec.eval()

            result = evaluate(
                enc,
                dec,
                data_loader=data_loader,
                device=device,
                checkpoint=checkpoint,
                beam_width=options.beam_width,
                prefix=options.prefix,
            )
            results["best"][checkpoint_name] = result["best"]
            results["mean"][checkpoint_name] = result["mean"]
            results["highest_prob"][checkpoint_name] = result["highest_prob"]

        highest_prob_err_table, highest_prob_correct_table = create_markdown_tables(
            results["highest_prob"])
        best_err_table, best_correct_table = create_markdown_tables(
            results["best"])
        mean_err_table, mean_correct_table = create_markdown_tables(
            results["mean"])
        print(("\n# Dataset {name}\n\n"
               "Beam width: {beam_width}\n\n"
               "## Highest Probability\n\n{highest_prob_err_table}\n\n"
               "{highest_prob_correct_table}\n\n"
               "## Best\n\n{best_err_table}\n\n{best_correct_table}\n\n"
               "## Mean\n\n{mean_err_table}\n\n{mean_correct_table}").format(
                   name=dataset_name,
                   beam_width=options.beam_width,
                   highest_prob_err_table=highest_prob_err_table,
                   highest_prob_correct_table=highest_prob_correct_table,
                   best_err_table=best_err_table,
                   best_correct_table=best_correct_table,
                   mean_err_table=mean_err_table,
                   mean_correct_table=mean_correct_table,
               ))
Esempio n. 21
0
def DDF(cfg):

    filter_list_path = Path(utils.to_absolute_path(cfg.filter_list))
    with open(filter_list_path) as file:
        filter_list = json.load(file)
    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)
    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    encoder.eval()
    decoder.eval()
    meter = pyloudnorm.Meter(cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Low":
        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            # librosa.load (it will return audio time series, and its sampling rate)
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            path = out_dir / out_filename

            # to return raw recording in mel-spectrogram without any filtering
            if cfg.output_type == "Embedding":
                mel = librosa.feature.melspectrogram(
                    preemphasis(wav, cfg.preprocessing.preemph),
                    sr=cfg.preprocessing.sr,
                    n_fft=cfg.preprocessing.n_fft,
                    n_mels=cfg.preprocessing.n_mels,
                    hop_length=cfg.preprocessing.hop_length,
                    win_length=cfg.preprocessing.win_length,
                    fmin=cfg.preprocessing.fmin,
                    power=1)
                logmel = librosa.amplitude_to_db(
                    mel, top_db=cfg.preprocessing.top_db)
                logmel = logmel / cfg.preprocessing.top_db + 1
                mel = torch.FloatTensor(logmel).squeeze().to(device).numpy()
                np.savetxt(path.with_suffix(".mel.txt"), mel)

            # to return raw recording in waveform without any filtering
            if cfg.output_type == "Recording":
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         wav.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Moderate":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    speaker = decoder.speaker(speaker)
                vq = vq.squeeze().to(device).numpy()
                speaker = speaker.squeeze().to(device).numpy()

                np.savetxt(path.with_suffix(".vq.txt"), vq)
                np.savetxt(path.with_suffix(".speaker.txt"), speaker)

    #---------------------------------------
    if cfg.privacy_preference == "High":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                vq = vq.squeeze().cpu().numpy()
                np.savetxt(path.with_suffix(".vq.txt"), vq)
                swa = SWA(number_swa_models=number_swa_models)
                scheduler_decoder.curr_iter = iterations
                if finetune_encoder:
                    scheduler_encoder.curr_iter = iterations
                print scheduler_decoder.get_lr()[0]
                print "SWA decoder curr lr", scheduler_decoder.print_lr()[0]
            else:
                swa = SWA(number_swa_models=0)
                print "# of SWA models 0"
        else:
            swa = SWA(number_swa_models=0)
            print "# of SWA models 0"

    encoder.eval()
    decoder.eval()
    if swa_params:
        encoder_swa.eval()
        decoder_swa.eval()

    criterion = nn.CrossEntropyLoss()

    dataset = COCOMultiLabel(train=True,
                             classification=False,
                             image_path=args.image_path,
                             sort_by_freq=args.sort_by_freq)
    dataset_val = COCOMultiLabel(train=False,
                                 classification=False,
                                 image_path=args.image_path,
                                 sort_by_freq=args.sort_by_freq)
    dataloader = DataLoader(dataset,
Esempio n. 23
0
class FNM(object):
    def __init__(self, args):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = args.device_id
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.profile_list_path = args.profile_list
        self.front_list_path = args.front_list
        self.profile_path = args.profile_path
        self.front_path = args.front_path
        self.test_path = args.test_path
        self.test_list = args.test_list

        self.crop_size = args.ori_height
        self.image_size = args.height
        self.res_n = args.res_n
        self.is_finetune = args.is_finetune
        self.result_name = args.result_name
        self.summary_dir = args.summary_dir
        self.iteration = args.iteration
        self.weight_decay = args.weight_decay
        self.decay_flag = args.decay_flag
        self.print_freq = args.print_freq
        self.save_freq = args.save_freq
        self.img_size = args.width
        self.model_name = args.model_name

        # For hyper parameters
        self.lambda_l1 = args.lambda_l1
        self.lambda_fea = args.lambda_fea
        self.lambda_reg = args.lambda_reg
        self.lambda_gan = args.lambda_gan
        self.lambda_gp = args.lambda_gp

        self.channel = args.channel
        self.device = torch.device("cuda:{}".format(args.device_id))
        self.make_dirs()
        self.build_model()
        """Define Loss"""
        self.L1_loss = nn.L1Loss().to(self.device)
        self.L2_loss = nn.MSELoss().to(self.device)

    def make_dirs(self):
        check_folder(self.summary_dir)
        check_folder(os.path.join("results", self.result_name, "model"))
        check_folder(os.path.join("results", self.result_name, "img"))

    def build_model(self):
        self.expert_net = se50_net(
            "./other_models/arcface_se50/model_ir_se50.pth").to(self.device)
        for param in self.expert_net.parameters():
            param.requires_grad = False
        #self.dataset = sample_dataset(self.profile_list_path, self.front_list_path, self.profile_path, self.front_path, self.crop_size, self.image_size)
        self.front_loader = get_loader(self.front_list_path,
                                       self.front_path,
                                       self.crop_size,
                                       self.image_size,
                                       self.batch_size,
                                       mode="train",
                                       num_workers=8)

        self.profile_loader = get_loader(self.profile_list_path,
                                         self.profile_path,
                                         self.crop_size,
                                         self.image_size,
                                         self.batch_size,
                                         mode="train",
                                         num_workers=8)

        self.test_loader = get_loader(self.test_list,
                                      self.test_path,
                                      self.crop_size,
                                      self.image_size,
                                      self.batch_size,
                                      mode="test",
                                      num_workers=8)

        #self.front_loader = iter(self.front_loader)
        #self.profile_loader = iter(self.profile_loader)
        #resnet_blocks
        resnet_block_list = []
        for i in range(self.res_n):
            resnet_block_list.append(ResnetBlock(512, use_bias=False))

        self.body = nn.Sequential(*resnet_block_list).to(self.device)
        #[b, 512, 7, 7]
        self.decoder = Decoder().to(self.device)
        self.dis = Discriminator(self.channel).to(self.device)

        self.G_optim = torch.optim.Adam(itertools.chain(
            self.body.parameters(), self.decoder.parameters()),
                                        lr=self.lr,
                                        betas=(0.5, 0.999),
                                        weight_decay=self.weight_decay)

        self.D_optim = torch.optim.Adam(itertools.chain(self.dis.parameters()),
                                        lr=self.lr,
                                        betas=(0.5, 0.999),
                                        weight_decay=self.weight_decay)

        self.downsample112x112 = nn.Upsample(size=(112, 112), mode='bilinear')

    def update_lr(self, start_iter):
        if self.decay_flag and start_iter > (self.iteration // 2):
            self.G_optim.param_groups[0]['lr'] -= (
                self.lr /
                (self.iteration // 2)) * (start_iter - self.iteration // 2)
            self.D_optim.param_groups[0]['lr'] -= (
                self.lr /
                (self.iteration // 2)) * (start_iter - self.iteration // 2)

    def train(self):
        self.body.train(), self.decoder.train(), self.dis.train()
        start_iter = 1
        if self.is_finetune:
            model_list = glob(
                os.path.join("results", self.result_name, "model", "*.pt"))
            if not len(model_list) == 0:
                model_list.sort()
                start_iter = int(model_list[-1].split('_')[-1].split('.')[0])
                self.load(os.path.join("results", self.result_name, 'model'),
                          start_iter)
                print(" [*] Load SUCCESS")
                self.update_lr(start_iter)
        print("training start...")
        start_time = time.time()
        for step in range(start_iter, self.iteration + 1):
            self.update_lr(start_iter)
            try:
                front_224, front_112 = front_iter.next()
                if front_224.shape[0] != self.batch_size:
                    raise Exception
            except:
                front_iter = iter(self.front_loader)
                front_224, front_112 = front_iter.next()
            try:
                profile_224, profile_112 = profile_iter.next()
                if profile_224.shape[0] != self.batch_size:
                    raise Exception
            except:
                profile_iter = iter(self.profile_loader)
                profile_224, profile_112 = profile_iter.next()

            profile_224, front_224, profile_112, front_112 = profile_224.to(
                self.device), front_224.to(self.device), profile_112.to(
                    self.device), front_112.to(self.device)

            # Update D
            self.D_optim.zero_grad()

            feature_p = self.expert_net.get_feature(profile_112)
            feature_f = self.expert_net.get_feature(front_112)
            gen_p = self.decoder(self.body(feature_p))
            gen_f = self.decoder(self.body(feature_f))
            feature_gen_p = self.expert_net.get_feature(
                self.downsample112x112(gen_p))
            feature_gen_f = self.expert_net.get_feature(
                self.downsample112x112(gen_f))
            d_f = self.dis(front_224)
            d_gen_p = self.dis(gen_p)
            d_gen_f = self.dis(gen_f)

            D_adv_loss = torch.mean(
                tensor_tuple_sum(d_gen_f) * 0.5 +
                tensor_tuple_sum(d_gen_p) * 0.5 - tensor_tuple_sum(d_f)) / 5

            alpha = torch.rand(gen_p.size(0), 1, 1, 1).to(self.device)
            inter = (alpha * front_224.data +
                     (1 - alpha) * gen_p.data).requires_grad_(True)
            out_inter = self.dis(inter)
            gradient_penalty_loss = (
                gradient_penalty(out_inter[0], inter, self.device) +
                gradient_penalty(out_inter[1], inter, self.device) +
                gradient_penalty(out_inter[2], inter, self.device) +
                gradient_penalty(out_inter[3], inter, self.device) +
                gradient_penalty(out_inter[4], inter, self.device)) / 5
            #print("gradient_penalty_loss:{}".format(gradient_penalty_loss))
            d_loss = self.lambda_gan * D_adv_loss + self.lambda_gp * gradient_penalty_loss
            d_loss.backward(retain_graph=True)
            self.D_optim.step()

            # Update G
            self.G_optim.zero_grad()
            try:
                front_224, front_112 = front_iter.next()
                if front_224.shape[0] != self.batch_size:
                    raise Exception
            except:
                front_iter = iter(self.front_loader)
                front_224, front_112 = front_iter.next()
            try:
                profile_224, profile_112 = profile_iter.next()
                if profile_224.shape[0] != self.batch_size:
                    raise Exception
            except:
                profile_iter = iter(self.profile_loader)
                profile_224, profile_112 = profile_iter.next()

            profile_224, front_224, profile_112, front_112 = profile_224.to(
                self.device), front_224.to(self.device), profile_112.to(
                    self.device), front_112.to(self.device)

            feature_p = self.expert_net.get_feature(profile_112)
            feature_f = self.expert_net.get_feature(front_112)
            gen_p = self.decoder(self.body(feature_p))
            gen_f = self.decoder(self.body(feature_f))
            feature_gen_p = self.expert_net.get_feature(
                self.downsample112x112(gen_p))
            feature_gen_f = self.expert_net.get_feature(
                self.downsample112x112(gen_f))
            d_f = self.dis(front_224)
            d_gen_p = self.dis(gen_p)
            d_gen_f = self.dis(gen_f)

            pixel_loss = torch.mean(self.L1_loss(front_224, gen_f))

            feature_p_norm = l2_norm(feature_p)
            feature_f_norm = l2_norm(feature_f)
            feature_gen_p_norm = l2_norm(feature_gen_p)
            feature_gen_f_norm = l2_norm(feature_gen_f)

            perceptual_loss = torch.mean(
                0.5 *
                (1 - torch.sum(torch.mul(feature_p_norm, feature_gen_p_norm),
                               dim=(1, 2, 3))) + 0.5 *
                (1 - torch.sum(torch.mul(feature_f_norm, feature_gen_f_norm),
                               dim=(1, 2, 3))))

            G_adv_loss = -torch.mean(
                tensor_tuple_sum(d_gen_f) * 0.5 +
                tensor_tuple_sum(d_gen_p) * 0.5) / 5
            g_loss = self.lambda_gan * G_adv_loss + self.lambda_l1 * pixel_loss + self.lambda_fea * perceptual_loss
            g_loss.backward()
            self.G_optim.step()

            print("[%5d/%5d] time: %4.4f d_loss: %.8f, g_loss: %.8f" %
                  (step, self.iteration, time.time() - start_time, d_loss,
                   g_loss))
            print("D_adv_loss : %.8f" % (self.lambda_gan * D_adv_loss))
            print("G_adv_loss : %.8f" % (self.lambda_gan * G_adv_loss))
            print("pixel_loss : %.8f" % (self.lambda_l1 * pixel_loss))
            print("perceptual_loss : %.8f" %
                  (self.lambda_fea * perceptual_loss))
            print("gp_loss : %.8f" % (self.lambda_gp * gradient_penalty_loss))

            with torch.no_grad():
                if step % self.print_freq == 0:
                    train_sample_num = 5
                    test_sample_num = 5
                    A2B = np.zeros((self.img_size * 4, 0, 3))
                    self.body.eval(), self.decoder.eval(), self.dis.eval()
                    for _ in range(train_sample_num):
                        try:
                            front_224, front_112 = front_iter.next()
                            if front_224.shape[0] != self.batch_size:
                                raise Exception
                        except:
                            front_iter = iter(self.front_loader)
                            front_224, front_112 = front_iter.next()
                        try:
                            profile_224, profile_112 = profile_iter.next()
                            if profile_224.shape[0] != self.batch_size:
                                raise Exception
                        except:
                            profile_iter = iter(self.profile_loader)
                            profile_224, profile_112 = profile_iter.next()

                        profile_224, front_224, profile_112, front_112 = profile_224.to(
                            self.device), front_224.to(
                                self.device), profile_112.to(
                                    self.device), front_112.to(self.device)

                        feature_p = self.expert_net.get_feature(profile_112)
                        feature_f = self.expert_net.get_feature(front_112)
                        gen_p = self.decoder(self.body(feature_p))
                        gen_f = self.decoder(self.body(feature_f))

                        A2B = np.concatenate(
                            (A2B,
                             np.concatenate(
                                 (RGB2BGR(tensor2numpy(denorm(
                                     profile_224[0]))),
                                  RGB2BGR(tensor2numpy(denorm(gen_p[0]))),
                                  RGB2BGR(tensor2numpy(denorm(front_224[0]))),
                                  RGB2BGR(tensor2numpy(denorm(gen_f[0])))),
                                 0)), 1)

                    for _ in range(train_sample_num):
                        show_list = []
                        for i in range(2):
                            try:
                                test_profile_224, test_profile_112 = test_iter.next(
                                )
                                if test_profile_224.shape[0] != self.batch_size:
                                    raise Exception
                            except:
                                test_iter = iter(self.test_loader)
                                test_profile_224, test_profile_112 = test_iter.next(
                                )
                            test_profile_224, test_profile_112 = test_profile_224.to(
                                self.device), test_profile_112.to(self.device)
                            test_feature_p = self.expert_net.get_feature(
                                test_profile_112)
                            test_gen_p = self.decoder(
                                self.body(test_feature_p))
                            show_list.append(test_profile_224[0])
                            show_list.append(test_gen_p[0])

                        A2B = np.concatenate(
                            (A2B,
                             np.concatenate(
                                 (RGB2BGR(tensor2numpy(denorm(show_list[0]))),
                                  RGB2BGR(tensor2numpy(denorm(show_list[1]))),
                                  RGB2BGR(tensor2numpy(denorm(show_list[2]))),
                                  RGB2BGR(tensor2numpy(denorm(show_list[3])))),
                                 0)), 1)

                    cv2.imwrite(
                        os.path.join("results", self.result_name, 'img',
                                     'A2B_%07d.png' % step), A2B * 255.0)
                    self.body.train(), self.decoder.train(), self.dis.train()

                if step % self.save_freq == 0:
                    self.save(
                        os.path.join("results", self.result_name, "model"),
                        step)

                if step % 1000 == 0:
                    params = {}
                    params['body'] = self.body.state_dict()
                    params['decoder'] = self.decoder.state_dict()
                    params['dis'] = self.dis.state_dict()
                    torch.save(
                        params,
                        os.path.join("results", self.result_name,
                                     self.model_name + "_params_latest.pt"))

    def load(self, dir, step):
        params = torch.load(
            os.path.join(dir, self.model_name + '_params_%07d.pt' % step))
        self.body.load_state_dict(params['body'])
        self.decoder.load_state_dict(params['decoder'])
        self.dis.load_state_dict(params['dis'])

    def save(self, dir, step):
        params = {}
        params['body'] = self.body.state_dict()
        params['decoder'] = self.decoder.state_dict()
        params['dis'] = self.dis.state_dict()
        torch.save(
            params,
            os.path.join(dir, self.model_name + '_params_%07d.pt' % step))

    def demo(self):
        try:
            front_224, front_112 = front_iter.next()
            if front_224.shape[0] != self.batch_size:
                raise Exception
        except:
            front_iter = iter(self.front_loader)
            front_224, front_112 = front_iter.next()
        try:
            profile_224, profile_112 = profile_iter.next()
            if profile_224.shape[0] != self.batch_size:
                raise Exception
        except:
            profile_iter = iter(self.profile_loader)
            profile_224, profile_112 = profile_iter.next()

        profile_224, front_224, profile_112, front_112 = profile_224.to(
            self.device), front_224.to(self.device), profile_112.to(
                self.device), front_112.to(self.device)
        D_face, D_eye, D_nose, D_mouth, D_map = self.dis(profile_224)
        '''
        print("D_face.shape:", D_face.shape)
        print("D_eye.shape:", D_eye.shape)
        print("D_nose.shape:", D_nose.shape)
        print("D_mouth.shape:", D_mouth.shape)
        '''
        cv2.imwrite("profile.jpg",
                    cv2.cvtColor(tensor2im(profile_112), cv2.COLOR_BGR2RGB))
        cv2.imwrite("front.jpg",
                    cv2.cvtColor(tensor2im(front_112), cv2.COLOR_BGR2RGB))
        feature = self.expert_net.get_feature(profile_224)
        print(feature.shape)
        '''
Esempio n. 24
0
        print(translate(captions))


with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

vocab_size = len(vocab)
print('vocab_size:', vocab_size)

dataloader = get_loader(image_dir,
                        caption_path,
                        vocab,
                        batch_size,
                        crop_size,
                        shuffle=True,
                        num_workers=num_workers)

encoder = Encoder().to(device)
encoder.fine_tune(fine_tune_encoder)
decoder = Decoder(attention_dim, embedding_size, lstm_size,
                  vocab_size).to(device)

print('Start loading models.')
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))
encoder.eval()
decoder.eval()

sample('data/surf.jpg', vocab, dataloader, encoder, decoder)
sample('data/giraffe.png', vocab, dataloader, encoder, decoder)
Esempio n. 25
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path(
        "datasets")) / cfg.dataset.path  #zerospeech/datasets/2019/english
    with open(dataset_path / "speakers.json") as file:  # 말하는 사람들 이름 써있는 데이터
        speakers = sorted(json.load(file))  # speakers라는 객체로 저장

    synthesis_list_path = Path(utils.to_absolute_path(
        cfg.synthesis_list))  # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(
            file)  # datasets/2019/english에 있는 synthesis.json보면됨

    in_dir = Path(utils.to_absolute_path(
        cfg.in_dir))  # ???임. zerospeech 폴더로 경로따면 될듯. (./)
    out_dir = Path(utils.to_absolute_path(
        cfg.out_dir))  #???임. 목소리 변환된 결과를 저장할 경로
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # gpu안되면 cpu로

    encoder = Encoder(
        **cfg.model.encoder)  #ZeroSpeech/config/model/default에 있는 encoder
    decoder = Decoder(
        **cfg.model.decoder)  #ZeroSpeech/config/model/default에 있는 decoder
    encoder.to(device)  # cpu or gpu
    decoder.to(device)  # cpu or gpu

    print("Load checkpoint from: {}:".format(cfg.checkpoint)
          )  ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage
                            )  # checkpoint에 지정된 weight들을 불러옵니다
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    meter = pyloudnorm.Meter(
        cfg.preprocessing.sr
    )  #sr:16000으로 조정??  https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다..

    for wav_path, speaker_id, out_filename in tqdm(
            synthesis_list
    ):  #"english/test/S002_0379088085","V002","V002_0379088085"
        wav_path = in_dir / wav_path  # ./english/test/S002_0379088085
        wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                              sr=cfg.preprocessing.sr)
        ref_loudness = meter.integrated_loudness(wav)  #인풋의 음량을 측정인듯
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(
            device)  #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입

        #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze

        #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/

        speaker = torch.LongTensor([speakers.index(speaker_id)
                                    ]).to(device)  # 마찬가지로 텐서로 만드는데

        #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데,
        #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다.
        #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다.

        # 즉 mel은 소수점있고 speaker는 소숫점 없으니까!
        with torch.no_grad(
        ):  # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        output_loudness = meter.integrated_loudness(output)  #아웃풋의 음량을 측정인듯
        output = pyloudnorm.normalize.loudness(output, output_loudness,
                                               ref_loudness)
        # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경
        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"),
                                 output.astype(np.float32),
                                 sr=cfg.preprocessing.sr)
Esempio n. 26
0
def main(args):

    # ==============================
    # Create some folders or files for saving
    # ==============================

    if not os.path.exists(args.root_folder):
        os.mkdir(args.root_folder)

    loss_path = args.loss_path
    mertics_path = args.mertics_path
    epoch_model_path = args.epoch_model_path
    best_model_path = args.best_model_path
    generated_captions_path = args.generated_captions_folder_path
    sentences_show_path = args.sentences_show_path

    # Transform the format of images
    # This function in utils.general_tools.py
    train_transform = get_train_transform()
    val_transform = get_val_trainsform()

    # Load vocabulary
    print("*** Load Vocabulary ***")
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Create data sets
    # This function in data_load.py
    train_data = train_load(root=args.train_image_dir,
                            json=args.train_caption_path,
                            vocab=vocab,
                            transform=train_transform,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=args.num_workers)

    val_data = val_load(root=args.val_image_dir,
                        json=args.val_caption_path,
                        transform=val_transform,
                        batch_size=1,
                        shuffle=False,
                        num_workers=args.num_workers)

    # Build model
    encoder = Encoder(args.hidden_dim, args.fine_tuning).to(device)
    decoder = Decoder(args.embedding_dim, args.hidden_dim, vocab, len(vocab),
                      args.max_seq_length).to(device)

    # Select loss function
    criterion = nn.CrossEntropyLoss().to(device)

    if args.fine_tuning == True:
        params = list(decoder.parameters()) + list(encoder.parameters())
        optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr)
    else:
        params = decoder.parameters()
        optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr)

    # Load pretrained model
    if args.resume == True:
        checkpoint = torch.load(best_model_path)
        encoder.load_state_dict(checkpoint['encoder'])
        decoder.load_state_dict(checkpoint['decoder'])
        if args.fine_tuning == False:
            optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch'] + 1
        best_score = checkpoint['best_score']
        best_epoch = checkpoint['best_epoch']

    # New epoch and score
    else:
        start_epoch = 1
        best_score = 0
        best_epoch = 0

    for epoch in range(start_epoch, 10000):

        print("-" * 20)
        print("epoch:{}".format(epoch))

        # Adjust learning rate when the difference between epoch and best epoch is multiple of 3
        if (epoch - best_epoch) > 0 and (epoch - best_epoch) % 4 == 0:
            # This function in utils.general_tools.py
            adjust_lr(optimizer, args.shrink_factor)
        if (epoch - best_epoch) > 10:
            break
            print("*** Training complete ***")

        # =============
        # Training
        # =============

        print(" *** Training ***")
        decoder.train()
        encoder.train()
        total_step = len(train_data)
        epoch_loss = 0
        for (images, captions, lengths, img_ids) in tqdm(train_data):
            images = images.to(device)
            captions = captions.to(device)
            # Why do lengths cut 1 and the first dimension of captions from 1
            # Because we need to ignore the begining symbol <start>
            lengths = list(np.array(lengths) - 1)

            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]
            features = encoder(images)
            predictions = decoder(features, captions, lengths)
            predictions = pack_padded_sequence(predictions,
                                               lengths,
                                               batch_first=True)[0]

            loss = criterion(predictions, targets)
            epoch_loss += loss.item()
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

        # Save loss information
        # This function in utils.save_tools.py
        save_loss(round(epoch_loss / total_step, 3), epoch, loss_path)

        # =============
        # Evaluating
        # =============

        print("*** Evaluating ***")
        encoder.eval()
        decoder.eval()
        generated_captions = []
        for image, img_id in tqdm(val_data):

            image = image.to(device)
            img_id = img_id[0]

            features = encoder(image)
            sentence = decoder.generate(features)
            sentence = ' '.join(sentence)
            item = {'image_id': int(img_id), 'caption': sentence}
            generated_captions.append(item)
            j = random.randint(1, 100)

        print('*** Computing metrics ***')

        # Save current generated captions
        # This function in utils.save_tools.py

        captions_json_path = save_generated_captions(generated_captions, epoch,
                                                     generated_captions_path,
                                                     args.fine_tuning)

        # Compute score of metrics
        # This function in utils.general_tools.py
        results = coco_metrics(args.val_caption_path, captions_json_path,
                               epoch, sentences_show_path)

        # Save metrics results
        # This function in utils.save_tools.py
        epoch_score = save_metrics(results, epoch, mertics_path)

        # Update the best score
        if best_score < epoch_score:

            best_score = epoch_score
            best_epoch = epoch

            save_best_model(encoder, decoder, optimizer, epoch, best_score,
                            best_epoch, best_model_path)

        print("*** Best score:{} Best epoch:{} ***".format(
            best_score, best_epoch))
        # Save every epoch model
        save_epoch_model(encoder, decoder, optimizer, epoch, best_score,
                         best_epoch, epoch_model_path, args.fine_tuning)
Esempio n. 27
0
def main(_):
    # Load the configuration file.
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Load the vocabularies.
    src_vocab = Vocab.load(config['data']['src']['vocab'])
    tgt_vocab = Vocab.load(config['data']['tgt']['vocab'])

    # Load the training and dev datasets.
    test_data = ShakespeareDataset('test', config, src_vocab, tgt_vocab)

    # Restore the model.
    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    encoder = Encoder(src_vocab_size, config['model']['embedding_dim'],
                      config['model']['bidirection'],
                      config['model']['dropout'], config['model']['layer'],
                      config['model']['mode'])
    decoder = Decoder(tgt_vocab_size, config['model']['embedding_dim'],
                      config['model']['bidirection'],
                      config['model']['dropout'], config['model']['layer'],
                      config['model']['mode'])

    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()
    ckpt_path = os.path.join(config['data']['ckpt'], config['experiment_name'],
                             'model.pt')
    if os.path.exists(ckpt_path):
        print('Loading checkpoint: %s' % ckpt_path)
        ckpt = torch.load(ckpt_path)
        encoder.load_state_dict(ckpt['encoder'])
        decoder.load_state_dict(ckpt['decoder'])
    else:
        print('Unable to find checkpoint. Terminating.')
        sys.exit(1)
    encoder.eval()
    decoder.eval()

    # Initialize translator.
    greedy_translator = GreedyTranslator(encoder, decoder, tgt_vocab)

    # Qualitative evaluation - print translations for first couple sentences in
    # test corpus.
    for i in range(10):
        src, tgt = test_data[i]
        translation = greedy_translator(src)
        src_sentence = [src_vocab.id2word(id) for id in src.data.cpu().numpy()]
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        print('---')
        print('Source: %s' % ' '.join(src_sentence))
        print('Ground truth: %s' % ' '.join(tgt_sentence))
        print('Model output: %s' % ' '.join(translated_sentence))
    print('---')

    # Quantitative evaluation - compute corpus level BLEU scores.
    hypotheses = []
    references = []
    for src, tgt in test_data:
        translation = greedy_translator(src)
        tgt_sentence = [tgt_vocab.id2word(id) for id in tgt.data.cpu().numpy()]
        translated_sentence = [tgt_vocab.id2word(id) for id in translation]
        # Remove start and end of sentence tokens.
        tgt_sentence = tgt_sentence[1:-1]
        translated_sentence = translated_sentence[1:-1]
        hypotheses.append(tgt_sentence)
        references.append([translated_sentence])
    print("Corpus BLEU score: %0.4f" % corpus_bleu(references, hypotheses))
Esempio n. 28
0
def eval_reward(args, shared_model, writer_dir=None):
    """
	For evaluation

	Arguments:
	- writer: the tensorboard summary writer directory (note: can't get it working directly with the SummaryWriter object)
	"""
    writer = SummaryWriter(log_dir=os.path.join(
        writer_dir, 'eval')) if writer_dir is not None else None

    # current episode stats
    episode_reward = episode_value_mse = episode_td_error = episode_pg_loss = episode_length = 0

    # global stats
    i_episode = 0
    total_episode = total_steps = 0
    num_goals_achieved = 0

    # intilialize the env and models
    torch.manual_seed(args.seed)
    env = create_env(args.env_name, framework=args.framework, args=args)
    set_seed(args.seed, env, args.framework)

    shared_enc, shared_dec, shared_d_module, shared_r_module = shared_model

    enc = Encoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    dec = Decoder(env.observation_space.shape[0],
                  args.dim,
                  use_conv=args.use_conv)
    d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete)
    r_module = R_Module(env.action_space.shape[0],
                        args.dim,
                        discrete=args.discrete,
                        baseline=False,
                        state_space=env.observation_space.shape[0])

    all_params = chain(enc.parameters(), dec.parameters(),
                       d_module.parameters(), r_module.parameters())

    if args.from_checkpoint is not None:
        model_state, _ = torch.load(args.from_checkpoint)
        model.load_state_dict(model_state)

    # set the model to evaluation mode
    enc.eval()
    dec.eval()
    d_module.eval()
    r_module.eval()

    # reset the state
    state = env.reset()
    state = Variable(torch.from_numpy(state).float())

    start = time.time()

    while total_episode < args.num_episodes:

        # Sync with the shared model
        r_module.load_state_dict(shared_r_module.state_dict())
        d_module.load_state_dict(shared_d_module.state_dict())
        enc.load_state_dict(shared_enc.state_dict())
        dec.load_state_dict(shared_dec.state_dict())

        # reset stuff
        cd_p = Variable(torch.zeros(1, args.lstm_dim))
        hd_p = Variable(torch.zeros(1, args.lstm_dim))

        # for the reward
        cr_p = Variable(torch.zeros(1, args.lstm_dim))
        hr_p = Variable(torch.zeros(1, args.lstm_dim))

        i_episode += 1
        episode_length = 0
        episode_reward = 0
        args.local = True
        args.d = 0
        succ, _, episode_reward, episode_length = test(1, args, args, args,
                                                       d_module, r_module, enc)
        log("Eval: succ {:.2f}, reward {:.2f}, length {:.2f}".format(
            succ, episode_reward, episode_length))
        # Episode has ended, write the summaries here
        if writer_dir is not None:
            # current episode stats
            writer.add_scalar('eval/episode_reward', episode_reward, i_episode)
            writer.add_scalar('eval/episode_length', episode_length, i_episode)
            writer.add_scalar('eval/success', succ, i_episode)

        time.sleep(args.eval_every)
        print("sleep")
Esempio n. 29
0
def convert():
    '''
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))
    '''
    dataset_path = Path('./cfg').absolute()
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))
    with open(Path("./cfg/cfg.json").absolute()) as file:
        para = json.load(file)

    synthesis_list_path = Path('./dataset/english/synthesis.txt').absolute()
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)
    in_dir = Path('./dataset/english').absolute()
    out_dir = Path('./output').absolute()
    out_dir.mkdir(exist_ok=True, parents=True)
    print(synthesis_list)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(in_channels=para['encoder']['in_channels'],
                      channels=para['encoder']['channels'],
                      n_embeddings=para['encoder']['n_embeddings'],
                      embedding_dim=para['encoder']['embedding_dim'],
                      jitter=para['encoder']['jitter'])
    decoder = Decoder(
        in_channels=para['decoder']['in_channels'],
        conditioning_channels=para['decoder']['conditioning_channels'],
        n_speakers=para['decoder']['n_speakers'],
        speaker_embedding_dim=para['decoder']['speaker_embedding_dim'],
        mu_embedding_dim=para['decoder']['mu_embedding_dim'],
        rnn_channels=para['decoder']['rnn_channels'],
        fc_channels=para['decoder']['fc_channels'],
        bits=para['decoder']['bits'],
        hop_length=para['decoder']['hop_length'])
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format('./checkpoint/model.pt'))
    checkpoint_path = Path('./checkpoint/model.pt').absolute()
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    #meter = pyloudnorm.Meter(160000)
    print('load finish')
    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                              sr=para['preprocess']['sr'])
        #ref_loudness = meter.integrated_loudness(wav)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, para['preprocess']['preemph']),
            sr=para['preprocess']['sr'],
            n_fft=para['preprocess']['n_fft'],
            n_mels=para['preprocess']['n_mels'],
            hop_length=para['preprocess']['hop_length'],
            win_length=para['preprocess']['win_length'],
            fmin=para['preprocess']['fmin'],
            power=1)
        logmel = librosa.amplitude_to_db(mel,
                                         top_db=para['preprocess']['top_db'])
        logmel = logmel / para['preprocess']['top_db'] + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        #output_loudness = meter.integrated_loudness(output)
        #output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness)
        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"),
                                 output.astype(np.float32),
                                 sr=para['preprocess']['sr'])