Example #1
0
    def __init__(self, config, vocab_size):
        super(DeepAPI, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.temp = config['temp']

        self.desc_embedder = nn.Embedding(vocab_size,
                                          config['emb_size'],
                                          padding_idx=PAD_ID)
        self.api_embedder = nn.Embedding(vocab_size,
                                         config['emb_size'],
                                         padding_idx=PAD_ID)
        # utter encoder: encode response to vector
        self.encoder = Encoder(self.desc_embedder, config['emb_size'],
                               config['n_hidden'], True, config['n_layers'],
                               config['noise_radius'])
        self.decoder = Decoder(self.api_embedder, config['emb_size'],
                               config['n_hidden'] * 2, vocab_size,
                               config['use_attention'], 1,
                               config['dropout'])  # utter decoder: P(x|c,z)
        self.optimizer = optim.Adadelta(list(self.encoder.parameters()) +
                                        list(self.decoder.parameters()),
                                        lr=config['lr_ae'],
                                        rho=0.95)
        self.criterion_ce = nn.CrossEntropyLoss()
    def __init__(self, config):
        super(Transformer_EncoderDecoder, self).__init__()
        c = copy.deepcopy
        self.attn = MultiHeadedAttention(config['head'], config['emb_dim'])
        self.ff = PositionwiseFeedForward(config['emb_dim'], config['d_ff'],
                                          config['drop_out'])
        self.position = PositionalEncoding(config['emb_dim'],
                                           config['drop_out'])
        self.encoder = Encoder(
            EncoderLayer(config['emb_dim'], c(self.attn), c(self.ff),
                         config['drop_out']), config['N_layers'])
        self.decoder = Decoder(
            DecoderLayer(config['emb_dim'], c(self.attn), c(self.attn),
                         c(self.ff), config['drop_out']), config['N_layers'])
        self.src_embed = nn.Sequential(
            Embeddings(config['emb_dim'], config['vocab_size']),
            c(self.position))
        self.tgt_embed = nn.Sequential(
            Embeddings(config['emb_dim'], config['vocab_size']),
            c(self.position))
        self.generator = Generator(config['emb_dim'], config['vocab_size'])
        self.fc_out = nn.Linear(config['emb_dim'], config['vocab_size'])

        self.model = EncoderDecoder(self.encoder, self.decoder, self.src_embed,
                                    self.tgt_embed, self.generator)
Example #3
0
def da_rnn(train_data: TrainData, n_targs: int, encoder_hidden_size=64, decoder_hidden_size=64,
           T=10, learning_rate=0.01, batch_size=128):

    train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7), batch_size, nn.MSELoss())
    logger.info(f"Training size: {train_cfg.train_size:d}.")

    enc_kwargs = {"input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T}
    encoder = Encoder(**enc_kwargs).to(device)
    with open(os.path.join("data", "enc_kwargs.json"), "w") as fi:
        json.dump(enc_kwargs, fi, indent=4)

    dec_kwargs = {"encoder_hidden_size": encoder_hidden_size,
                  "decoder_hidden_size": decoder_hidden_size, "T": T, "out_feats": n_targs}
    decoder = Decoder(**dec_kwargs).to(device)
    with open(os.path.join("data", "dec_kwargs.json"), "w") as fi:
        json.dump(dec_kwargs, fi, indent=4)

    encoder_optimizer = optim.Adam(
        params=[p for p in encoder.parameters() if p.requires_grad],
        lr=learning_rate)
    decoder_optimizer = optim.Adam(
        params=[p for p in decoder.parameters() if p.requires_grad],
        lr=learning_rate)
    da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer)

    return train_cfg, da_rnn_net
Example #4
0
class RNN(object):
    def __init__(self, input_size, output_size):
        super(RNN, self).__init__()

        self.encoder = Encoder(input_size)
        self.decoder = Decoder(output_size)

        self.loss = nn.CrossEntropyLoss()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters())
        self.decoder_optimizer = optim.Adam(self.decoder.parameters())

        sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1,
                                                                    1).zero_()
        sos[0, 0], eos[0, 0] = 0, 1

        self.sos, self.eos = sos, eos

    def train(self, input, target):
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        hidden_state = self.encoder.first_hidden()

        # Encoder
        for ivec in input:
            _, hidden_state = self.encoder.forward(Variable(ivec),
                                                   hidden_state)

        # Decoder
        target.insert(0, self.sos)
        target.append(self.eos)
        total_loss = 0
        for i in range(len(target) - 1):
            _, softmax, hidden_state = self.decoder.forward(
                target[i], hidden_state)
            total_loss += self.loss(softmax, Variable(target[i + 1][0]))

        total_loss.backward()

        self.decoder_optimizer.step()
        self.encoder_optimizer.step()

        return total_loss

    def eval(self, input):
        hidden_state = self.encoder.first_hidden()

        # Encoder
        for ivec in input:
            _, hidden_state = self.encoder.forward(ivec, hidden_state)

        outputs = []
        output = self.sos
        # Decoder
        while output is not self.eos:
            output, _, hidden_state = self.decoder.forward(
                output, hidden_state)
            outputs += output

        return outputs
Example #5
0
def da_rnn(train_data,
           n_targs: int,
           encoder_hidden_size=64,
           decoder_hidden_size=64,
           T=10,
           learning_rate=0.01,
           batch_size=128):

    train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7),
                            batch_size, nn.MSELoss())
    logging.info(f"Training size: {train_cfg.train_size:d}.")

    enc_params = pd.DataFrame([{
        'input_size': train_data.feats.shape[1],
        'hidden_size': encoder_hidden_size,
        'T': T
    }])
    enc_params.to_csv(os.path.join('results', save_name, 'enc_params.csv'))

    encoder = Encoder(input_size=enc_params['input_size'][0].item(),
                      hidden_size=enc_params['hidden_size'][0].item(),
                      T=enc_params['T'][0].item()).cuda()

    dec_params = pd.DataFrame([{
        'encoder_hidden_size': encoder_hidden_size,
        'decoder_hidden_size': decoder_hidden_size,
        'T': T,
        'out_feats': n_targs
    }])
    dec_params.to_csv(os.path.join('results', save_name, 'dec_params.csv'))

    decoder = Decoder(
        encoder_hidden_size=dec_params['encoder_hidden_size'][0].item(),
        decoder_hidden_size=dec_params['decoder_hidden_size'][0].item(),
        T=dec_params['T'][0].item(),
        out_feats=dec_params['out_feats'][0].item()).cuda()

    encoder_optimizer = optim.Adam(
        params=[p for p in encoder.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=args.wdecay)

    decoder_optimizer = optim.Adam(
        params=[p for p in decoder.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=args.wdecay)

    encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        encoder_optimizer, train_data.feats.shape[0], eta_min=args.min_lr)
    decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        decoder_optimizer, train_data.feats.shape[0], eta_min=args.min_lr)

    model = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer,
                     encoder_scheduler, decoder_scheduler)

    return train_cfg, model
Example #6
0
    def __init__(self, input_size, output_size, resume=False):
        super(RNN, self).__init__()

        self.encoder = Encoder(input_size)
        self.decoder = Decoder(output_size)

        self.loss = nn.CrossEntropyLoss()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters())
        self.decoder_optimizer = optim.Adam(self.decoder.parameters())

        if resume:
            self.encoder.load_state_dict(torch.load("models/encoder.ckpt"))
            self.decoder.load_state_dict(torch.load("models/decoder.ckpt"))
Example #7
0
def da_rnn(train_data,
           n_targs: int,
           encoder_hidden_size=64,
           decoder_hidden_size=64,
           T=10,
           learning_rate=0.01,
           batch_size=128):

    train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7),
                            batch_size, nn.MSELoss())
    logging.info(f"Training size: {train_cfg.train_size:d}.")

    enc_kwargs = {
        "input_size": train_data.feats.shape[1],
        "hidden_size": encoder_hidden_size,
        "T": T
    }
    encoder = Encoder(**enc_kwargs).cuda()
    with open(os.path.join("data", "enc_kwargs.json"), "w") as fi:
        json.dump(enc_kwargs, fi, indent=4)

    dec_kwargs = {
        "encoder_hidden_size": encoder_hidden_size,
        "decoder_hidden_size": decoder_hidden_size,
        "T": T,
        "out_feats": n_targs
    }
    decoder = Decoder(**dec_kwargs).cuda()
    with open(os.path.join("data", "dec_kwargs.json"), "w") as fi:
        json.dump(dec_kwargs, fi, indent=4)

    encoder_optimizer = optim.Adam(
        params=[p for p in encoder.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=args.wdecay)

    decoder_optimizer = optim.Adam(
        params=[p for p in decoder.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=args.wdecay)

    encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        encoder_optimizer, args.epochs, eta_min=args.min_lr)
    decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        decoder_optimizer, args.epochs, eta_min=args.min_lr)

    da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer,
                          decoder_optimizer, encoder_scheduler,
                          decoder_scheduler)

    return train_cfg, da_rnn_net
Example #8
0
    def __init__(self, X_dim, Y_dim, encoder_hidden_size=64, decoder_hidden_size=64,
                 linear_dropout=0, T=10, learning_rate=1e-5, batch_size=128, decay_rate=0.95):
        self.T = T
        self.decay_rate = decay_rate
        self.batch_size = batch_size
        self.X_dim = X_dim
        self.Y_dim = Y_dim

        self.encoder = Encoder(X_dim, encoder_hidden_size, T, linear_dropout).to(device)
        self.decoder = Decoder(encoder_hidden_size, decoder_hidden_size, T, linear_dropout, Y_dim).to(device)

        self.encoder_optim = torch.optim.Adam(params=self.encoder.parameters(), lr=learning_rate)
        self.decoder_optim = torch.optim.Adam(params=self.decoder.parameters(), lr=learning_rate)
        self.loss_func = torch.nn.MSELoss()
Example #9
0
    def __init__(self, input_size, output_size):
        super(RNN, self).__init__()

        self.encoder = Encoder(input_size)
        self.decoder = Decoder(output_size)

        self.loss = nn.CrossEntropyLoss()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters())
        self.decoder_optimizer = optim.Adam(self.decoder.parameters())

        sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_()
        sos[0, 0], eos[0, 0] = 0, 1

        self.sos, self.eos = sos, eos
Example #10
0
    def __init__(self, obs, nums, glimpse_size=(20, 20),
                 inpt_encoder_hidden=[256]*2,
                 glimpse_encoder_hidden=[256]*2,
                 glimpse_decoder_hidden=[252]*2,
                 transform_estimator_hidden=[256]*2,
                 steps_pred_hidden=[50]*1,
                 baseline_hidden=[256, 128]*1,
                 transform_var_bias=-2.,
                 step_bias=0.,
                 *args, **kwargs):

        self.baseline = BaselineMLP(baseline_hidden)

        def _make_transform_estimator(x):
            est = StochasticTransformParam(transform_estimator_hidden, x, scale_bias=transform_var_bias)
            return est

        super(AIRonMNIST, self).__init__(
            *args,
            obs=obs,
            nums=nums,
            glimpse_size=glimpse_size,
            n_appearance=50,
            transition=snt.LSTM(256),
            input_encoder=(lambda: Encoder(inpt_encoder_hidden)),
            glimpse_encoder=(lambda: Encoder(glimpse_encoder_hidden)),
            glimpse_decoder=(lambda x: Decoder(glimpse_decoder_hidden, x)),
            transform_estimator=_make_transform_estimator,
            steps_predictor=(lambda: StepsPredictor(steps_pred_hidden, step_bias)),
            output_std=.3,
            **kwargs
        )
Example #11
0
    def __init__(self, args):
        super(SSM, self).__init__()

        self.s_dim = s_dim = args.s_dim
        self.a_dim = a_dim = args.a_dim
        self.o_dim = o_dim = args.o_dim
        self.h_dim = h_dim = args.h_dim
        self.device = args.device
        self.args = args

        self.encoder = torch.nn.DataParallel(
            Encoder(o_dim, h_dim).to(self.device[0]), self.device)
        self.decoder = torch.nn.DataParallel(
            Decoder(s_dim, o_dim).to(self.device[0]), self.device)
        self.prior = torch.nn.DataParallel(
            Prior(s_dim, a_dim).to(self.device[0]), self.device)
        self.posterior = torch.nn.DataParallel(
            Posterior(self.prior, s_dim, a_dim, h_dim).to(self.device[0]),
            self.device)

        self.distributions = nn.ModuleList(
            [self.prior, self.posterior, self.encoder, self.decoder])
        init_weights(self.distributions)

        # for s_aux_loss
        self.prior01 = Normal(torch.tensor(0.), scale=torch.tensor(1.))

        self.g_optimizer = optim.Adam(self.distributions.parameters())
Example #12
0
def darnn(train_data: TrainingData, 
           n_targets: int, 
           encoder_hidden_size: int, 
           decoder_hidden_size: int,
           T: int, 
           learning_rate=0.002, 
           batch_size=32):
    train_cfg = TrainingConfig(T, int(train_data.features.shape[0] * 0.7), batch_size, nn.MSELoss())
    print(f"Training size: {train_cfg.train_size:d}.")
    enc_kwargs = {"input_size": train_data.features.shape[1], "hidden_size": encoder_hidden_size, "T": T}
    encoder = Encoder(**enc_kwargs).to(device)
    dec_kwargs = {"encoder_hidden_size": encoder_hidden_size,"decoder_hidden_size": decoder_hidden_size, "T": T, "out_features": n_targets}
    decoder = Decoder(**dec_kwargs).to(device)
    encoder_optimizer = optim.Adam(params=[p for p in encoder.parameters() if p.requires_grad],lr=learning_rate)
    decoder_optimizer = optim.Adam(params=[p for p in decoder.parameters() if p.requires_grad],lr=learning_rate)
    da_rnn_net = Darnn_Net(encoder, decoder, encoder_optimizer, decoder_optimizer)
    return train_cfg, da_rnn_net
Example #13
0
 def __init__(self, vocab, feats_size, kernel_size, rec_field, attn_size,
              hidden_size, mid_layer, dropout, which):
     super(TextNormalizer, self).__init__()
     self.vocab = vocab
     self.encoder = Encoder(len(vocab), feats_size, kernel_size, rec_field,
                            dropout, which)
     self.decoder = Decoder(len(vocab), feats_size, attn_size, hidden_size,
                            mid_layer, dropout)
     self.init_hidden = InitialWeights(hidden_size, mid_layer, 4)
Example #14
0
def TCHA(train_data: TrainData, n_targs: int, bidirec=False, num_layer=1, encoder_hidden_size=64, decoder_hidden_size=64,
         T=10, learning_rate=0.01, batch_size=128, interval=1, split=0.7, isMean=False):
    train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * split), batch_size, nn.MSELoss(), interval, T, isMean)
    logger.info(f"Training size: {train_cfg.train_size:d}.")

    enc_args = {"input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T,
                  "bidirec": bidirec, "num_layer": num_layer}
    encoder = Encoder(**enc_args).to(device)

    dec_args = {"encoder_hidden_size": encoder_hidden_size, "decoder_hidden_size": decoder_hidden_size, "T": T,
                  "out_feats": n_targs, "bidirec": bidirec, "num_layer": num_layer}
    decoder = Decoder(**dec_args).to(device)

    encoder_optimizer = optim.Adam(
        params=[p for p in encoder.parameters() if p.requires_grad],
        lr=learning_rate)
    decoder_optimizer = optim.Adam(
        params=[p for p in decoder.parameters() if p.requires_grad],
        lr=learning_rate)
    tcha = TCHA_Net(encoder, decoder, encoder_optimizer, decoder_optimizer)

    return train_cfg, tcha
Example #15
0
        def set_params(train_data, device, **da_rnn_kwargs):
            train_configs = TrainConfig(da_rnn_kwargs["time_step"],
                                        int(train_data.shape[0] * 0.95),
                                        da_rnn_kwargs["batch_size"],
                                        nn.MSELoss())

            enc_kwargs = {
                "input_size": train_data.shape[1],
                "hidden_size": da_rnn_kwargs["en_hidden_size"],
                "time_step":
                int(da_rnn_kwargs["time_step"] / self.predict_size)
            }
            dec_kwargs = {
                "encoder_hidden_size": da_rnn_kwargs["en_hidden_size"],
                "decoder_hidden_size": da_rnn_kwargs["de_hidden_size"],
                "time_step":
                int(da_rnn_kwargs["time_step"] / self.predict_size),
                "out_feats": da_rnn_kwargs["target_cols"]
            }
            encoder = Encoder(**enc_kwargs).to(device)
            decoder = Decoder(**dec_kwargs).to(device)

            encoder_optimizer = optim.Adam(
                params=[p for p in encoder.parameters() if p.requires_grad],
                lr=da_rnn_kwargs["learning_rate"],
                betas=(0.9, 0.999),
                eps=1e-08)
            decoder_optimizer = optim.Adam(
                params=[p for p in decoder.parameters() if p.requires_grad],
                lr=da_rnn_kwargs["learning_rate"],
                betas=(0.9, 0.999),
                eps=1e-08)
            da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer,
                                  decoder_optimizer)

            return train_configs, da_rnn_net
Example #16
0
    def __init__(self, temp, latent_num, latent_dim):
        super(Model, self).__init__()
        if type(temp) != torch.Tensor:
            temp = torch.tensor(temp)
        self.__temp = temp
        self.latent_num = latent_num
        self.latent_dim = latent_dim
        self.encoder = Encoder(latent_num=latent_num, latent_dim=latent_dim)
        self.decoder = Decoder(latent_num=latent_num, latent_dim=latent_dim)
        if 'ExpTDModel' in  str(self.__class__):
            self.prior = ExpRelaxedCategorical(temp, probs=torch.ones(latent_dim).cuda())
        else:
            self.prior = dist.RelaxedOneHotCategorical(temp, probs=torch.ones(latent_dim).cuda())
        self.initialize()

        self.softmax = nn.Softmax(dim=-1)
 def __init__(self):
     self.model = get_model().cuda()
     self.ctc_loss = CTCLoss(size_average=True)
     self.decoder = Decoder()
     # self.optimizer = optim.Adam(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay)
     self.optimizer = optim.ASGD(self.model.parameters(),
                                 lr=configs.lr,
                                 weight_decay=configs.l2_weight_decay)
     self.lr_scheduler = lr_scheduler.ReduceLROnPlateau(
         self.optimizer,
         'min',
         patience=configs.lr_scheduler_patience,
         factor=configs.lr_scheduler_factor,
         verbose=True)
     self.epoch_idx = 0
     self.min_avg_dist = 1000.
Example #18
0
    def __init__(self,
                 word2idx,
                 emb_size,
                 hidden_sizes,
                 dropout,
                 rnn_type="LSTM",
                 pretrained_embs=None,
                 fixed_embs=False,
                 tied=None):
        super(RNNLanguageModel, self).__init__()

        self.encoder = Encoder(word2idx, emb_size, pretrained_embs, fixed_embs)
        self.decoder = Decoder(len(word2idx), hidden_sizes[-1], tied,
                               self.encoder)

        self.rnn = StackedRNN(rnn_type, emb_size, hidden_sizes, dropout)
        self.drop = nn.Dropout(dropout)
    def __init__(self,
                 input_dim_encoder: int,
                 hidden_dim_encoder: int,
                 output_dim_encoder: int,
                 dropout_p_encoder: float,
                 output_dim_h_decoder: int,
                 nb_classes: int,
                 dropout_p_decoder: float,
                 max_out_t_steps: int) \
            -> None:
        """Baseline method for audio captioning with Clotho dataset.

        :param input_dim_encoder: Input dimensionality of the encoder.
        :type input_dim_encoder: int
        :param hidden_dim_encoder: Hidden dimensionality of the encoder.
        :type hidden_dim_encoder: int
        :param output_dim_encoder: Output dimensionality of the encoder.
        :type output_dim_encoder: int
        :param dropout_p_encoder: Encoder RNN dropout.
        :type dropout_p_encoder: float
        :param output_dim_h_decoder: Hidden output dimensionality of the decoder.
        :type output_dim_h_decoder: int
        :param nb_classes: Amount of output classes.
        :type nb_classes: int
        :param dropout_p_decoder: Decoder RNN dropout.
        :type dropout_p_decoder: float
        :param max_out_t_steps: Maximum output time-steps of the decoder.
        :type max_out_t_steps: int
        """
        super().__init__()

        self.max_out_t_steps: int = max_out_t_steps

        self.encoder: Module = Encoder(input_dim=input_dim_encoder,
                                       hidden_dim=hidden_dim_encoder,
                                       output_dim=output_dim_encoder,
                                       dropout_p=dropout_p_encoder)

        self.decoder: Module = Decoder(input_dim=output_dim_encoder * 2,
                                       output_dim=output_dim_h_decoder,
                                       nb_classes=nb_classes,
                                       dropout_p=dropout_p_decoder)
Example #20
0
    def __init__(self,
                 word2idx,
                 emb_size,
                 hidden_sizes,
                 dropout,
                 rnn_type="LSTM",
                 pretrained_embs=None,
                 fixed_embs=False,
                 tied=None):
        super(BidirectionalLanguageModel, self).__init__()
        self.drop = nn.Dropout(dropout)

        self.encoder = Encoder(word2idx, emb_size, pretrained_embs, fixed_embs)
        self.decoder = Decoder(len(word2idx), hidden_sizes[-1], tied,
                               self.encoder)

        self.forward_lstm = StackedRNN(rnn_type, emb_size, hidden_sizes,
                                       dropout)
        self.backward_lstm = StackedRNN(rnn_type, emb_size, hidden_sizes,
                                        dropout)

        self.rnn_type = rnn_type
        self.hidden_sizes = hidden_sizes
        self.nlayers = len(hidden_sizes)
Example #21
0
    def __init__(self, embed_dim=300, hidden_dim=256, inner_dim=2048,
                 n_head=2, N_en=6, N_de=6, dropout=0.1,
                 vocab_size=5000, sos_idx=2, eos_idx=3, pad_idx=0, unk_idx=1,
                 max_src_len=100, max_tgt_len=20, args=False):
        
        super(Transformer, self).__init__()

        #===Test the GPU availability
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        #--Token indexes & Properties
        self.sos, self.eos, self.pad, self.unk = sos_idx, eos_idx, pad_idx, unk_idx
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        self.scale = embed_dim ** 0.5

        #===Base model(attn, enc, dec, ff)
        max_len = max(max_src_len, max_tgt_len)
        attn_enc_layer = ATTNLayer(
            embed_dim, n_head, hidden_dim, inner_dim, dropout, max_len, False)
        attn_dec_layer = ATTNLayer(
            embed_dim, n_head, hidden_dim, inner_dim, dropout, max_len, True)


        #===Main Archetecture(enc, dec)
        self.encoder = Encoder(attn_enc_layer, N_en, True)
        self.decoder = Decoder(attn_dec_layer, N_de, True)
        
        #===Embedding setting(src, tgt)
        self.embed = nn.Embedding(vocab_size, embed_dim)

        #===Fianl FC(logit2vocab)
        self.final = nn.Linear(embed_dim, vocab_size)

        #===Loss
        self.NLL = nn.NLLLoss(reduction='sum')
Example #22
0
    def __init__(self, config, api, PAD_token=0, pretrain_weight=None):
        super(PoemWAE, self).__init__()
        self.vocab = api.vocab
        self.vocab_size = len(self.vocab)
        self.rev_vocab = api.rev_vocab
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.maxlen = config.maxlen
        self.clip = config.clip
        self.lambda_gp = config.lambda_gp
        self.lr_gan_g = config.lr_gan_g
        self.lr_gan_d = config.lr_gan_d
        self.n_d_loss = config.n_d_loss
        self.temp = config.temp
        self.init_w = config.init_weight

        self.embedder = nn.Embedding(self.vocab_size,
                                     config.emb_size,
                                     padding_idx=PAD_token)
        if pretrain_weight is not None:
            self.embedder.weight.data.copy_(torch.from_numpy(pretrain_weight))
        # 用同一个seq_encoder来编码标题和前后两句话
        self.seq_encoder = Encoder(self.embedder, config.emb_size,
                                   config.n_hidden, True, config.n_layers,
                                   config.noise_radius)
        # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,4*hidden
        # 注意如果使用Poemwar_gmp则使用子类中的prior_net,即混合高斯分布的一个先验分布
        self.prior_net = Variation(config.n_hidden * 4,
                                   config.z_size,
                                   dropout_rate=config.dropout,
                                   init_weight=self.init_w)  # p(e|c)

        # 注意这儿原来是给Dialog那个任务用的,3*hidden
        # Poem数据集上,将title和上一句,另外加上x都分别用双向GRU编码并cat,因此是6*hidden
        self.post_net = Variation(config.n_hidden * 6,
                                  config.z_size,
                                  dropout_rate=config.dropout,
                                  init_weight=self.init_w)

        self.post_generator = nn.Sequential(
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size))
        self.post_generator.apply(self.init_weights)

        self.prior_generator = nn.Sequential(
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size))
        self.prior_generator.apply(self.init_weights)

        self.init_decoder_hidden = nn.Sequential(
            nn.Linear(config.n_hidden * 4 + config.z_size,
                      config.n_hidden * 4),
            nn.BatchNorm1d(config.n_hidden * 4, eps=1e-05, momentum=0.1),
            nn.ReLU())

        # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,因此hidden_size变为z_size + 4*hidden
        # 修改:decoder的hidden_size还设为n_hidden, init_hidden使用一个MLP将cat变换为n_hidden
        self.decoder = Decoder(self.embedder,
                               config.emb_size,
                               config.n_hidden * 4,
                               self.vocab_size,
                               n_layers=1)

        self.discriminator = nn.Sequential(
            # 因为Poem的cat两个双向编码,这里改为4*n_hidden + z_size
            nn.Linear(config.n_hidden * 4 + config.z_size,
                      config.n_hidden * 2),
            nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config.n_hidden * 2, config.n_hidden * 2),
            nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config.n_hidden * 2, 1),
        )
        self.discriminator.apply(self.init_weights)

        # optimizer 定义,分别对应三个模块的训练,注意!三个模块的optimizer不相同
        # self.optimizer_AE = optim.SGD(list(self.seq_encoder.parameters())
        self.optimizer_AE = optim.SGD(
            list(self.seq_encoder.parameters()) +
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.init_decoder_hidden.parameters()) +
            list(self.decoder.parameters()),
            lr=config.lr_ae)
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters()),
            lr=self.lr_gan_g)
        self.optimizer_D = optim.RMSprop(self.discriminator.parameters(),
                                         lr=self.lr_gan_d)

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.8)

        self.criterion_ce = nn.CrossEntropyLoss()
Example #23
0
class PoemWAE(nn.Module):
    def __init__(self, config, api, PAD_token=0, pretrain_weight=None):
        super(PoemWAE, self).__init__()
        self.vocab = api.vocab
        self.vocab_size = len(self.vocab)
        self.rev_vocab = api.rev_vocab
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.maxlen = config.maxlen
        self.clip = config.clip
        self.lambda_gp = config.lambda_gp
        self.lr_gan_g = config.lr_gan_g
        self.lr_gan_d = config.lr_gan_d
        self.n_d_loss = config.n_d_loss
        self.temp = config.temp
        self.init_w = config.init_weight

        self.embedder = nn.Embedding(self.vocab_size,
                                     config.emb_size,
                                     padding_idx=PAD_token)
        if pretrain_weight is not None:
            self.embedder.weight.data.copy_(torch.from_numpy(pretrain_weight))
        # 用同一个seq_encoder来编码标题和前后两句话
        self.seq_encoder = Encoder(self.embedder, config.emb_size,
                                   config.n_hidden, True, config.n_layers,
                                   config.noise_radius)
        # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,4*hidden
        # 注意如果使用Poemwar_gmp则使用子类中的prior_net,即混合高斯分布的一个先验分布
        self.prior_net = Variation(config.n_hidden * 4,
                                   config.z_size,
                                   dropout_rate=config.dropout,
                                   init_weight=self.init_w)  # p(e|c)

        # 注意这儿原来是给Dialog那个任务用的,3*hidden
        # Poem数据集上,将title和上一句,另外加上x都分别用双向GRU编码并cat,因此是6*hidden
        self.post_net = Variation(config.n_hidden * 6,
                                  config.z_size,
                                  dropout_rate=config.dropout,
                                  init_weight=self.init_w)

        self.post_generator = nn.Sequential(
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size))
        self.post_generator.apply(self.init_weights)

        self.prior_generator = nn.Sequential(
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size),
            nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(),
            nn.Linear(config.z_size, config.z_size))
        self.prior_generator.apply(self.init_weights)

        self.init_decoder_hidden = nn.Sequential(
            nn.Linear(config.n_hidden * 4 + config.z_size,
                      config.n_hidden * 4),
            nn.BatchNorm1d(config.n_hidden * 4, eps=1e-05, momentum=0.1),
            nn.ReLU())

        # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,因此hidden_size变为z_size + 4*hidden
        # 修改:decoder的hidden_size还设为n_hidden, init_hidden使用一个MLP将cat变换为n_hidden
        self.decoder = Decoder(self.embedder,
                               config.emb_size,
                               config.n_hidden * 4,
                               self.vocab_size,
                               n_layers=1)

        self.discriminator = nn.Sequential(
            # 因为Poem的cat两个双向编码,这里改为4*n_hidden + z_size
            nn.Linear(config.n_hidden * 4 + config.z_size,
                      config.n_hidden * 2),
            nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config.n_hidden * 2, config.n_hidden * 2),
            nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config.n_hidden * 2, 1),
        )
        self.discriminator.apply(self.init_weights)

        # optimizer 定义,分别对应三个模块的训练,注意!三个模块的optimizer不相同
        # self.optimizer_AE = optim.SGD(list(self.seq_encoder.parameters())
        self.optimizer_AE = optim.SGD(
            list(self.seq_encoder.parameters()) +
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.init_decoder_hidden.parameters()) +
            list(self.decoder.parameters()),
            lr=config.lr_ae)
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters()),
            lr=self.lr_gan_g)
        self.optimizer_D = optim.RMSprop(self.discriminator.parameters(),
                                         lr=self.lr_gan_d)

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.8)

        self.criterion_ce = nn.CrossEntropyLoss()

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            m.weight.data.uniform_(-self.init_w, self.init_w)
            # nn.init.kaiming_normal_(m.weight.data)
            # nn.init.kaiming_uniform_(m.weight.data)
            m.bias.data.fill_(0)

    # x: (batch, 2*n_hidden)
    # c: (batch, 2*2*n_hidden)
    def sample_code_post(self, x, c):
        z, _, _ = self.post_net(torch.cat((x, c),
                                          1))  # 输入:(batch, 3*2*n_hidden)
        z = self.post_generator(z)
        return z

    def sample_code_prior_sentiment(self, c, align):
        choice_statistic = self.prior_net(c, align)  # e: (batch, z_size)
        return choice_statistic

    def sample_code_prior(self, c):
        z, _, _ = self.prior_net(c)  # e: (batch, z_size)
        z = self.prior_generator(z)  # z: (batch, z_size)
        return z

    # 输入 title, context, target, target_lens.
    # c由title和context encode之后的hidden相concat而成
    def train_AE(self, title, context, target, target_lens):
        self.seq_encoder.train()
        self.decoder.train()
        # import pdb
        # pdb.set_trace()
        # (batch, 2 * hidden_size)
        title_last_hidden, _ = self.seq_encoder(title)
        context_last_hidden, _ = self.seq_encoder(context)

        # (batch, 2 * hidden_size)
        x, _ = self.seq_encoder(target[:, 1:], target_lens - 1)
        # context_embedding
        c = torch.cat((title_last_hidden, context_last_hidden),
                      1)  # (batch, 2 * hidden_size * 2)
        z = self.sample_code_post(x, c)  # (batch, z_size)

        # 标准的autoencoder的decode,decoder初态为x, c的cat,将target错位输入
        # output: (batch, len, vocab_size) len是9,即7+标点+</s>

        output = self.decoder(self.init_decoder_hidden(torch.cat((z, c), 1)),
                              None, target[:, :-1], target_lens - 1)
        flattened_output = output.view(-1, self.vocab_size)

        dec_target = target[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # 即判断target的token中是否有0(pad项)
        masked_target = dec_target.masked_select(mask)  # 选出非pad项
        output_mask = mask.unsqueeze(1).expand(
            mask.size(0), self.vocab_size)  # [(batch_sz * seq_len) x n_tokens]
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)

        self.optimizer_AE.zero_grad()
        loss = self.criterion_ce(masked_output / self.temp, masked_target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            list(self.seq_encoder.parameters()) +
            list(self.decoder.parameters()), self.clip)
        self.optimizer_AE.step()

        return [('train_loss_AE', loss.item())]

    # G是来缩短W距离的,可以类比VAE里面的缩小KL散度项
    def train_G(self,
                title,
                context,
                target,
                target_lens,
                sentiment_mask=None,
                mask_type=None):
        self.seq_encoder.eval()
        self.optimizer_G.zero_grad()

        for p in self.discriminator.parameters():
            p.requires_grad = False
        title_last_hidden, _ = self.seq_encoder(title)
        context_last_hidden, _ = self.seq_encoder(context)
        c = torch.cat((title_last_hidden, context_last_hidden),
                      1)  # (batch, 2 * hidden_size * 2)

        # -----------------posterior samples ---------------------------
        x, _ = self.seq_encoder(target[:, 1:], target_lens - 1)
        z_post = self.sample_code_post(
            x.detach(), c.detach())  # 去掉梯度,防止梯度向encoder的传播 (batch, z_size)

        errG_post = torch.mean(
            self.discriminator(torch.cat(
                (z_post, c.detach()),
                1))) * self.n_d_loss  # (batch, z_size + 4 * hidden)
        errG_post.backward(minus_one)

        # ----------------- prior samples ---------------------------
        prior_z = self.sample_code_prior(c.detach())
        errG_prior = torch.mean(
            self.discriminator(torch.cat(
                (prior_z, c.detach()), 1))) * self.n_d_loss
        # import pdb
        # pdb.set_trace()
        errG_prior.backward(one)
        self.optimizer_G.step()

        for p in self.discriminator.parameters():
            p.requires_grad = True

        costG = errG_prior - errG_post

        return [('train_loss_G', costG.item())]

    # D是用来拟合W距离,loss下降说明拟合度变好,增大gradient_penalty一定程度上可以提高拟合度
    # n_iters_n越大,D训练的次数越多,对应的拟合度也越好
    def train_D(self, title, context, target, target_lens):
        self.seq_encoder.eval()
        self.discriminator.train()
        self.optimizer_D.zero_grad()

        batch_size = context.size(0)

        title_last_hidden, _ = self.seq_encoder(title)
        context_last_hidden, _ = self.seq_encoder(context)
        c = torch.cat((title_last_hidden, context_last_hidden),
                      1)  # (batch, 2, hidden_size * 2)
        x, _ = self.seq_encoder(target[:, 1:], target_lens - 1)
        post_z = self.sample_code_post(x, c)
        errD_post = torch.mean(
            self.discriminator(torch.cat(
                (post_z.detach(), c.detach()), 1))) * self.n_d_loss
        errD_post.backward(one)

        prior_z = self.sample_code_prior(c)
        errD_prior = torch.mean(
            self.discriminator(torch.cat(
                (prior_z.detach(), c.detach()), 1))) * self.n_d_loss
        errD_prior.backward(minus_one)
        # import pdb
        # pdb.set_trace()

        alpha = to_tensor(torch.rand(batch_size, 1))
        alpha = alpha.expand(prior_z.size())
        interpolates = alpha * prior_z.data + ((1 - alpha) * post_z.data)
        interpolates = Variable(interpolates, requires_grad=True)

        d_input = torch.cat((interpolates, c.detach()), 1)
        disc_interpolates = torch.mean(self.discriminator(d_input))
        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=to_tensor(torch.ones(disc_interpolates.size())),
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]
        gradient_penalty = (
            (gradients.contiguous().view(gradients.size(0), -1).norm(2, dim=1)
             - 1)**2).mean() * self.lambda_gp
        gradient_penalty.backward()

        self.optimizer_D.step()
        costD = -(errD_prior - errD_post) + gradient_penalty
        return [('train_loss_D', costD.item())]

    def valid(self, title, context, target, target_lens, sentiment_mask=None):
        self.seq_encoder.eval()
        self.discriminator.eval()
        self.decoder.eval()

        title_last_hidden, _ = self.seq_encoder(title)
        context_last_hidden, _ = self.seq_encoder(context)
        c = torch.cat((title_last_hidden, context_last_hidden),
                      1)  # (batch, 2 * hidden_size * 2)
        x, _ = self.seq_encoder(target[:, 1:], target_lens - 1)

        post_z = self.sample_code_post(x, c)
        prior_z = self.sample_code_prior(c)
        errD_post = torch.mean(self.discriminator(torch.cat((post_z, c), 1)))
        errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c), 1)))
        costD = -(errD_prior - errD_post)
        costG = -costD

        dec_target = target[:, 1:].contiguous().view(-1)  # (batch_size * len)
        mask = dec_target.gt(0)  # 即判断target的token中是否有0(pad项)
        masked_target = dec_target.masked_select(mask)  # 选出非pad项
        output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size)

        output = self.decoder(
            self.init_decoder_hidden(torch.cat((post_z, c), 1)), None,
            target[:, :-1], (target_lens - 1))
        flattened_output = output.view(-1, self.vocab_size)
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)
        lossAE = self.criterion_ce(masked_output / self.temp, masked_target)
        return [('valid_loss_AE', lossAE.item()),
                ('valid_loss_G', costG.item()), ('valid_loss_D', costD.item())]

    # 正如论文中说的,测试生成的时候,从先验网络中拿到噪声,用G生成prior_z(即代码中的sample_code_prior(c))
    # 然后decoder将prior_z和c的cat当做输入,decode出这句诗(这和论文里面不太一样,论文里面只把prior_z当做输入)
    # batch_size是1,一次测一句

    # title 即标题
    # context 上一句
    def test(self, title_tensor, title_words, headers):
        self.seq_encoder.eval()
        self.discriminator.eval()
        self.decoder.eval()
        # tem初始化为[2,3,0,0,0,0,0,0,0]

        tem = [[2, 3] + [0] * (self.maxlen - 2)]
        pred_poems = []

        title_tokens = [
            self.vocab[e] for e in title_words[0].tolist()
            if e not in [0, self.eos_id, self.go_id]
        ]
        pred_poems.append(title_tokens)
        for sent_id in range(4):
            tem = to_tensor(np.array(tem))
            context = tem

            # vec_context = np.zeros((batch_size, self.maxlen), dtype=np.int64)
            # for b_id in range(batch_size):
            #     vec_context[b_id, :] = np.array(context[b_id])
            # context = to_tensor(vec_context)

            title_last_hidden, _ = self.seq_encoder(
                title_tensor)  # (batch=1, 2*hidden)
            if sent_id == 0:
                context_last_hidden, _ = self.seq_encoder(
                    title_tensor)  # (batch=1, 2*hidden)
            else:
                context_last_hidden, _ = self.seq_encoder(
                    context)  # (batch=1, 2*hidden)
            c = torch.cat((title_last_hidden, context_last_hidden),
                          1)  # (batch, 4*hidden_size)
            # 由于一次只有一首诗,batch_size = 1,因此不必repeat
            prior_z = self.sample_code_prior(c)

            # decode_words 是完整的一句诗
            decode_words = self.decoder.testing(
                init_hidden=self.init_decoder_hidden(torch.cat((prior_z, c),
                                                               1)),
                maxlen=self.maxlen,
                go_id=self.go_id,
                mode="greedy",
                header=headers[sent_id])

            decode_words = decode_words[0].tolist()
            # import pdb
            # pdb.set_trace()
            if len(decode_words) > self.maxlen:
                tem = [decode_words[0:self.maxlen]]
            else:
                tem = [[0] * (self.maxlen - len(decode_words)) + decode_words]

            pred_tokens = [
                self.vocab[e] for e in decode_words[:-1]
                if e != self.eos_id and e != 0
            ]
            pred_poems.append(pred_tokens)

        gen = ''
        for line in pred_poems:
            true_str = " ".join(line)
            gen = gen + true_str + '\n'

        return gen

    def sample(self, title, context, repeat, go_id, end_id):
        self.seq_encoder.eval()
        self.decoder.eval()

        title_last_hidden, _ = self.seq_encoder(title)
        context_last_hidden, _ = self.seq_encoder(context)
        c = torch.cat((title_last_hidden, context_last_hidden),
                      1)  # (batch, 2 * hidden_size * 2)

        c_repeated = c.expand(
            repeat, -1)  # 注意,我们输入的batch_size是1,这里复制repeat遍,为了后面的BLEU计算

        prior_z = self.sample_code_prior(
            c_repeated)  # c_repeated: (batch_size=repeat, 4*hidden_size)

        # (batch, max_len, 1)  (batch_size, 1)
        sample_words, sample_lens = self.decoder.sampling(
            self.init_decoder_hidden(torch.cat((prior_z, c_repeated), 1)),
            self.maxlen, go_id, end_id, "greedy")
        return sample_words, sample_lens
Example #24
0
    def __init__(self,
                 num_steps,
                 x_size,
                 window_size,
                 z_what_size,
                 rnn_hidden_size,
                 encoder_net=[],
                 decoder_net=[],
                 predict_net=[],
                 embed_net=None,
                 bl_predict_net=[],
                 non_linearity='ReLU',
                 decoder_output_bias=None,
                 decoder_output_use_sigmoid=False,
                 use_masking=True,
                 use_baselines=True,
                 baseline_scalar=None,
                 scale_prior_mean=3.0,
                 scale_prior_sd=0.1,
                 pos_prior_mean=0.0,
                 pos_prior_sd=1.0,
                 likelihood_sd=0.3,
                 use_cuda=False):

        super(AIR, self).__init__()

        self.num_steps = num_steps
        self.x_size = x_size
        self.window_size = window_size
        self.z_what_size = z_what_size
        self.rnn_hidden_size = rnn_hidden_size
        self.use_masking = use_masking
        self.use_baselines = use_baselines
        self.baseline_scalar = baseline_scalar
        self.likelihood_sd = likelihood_sd
        self.use_cuda = use_cuda
        prototype = torch.tensor(0.).cuda() if use_cuda else torch.tensor(0.)
        self.options = dict(dtype=prototype.dtype, device=prototype.device)

        self.z_pres_size = 1
        self.z_where_size = 3
        # By making these parameters they will be moved to the gpu
        # when necessary. (They are not registered with pyro for
        # optimization.)
        self.z_where_loc_prior = nn.Parameter(torch.FloatTensor(
            [scale_prior_mean, pos_prior_mean, pos_prior_mean]),
                                              requires_grad=False)
        self.z_where_scale_prior = nn.Parameter(torch.FloatTensor(
            [scale_prior_sd, pos_prior_sd, pos_prior_sd]),
                                                requires_grad=False)

        # Create nn modules.
        rnn_input_size = x_size**2 if embed_net is None else embed_net[-1]
        rnn_input_size += self.z_where_size + z_what_size + self.z_pres_size
        nl = getattr(nn, non_linearity)

        self.rnn = nn.LSTMCell(rnn_input_size, rnn_hidden_size)
        self.encode = Encoder(window_size**2, encoder_net, z_what_size, nl)
        self.decode = Decoder(window_size**2, decoder_net, z_what_size,
                              decoder_output_bias, decoder_output_use_sigmoid,
                              nl)
        self.predict = Predict(rnn_hidden_size, predict_net, self.z_pres_size,
                               self.z_where_size, nl)
        self.embed = Identity() if embed_net is None else MLP(
            x_size**2, embed_net, nl, True)

        self.bl_rnn = nn.LSTMCell(rnn_input_size, rnn_hidden_size)
        self.bl_predict = MLP(rnn_hidden_size, bl_predict_net + [1], nl)
        self.bl_embed = Identity() if embed_net is None else MLP(
            x_size**2, embed_net, nl, True)

        # Create parameters.
        self.h_init = nn.Parameter(torch.zeros(1, rnn_hidden_size))
        self.c_init = nn.Parameter(torch.zeros(1, rnn_hidden_size))
        self.bl_h_init = nn.Parameter(torch.zeros(1, rnn_hidden_size))
        self.bl_c_init = nn.Parameter(torch.zeros(1, rnn_hidden_size))
        self.z_where_init = nn.Parameter(torch.zeros(1, self.z_where_size))
        self.z_what_init = nn.Parameter(torch.zeros(1, self.z_what_size))

        if use_cuda:
            self.cuda()
    def __init__(self, config, vocab_size, PAD_token=0):
        super(DFVAE, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.lambda_gp = config['lambda_gp']
        self.temp = config['temp']

        self.embedder = nn.Embedding(vocab_size,
                                     config['emb_size'],
                                     padding_idx=PAD_token)
        self.utt_encoder = Encoder(self.embedder, config['emb_size'],
                                   config['n_hidden'], True,
                                   config['n_layers'], config['noise_radius'])
        self.context_encoder = ContextEncoder(self.utt_encoder,
                                              config['n_hidden'] * 2 + 2,
                                              config['n_hidden'], 1,
                                              config['noise_radius'])
        self.prior_net = Variation(config['n_hidden'],
                                   config['z_size'])  # p(e|c)
        self.post_net = Variation(config['n_hidden'] * 3,
                                  config['z_size'])  # q(e|c,x)

        #self.prior_highway = nn.Linear(config['n_hidden'], config['n_hidden'])
        #self.post_highway = nn.Linear(config['n_hidden'] * 3, config['n_hidden'])
        self.postflow1 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.postflow2 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.postflow3 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.priorflow1 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)
        self.priorflow2 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)
        self.priorflow3 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)

        self.post_generator = nn_.SequentialFlow(self.postflow1,
                                                 self.postflow2,
                                                 self.postflow3)
        self.prior_generator = nn_.SequentialFlow(self.priorflow1,
                                                  self.priorflow2,
                                                  self.priorflow3)

        self.decoder = Decoder(self.embedder,
                               config['emb_size'],
                               config['n_hidden'] + config['z_size'],
                               vocab_size,
                               n_layers=1)

        self.optimizer_AE = optim.SGD(
            list(self.context_encoder.parameters()) +
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.decoder.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters())
            #+list(self.prior_highway.parameters())
            #+list(self.post_highway.parameters())
            ,
            lr=config['lr_ae'])
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters())
            #+list(self.prior_highway.parameters())
            #+list(self.post_highway.parameters())
            ,
            lr=config['lr_gan_g'])

        #self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d'])

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.6)

        self.criterion_ce = nn.CrossEntropyLoss()
class DFVAE(nn.Module):
    def __init__(self, config, vocab_size, PAD_token=0):
        super(DFVAE, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.lambda_gp = config['lambda_gp']
        self.temp = config['temp']

        self.embedder = nn.Embedding(vocab_size,
                                     config['emb_size'],
                                     padding_idx=PAD_token)
        self.utt_encoder = Encoder(self.embedder, config['emb_size'],
                                   config['n_hidden'], True,
                                   config['n_layers'], config['noise_radius'])
        self.context_encoder = ContextEncoder(self.utt_encoder,
                                              config['n_hidden'] * 2 + 2,
                                              config['n_hidden'], 1,
                                              config['noise_radius'])
        self.prior_net = Variation(config['n_hidden'],
                                   config['z_size'])  # p(e|c)
        self.post_net = Variation(config['n_hidden'] * 3,
                                  config['z_size'])  # q(e|c,x)

        #self.prior_highway = nn.Linear(config['n_hidden'], config['n_hidden'])
        #self.post_highway = nn.Linear(config['n_hidden'] * 3, config['n_hidden'])
        self.postflow1 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.postflow2 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.postflow3 = flow.myIAF(config['z_size'], config['z_size'] * 2,
                                    config['n_hidden'], 3)
        self.priorflow1 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)
        self.priorflow2 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)
        self.priorflow3 = flow.IAF(config['z_size'], config['z_size'] * 2,
                                   config['n_hidden'], 3)

        self.post_generator = nn_.SequentialFlow(self.postflow1,
                                                 self.postflow2,
                                                 self.postflow3)
        self.prior_generator = nn_.SequentialFlow(self.priorflow1,
                                                  self.priorflow2,
                                                  self.priorflow3)

        self.decoder = Decoder(self.embedder,
                               config['emb_size'],
                               config['n_hidden'] + config['z_size'],
                               vocab_size,
                               n_layers=1)

        self.optimizer_AE = optim.SGD(
            list(self.context_encoder.parameters()) +
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.decoder.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters())
            #+list(self.prior_highway.parameters())
            #+list(self.post_highway.parameters())
            ,
            lr=config['lr_ae'])
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters())
            #+list(self.prior_highway.parameters())
            #+list(self.post_highway.parameters())
            ,
            lr=config['lr_gan_g'])

        #self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d'])

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.6)

        self.criterion_ce = nn.CrossEntropyLoss()

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            m.weight.data.uniform_(-0.02, 0.02)
            m.bias.data.fill_(0)

    def sample_post(self, x, c):
        xc = torch.cat((x, c), 1)
        e, mu, log_s = self.post_net(xc)
        #h_post = self.post_highway(xc)
        z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu))
        #h_prior = self.prior_highway(c)
        tilde_z, det_g, _ = self.prior_generator((z, det_f, c))
        return tilde_z, z, mu, log_s, det_f, det_g

    def sample_code_post(self, x, c):
        xc = torch.cat((x, c), 1)
        e, mu, log_s = self.post_net(xc)
        #h_post = self.post_highway(xc)
        z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu))
        #h_prior = self.prior_highway(c)
        tilde_z, det_g, _ = self.prior_generator((z, det_f, c))
        return tilde_z, mu, log_s, det_f, det_g

    def sample_post2(self, x, c):
        xc = torch.cat((x, c), 1)
        e, mu, log_s = self.post_net(xc)
        #h_post = self.post_highway(xc)
        z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu))
        return e, mu, log_s, z, det_f

    def sample_code_prior(self, c):
        e, mu, log_s = self.prior_net(c)
        #z = self.prior_generator(e)
        #h_prior = self.prior_highway(c)
        #tilde_z, det_g, _ = self.prior_generator((e, 0, h_prior))
        return e, mu, log_s  #, det_g

    def sample_prior(self, c):
        e, mu, log_s = self.prior_net(c)
        #h_prior = self.prior_highway(c)
        z, det_prior, _ = self.prior_generator((e, 0, c))
        return z, det_prior

    def train_AE(self, context, context_lens, utt_lens, floors, response,
                 res_lens):
        self.context_encoder.train()
        self.decoder.train()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        z, _, _, _, _ = self.sample_code_post(x, c)
        z_post, mu_post, log_s_post, det_f, det_g = self.sample_code_post(x, c)
        #prior_z, mu_prior, log_s_prior = self.sample_code_prior(c)
        #KL_loss = torch.sum(log_s_prior - log_s_post + (torch.exp(log_s_post) + (mu_post - mu_prior)**2)/torch.exp(log_s_prior),1) / 2 - 100
        #kloss = KL_loss - det_f #+ det_g
        #KL_loss = log_Normal_diag(z_post, mu_post, log_s_post) - log_Normal_diag(prior_z, mu_prior, log_s_prior)
        output = self.decoder(torch.cat((z_post, c), 1), None,
                              response[:, :-1], (res_lens - 1))
        flattened_output = output.view(-1, self.vocab_size)

        dec_target = response[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # [(batch_sz*seq_len)]
        masked_target = dec_target.masked_select(mask)  #
        output_mask = mask.unsqueeze(1).expand(
            mask.size(0), self.vocab_size)  # [(batch_sz*seq_len) x n_tokens]
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)
        #print(KL_loss.mean())
        #print(det_f.mean())
        self.optimizer_AE.zero_grad()
        AE_term = self.criterion_ce(masked_output / self.temp, masked_target)
        loss = AE_term  #+ KL_loss.mean()
        loss.backward()

        #torch.nn.utils.clip_grad_norm_(list(self.context_encoder.parameters())+list(self.decoder.parameters()), self.clip)
        torch.nn.utils.clip_grad_norm_(
            list(self.context_encoder.parameters()) +
            list(self.decoder.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_generator.parameters()) +
            list(self.post_net.parameters()), self.clip)
        self.optimizer_AE.step()

        return [
            ('train_loss_AE', AE_term.item())
        ]  #,('KL_loss', KL_loss.mean().item())]#,('det_f', det_f.mean().item()),('det_g', det_g.mean().item())]

    def train_G(self, context, context_lens, utt_lens, floors, response,
                res_lens):
        self.context_encoder.eval()
        self.optimizer_G.zero_grad()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        # -----------------posterior samples ---------------------------
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        z_0, mu_post, log_s_post, z_post, weight = self.sample_post2(
            x.detach(), c.detach())
        # ----------------- prior samples ---------------------------
        prior_z, mu_prior, log_s_prior = self.sample_code_prior(c.detach())
        KL_loss = torch.sum(
            log_s_prior - log_s_post + torch.exp(log_s_post) /
            torch.exp(log_s_prior) * torch.sum(weight**2, dim=2) +
            (mu_post)**2 / torch.exp(log_s_prior), 1) / 2 - 100
        #KL_loss = abs(log_Normal_diag(z_0, mu_post, log_s_post) - log_Normal_diag(z_post, mu_prior, log_s_prior))
        #KL_loss2 = torch.sum((prior_z - mu_post.detach())**2 / (2 * torch.exp(log_s_post.detach())),1)
        #print(mu_post.shape, prior_z.shape)
        loss = KL_loss
        #print(-det_f , KL_loss )
        #loss = abs(loss)
        loss.mean().backward()
        torch.nn.utils.clip_grad_norm_(
            list(self.post_generator.parameters()) +
            list(self.prior_generator.parameters()) +
            list(self.post_net.parameters()) +
            list(self.prior_generator.parameters()), self.clip)
        self.optimizer_G.step()
        #costG = errG_prior - errG_post
        return [
            ('KL_loss', KL_loss.mean().item())
        ]  #,('det_f', det_f.mean().item()),('det_g', det_g.sum().item())]

    def valid(self, context, context_lens, utt_lens, floors, response,
              res_lens):
        self.context_encoder.eval()
        #self.discriminator.eval()
        self.decoder.eval()

        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        post_z, mu_post, log_s_post, det_f, det_g = self.sample_code_post(x, c)
        prior_z, mu_prior, log_s_prior = self.sample_code_prior(c)
        #errD_post = torch.mean(self.discriminator(torch.cat((post_z, c),1)))
        #errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c),1)))
        KL_loss = torch.sum(
            log_s_prior - log_s_post +
            (torch.exp(log_s_post) +
             (mu_post)**2) / torch.exp(log_s_prior), 1) / 2
        #KL_loss = log_Normal_diag(post_z, mu_post, log_s_post) - log_Normal_diag(prior_z, mu_prior, log_s_prior)
        #KL_loss2 = torch.sum((prior_z - mu_post)**2 / (2 * torch.exp(log_s_post)),1)
        loss = KL_loss  # -det_f
        costG = loss.sum()
        dec_target = response[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # [(batch_sz*seq_len)]
        masked_target = dec_target.masked_select(mask)
        output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size)
        output = self.decoder(torch.cat((post_z, c), 1), None,
                              response[:, :-1], (res_lens - 1))
        flattened_output = output.view(-1, self.vocab_size)
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)
        lossAE = self.criterion_ce(masked_output / self.temp, masked_target)
        return [('valid_loss_AE', lossAE.item()),
                ('valid_loss_G', costG.item())]

    def sample(self, context, context_lens, utt_lens, floors, repeat, SOS_tok,
               EOS_tok):
        self.context_encoder.eval()
        self.decoder.eval()

        c = self.context_encoder(context, context_lens, utt_lens, floors)
        c_repeated = c.expand(repeat, -1)
        prior_z, _ = self.sample_prior(c_repeated)
        sample_words, sample_lens = self.decoder.sampling(
            torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok,
            EOS_tok, "greedy")
        return sample_words, sample_lens

    def gen(self, context, prior_z, context_lens, utt_lens, floors, repeat,
            SOS_tok, EOS_tok):
        self.context_encoder.eval()
        self.decoder.eval()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        c_repeated = c.expand(repeat, -1)
        sample_words, sample_lens = self.decoder.sampling(
            torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok,
            EOS_tok, "greedy")
        return sample_words, sample_lens

    def sample_latent(self, context, context_lens, utt_lens, floors, repeat,
                      SOS_tok, EOS_tok):
        self.context_encoder.eval()
        #self.decoder.eval()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        c_repeated = c.expand(repeat, -1)
        e, _, _ = self.sample_code_prior(c_repeated)
        prior_z, _, _ = self.prior_generator((e, 0, c_repeated))
        return prior_z, e

    def sample_latent_post(self, context, context_lens, utt_lens, floors,
                           response, res_lens, repeat):
        self.context_encoder.eval()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        c_repeated = c.expand(repeat, -1)
        x_repeated = x.expand(repeat, -1)
        z_post, z, mu_post, log_s_post, det_f, det_g = self.sample_post(
            x_repeated, c_repeated)
        return z_post, z

    def adjust_lr(self):
        self.lr_scheduler_AE.step()
Example #27
0
class RNN(object):
    def __init__(self, input_size, output_size):
        super(RNN, self).__init__()

        self.encoder = Encoder(input_size)
        self.decoder = Decoder(output_size)

        self.loss = nn.CrossEntropyLoss()
        self.encoder_optimizer = optim.Adam(self.encoder.parameters())
        self.decoder_optimizer = optim.Adam(self.decoder.parameters())

        sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_()
        sos[0, 0], eos[0, 0] = 0, 1

        self.sos, self.eos = sos, eos

    def train(self, input, target):
        target.insert(0, self.sos)
        target.append(self.eos)

        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Encoder
        hidden_state = self.encoder.first_hidden()
        for ivec in input:
            _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state)

        # Decoder
        total_loss, outputs = 0, []
        for i in range(len(target) - 1):
            _, softmax, hidden_state = self.decoder.forward(Variable(target[i]), hidden_state)

            outputs.append(np.argmax(softmax.data.numpy(), 1)[:, np.newaxis])
            total_loss += self.loss(softmax, Variable(target[i+1][0]))

        total_loss /= len(outputs)
        total_loss.backward()

        self.decoder_optimizer.step()
        self.encoder_optimizer.step()

        return total_loss.data[0], outputs   # use total_loss.data[0] for version 0.3.0_4 and below, .item() for 0.4.0

    def eval(self, input):
        hidden_state = self.encoder.first_hidden()

        # Encoder
        for ivec in input:
            _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state)

        sentence = []
        input = self.sos
        # Decoder
        while input.data[0, 0] != 1:
            output, _, hidden_state = self.decoder.forward(input, hidden_state)
            word = np.argmax(output.data.numpy()).reshape((1, 1))
            input = Variable(torch.LongTensor(word))
            sentence.append(word)

        return sentence

    def save(self):
        torch.save(self.encoder.state_dict(), "models/encoder.ckpt")
        torch.save(self.decoder.state_dict(), "models/decoder.ckpt")
Example #28
0
class DeepAPI(nn.Module):
    ''' model. '''
    def __init__(self, config, vocab_size):
        super(DeepAPI, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.temp = config['temp']

        self.desc_embedder = nn.Embedding(vocab_size,
                                          config['emb_size'],
                                          padding_idx=PAD_ID)
        self.api_embedder = nn.Embedding(vocab_size,
                                         config['emb_size'],
                                         padding_idx=PAD_ID)
        # utter encoder: encode response to vector
        self.encoder = Encoder(self.desc_embedder, config['emb_size'],
                               config['n_hidden'], True, config['n_layers'],
                               config['noise_radius'])
        self.decoder = Decoder(self.api_embedder, config['emb_size'],
                               config['n_hidden'] * 2, vocab_size,
                               config['use_attention'], 1,
                               config['dropout'])  # utter decoder: P(x|c,z)
        self.optimizer = optim.Adadelta(list(self.encoder.parameters()) +
                                        list(self.decoder.parameters()),
                                        lr=config['lr_ae'],
                                        rho=0.95)
        self.criterion_ce = nn.CrossEntropyLoss()

    def forward(self, descs, desc_lens, apiseqs, api_lens):
        c, hids = self.encoder(descs, desc_lens)
        output, _ = self.decoder(c, hids, None, apiseqs[:, :-1],
                                 (api_lens - 1))
        # decode from z, c  # output: [batch x seq_len x n_tokens]
        output = output.view(-1, self.vocab_size)  # [batch*seq_len x n_tokens]

        dec_target = apiseqs[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # [(batch_sz*seq_len)]
        masked_target = dec_target.masked_select(mask)  #
        output_mask = mask.unsqueeze(1).expand(
            mask.size(0), self.vocab_size)  # [(batch_sz*seq_len) x n_tokens]

        masked_output = output.masked_select(output_mask).view(
            -1, self.vocab_size)
        loss = self.criterion_ce(masked_output / self.temp, masked_target)
        return loss

    def train_AE(self, descs, desc_lens, apiseqs, api_lens):
        self.encoder.train()
        self.decoder.train()

        loss = self.forward(descs, desc_lens, apiseqs, api_lens)

        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` to prevent exploding gradient in RNNs / LSTMs
        torch.nn.utils.clip_grad_norm_(
            list(self.encoder.parameters()) + list(self.decoder.parameters()),
            self.clip)
        self.optimizer.step()
        return {'train_loss': loss.item()}

    def valid(self, descs, desc_lens, apiseqs, api_lens):
        self.encoder.eval()
        self.decoder.eval()
        loss = self.forward(descs, desc_lens, apiseqs, api_lens)
        return {'valid_loss': loss.item()}

    def sample(self, descs, desc_lens, n_samples, mode='beamsearch'):
        self.encoder.eval()
        self.decoder.eval()
        c, hids = self.encoder(descs, desc_lens)
        if mode == 'beamsearch':
            sample_words, sample_lens, _ = self.decoder.beam_decode(
                c, hids, None, 12, self.maxlen, n_samples)
            #[batch_size x n_samples x seq_len]
            sample_words, sample_lens = sample_words[0], sample_lens[0]
        else:
            sample_words, sample_lens = self.decoder.sampling(
                c, hids, None, n_samples, self.maxlen, mode)
        return sample_words, sample_lens

    def adjust_lr(self):
        #self.lr_scheduler_AE.step()
        return None
Example #29
0
    def __init__(self, config, vocab_size, PAD_token=0):
        super(DialogWAE, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.lambda_gp = config['lambda_gp']
        self.temp = config['temp']

        self.embedder = nn.Embedding(vocab_size,
                                     config['emb_size'],
                                     padding_idx=PAD_token)
        self.utt_encoder = Encoder(self.embedder, config['emb_size'],
                                   config['n_hidden'], True,
                                   config['n_layers'], config['noise_radius'])
        self.context_encoder = ContextEncoder(self.utt_encoder,
                                              config['n_hidden'] * 2 + 2,
                                              config['n_hidden'], 1,
                                              config['noise_radius'])
        self.prior_net = Variation(config['n_hidden'],
                                   config['z_size'])  # p(e|c)
        self.post_net = Variation(config['n_hidden'] * 3,
                                  config['z_size'])  # q(e|c,x)

        self.post_generator = nn.Sequential(
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05,
                           momentum=0.1), nn.ReLU(),
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1),
            nn.ReLU(), nn.Linear(config['z_size'], config['z_size']))
        self.post_generator.apply(self.init_weights)

        self.prior_generator = nn.Sequential(
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05,
                           momentum=0.1), nn.ReLU(),
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1),
            nn.ReLU(), nn.Linear(config['z_size'], config['z_size']))
        self.prior_generator.apply(self.init_weights)

        self.decoder = Decoder(self.embedder,
                               config['emb_size'],
                               config['n_hidden'] + config['z_size'],
                               vocab_size,
                               n_layers=1)

        self.discriminator = nn.Sequential(
            nn.Linear(config['n_hidden'] + config['z_size'],
                      config['n_hidden'] * 2),
            nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config['n_hidden'] * 2, config['n_hidden'] * 2),
            nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config['n_hidden'] * 2, 1),
        )
        self.discriminator.apply(self.init_weights)

        self.optimizer_AE = optim.SGD(list(self.context_encoder.parameters()) +
                                      list(self.post_net.parameters()) +
                                      list(self.post_generator.parameters()) +
                                      list(self.decoder.parameters()),
                                      lr=config['lr_ae'])
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters()),
            lr=config['lr_gan_g'])
        self.optimizer_D = optim.RMSprop(self.discriminator.parameters(),
                                         lr=config['lr_gan_d'])

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.6)

        self.criterion_ce = nn.CrossEntropyLoss()
Example #30
0
class DialogWAE(nn.Module):
    def __init__(self, config, vocab_size, PAD_token=0):
        super(DialogWAE, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen = config['maxlen']
        self.clip = config['clip']
        self.lambda_gp = config['lambda_gp']
        self.temp = config['temp']

        self.embedder = nn.Embedding(vocab_size,
                                     config['emb_size'],
                                     padding_idx=PAD_token)
        self.utt_encoder = Encoder(self.embedder, config['emb_size'],
                                   config['n_hidden'], True,
                                   config['n_layers'], config['noise_radius'])
        self.context_encoder = ContextEncoder(self.utt_encoder,
                                              config['n_hidden'] * 2 + 2,
                                              config['n_hidden'], 1,
                                              config['noise_radius'])
        self.prior_net = Variation(config['n_hidden'],
                                   config['z_size'])  # p(e|c)
        self.post_net = Variation(config['n_hidden'] * 3,
                                  config['z_size'])  # q(e|c,x)

        self.post_generator = nn.Sequential(
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05,
                           momentum=0.1), nn.ReLU(),
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1),
            nn.ReLU(), nn.Linear(config['z_size'], config['z_size']))
        self.post_generator.apply(self.init_weights)

        self.prior_generator = nn.Sequential(
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05,
                           momentum=0.1), nn.ReLU(),
            nn.Linear(config['z_size'], config['z_size']),
            nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1),
            nn.ReLU(), nn.Linear(config['z_size'], config['z_size']))
        self.prior_generator.apply(self.init_weights)

        self.decoder = Decoder(self.embedder,
                               config['emb_size'],
                               config['n_hidden'] + config['z_size'],
                               vocab_size,
                               n_layers=1)

        self.discriminator = nn.Sequential(
            nn.Linear(config['n_hidden'] + config['z_size'],
                      config['n_hidden'] * 2),
            nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config['n_hidden'] * 2, config['n_hidden'] * 2),
            nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1),
            nn.LeakyReLU(0.2),
            nn.Linear(config['n_hidden'] * 2, 1),
        )
        self.discriminator.apply(self.init_weights)

        self.optimizer_AE = optim.SGD(list(self.context_encoder.parameters()) +
                                      list(self.post_net.parameters()) +
                                      list(self.post_generator.parameters()) +
                                      list(self.decoder.parameters()),
                                      lr=config['lr_ae'])
        self.optimizer_G = optim.RMSprop(
            list(self.post_net.parameters()) +
            list(self.post_generator.parameters()) +
            list(self.prior_net.parameters()) +
            list(self.prior_generator.parameters()),
            lr=config['lr_gan_g'])
        self.optimizer_D = optim.RMSprop(self.discriminator.parameters(),
                                         lr=config['lr_gan_d'])

        self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE,
                                                         step_size=10,
                                                         gamma=0.6)

        self.criterion_ce = nn.CrossEntropyLoss()

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            m.weight.data.uniform_(-0.02, 0.02)
            m.bias.data.fill_(0)

    def sample_code_post(self, x, c):
        e, _, _ = self.post_net(torch.cat((x, c), 1))
        z = self.post_generator(e)
        return z

    def sample_code_prior(self, c):
        e, _, _ = self.prior_net(c)
        z = self.prior_generator(e)
        return z

    def train_AE(self, context, context_lens, utt_lens, floors, response,
                 res_lens):
        self.context_encoder.train()
        self.decoder.train()
        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        z = self.sample_code_post(x, c)
        output = self.decoder(torch.cat((z, c), 1), None, response[:, :-1],
                              (res_lens - 1))
        flattened_output = output.view(-1, self.vocab_size)

        dec_target = response[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # [(batch_sz*seq_len)]
        masked_target = dec_target.masked_select(mask)  #
        output_mask = mask.unsqueeze(1).expand(
            mask.size(0), self.vocab_size)  # [(batch_sz*seq_len) x n_tokens]
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)

        self.optimizer_AE.zero_grad()
        loss = self.criterion_ce(masked_output / self.temp, masked_target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(
            list(self.context_encoder.parameters()) +
            list(self.decoder.parameters()), self.clip)
        self.optimizer_AE.step()

        return [('train_loss_AE', loss.item())]

    def train_G(self, context, context_lens, utt_lens, floors, response,
                res_lens):
        self.context_encoder.eval()
        self.optimizer_G.zero_grad()

        for p in self.discriminator.parameters():
            p.requires_grad = False

        c = self.context_encoder(context, context_lens, utt_lens, floors)
        # -----------------posterior samples ---------------------------
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        z_post = self.sample_code_post(x.detach(), c.detach())
        errG_post = torch.mean(
            self.discriminator(torch.cat((z_post, c.detach()), 1)))
        errG_post.backward(minus_one)

        # ----------------- prior samples ---------------------------
        prior_z = self.sample_code_prior(c.detach())
        errG_prior = torch.mean(
            self.discriminator(torch.cat((prior_z, c.detach()), 1)))
        errG_prior.backward(one)

        self.optimizer_G.step()

        for p in self.discriminator.parameters():
            p.requires_grad = True

        costG = errG_prior - errG_post
        return [('train_loss_G', costG.item())]

    def train_D(self, context, context_lens, utt_lens, floors, response,
                res_lens):
        self.context_encoder.eval()
        self.discriminator.train()

        self.optimizer_D.zero_grad()

        batch_size = context.size(0)

        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        post_z = self.sample_code_post(x, c)
        errD_post = torch.mean(
            self.discriminator(torch.cat((post_z.detach(), c.detach()), 1)))
        errD_post.backward(one)

        prior_z = self.sample_code_prior(c)
        errD_prior = torch.mean(
            self.discriminator(torch.cat((prior_z.detach(), c.detach()), 1)))
        errD_prior.backward(minus_one)

        alpha = gData(torch.rand(batch_size, 1))
        alpha = alpha.expand(prior_z.size())
        interpolates = alpha * prior_z.data + ((1 - alpha) * post_z.data)
        interpolates = Variable(interpolates, requires_grad=True)
        d_input = torch.cat((interpolates, c.detach()), 1)
        disc_interpolates = torch.mean(self.discriminator(d_input))
        gradients = torch.autograd.grad(
            outputs=disc_interpolates,
            inputs=interpolates,
            grad_outputs=gData(torch.ones(disc_interpolates.size())),
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]
        gradient_penalty = (
            (gradients.contiguous().view(gradients.size(0), -1).norm(2, dim=1)
             - 1)**2).mean() * self.lambda_gp
        gradient_penalty.backward()

        self.optimizer_D.step()
        costD = -(errD_prior - errD_post) + gradient_penalty
        return [('train_loss_D', costD.item())]

    def valid(self, context, context_lens, utt_lens, floors, response,
              res_lens):
        self.context_encoder.eval()
        self.discriminator.eval()
        self.decoder.eval()

        c = self.context_encoder(context, context_lens, utt_lens, floors)
        x, _ = self.utt_encoder(response[:, 1:], res_lens - 1)
        post_z = self.sample_code_post(x, c)
        prior_z = self.sample_code_prior(c)
        errD_post = torch.mean(self.discriminator(torch.cat((post_z, c), 1)))
        errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c), 1)))
        costD = -(errD_prior - errD_post)
        costG = -costD

        dec_target = response[:, 1:].contiguous().view(-1)
        mask = dec_target.gt(0)  # [(batch_sz*seq_len)]
        masked_target = dec_target.masked_select(mask)
        output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size)
        output = self.decoder(torch.cat((post_z, c), 1), None,
                              response[:, :-1], (res_lens - 1))
        flattened_output = output.view(-1, self.vocab_size)
        masked_output = flattened_output.masked_select(output_mask).view(
            -1, self.vocab_size)
        lossAE = self.criterion_ce(masked_output / self.temp, masked_target)
        return [('valid_loss_AE', lossAE.item()),
                ('valid_loss_G', costG.item()), ('valid_loss_D', costD.item())]

    def sample(self, context, context_lens, utt_lens, floors, repeat, SOS_tok,
               EOS_tok):
        self.context_encoder.eval()
        self.decoder.eval()

        c = self.context_encoder(context, context_lens, utt_lens,
                                 floors)  # encode context into embedding
        c_repeated = c.expand(repeat, -1)
        prior_z = self.sample_code_prior(c_repeated)
        #         print(prior_z.shape)
        #         print(prior_z)
        sample_words, sample_lens = self.decoder.sampling(
            torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok,
            EOS_tok, "greedy")
        return sample_words, sample_lens

    def adjust_lr(self):
        self.lr_scheduler_AE.step()