def __init__(self, config, vocab_size): super(DeepAPI, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.temp = config['temp'] self.desc_embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_ID) self.api_embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_ID) # utter encoder: encode response to vector self.encoder = Encoder(self.desc_embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.decoder = Decoder(self.api_embedder, config['emb_size'], config['n_hidden'] * 2, vocab_size, config['use_attention'], 1, config['dropout']) # utter decoder: P(x|c,z) self.optimizer = optim.Adadelta(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=config['lr_ae'], rho=0.95) self.criterion_ce = nn.CrossEntropyLoss()
def __init__(self, config): super(Transformer_EncoderDecoder, self).__init__() c = copy.deepcopy self.attn = MultiHeadedAttention(config['head'], config['emb_dim']) self.ff = PositionwiseFeedForward(config['emb_dim'], config['d_ff'], config['drop_out']) self.position = PositionalEncoding(config['emb_dim'], config['drop_out']) self.encoder = Encoder( EncoderLayer(config['emb_dim'], c(self.attn), c(self.ff), config['drop_out']), config['N_layers']) self.decoder = Decoder( DecoderLayer(config['emb_dim'], c(self.attn), c(self.attn), c(self.ff), config['drop_out']), config['N_layers']) self.src_embed = nn.Sequential( Embeddings(config['emb_dim'], config['vocab_size']), c(self.position)) self.tgt_embed = nn.Sequential( Embeddings(config['emb_dim'], config['vocab_size']), c(self.position)) self.generator = Generator(config['emb_dim'], config['vocab_size']) self.fc_out = nn.Linear(config['emb_dim'], config['vocab_size']) self.model = EncoderDecoder(self.encoder, self.decoder, self.src_embed, self.tgt_embed, self.generator)
def da_rnn(train_data: TrainData, n_targs: int, encoder_hidden_size=64, decoder_hidden_size=64, T=10, learning_rate=0.01, batch_size=128): train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7), batch_size, nn.MSELoss()) logger.info(f"Training size: {train_cfg.train_size:d}.") enc_kwargs = {"input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T} encoder = Encoder(**enc_kwargs).to(device) with open(os.path.join("data", "enc_kwargs.json"), "w") as fi: json.dump(enc_kwargs, fi, indent=4) dec_kwargs = {"encoder_hidden_size": encoder_hidden_size, "decoder_hidden_size": decoder_hidden_size, "T": T, "out_feats": n_targs} decoder = Decoder(**dec_kwargs).to(device) with open(os.path.join("data", "dec_kwargs.json"), "w") as fi: json.dump(dec_kwargs, fi, indent=4) encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=learning_rate) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=learning_rate) da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer) return train_cfg, da_rnn_net
class RNN(object): def __init__(self, input_size, output_size): super(RNN, self).__init__() self.encoder = Encoder(input_size) self.decoder = Decoder(output_size) self.loss = nn.CrossEntropyLoss() self.encoder_optimizer = optim.Adam(self.encoder.parameters()) self.decoder_optimizer = optim.Adam(self.decoder.parameters()) sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_() sos[0, 0], eos[0, 0] = 0, 1 self.sos, self.eos = sos, eos def train(self, input, target): self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() hidden_state = self.encoder.first_hidden() # Encoder for ivec in input: _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state) # Decoder target.insert(0, self.sos) target.append(self.eos) total_loss = 0 for i in range(len(target) - 1): _, softmax, hidden_state = self.decoder.forward( target[i], hidden_state) total_loss += self.loss(softmax, Variable(target[i + 1][0])) total_loss.backward() self.decoder_optimizer.step() self.encoder_optimizer.step() return total_loss def eval(self, input): hidden_state = self.encoder.first_hidden() # Encoder for ivec in input: _, hidden_state = self.encoder.forward(ivec, hidden_state) outputs = [] output = self.sos # Decoder while output is not self.eos: output, _, hidden_state = self.decoder.forward( output, hidden_state) outputs += output return outputs
def da_rnn(train_data, n_targs: int, encoder_hidden_size=64, decoder_hidden_size=64, T=10, learning_rate=0.01, batch_size=128): train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7), batch_size, nn.MSELoss()) logging.info(f"Training size: {train_cfg.train_size:d}.") enc_params = pd.DataFrame([{ 'input_size': train_data.feats.shape[1], 'hidden_size': encoder_hidden_size, 'T': T }]) enc_params.to_csv(os.path.join('results', save_name, 'enc_params.csv')) encoder = Encoder(input_size=enc_params['input_size'][0].item(), hidden_size=enc_params['hidden_size'][0].item(), T=enc_params['T'][0].item()).cuda() dec_params = pd.DataFrame([{ 'encoder_hidden_size': encoder_hidden_size, 'decoder_hidden_size': decoder_hidden_size, 'T': T, 'out_feats': n_targs }]) dec_params.to_csv(os.path.join('results', save_name, 'dec_params.csv')) decoder = Decoder( encoder_hidden_size=dec_params['encoder_hidden_size'][0].item(), decoder_hidden_size=dec_params['decoder_hidden_size'][0].item(), T=dec_params['T'][0].item(), out_feats=dec_params['out_feats'][0].item()).cuda() encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=learning_rate, weight_decay=args.wdecay) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=learning_rate, weight_decay=args.wdecay) encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR( encoder_optimizer, train_data.feats.shape[0], eta_min=args.min_lr) decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR( decoder_optimizer, train_data.feats.shape[0], eta_min=args.min_lr) model = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer, encoder_scheduler, decoder_scheduler) return train_cfg, model
def __init__(self, input_size, output_size, resume=False): super(RNN, self).__init__() self.encoder = Encoder(input_size) self.decoder = Decoder(output_size) self.loss = nn.CrossEntropyLoss() self.encoder_optimizer = optim.Adam(self.encoder.parameters()) self.decoder_optimizer = optim.Adam(self.decoder.parameters()) if resume: self.encoder.load_state_dict(torch.load("models/encoder.ckpt")) self.decoder.load_state_dict(torch.load("models/decoder.ckpt"))
def da_rnn(train_data, n_targs: int, encoder_hidden_size=64, decoder_hidden_size=64, T=10, learning_rate=0.01, batch_size=128): train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7), batch_size, nn.MSELoss()) logging.info(f"Training size: {train_cfg.train_size:d}.") enc_kwargs = { "input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T } encoder = Encoder(**enc_kwargs).cuda() with open(os.path.join("data", "enc_kwargs.json"), "w") as fi: json.dump(enc_kwargs, fi, indent=4) dec_kwargs = { "encoder_hidden_size": encoder_hidden_size, "decoder_hidden_size": decoder_hidden_size, "T": T, "out_feats": n_targs } decoder = Decoder(**dec_kwargs).cuda() with open(os.path.join("data", "dec_kwargs.json"), "w") as fi: json.dump(dec_kwargs, fi, indent=4) encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=learning_rate, weight_decay=args.wdecay) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=learning_rate, weight_decay=args.wdecay) encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR( encoder_optimizer, args.epochs, eta_min=args.min_lr) decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR( decoder_optimizer, args.epochs, eta_min=args.min_lr) da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer, encoder_scheduler, decoder_scheduler) return train_cfg, da_rnn_net
def __init__(self, X_dim, Y_dim, encoder_hidden_size=64, decoder_hidden_size=64, linear_dropout=0, T=10, learning_rate=1e-5, batch_size=128, decay_rate=0.95): self.T = T self.decay_rate = decay_rate self.batch_size = batch_size self.X_dim = X_dim self.Y_dim = Y_dim self.encoder = Encoder(X_dim, encoder_hidden_size, T, linear_dropout).to(device) self.decoder = Decoder(encoder_hidden_size, decoder_hidden_size, T, linear_dropout, Y_dim).to(device) self.encoder_optim = torch.optim.Adam(params=self.encoder.parameters(), lr=learning_rate) self.decoder_optim = torch.optim.Adam(params=self.decoder.parameters(), lr=learning_rate) self.loss_func = torch.nn.MSELoss()
def __init__(self, input_size, output_size): super(RNN, self).__init__() self.encoder = Encoder(input_size) self.decoder = Decoder(output_size) self.loss = nn.CrossEntropyLoss() self.encoder_optimizer = optim.Adam(self.encoder.parameters()) self.decoder_optimizer = optim.Adam(self.decoder.parameters()) sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_() sos[0, 0], eos[0, 0] = 0, 1 self.sos, self.eos = sos, eos
def __init__(self, obs, nums, glimpse_size=(20, 20), inpt_encoder_hidden=[256]*2, glimpse_encoder_hidden=[256]*2, glimpse_decoder_hidden=[252]*2, transform_estimator_hidden=[256]*2, steps_pred_hidden=[50]*1, baseline_hidden=[256, 128]*1, transform_var_bias=-2., step_bias=0., *args, **kwargs): self.baseline = BaselineMLP(baseline_hidden) def _make_transform_estimator(x): est = StochasticTransformParam(transform_estimator_hidden, x, scale_bias=transform_var_bias) return est super(AIRonMNIST, self).__init__( *args, obs=obs, nums=nums, glimpse_size=glimpse_size, n_appearance=50, transition=snt.LSTM(256), input_encoder=(lambda: Encoder(inpt_encoder_hidden)), glimpse_encoder=(lambda: Encoder(glimpse_encoder_hidden)), glimpse_decoder=(lambda x: Decoder(glimpse_decoder_hidden, x)), transform_estimator=_make_transform_estimator, steps_predictor=(lambda: StepsPredictor(steps_pred_hidden, step_bias)), output_std=.3, **kwargs )
def __init__(self, args): super(SSM, self).__init__() self.s_dim = s_dim = args.s_dim self.a_dim = a_dim = args.a_dim self.o_dim = o_dim = args.o_dim self.h_dim = h_dim = args.h_dim self.device = args.device self.args = args self.encoder = torch.nn.DataParallel( Encoder(o_dim, h_dim).to(self.device[0]), self.device) self.decoder = torch.nn.DataParallel( Decoder(s_dim, o_dim).to(self.device[0]), self.device) self.prior = torch.nn.DataParallel( Prior(s_dim, a_dim).to(self.device[0]), self.device) self.posterior = torch.nn.DataParallel( Posterior(self.prior, s_dim, a_dim, h_dim).to(self.device[0]), self.device) self.distributions = nn.ModuleList( [self.prior, self.posterior, self.encoder, self.decoder]) init_weights(self.distributions) # for s_aux_loss self.prior01 = Normal(torch.tensor(0.), scale=torch.tensor(1.)) self.g_optimizer = optim.Adam(self.distributions.parameters())
def darnn(train_data: TrainingData, n_targets: int, encoder_hidden_size: int, decoder_hidden_size: int, T: int, learning_rate=0.002, batch_size=32): train_cfg = TrainingConfig(T, int(train_data.features.shape[0] * 0.7), batch_size, nn.MSELoss()) print(f"Training size: {train_cfg.train_size:d}.") enc_kwargs = {"input_size": train_data.features.shape[1], "hidden_size": encoder_hidden_size, "T": T} encoder = Encoder(**enc_kwargs).to(device) dec_kwargs = {"encoder_hidden_size": encoder_hidden_size,"decoder_hidden_size": decoder_hidden_size, "T": T, "out_features": n_targets} decoder = Decoder(**dec_kwargs).to(device) encoder_optimizer = optim.Adam(params=[p for p in encoder.parameters() if p.requires_grad],lr=learning_rate) decoder_optimizer = optim.Adam(params=[p for p in decoder.parameters() if p.requires_grad],lr=learning_rate) da_rnn_net = Darnn_Net(encoder, decoder, encoder_optimizer, decoder_optimizer) return train_cfg, da_rnn_net
def __init__(self, vocab, feats_size, kernel_size, rec_field, attn_size, hidden_size, mid_layer, dropout, which): super(TextNormalizer, self).__init__() self.vocab = vocab self.encoder = Encoder(len(vocab), feats_size, kernel_size, rec_field, dropout, which) self.decoder = Decoder(len(vocab), feats_size, attn_size, hidden_size, mid_layer, dropout) self.init_hidden = InitialWeights(hidden_size, mid_layer, 4)
def TCHA(train_data: TrainData, n_targs: int, bidirec=False, num_layer=1, encoder_hidden_size=64, decoder_hidden_size=64, T=10, learning_rate=0.01, batch_size=128, interval=1, split=0.7, isMean=False): train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * split), batch_size, nn.MSELoss(), interval, T, isMean) logger.info(f"Training size: {train_cfg.train_size:d}.") enc_args = {"input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T, "bidirec": bidirec, "num_layer": num_layer} encoder = Encoder(**enc_args).to(device) dec_args = {"encoder_hidden_size": encoder_hidden_size, "decoder_hidden_size": decoder_hidden_size, "T": T, "out_feats": n_targs, "bidirec": bidirec, "num_layer": num_layer} decoder = Decoder(**dec_args).to(device) encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=learning_rate) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=learning_rate) tcha = TCHA_Net(encoder, decoder, encoder_optimizer, decoder_optimizer) return train_cfg, tcha
def set_params(train_data, device, **da_rnn_kwargs): train_configs = TrainConfig(da_rnn_kwargs["time_step"], int(train_data.shape[0] * 0.95), da_rnn_kwargs["batch_size"], nn.MSELoss()) enc_kwargs = { "input_size": train_data.shape[1], "hidden_size": da_rnn_kwargs["en_hidden_size"], "time_step": int(da_rnn_kwargs["time_step"] / self.predict_size) } dec_kwargs = { "encoder_hidden_size": da_rnn_kwargs["en_hidden_size"], "decoder_hidden_size": da_rnn_kwargs["de_hidden_size"], "time_step": int(da_rnn_kwargs["time_step"] / self.predict_size), "out_feats": da_rnn_kwargs["target_cols"] } encoder = Encoder(**enc_kwargs).to(device) decoder = Decoder(**dec_kwargs).to(device) encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=da_rnn_kwargs["learning_rate"], betas=(0.9, 0.999), eps=1e-08) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=da_rnn_kwargs["learning_rate"], betas=(0.9, 0.999), eps=1e-08) da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer) return train_configs, da_rnn_net
def __init__(self, temp, latent_num, latent_dim): super(Model, self).__init__() if type(temp) != torch.Tensor: temp = torch.tensor(temp) self.__temp = temp self.latent_num = latent_num self.latent_dim = latent_dim self.encoder = Encoder(latent_num=latent_num, latent_dim=latent_dim) self.decoder = Decoder(latent_num=latent_num, latent_dim=latent_dim) if 'ExpTDModel' in str(self.__class__): self.prior = ExpRelaxedCategorical(temp, probs=torch.ones(latent_dim).cuda()) else: self.prior = dist.RelaxedOneHotCategorical(temp, probs=torch.ones(latent_dim).cuda()) self.initialize() self.softmax = nn.Softmax(dim=-1)
def __init__(self): self.model = get_model().cuda() self.ctc_loss = CTCLoss(size_average=True) self.decoder = Decoder() # self.optimizer = optim.Adam(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.lr_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', patience=configs.lr_scheduler_patience, factor=configs.lr_scheduler_factor, verbose=True) self.epoch_idx = 0 self.min_avg_dist = 1000.
def __init__(self, word2idx, emb_size, hidden_sizes, dropout, rnn_type="LSTM", pretrained_embs=None, fixed_embs=False, tied=None): super(RNNLanguageModel, self).__init__() self.encoder = Encoder(word2idx, emb_size, pretrained_embs, fixed_embs) self.decoder = Decoder(len(word2idx), hidden_sizes[-1], tied, self.encoder) self.rnn = StackedRNN(rnn_type, emb_size, hidden_sizes, dropout) self.drop = nn.Dropout(dropout)
def __init__(self, input_dim_encoder: int, hidden_dim_encoder: int, output_dim_encoder: int, dropout_p_encoder: float, output_dim_h_decoder: int, nb_classes: int, dropout_p_decoder: float, max_out_t_steps: int) \ -> None: """Baseline method for audio captioning with Clotho dataset. :param input_dim_encoder: Input dimensionality of the encoder. :type input_dim_encoder: int :param hidden_dim_encoder: Hidden dimensionality of the encoder. :type hidden_dim_encoder: int :param output_dim_encoder: Output dimensionality of the encoder. :type output_dim_encoder: int :param dropout_p_encoder: Encoder RNN dropout. :type dropout_p_encoder: float :param output_dim_h_decoder: Hidden output dimensionality of the decoder. :type output_dim_h_decoder: int :param nb_classes: Amount of output classes. :type nb_classes: int :param dropout_p_decoder: Decoder RNN dropout. :type dropout_p_decoder: float :param max_out_t_steps: Maximum output time-steps of the decoder. :type max_out_t_steps: int """ super().__init__() self.max_out_t_steps: int = max_out_t_steps self.encoder: Module = Encoder(input_dim=input_dim_encoder, hidden_dim=hidden_dim_encoder, output_dim=output_dim_encoder, dropout_p=dropout_p_encoder) self.decoder: Module = Decoder(input_dim=output_dim_encoder * 2, output_dim=output_dim_h_decoder, nb_classes=nb_classes, dropout_p=dropout_p_decoder)
def __init__(self, word2idx, emb_size, hidden_sizes, dropout, rnn_type="LSTM", pretrained_embs=None, fixed_embs=False, tied=None): super(BidirectionalLanguageModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = Encoder(word2idx, emb_size, pretrained_embs, fixed_embs) self.decoder = Decoder(len(word2idx), hidden_sizes[-1], tied, self.encoder) self.forward_lstm = StackedRNN(rnn_type, emb_size, hidden_sizes, dropout) self.backward_lstm = StackedRNN(rnn_type, emb_size, hidden_sizes, dropout) self.rnn_type = rnn_type self.hidden_sizes = hidden_sizes self.nlayers = len(hidden_sizes)
def __init__(self, embed_dim=300, hidden_dim=256, inner_dim=2048, n_head=2, N_en=6, N_de=6, dropout=0.1, vocab_size=5000, sos_idx=2, eos_idx=3, pad_idx=0, unk_idx=1, max_src_len=100, max_tgt_len=20, args=False): super(Transformer, self).__init__() #===Test the GPU availability self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #--Token indexes & Properties self.sos, self.eos, self.pad, self.unk = sos_idx, eos_idx, pad_idx, unk_idx self.max_src_len = max_src_len self.max_tgt_len = max_tgt_len self.scale = embed_dim ** 0.5 #===Base model(attn, enc, dec, ff) max_len = max(max_src_len, max_tgt_len) attn_enc_layer = ATTNLayer( embed_dim, n_head, hidden_dim, inner_dim, dropout, max_len, False) attn_dec_layer = ATTNLayer( embed_dim, n_head, hidden_dim, inner_dim, dropout, max_len, True) #===Main Archetecture(enc, dec) self.encoder = Encoder(attn_enc_layer, N_en, True) self.decoder = Decoder(attn_dec_layer, N_de, True) #===Embedding setting(src, tgt) self.embed = nn.Embedding(vocab_size, embed_dim) #===Fianl FC(logit2vocab) self.final = nn.Linear(embed_dim, vocab_size) #===Loss self.NLL = nn.NLLLoss(reduction='sum')
def __init__(self, config, api, PAD_token=0, pretrain_weight=None): super(PoemWAE, self).__init__() self.vocab = api.vocab self.vocab_size = len(self.vocab) self.rev_vocab = api.rev_vocab self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.maxlen = config.maxlen self.clip = config.clip self.lambda_gp = config.lambda_gp self.lr_gan_g = config.lr_gan_g self.lr_gan_d = config.lr_gan_d self.n_d_loss = config.n_d_loss self.temp = config.temp self.init_w = config.init_weight self.embedder = nn.Embedding(self.vocab_size, config.emb_size, padding_idx=PAD_token) if pretrain_weight is not None: self.embedder.weight.data.copy_(torch.from_numpy(pretrain_weight)) # 用同一个seq_encoder来编码标题和前后两句话 self.seq_encoder = Encoder(self.embedder, config.emb_size, config.n_hidden, True, config.n_layers, config.noise_radius) # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,4*hidden # 注意如果使用Poemwar_gmp则使用子类中的prior_net,即混合高斯分布的一个先验分布 self.prior_net = Variation(config.n_hidden * 4, config.z_size, dropout_rate=config.dropout, init_weight=self.init_w) # p(e|c) # 注意这儿原来是给Dialog那个任务用的,3*hidden # Poem数据集上,将title和上一句,另外加上x都分别用双向GRU编码并cat,因此是6*hidden self.post_net = Variation(config.n_hidden * 6, config.z_size, dropout_rate=config.dropout, init_weight=self.init_w) self.post_generator = nn.Sequential( nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size)) self.post_generator.apply(self.init_weights) self.prior_generator = nn.Sequential( nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size)) self.prior_generator.apply(self.init_weights) self.init_decoder_hidden = nn.Sequential( nn.Linear(config.n_hidden * 4 + config.z_size, config.n_hidden * 4), nn.BatchNorm1d(config.n_hidden * 4, eps=1e-05, momentum=0.1), nn.ReLU()) # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,因此hidden_size变为z_size + 4*hidden # 修改:decoder的hidden_size还设为n_hidden, init_hidden使用一个MLP将cat变换为n_hidden self.decoder = Decoder(self.embedder, config.emb_size, config.n_hidden * 4, self.vocab_size, n_layers=1) self.discriminator = nn.Sequential( # 因为Poem的cat两个双向编码,这里改为4*n_hidden + z_size nn.Linear(config.n_hidden * 4 + config.z_size, config.n_hidden * 2), nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config.n_hidden * 2, config.n_hidden * 2), nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config.n_hidden * 2, 1), ) self.discriminator.apply(self.init_weights) # optimizer 定义,分别对应三个模块的训练,注意!三个模块的optimizer不相同 # self.optimizer_AE = optim.SGD(list(self.seq_encoder.parameters()) self.optimizer_AE = optim.SGD( list(self.seq_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.init_decoder_hidden.parameters()) + list(self.decoder.parameters()), lr=config.lr_ae) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()), lr=self.lr_gan_g) self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=self.lr_gan_d) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.8) self.criterion_ce = nn.CrossEntropyLoss()
class PoemWAE(nn.Module): def __init__(self, config, api, PAD_token=0, pretrain_weight=None): super(PoemWAE, self).__init__() self.vocab = api.vocab self.vocab_size = len(self.vocab) self.rev_vocab = api.rev_vocab self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.maxlen = config.maxlen self.clip = config.clip self.lambda_gp = config.lambda_gp self.lr_gan_g = config.lr_gan_g self.lr_gan_d = config.lr_gan_d self.n_d_loss = config.n_d_loss self.temp = config.temp self.init_w = config.init_weight self.embedder = nn.Embedding(self.vocab_size, config.emb_size, padding_idx=PAD_token) if pretrain_weight is not None: self.embedder.weight.data.copy_(torch.from_numpy(pretrain_weight)) # 用同一个seq_encoder来编码标题和前后两句话 self.seq_encoder = Encoder(self.embedder, config.emb_size, config.n_hidden, True, config.n_layers, config.noise_radius) # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,4*hidden # 注意如果使用Poemwar_gmp则使用子类中的prior_net,即混合高斯分布的一个先验分布 self.prior_net = Variation(config.n_hidden * 4, config.z_size, dropout_rate=config.dropout, init_weight=self.init_w) # p(e|c) # 注意这儿原来是给Dialog那个任务用的,3*hidden # Poem数据集上,将title和上一句,另外加上x都分别用双向GRU编码并cat,因此是6*hidden self.post_net = Variation(config.n_hidden * 6, config.z_size, dropout_rate=config.dropout, init_weight=self.init_w) self.post_generator = nn.Sequential( nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size)) self.post_generator.apply(self.init_weights) self.prior_generator = nn.Sequential( nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size), nn.BatchNorm1d(config.z_size, eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config.z_size, config.z_size)) self.prior_generator.apply(self.init_weights) self.init_decoder_hidden = nn.Sequential( nn.Linear(config.n_hidden * 4 + config.z_size, config.n_hidden * 4), nn.BatchNorm1d(config.n_hidden * 4, eps=1e-05, momentum=0.1), nn.ReLU()) # 由于Poem这里context是title和last sentence双向GRU编码后的直接cat,因此hidden_size变为z_size + 4*hidden # 修改:decoder的hidden_size还设为n_hidden, init_hidden使用一个MLP将cat变换为n_hidden self.decoder = Decoder(self.embedder, config.emb_size, config.n_hidden * 4, self.vocab_size, n_layers=1) self.discriminator = nn.Sequential( # 因为Poem的cat两个双向编码,这里改为4*n_hidden + z_size nn.Linear(config.n_hidden * 4 + config.z_size, config.n_hidden * 2), nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config.n_hidden * 2, config.n_hidden * 2), nn.BatchNorm1d(config.n_hidden * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config.n_hidden * 2, 1), ) self.discriminator.apply(self.init_weights) # optimizer 定义,分别对应三个模块的训练,注意!三个模块的optimizer不相同 # self.optimizer_AE = optim.SGD(list(self.seq_encoder.parameters()) self.optimizer_AE = optim.SGD( list(self.seq_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.init_decoder_hidden.parameters()) + list(self.decoder.parameters()), lr=config.lr_ae) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()), lr=self.lr_gan_g) self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=self.lr_gan_d) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.8) self.criterion_ce = nn.CrossEntropyLoss() def init_weights(self, m): if isinstance(m, nn.Linear): m.weight.data.uniform_(-self.init_w, self.init_w) # nn.init.kaiming_normal_(m.weight.data) # nn.init.kaiming_uniform_(m.weight.data) m.bias.data.fill_(0) # x: (batch, 2*n_hidden) # c: (batch, 2*2*n_hidden) def sample_code_post(self, x, c): z, _, _ = self.post_net(torch.cat((x, c), 1)) # 输入:(batch, 3*2*n_hidden) z = self.post_generator(z) return z def sample_code_prior_sentiment(self, c, align): choice_statistic = self.prior_net(c, align) # e: (batch, z_size) return choice_statistic def sample_code_prior(self, c): z, _, _ = self.prior_net(c) # e: (batch, z_size) z = self.prior_generator(z) # z: (batch, z_size) return z # 输入 title, context, target, target_lens. # c由title和context encode之后的hidden相concat而成 def train_AE(self, title, context, target, target_lens): self.seq_encoder.train() self.decoder.train() # import pdb # pdb.set_trace() # (batch, 2 * hidden_size) title_last_hidden, _ = self.seq_encoder(title) context_last_hidden, _ = self.seq_encoder(context) # (batch, 2 * hidden_size) x, _ = self.seq_encoder(target[:, 1:], target_lens - 1) # context_embedding c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 2 * hidden_size * 2) z = self.sample_code_post(x, c) # (batch, z_size) # 标准的autoencoder的decode,decoder初态为x, c的cat,将target错位输入 # output: (batch, len, vocab_size) len是9,即7+标点+</s> output = self.decoder(self.init_decoder_hidden(torch.cat((z, c), 1)), None, target[:, :-1], target_lens - 1) flattened_output = output.view(-1, self.vocab_size) dec_target = target[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # 即判断target的token中是否有0(pad项) masked_target = dec_target.masked_select(mask) # 选出非pad项 output_mask = mask.unsqueeze(1).expand( mask.size(0), self.vocab_size) # [(batch_sz * seq_len) x n_tokens] masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) self.optimizer_AE.zero_grad() loss = self.criterion_ce(masked_output / self.temp, masked_target) loss.backward() torch.nn.utils.clip_grad_norm_( list(self.seq_encoder.parameters()) + list(self.decoder.parameters()), self.clip) self.optimizer_AE.step() return [('train_loss_AE', loss.item())] # G是来缩短W距离的,可以类比VAE里面的缩小KL散度项 def train_G(self, title, context, target, target_lens, sentiment_mask=None, mask_type=None): self.seq_encoder.eval() self.optimizer_G.zero_grad() for p in self.discriminator.parameters(): p.requires_grad = False title_last_hidden, _ = self.seq_encoder(title) context_last_hidden, _ = self.seq_encoder(context) c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 2 * hidden_size * 2) # -----------------posterior samples --------------------------- x, _ = self.seq_encoder(target[:, 1:], target_lens - 1) z_post = self.sample_code_post( x.detach(), c.detach()) # 去掉梯度,防止梯度向encoder的传播 (batch, z_size) errG_post = torch.mean( self.discriminator(torch.cat( (z_post, c.detach()), 1))) * self.n_d_loss # (batch, z_size + 4 * hidden) errG_post.backward(minus_one) # ----------------- prior samples --------------------------- prior_z = self.sample_code_prior(c.detach()) errG_prior = torch.mean( self.discriminator(torch.cat( (prior_z, c.detach()), 1))) * self.n_d_loss # import pdb # pdb.set_trace() errG_prior.backward(one) self.optimizer_G.step() for p in self.discriminator.parameters(): p.requires_grad = True costG = errG_prior - errG_post return [('train_loss_G', costG.item())] # D是用来拟合W距离,loss下降说明拟合度变好,增大gradient_penalty一定程度上可以提高拟合度 # n_iters_n越大,D训练的次数越多,对应的拟合度也越好 def train_D(self, title, context, target, target_lens): self.seq_encoder.eval() self.discriminator.train() self.optimizer_D.zero_grad() batch_size = context.size(0) title_last_hidden, _ = self.seq_encoder(title) context_last_hidden, _ = self.seq_encoder(context) c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 2, hidden_size * 2) x, _ = self.seq_encoder(target[:, 1:], target_lens - 1) post_z = self.sample_code_post(x, c) errD_post = torch.mean( self.discriminator(torch.cat( (post_z.detach(), c.detach()), 1))) * self.n_d_loss errD_post.backward(one) prior_z = self.sample_code_prior(c) errD_prior = torch.mean( self.discriminator(torch.cat( (prior_z.detach(), c.detach()), 1))) * self.n_d_loss errD_prior.backward(minus_one) # import pdb # pdb.set_trace() alpha = to_tensor(torch.rand(batch_size, 1)) alpha = alpha.expand(prior_z.size()) interpolates = alpha * prior_z.data + ((1 - alpha) * post_z.data) interpolates = Variable(interpolates, requires_grad=True) d_input = torch.cat((interpolates, c.detach()), 1) disc_interpolates = torch.mean(self.discriminator(d_input)) gradients = torch.autograd.grad( outputs=disc_interpolates, inputs=interpolates, grad_outputs=to_tensor(torch.ones(disc_interpolates.size())), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ( (gradients.contiguous().view(gradients.size(0), -1).norm(2, dim=1) - 1)**2).mean() * self.lambda_gp gradient_penalty.backward() self.optimizer_D.step() costD = -(errD_prior - errD_post) + gradient_penalty return [('train_loss_D', costD.item())] def valid(self, title, context, target, target_lens, sentiment_mask=None): self.seq_encoder.eval() self.discriminator.eval() self.decoder.eval() title_last_hidden, _ = self.seq_encoder(title) context_last_hidden, _ = self.seq_encoder(context) c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 2 * hidden_size * 2) x, _ = self.seq_encoder(target[:, 1:], target_lens - 1) post_z = self.sample_code_post(x, c) prior_z = self.sample_code_prior(c) errD_post = torch.mean(self.discriminator(torch.cat((post_z, c), 1))) errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c), 1))) costD = -(errD_prior - errD_post) costG = -costD dec_target = target[:, 1:].contiguous().view(-1) # (batch_size * len) mask = dec_target.gt(0) # 即判断target的token中是否有0(pad项) masked_target = dec_target.masked_select(mask) # 选出非pad项 output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size) output = self.decoder( self.init_decoder_hidden(torch.cat((post_z, c), 1)), None, target[:, :-1], (target_lens - 1)) flattened_output = output.view(-1, self.vocab_size) masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) lossAE = self.criterion_ce(masked_output / self.temp, masked_target) return [('valid_loss_AE', lossAE.item()), ('valid_loss_G', costG.item()), ('valid_loss_D', costD.item())] # 正如论文中说的,测试生成的时候,从先验网络中拿到噪声,用G生成prior_z(即代码中的sample_code_prior(c)) # 然后decoder将prior_z和c的cat当做输入,decode出这句诗(这和论文里面不太一样,论文里面只把prior_z当做输入) # batch_size是1,一次测一句 # title 即标题 # context 上一句 def test(self, title_tensor, title_words, headers): self.seq_encoder.eval() self.discriminator.eval() self.decoder.eval() # tem初始化为[2,3,0,0,0,0,0,0,0] tem = [[2, 3] + [0] * (self.maxlen - 2)] pred_poems = [] title_tokens = [ self.vocab[e] for e in title_words[0].tolist() if e not in [0, self.eos_id, self.go_id] ] pred_poems.append(title_tokens) for sent_id in range(4): tem = to_tensor(np.array(tem)) context = tem # vec_context = np.zeros((batch_size, self.maxlen), dtype=np.int64) # for b_id in range(batch_size): # vec_context[b_id, :] = np.array(context[b_id]) # context = to_tensor(vec_context) title_last_hidden, _ = self.seq_encoder( title_tensor) # (batch=1, 2*hidden) if sent_id == 0: context_last_hidden, _ = self.seq_encoder( title_tensor) # (batch=1, 2*hidden) else: context_last_hidden, _ = self.seq_encoder( context) # (batch=1, 2*hidden) c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 4*hidden_size) # 由于一次只有一首诗,batch_size = 1,因此不必repeat prior_z = self.sample_code_prior(c) # decode_words 是完整的一句诗 decode_words = self.decoder.testing( init_hidden=self.init_decoder_hidden(torch.cat((prior_z, c), 1)), maxlen=self.maxlen, go_id=self.go_id, mode="greedy", header=headers[sent_id]) decode_words = decode_words[0].tolist() # import pdb # pdb.set_trace() if len(decode_words) > self.maxlen: tem = [decode_words[0:self.maxlen]] else: tem = [[0] * (self.maxlen - len(decode_words)) + decode_words] pred_tokens = [ self.vocab[e] for e in decode_words[:-1] if e != self.eos_id and e != 0 ] pred_poems.append(pred_tokens) gen = '' for line in pred_poems: true_str = " ".join(line) gen = gen + true_str + '\n' return gen def sample(self, title, context, repeat, go_id, end_id): self.seq_encoder.eval() self.decoder.eval() title_last_hidden, _ = self.seq_encoder(title) context_last_hidden, _ = self.seq_encoder(context) c = torch.cat((title_last_hidden, context_last_hidden), 1) # (batch, 2 * hidden_size * 2) c_repeated = c.expand( repeat, -1) # 注意,我们输入的batch_size是1,这里复制repeat遍,为了后面的BLEU计算 prior_z = self.sample_code_prior( c_repeated) # c_repeated: (batch_size=repeat, 4*hidden_size) # (batch, max_len, 1) (batch_size, 1) sample_words, sample_lens = self.decoder.sampling( self.init_decoder_hidden(torch.cat((prior_z, c_repeated), 1)), self.maxlen, go_id, end_id, "greedy") return sample_words, sample_lens
def __init__(self, num_steps, x_size, window_size, z_what_size, rnn_hidden_size, encoder_net=[], decoder_net=[], predict_net=[], embed_net=None, bl_predict_net=[], non_linearity='ReLU', decoder_output_bias=None, decoder_output_use_sigmoid=False, use_masking=True, use_baselines=True, baseline_scalar=None, scale_prior_mean=3.0, scale_prior_sd=0.1, pos_prior_mean=0.0, pos_prior_sd=1.0, likelihood_sd=0.3, use_cuda=False): super(AIR, self).__init__() self.num_steps = num_steps self.x_size = x_size self.window_size = window_size self.z_what_size = z_what_size self.rnn_hidden_size = rnn_hidden_size self.use_masking = use_masking self.use_baselines = use_baselines self.baseline_scalar = baseline_scalar self.likelihood_sd = likelihood_sd self.use_cuda = use_cuda prototype = torch.tensor(0.).cuda() if use_cuda else torch.tensor(0.) self.options = dict(dtype=prototype.dtype, device=prototype.device) self.z_pres_size = 1 self.z_where_size = 3 # By making these parameters they will be moved to the gpu # when necessary. (They are not registered with pyro for # optimization.) self.z_where_loc_prior = nn.Parameter(torch.FloatTensor( [scale_prior_mean, pos_prior_mean, pos_prior_mean]), requires_grad=False) self.z_where_scale_prior = nn.Parameter(torch.FloatTensor( [scale_prior_sd, pos_prior_sd, pos_prior_sd]), requires_grad=False) # Create nn modules. rnn_input_size = x_size**2 if embed_net is None else embed_net[-1] rnn_input_size += self.z_where_size + z_what_size + self.z_pres_size nl = getattr(nn, non_linearity) self.rnn = nn.LSTMCell(rnn_input_size, rnn_hidden_size) self.encode = Encoder(window_size**2, encoder_net, z_what_size, nl) self.decode = Decoder(window_size**2, decoder_net, z_what_size, decoder_output_bias, decoder_output_use_sigmoid, nl) self.predict = Predict(rnn_hidden_size, predict_net, self.z_pres_size, self.z_where_size, nl) self.embed = Identity() if embed_net is None else MLP( x_size**2, embed_net, nl, True) self.bl_rnn = nn.LSTMCell(rnn_input_size, rnn_hidden_size) self.bl_predict = MLP(rnn_hidden_size, bl_predict_net + [1], nl) self.bl_embed = Identity() if embed_net is None else MLP( x_size**2, embed_net, nl, True) # Create parameters. self.h_init = nn.Parameter(torch.zeros(1, rnn_hidden_size)) self.c_init = nn.Parameter(torch.zeros(1, rnn_hidden_size)) self.bl_h_init = nn.Parameter(torch.zeros(1, rnn_hidden_size)) self.bl_c_init = nn.Parameter(torch.zeros(1, rnn_hidden_size)) self.z_where_init = nn.Parameter(torch.zeros(1, self.z_where_size)) self.z_what_init = nn.Parameter(torch.zeros(1, self.z_what_size)) if use_cuda: self.cuda()
def __init__(self, config, vocab_size, PAD_token=0): super(DFVAE, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.lambda_gp = config['lambda_gp'] self.temp = config['temp'] self.embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_token) self.utt_encoder = Encoder(self.embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.context_encoder = ContextEncoder(self.utt_encoder, config['n_hidden'] * 2 + 2, config['n_hidden'], 1, config['noise_radius']) self.prior_net = Variation(config['n_hidden'], config['z_size']) # p(e|c) self.post_net = Variation(config['n_hidden'] * 3, config['z_size']) # q(e|c,x) #self.prior_highway = nn.Linear(config['n_hidden'], config['n_hidden']) #self.post_highway = nn.Linear(config['n_hidden'] * 3, config['n_hidden']) self.postflow1 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.postflow2 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.postflow3 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow1 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow2 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow3 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.post_generator = nn_.SequentialFlow(self.postflow1, self.postflow2, self.postflow3) self.prior_generator = nn_.SequentialFlow(self.priorflow1, self.priorflow2, self.priorflow3) self.decoder = Decoder(self.embedder, config['emb_size'], config['n_hidden'] + config['z_size'], vocab_size, n_layers=1) self.optimizer_AE = optim.SGD( list(self.context_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.decoder.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()) #+list(self.prior_highway.parameters()) #+list(self.post_highway.parameters()) , lr=config['lr_ae']) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()) #+list(self.prior_highway.parameters()) #+list(self.post_highway.parameters()) , lr=config['lr_gan_g']) #self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d']) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.6) self.criterion_ce = nn.CrossEntropyLoss()
class DFVAE(nn.Module): def __init__(self, config, vocab_size, PAD_token=0): super(DFVAE, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.lambda_gp = config['lambda_gp'] self.temp = config['temp'] self.embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_token) self.utt_encoder = Encoder(self.embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.context_encoder = ContextEncoder(self.utt_encoder, config['n_hidden'] * 2 + 2, config['n_hidden'], 1, config['noise_radius']) self.prior_net = Variation(config['n_hidden'], config['z_size']) # p(e|c) self.post_net = Variation(config['n_hidden'] * 3, config['z_size']) # q(e|c,x) #self.prior_highway = nn.Linear(config['n_hidden'], config['n_hidden']) #self.post_highway = nn.Linear(config['n_hidden'] * 3, config['n_hidden']) self.postflow1 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.postflow2 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.postflow3 = flow.myIAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow1 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow2 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.priorflow3 = flow.IAF(config['z_size'], config['z_size'] * 2, config['n_hidden'], 3) self.post_generator = nn_.SequentialFlow(self.postflow1, self.postflow2, self.postflow3) self.prior_generator = nn_.SequentialFlow(self.priorflow1, self.priorflow2, self.priorflow3) self.decoder = Decoder(self.embedder, config['emb_size'], config['n_hidden'] + config['z_size'], vocab_size, n_layers=1) self.optimizer_AE = optim.SGD( list(self.context_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.decoder.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()) #+list(self.prior_highway.parameters()) #+list(self.post_highway.parameters()) , lr=config['lr_ae']) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()) #+list(self.prior_highway.parameters()) #+list(self.post_highway.parameters()) , lr=config['lr_gan_g']) #self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d']) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.6) self.criterion_ce = nn.CrossEntropyLoss() def init_weights(self, m): if isinstance(m, nn.Linear): m.weight.data.uniform_(-0.02, 0.02) m.bias.data.fill_(0) def sample_post(self, x, c): xc = torch.cat((x, c), 1) e, mu, log_s = self.post_net(xc) #h_post = self.post_highway(xc) z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu)) #h_prior = self.prior_highway(c) tilde_z, det_g, _ = self.prior_generator((z, det_f, c)) return tilde_z, z, mu, log_s, det_f, det_g def sample_code_post(self, x, c): xc = torch.cat((x, c), 1) e, mu, log_s = self.post_net(xc) #h_post = self.post_highway(xc) z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu)) #h_prior = self.prior_highway(c) tilde_z, det_g, _ = self.prior_generator((z, det_f, c)) return tilde_z, mu, log_s, det_f, det_g def sample_post2(self, x, c): xc = torch.cat((x, c), 1) e, mu, log_s = self.post_net(xc) #h_post = self.post_highway(xc) z, det_f, _, _ = self.post_generator((e, torch.eye(e.shape[1]), c, mu)) return e, mu, log_s, z, det_f def sample_code_prior(self, c): e, mu, log_s = self.prior_net(c) #z = self.prior_generator(e) #h_prior = self.prior_highway(c) #tilde_z, det_g, _ = self.prior_generator((e, 0, h_prior)) return e, mu, log_s #, det_g def sample_prior(self, c): e, mu, log_s = self.prior_net(c) #h_prior = self.prior_highway(c) z, det_prior, _ = self.prior_generator((e, 0, c)) return z, det_prior def train_AE(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.train() self.decoder.train() c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) z, _, _, _, _ = self.sample_code_post(x, c) z_post, mu_post, log_s_post, det_f, det_g = self.sample_code_post(x, c) #prior_z, mu_prior, log_s_prior = self.sample_code_prior(c) #KL_loss = torch.sum(log_s_prior - log_s_post + (torch.exp(log_s_post) + (mu_post - mu_prior)**2)/torch.exp(log_s_prior),1) / 2 - 100 #kloss = KL_loss - det_f #+ det_g #KL_loss = log_Normal_diag(z_post, mu_post, log_s_post) - log_Normal_diag(prior_z, mu_prior, log_s_prior) output = self.decoder(torch.cat((z_post, c), 1), None, response[:, :-1], (res_lens - 1)) flattened_output = output.view(-1, self.vocab_size) dec_target = response[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # [(batch_sz*seq_len)] masked_target = dec_target.masked_select(mask) # output_mask = mask.unsqueeze(1).expand( mask.size(0), self.vocab_size) # [(batch_sz*seq_len) x n_tokens] masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) #print(KL_loss.mean()) #print(det_f.mean()) self.optimizer_AE.zero_grad() AE_term = self.criterion_ce(masked_output / self.temp, masked_target) loss = AE_term #+ KL_loss.mean() loss.backward() #torch.nn.utils.clip_grad_norm_(list(self.context_encoder.parameters())+list(self.decoder.parameters()), self.clip) torch.nn.utils.clip_grad_norm_( list(self.context_encoder.parameters()) + list(self.decoder.parameters()) + list(self.post_generator.parameters()) + list(self.prior_generator.parameters()) + list(self.post_net.parameters()), self.clip) self.optimizer_AE.step() return [ ('train_loss_AE', AE_term.item()) ] #,('KL_loss', KL_loss.mean().item())]#,('det_f', det_f.mean().item()),('det_g', det_g.mean().item())] def train_G(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.eval() self.optimizer_G.zero_grad() c = self.context_encoder(context, context_lens, utt_lens, floors) # -----------------posterior samples --------------------------- x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) z_0, mu_post, log_s_post, z_post, weight = self.sample_post2( x.detach(), c.detach()) # ----------------- prior samples --------------------------- prior_z, mu_prior, log_s_prior = self.sample_code_prior(c.detach()) KL_loss = torch.sum( log_s_prior - log_s_post + torch.exp(log_s_post) / torch.exp(log_s_prior) * torch.sum(weight**2, dim=2) + (mu_post)**2 / torch.exp(log_s_prior), 1) / 2 - 100 #KL_loss = abs(log_Normal_diag(z_0, mu_post, log_s_post) - log_Normal_diag(z_post, mu_prior, log_s_prior)) #KL_loss2 = torch.sum((prior_z - mu_post.detach())**2 / (2 * torch.exp(log_s_post.detach())),1) #print(mu_post.shape, prior_z.shape) loss = KL_loss #print(-det_f , KL_loss ) #loss = abs(loss) loss.mean().backward() torch.nn.utils.clip_grad_norm_( list(self.post_generator.parameters()) + list(self.prior_generator.parameters()) + list(self.post_net.parameters()) + list(self.prior_generator.parameters()), self.clip) self.optimizer_G.step() #costG = errG_prior - errG_post return [ ('KL_loss', KL_loss.mean().item()) ] #,('det_f', det_f.mean().item()),('det_g', det_g.sum().item())] def valid(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.eval() #self.discriminator.eval() self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) post_z, mu_post, log_s_post, det_f, det_g = self.sample_code_post(x, c) prior_z, mu_prior, log_s_prior = self.sample_code_prior(c) #errD_post = torch.mean(self.discriminator(torch.cat((post_z, c),1))) #errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c),1))) KL_loss = torch.sum( log_s_prior - log_s_post + (torch.exp(log_s_post) + (mu_post)**2) / torch.exp(log_s_prior), 1) / 2 #KL_loss = log_Normal_diag(post_z, mu_post, log_s_post) - log_Normal_diag(prior_z, mu_prior, log_s_prior) #KL_loss2 = torch.sum((prior_z - mu_post)**2 / (2 * torch.exp(log_s_post)),1) loss = KL_loss # -det_f costG = loss.sum() dec_target = response[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # [(batch_sz*seq_len)] masked_target = dec_target.masked_select(mask) output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size) output = self.decoder(torch.cat((post_z, c), 1), None, response[:, :-1], (res_lens - 1)) flattened_output = output.view(-1, self.vocab_size) masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) lossAE = self.criterion_ce(masked_output / self.temp, masked_target) return [('valid_loss_AE', lossAE.item()), ('valid_loss_G', costG.item())] def sample(self, context, context_lens, utt_lens, floors, repeat, SOS_tok, EOS_tok): self.context_encoder.eval() self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) c_repeated = c.expand(repeat, -1) prior_z, _ = self.sample_prior(c_repeated) sample_words, sample_lens = self.decoder.sampling( torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok, EOS_tok, "greedy") return sample_words, sample_lens def gen(self, context, prior_z, context_lens, utt_lens, floors, repeat, SOS_tok, EOS_tok): self.context_encoder.eval() self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) c_repeated = c.expand(repeat, -1) sample_words, sample_lens = self.decoder.sampling( torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok, EOS_tok, "greedy") return sample_words, sample_lens def sample_latent(self, context, context_lens, utt_lens, floors, repeat, SOS_tok, EOS_tok): self.context_encoder.eval() #self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) c_repeated = c.expand(repeat, -1) e, _, _ = self.sample_code_prior(c_repeated) prior_z, _, _ = self.prior_generator((e, 0, c_repeated)) return prior_z, e def sample_latent_post(self, context, context_lens, utt_lens, floors, response, res_lens, repeat): self.context_encoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) c_repeated = c.expand(repeat, -1) x_repeated = x.expand(repeat, -1) z_post, z, mu_post, log_s_post, det_f, det_g = self.sample_post( x_repeated, c_repeated) return z_post, z def adjust_lr(self): self.lr_scheduler_AE.step()
class RNN(object): def __init__(self, input_size, output_size): super(RNN, self).__init__() self.encoder = Encoder(input_size) self.decoder = Decoder(output_size) self.loss = nn.CrossEntropyLoss() self.encoder_optimizer = optim.Adam(self.encoder.parameters()) self.decoder_optimizer = optim.Adam(self.decoder.parameters()) sos, eos = torch.LongTensor(1, 1).zero_(), torch.LongTensor(1, 1).zero_() sos[0, 0], eos[0, 0] = 0, 1 self.sos, self.eos = sos, eos def train(self, input, target): target.insert(0, self.sos) target.append(self.eos) self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() # Encoder hidden_state = self.encoder.first_hidden() for ivec in input: _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state) # Decoder total_loss, outputs = 0, [] for i in range(len(target) - 1): _, softmax, hidden_state = self.decoder.forward(Variable(target[i]), hidden_state) outputs.append(np.argmax(softmax.data.numpy(), 1)[:, np.newaxis]) total_loss += self.loss(softmax, Variable(target[i+1][0])) total_loss /= len(outputs) total_loss.backward() self.decoder_optimizer.step() self.encoder_optimizer.step() return total_loss.data[0], outputs # use total_loss.data[0] for version 0.3.0_4 and below, .item() for 0.4.0 def eval(self, input): hidden_state = self.encoder.first_hidden() # Encoder for ivec in input: _, hidden_state = self.encoder.forward(Variable(ivec), hidden_state) sentence = [] input = self.sos # Decoder while input.data[0, 0] != 1: output, _, hidden_state = self.decoder.forward(input, hidden_state) word = np.argmax(output.data.numpy()).reshape((1, 1)) input = Variable(torch.LongTensor(word)) sentence.append(word) return sentence def save(self): torch.save(self.encoder.state_dict(), "models/encoder.ckpt") torch.save(self.decoder.state_dict(), "models/decoder.ckpt")
class DeepAPI(nn.Module): ''' model. ''' def __init__(self, config, vocab_size): super(DeepAPI, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.temp = config['temp'] self.desc_embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_ID) self.api_embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_ID) # utter encoder: encode response to vector self.encoder = Encoder(self.desc_embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.decoder = Decoder(self.api_embedder, config['emb_size'], config['n_hidden'] * 2, vocab_size, config['use_attention'], 1, config['dropout']) # utter decoder: P(x|c,z) self.optimizer = optim.Adadelta(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=config['lr_ae'], rho=0.95) self.criterion_ce = nn.CrossEntropyLoss() def forward(self, descs, desc_lens, apiseqs, api_lens): c, hids = self.encoder(descs, desc_lens) output, _ = self.decoder(c, hids, None, apiseqs[:, :-1], (api_lens - 1)) # decode from z, c # output: [batch x seq_len x n_tokens] output = output.view(-1, self.vocab_size) # [batch*seq_len x n_tokens] dec_target = apiseqs[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # [(batch_sz*seq_len)] masked_target = dec_target.masked_select(mask) # output_mask = mask.unsqueeze(1).expand( mask.size(0), self.vocab_size) # [(batch_sz*seq_len) x n_tokens] masked_output = output.masked_select(output_mask).view( -1, self.vocab_size) loss = self.criterion_ce(masked_output / self.temp, masked_target) return loss def train_AE(self, descs, desc_lens, apiseqs, api_lens): self.encoder.train() self.decoder.train() loss = self.forward(descs, desc_lens, apiseqs, api_lens) self.optimizer.zero_grad() loss.backward() # `clip_grad_norm` to prevent exploding gradient in RNNs / LSTMs torch.nn.utils.clip_grad_norm_( list(self.encoder.parameters()) + list(self.decoder.parameters()), self.clip) self.optimizer.step() return {'train_loss': loss.item()} def valid(self, descs, desc_lens, apiseqs, api_lens): self.encoder.eval() self.decoder.eval() loss = self.forward(descs, desc_lens, apiseqs, api_lens) return {'valid_loss': loss.item()} def sample(self, descs, desc_lens, n_samples, mode='beamsearch'): self.encoder.eval() self.decoder.eval() c, hids = self.encoder(descs, desc_lens) if mode == 'beamsearch': sample_words, sample_lens, _ = self.decoder.beam_decode( c, hids, None, 12, self.maxlen, n_samples) #[batch_size x n_samples x seq_len] sample_words, sample_lens = sample_words[0], sample_lens[0] else: sample_words, sample_lens = self.decoder.sampling( c, hids, None, n_samples, self.maxlen, mode) return sample_words, sample_lens def adjust_lr(self): #self.lr_scheduler_AE.step() return None
def __init__(self, config, vocab_size, PAD_token=0): super(DialogWAE, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.lambda_gp = config['lambda_gp'] self.temp = config['temp'] self.embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_token) self.utt_encoder = Encoder(self.embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.context_encoder = ContextEncoder(self.utt_encoder, config['n_hidden'] * 2 + 2, config['n_hidden'], 1, config['noise_radius']) self.prior_net = Variation(config['n_hidden'], config['z_size']) # p(e|c) self.post_net = Variation(config['n_hidden'] * 3, config['z_size']) # q(e|c,x) self.post_generator = nn.Sequential( nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size'])) self.post_generator.apply(self.init_weights) self.prior_generator = nn.Sequential( nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size'])) self.prior_generator.apply(self.init_weights) self.decoder = Decoder(self.embedder, config['emb_size'], config['n_hidden'] + config['z_size'], vocab_size, n_layers=1) self.discriminator = nn.Sequential( nn.Linear(config['n_hidden'] + config['z_size'], config['n_hidden'] * 2), nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config['n_hidden'] * 2, config['n_hidden'] * 2), nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config['n_hidden'] * 2, 1), ) self.discriminator.apply(self.init_weights) self.optimizer_AE = optim.SGD(list(self.context_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.decoder.parameters()), lr=config['lr_ae']) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()), lr=config['lr_gan_g']) self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d']) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.6) self.criterion_ce = nn.CrossEntropyLoss()
class DialogWAE(nn.Module): def __init__(self, config, vocab_size, PAD_token=0): super(DialogWAE, self).__init__() self.vocab_size = vocab_size self.maxlen = config['maxlen'] self.clip = config['clip'] self.lambda_gp = config['lambda_gp'] self.temp = config['temp'] self.embedder = nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_token) self.utt_encoder = Encoder(self.embedder, config['emb_size'], config['n_hidden'], True, config['n_layers'], config['noise_radius']) self.context_encoder = ContextEncoder(self.utt_encoder, config['n_hidden'] * 2 + 2, config['n_hidden'], 1, config['noise_radius']) self.prior_net = Variation(config['n_hidden'], config['z_size']) # p(e|c) self.post_net = Variation(config['n_hidden'] * 3, config['z_size']) # q(e|c,x) self.post_generator = nn.Sequential( nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size'])) self.post_generator.apply(self.init_weights) self.prior_generator = nn.Sequential( nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size']), nn.BatchNorm1d(config['z_size'], eps=1e-05, momentum=0.1), nn.ReLU(), nn.Linear(config['z_size'], config['z_size'])) self.prior_generator.apply(self.init_weights) self.decoder = Decoder(self.embedder, config['emb_size'], config['n_hidden'] + config['z_size'], vocab_size, n_layers=1) self.discriminator = nn.Sequential( nn.Linear(config['n_hidden'] + config['z_size'], config['n_hidden'] * 2), nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config['n_hidden'] * 2, config['n_hidden'] * 2), nn.BatchNorm1d(config['n_hidden'] * 2, eps=1e-05, momentum=0.1), nn.LeakyReLU(0.2), nn.Linear(config['n_hidden'] * 2, 1), ) self.discriminator.apply(self.init_weights) self.optimizer_AE = optim.SGD(list(self.context_encoder.parameters()) + list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.decoder.parameters()), lr=config['lr_ae']) self.optimizer_G = optim.RMSprop( list(self.post_net.parameters()) + list(self.post_generator.parameters()) + list(self.prior_net.parameters()) + list(self.prior_generator.parameters()), lr=config['lr_gan_g']) self.optimizer_D = optim.RMSprop(self.discriminator.parameters(), lr=config['lr_gan_d']) self.lr_scheduler_AE = optim.lr_scheduler.StepLR(self.optimizer_AE, step_size=10, gamma=0.6) self.criterion_ce = nn.CrossEntropyLoss() def init_weights(self, m): if isinstance(m, nn.Linear): m.weight.data.uniform_(-0.02, 0.02) m.bias.data.fill_(0) def sample_code_post(self, x, c): e, _, _ = self.post_net(torch.cat((x, c), 1)) z = self.post_generator(e) return z def sample_code_prior(self, c): e, _, _ = self.prior_net(c) z = self.prior_generator(e) return z def train_AE(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.train() self.decoder.train() c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) z = self.sample_code_post(x, c) output = self.decoder(torch.cat((z, c), 1), None, response[:, :-1], (res_lens - 1)) flattened_output = output.view(-1, self.vocab_size) dec_target = response[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # [(batch_sz*seq_len)] masked_target = dec_target.masked_select(mask) # output_mask = mask.unsqueeze(1).expand( mask.size(0), self.vocab_size) # [(batch_sz*seq_len) x n_tokens] masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) self.optimizer_AE.zero_grad() loss = self.criterion_ce(masked_output / self.temp, masked_target) loss.backward() torch.nn.utils.clip_grad_norm_( list(self.context_encoder.parameters()) + list(self.decoder.parameters()), self.clip) self.optimizer_AE.step() return [('train_loss_AE', loss.item())] def train_G(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.eval() self.optimizer_G.zero_grad() for p in self.discriminator.parameters(): p.requires_grad = False c = self.context_encoder(context, context_lens, utt_lens, floors) # -----------------posterior samples --------------------------- x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) z_post = self.sample_code_post(x.detach(), c.detach()) errG_post = torch.mean( self.discriminator(torch.cat((z_post, c.detach()), 1))) errG_post.backward(minus_one) # ----------------- prior samples --------------------------- prior_z = self.sample_code_prior(c.detach()) errG_prior = torch.mean( self.discriminator(torch.cat((prior_z, c.detach()), 1))) errG_prior.backward(one) self.optimizer_G.step() for p in self.discriminator.parameters(): p.requires_grad = True costG = errG_prior - errG_post return [('train_loss_G', costG.item())] def train_D(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.eval() self.discriminator.train() self.optimizer_D.zero_grad() batch_size = context.size(0) c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) post_z = self.sample_code_post(x, c) errD_post = torch.mean( self.discriminator(torch.cat((post_z.detach(), c.detach()), 1))) errD_post.backward(one) prior_z = self.sample_code_prior(c) errD_prior = torch.mean( self.discriminator(torch.cat((prior_z.detach(), c.detach()), 1))) errD_prior.backward(minus_one) alpha = gData(torch.rand(batch_size, 1)) alpha = alpha.expand(prior_z.size()) interpolates = alpha * prior_z.data + ((1 - alpha) * post_z.data) interpolates = Variable(interpolates, requires_grad=True) d_input = torch.cat((interpolates, c.detach()), 1) disc_interpolates = torch.mean(self.discriminator(d_input)) gradients = torch.autograd.grad( outputs=disc_interpolates, inputs=interpolates, grad_outputs=gData(torch.ones(disc_interpolates.size())), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = ( (gradients.contiguous().view(gradients.size(0), -1).norm(2, dim=1) - 1)**2).mean() * self.lambda_gp gradient_penalty.backward() self.optimizer_D.step() costD = -(errD_prior - errD_post) + gradient_penalty return [('train_loss_D', costD.item())] def valid(self, context, context_lens, utt_lens, floors, response, res_lens): self.context_encoder.eval() self.discriminator.eval() self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) x, _ = self.utt_encoder(response[:, 1:], res_lens - 1) post_z = self.sample_code_post(x, c) prior_z = self.sample_code_prior(c) errD_post = torch.mean(self.discriminator(torch.cat((post_z, c), 1))) errD_prior = torch.mean(self.discriminator(torch.cat((prior_z, c), 1))) costD = -(errD_prior - errD_post) costG = -costD dec_target = response[:, 1:].contiguous().view(-1) mask = dec_target.gt(0) # [(batch_sz*seq_len)] masked_target = dec_target.masked_select(mask) output_mask = mask.unsqueeze(1).expand(mask.size(0), self.vocab_size) output = self.decoder(torch.cat((post_z, c), 1), None, response[:, :-1], (res_lens - 1)) flattened_output = output.view(-1, self.vocab_size) masked_output = flattened_output.masked_select(output_mask).view( -1, self.vocab_size) lossAE = self.criterion_ce(masked_output / self.temp, masked_target) return [('valid_loss_AE', lossAE.item()), ('valid_loss_G', costG.item()), ('valid_loss_D', costD.item())] def sample(self, context, context_lens, utt_lens, floors, repeat, SOS_tok, EOS_tok): self.context_encoder.eval() self.decoder.eval() c = self.context_encoder(context, context_lens, utt_lens, floors) # encode context into embedding c_repeated = c.expand(repeat, -1) prior_z = self.sample_code_prior(c_repeated) # print(prior_z.shape) # print(prior_z) sample_words, sample_lens = self.decoder.sampling( torch.cat((prior_z, c_repeated), 1), None, self.maxlen, SOS_tok, EOS_tok, "greedy") return sample_words, sample_lens def adjust_lr(self): self.lr_scheduler_AE.step()