def main(args): dataset = get_dataset() model = get_model(dataset).to(_device()) loader = torch_data.DataLoader( dataset, batch_size=FLAGS.n_batch, collate_fn=dataset.collate, shuffle=True, ) opt = optim.Adam(model.parameters(), lr=0.001) for i_epoch in range(FLAGS.n_epochs): epoch_loss = 0 for i_batch, (target, exemplars) in enumerate(loader): target = torch.tensor(target, device=_device()) exemplars = torch.tensor(exemplars, device=_device()) n_seq, n_batch, n_ex = exemplars.shape loss = model(target, exemplars, epoch=i_epoch) opt.zero_grad() loss.backward() opt.step() epoch_loss += loss.item() print(i_epoch, 1 / (1 + np.exp(-(i_epoch / 5 - 10)))) print(epoch_loss / (i_batch + 1)) print() for extrapolate in (True, False): eval_seqs, eval_batch = dataset.eval_batch(100, extrapolate) samples = model.sample(torch.tensor(eval_batch, device=_device()), 100) print(f"extrapolate={extrapolate}") evaluation.visualize(eval_seqs[:5], samples[:5], dataset) print(evaluation.compute_coverage(samples, dataset)) print() print()
def forward(self, target, exemplars, epoch, **kwargs): assert FLAGS.n_layers == 1 n_seq, n_batch = target.shape n_ex_seq, _, n_ex = exemplars.shape assert n_ex == 3 exemplars = exemplars.view(n_ex_seq, n_batch * n_ex) ex_embedding = self.embed(exemplars) _, (ex_encoding, _) = self.enc(ex_embedding) ex_encoding = ex_encoding.view(n_batch, n_ex, FLAGS.n_hidden) pred_tgt = ex_encoding[:, 1, :] + ex_encoding[:, 2, :] - ex_encoding[:, 0, :] pred_tgt = pred_tgt.unsqueeze(0) recon_scale = 0.5 * 1 / (1 + np.exp(-(epoch / 5 - 10))) tgt_embedding = self.embed(target) _, (tgt_encoding, _) = self.enc(tgt_embedding) mask = (torch.rand(1, n_batch, 1, device=_device()) < recon_scale).expand_as(pred_tgt).float() rep = mask * pred_tgt + (1 - mask) * tgt_encoding hid = (rep, torch.zeros_like(rep)) tgt_decoding, _ = self.dec(tgt_embedding[:-1, :, :], hid) tgt_pred = self.pred(tgt_decoding).view((n_seq - 1) * n_batch, self.n_tokens) return ( recon_scale * self.representation_loss(pred_tgt, tgt_encoding) + self.reconstruction_loss(tgt_pred, target[1:, :].view( (n_seq - 1) * n_batch)))
def forward(self, target, exemplars, epoch, **kwargs): del exemplars n_seq, n_batch = target.shape inp = target[:-1, :] out = target[1:, :].view((n_seq - 1) * n_batch) enc_embedding = self.embed(target) _, (enc_encoding, _) = self.rnn(enc_embedding) mean = self.mean(enc_encoding) log_std = self.log_std(enc_encoding) std = torch.exp(log_std) #prior_kl = ((mean ** 2 + std ** 2 - 2 * log_std - 1) / 2).mean() prior_kl = ((mean**2) / 2).mean() noise = torch.normal(mean=0, std=1, size=mean.shape, device=_device()) #encoding = (mean + std * noise, torch.zeros_like(mean)) encoding = (mean + noise, torch.zeros_like(mean)) dec_embedding = self.embed(inp) dec_representation, _ = self.rnn(dec_embedding, encoding) prediction = self.predict(dec_representation) prediction = prediction.view((n_seq - 1) * n_batch, self.n_tokens) pred_nlprob = self.loss(prediction, out) kl_weight = 10 * 1 / (1 + np.exp(-(epoch - 5))) return kl_weight * prior_kl + pred_nlprob
def sample(self, exemplars, count): init_state = [ torch.zeros(1, count, FLAGS.n_hidden), torch.zeros(1, count, FLAGS.n_hidden) ] init_state = [t.to(_device()) for t in init_state] return _sample(self.embed, self.rnn, self.predict, init_state, init_token=1, stop_token=10, count=count)
def _sample(embed, rnn, predict, init_state, init_token, stop_token, count=1, max_len=40, greedy=False): assert init_state[0].shape[1] == count with torch.no_grad(): out = [[init_token] for _ in range(count)] last_state = init_state last_token = init_token * torch.ones( (1, count), dtype=torch.int64, device=_device()) for i in range(max_len): hidden, next_state = rnn(embed(last_token), last_state) probs = F.softmax(predict(hidden), dim=2).detach().cpu().numpy() next_token = [] for j in range(count): if greedy: token = np.argmax(probs[0, j, :]) else: token = np.random.choice(probs.shape[2], p=probs[0, j, :]) out[j].append(token) next_token.append(token) last_state = next_state last_token = torch.tensor([next_token], dtype=torch.int64, device=_device()) out_clean = [] for seq in out: if stop_token in seq: seq = seq[:seq.index(stop_token) + 1] seq = [t for t in seq if t != 0] out_clean.append(seq) return out_clean
def sample(self, exemplars, count): assert False enc = torch.normal(mean=0, std=1, size=(1, count, FLAGS.n_hidden), device=_device()) init_state = (enc, torch.zeros_like(enc)) return _sample(self.embed, self.dec_rnn, self.predict, init_state, init_token=1, stop_token=10, count=count)
def forward(self, target, exemplars, **kwargs): n_seq, n_batch = target.shape n_ex_seq, _, n_ex = exemplars.shape inp = target[:-1, :] out = target[1:, :].view((n_seq - 1) * n_batch) exemplars = exemplars.view(n_ex_seq, n_batch * n_ex) ex_embedding = self.embed(exemplars) _, (ex_encoding, _) = self.enc_rnn(ex_embedding) ex_encoding = ex_encoding.view(n_batch, n_ex, FLAGS.n_hidden) enc_embedding = self.embed(target) _, (enc_encoding, _) = self.enc_rnn(enc_embedding) enc_encoding = enc_encoding.squeeze(0).unsqueeze(1).expand_as( ex_encoding) attention_weights = (enc_encoding * ex_encoding).sum(dim=2, keepdim=True) attention_weights = F.softmax(attention_weights, dim=1) weighted_ex = (ex_encoding * attention_weights.expand_as(ex_encoding)).sum( dim=1).unsqueeze(0) mean = self.mean(weighted_ex) log_std = self.log_std(weighted_ex) std = torch.exp(log_std) prior_kl = ((mean**2 + std**2 - 2 * log_std - 1) / 2).mean() noise = torch.normal(mean=0, std=1, size=mean.shape, device=_device()) encoding = (mean + std * noise, torch.zeros_like(mean)) dec_embedding = self.embed(inp) dec_representation, _ = self.dec_rnn(dec_embedding, encoding) prediction = self.predict(dec_representation) prediction = prediction.view((n_seq - 1) * n_batch, self.n_tokens) pred_nlprob = self.loss(prediction, out) return prior_kl + pred_nlprob