def generate_ngram_naive_bayes_model(training_iter, alpha): labelCounts = ntorch.ones(len(LABEL.vocab), names=("class")).cuda() * 0 vocabCounts = ntorch.tensor( [alpha[f[0]] for f in NGRAMS.vocab.itos], names=("vocab", )).cuda() * ntorch.ones( len(LABEL.vocab), names=("class", )).cuda() classes = ntorch.tensor(torch.eye(len(LABEL.vocab)), names=("class", "classIndex")).cuda() encoding = ntorch.tensor(torch.eye(len(NGRAMS.vocab)), names=("vocab", "vocabIndex")).cuda() for batch in training_iter: oneHot = encoding.index_select("vocabIndex", batch.text) setofwords, _ = oneHot.max("ngramlen") classRep = classes.index_select("classIndex", batch.label.long()) labelCounts += classRep.sum("batch") vocabCounts += setofwords.dot("batch", classRep) p = vocabCounts.get("class", 1) q = vocabCounts.get("class", 0) r = ((p * q.sum()) / (q * p.sum())).log() # r= (p/q).log() weight = r b = (labelCounts.get("class", 1) / labelCounts.get("class", 0)).log() def naive_bayes(test_batch): oneHotTest = encoding.index_select("vocabIndex", test_batch.cuda()) setofwords, _ = oneHotTest.max("seqlen") y = (weight.dot("vocab", setofwords) + b).sigmoid() return (y - 0.5) * (ntorch.tensor([-1., 1.], names=("class")).cuda()) + 0.5 return naive_bayes
def forward(self, seq): seq_len = seq.shape["seqlen"] batch_size = seq.shape["batch"] pad_token = self.text.vocab.stoi["<pad>"] additional_padding = ntorch.ones(batch_size, self.longest_n, names=("batch", "seqlen")).long().to(self.device) additional_padding *= pad_token seq = ntorch.cat([additional_padding, seq, additional_padding], dim="seqlen") amino_acids = self.codon_to_aa[seq.values] return_ar = ntorch.zeros(seq_len, batch_size, self.out_vocab, names=("seqlen", "batch", "vocablen")) # convert to numpy to leave GPU amino_acids = amino_acids.detach().cpu().numpy() for batch_item in range(batch_size): # start at n, end at seq_len - n for seq_item in range(self.longest_n, seq_len - self.longest_n): # Must iterate over all dictionaries for weight, n, ngram_dict in zip(self.weight_list, self.n_list, self.dict_list): # N gram is a 2d numpy array containing an amino acid embedding in each row n_gram = amino_acids[batch_item,seq_item - n : seq_item + n + 1] # note, we want to populate the return ar before padding! return_ar[{"seqlen" : seq_item - self.longest_n, "batch" : batch_item}] += weight * ngram_dict[str(n_gram)].float() return return_ar.to(self.device)
def make_n_gram_dict(train_iter, n, amino_acid_conversion, TEXT, AA_LABEL): ''' Helper function to create a frequency default dictionary Args: train_iter: Training bucket iterator n: Number of amino acids to each side of AA (e.g. 0 is unigram, 1 is trigram) amino_acid_conversion: index_table converting the codon index to AA index TEXT: torchtext field for the vocab of nucleotides AA_LABEL: Torchtext for amino acids Returns: default_dict: dictionary mapping a sequence of amino acids to probability over codons TODO: Make this faster ''' default_obj = lambda : torch.tensor(np.zeros(len(TEXT.vocab.stoi))) default_dict = defaultdict(default_obj) with torch.no_grad(): ident_mat = np.eye(len(TEXT.vocab.stoi)) ident_mat_aa = np.eye(len(AA_LABEL.vocab)) for i, batch in enumerate(train_iter): # Select for all non zero tensors # Use this to find all indices that aren't padding seq_len = batch.sequence.shape["seqlen"] batch_size = batch.sequence.shape["batch"] # Pad amino acids and seq with <pad> token pad_token = TEXT.vocab.stoi["<pad>"] additional_padding = ntorch.ones(batch_size, n, names=("batch", "seqlen")).long() additional_padding *= pad_token seq = ntorch.cat([additional_padding, batch.sequence, additional_padding], dim="seqlen") # Now one hots.. amino_acids = amino_acid_conversion[seq.values].detach().cpu().numpy() # Note: we should assert that start and pad are treated the same # This is because at test time, presumably we narrow the start for the AA.. if i == 0: assert((amino_acids[0,n] == amino_acids[0,0]).all()) seq = seq.detach().cpu().numpy() # Pad with padding token for batch_item in range(batch_size): # start at n, end at seq_len - n for seq_item in range(n, seq_len - n): # Middle token is a discrete number representing the codon (0 to 66) middle_token = seq[batch_item, seq_item] # N gram is a 2d numpy array containing an amino acid embedding in each row n_gram = amino_acids[batch_item,seq_item - n : seq_item + n + 1] default_dict[str(n_gram)][middle_token] += 1 for key in default_dict: default_dict[key] /= (default_dict[key]).sum() return default_dict
def reinforce(self, premise, hypothesis, label): # REINFORCE q = self.q(premise, hypothesis, label).rename('label', 'latent') latent_dist = nds.Categorical(logits=q, dim_logit='latent') # Sample to appromixate E[] samples = latent_dist.sample([self.num_samples], names=('samples', )) # Batch premises and hypotheses batches = defaultdict(list) premise_n = premise.unbind('batch') hypothesis_n = hypothesis.unbind('batch') # Get some samples samples_n = samples.transpose('batch', 'samples').tolist() # Idea is to work with samples based on their sampled model for i, batch in enumerate(samples_n): p = premise_n[i] h = hypothesis_n[i] for sample in batch: batches[sample].append((i, p, h)) # Can now evaluate sampled models with batching batch_size = premise.shape['batch'] counts = [0] * batch_size res = [None] * (self.num_samples * batch_size) correct = label.tolist() for i, items in batches.items(): # for item in items: # batch_p = ntorch. batch_p = ntorch.stack([p for _, p, _ in items], 'batch') batch_h = ntorch.stack([h for _, _, h in items], 'batch') batch_i = [i for i, _, _ in items] # Evaluate model per batch, then update preds = self.models[i](batch_p, batch_h) for i, log_probs in zip(batch_i, preds.unbind('batch')): res[self.num_samples * i + counts[i]] = log_probs.values[correct[i]] counts[i] += 1 # Finally average results for sample res = torch.stack(res, dim=0).reshape(batch_size, self.num_samples) res = ntorch.tensor(res, names=( 'batch', 'sample', )) # Onward to estimating gradient + calculating loss surrogate = (latent_dist.log_prob(samples) * res.detach() + res).mean('sample') prior = ntorch.ones(self.K, names='latent').log_softmax(dim='latent') prior = nds.Categorical(logits=prior, dim_logit='latent') KLD = nds.kl_divergence(latent_dist, prior) * self.kl_weight loss = (KLD - surrogate._tensor).mean() # -(surrogate = kl) elbo = (KLD.detach() - res.detach().mean('sample')._tensor).mean() return loss, elbo
def elbo_reinforce(self, premise, hypothesis, label): # computing the q distribution: p(c | a, b, y) q = self.q(premise, hypothesis, label).rename('label', 'latent') latent_dist = ds.Categorical(logits=q, dim_logit='latent') # generating some samples samples = latent_dist.sample([self.sample_size], names=('samples', )) # bucketing samples by the sampled model to maximize efficiency buckets = defaultdict(list) premise_lst = premise.unbind('batch') hypothesis_lst = hypothesis.unbind('batch') samples_list = samples.transpose('batch', 'samples').tolist() for i, batch in enumerate(samples_list): p, h = premise_lst[i], hypothesis_lst[i] for sample in batch: buckets[sample].append((i, p, h)) # evaluating the sampled models efficiently using batching orig_batch_size = premise.shape['batch'] counts = [0] * orig_batch_size res = [None] * (self.sample_size * orig_batch_size) correct = label.tolist() for c, items in buckets.items(): # stacking data points into batches batch_premise = ntorch.stack([p for _, p, _ in items], 'batch') batch_hypothesis = ntorch.stack([h for _, _, h in items], 'batch') ids = [i for i, _, _ in items] # evaluating the model on that batch predictions = self.models[c](batch_premise, batch_hypothesis) # updating the result at the appropriate index for i, log_probs in zip(ids, predictions.unbind('batch')): res[self.sample_size * i + counts[i]] = log_probs.values[correct[i]] counts[i] += 1 # reforming and averaging the results for each sample res = torch.stack(res, dim=0).reshape(orig_batch_size, self.sample_size) res = ntorch.tensor(res, names=('batch', 'sample')) # computing a surrogate objective for REINFORCE # https://pyro.ai/examples/svi_part_iii.html q_log_prob = latent_dist.log_prob(samples) surrogate_objective = (q_log_prob * res.detach() + res).mean('sample') # adding on the KL regularizing term ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent') uniform_dist = ds.Categorical(logits=ones, dim_logit='latent') kl = ds.kl_divergence(latent_dist, uniform_dist) * self.kl_importance # reporting the surrogate objective as well as the actual elbo loss = -(surrogate_objective - kl).mean() elbo = -(res.detach().mean('sample') - kl.detach()).mean() return loss, elbo
def decode(self, premise, hypothesis): label = ntorch.ones(premise.shape['batch'], names=('batch', )) preds = 0 for i in range(1, self.num_samples + 1): q = self.q(premise, hypothesis, label * i).rename('label', 'latent').exp() for c in range(len(self.models)): log_probs = self.models[c](premise, hypothesis) preds += log_probs * q.get('latent', c) / len(self.models) return preds / self.num_samples, q
def loss_function(recon_x, x, var_posterior): BCE = recon_x.reduce2( x.stack(h=("ch", "height", "width")), lambda x, y: F.binary_cross_entropy(x, y, reduction="sum"), ("batch", "x"), ) prior = ndistributions.Normal(ntorch.zeros(dict(batch=1, z=1)), ntorch.ones(dict(batch=1, z=1))) KLD = ndistributions.kl_divergence(var_posterior, prior).sum() return BCE + KLD
def infer(self, premise, hypothesis): label = ntorch.ones(premise.shape['batch'], names=('batch', )).long() predictions = 0 for i in range(1, 4): q = self.q(premise, hypothesis, label * i).rename('label', 'latent').exp() for c in range(len(self.models)): log_probs = self.models[c](premise, hypothesis) predictions += log_probs * q.get('latent', c) / len( self.models) return predictions / 3
def elbo_exact(self, premise, hypothesis, label): # computing the q distribution: p(c | a, b, y) q = self.q(premise, hypothesis, label).rename('label', 'latent') latent_dist = ds.Categorical(logits=q, dim_logit='latent') one_hot_label = torch.eye(4).index_select(0, label.values) one_hot_label = ntorch.tensor(one_hot_label, names=('batch', 'label')) # computing p(y | a, b, c) for every c objective = 0 q = q.exp() for c in range(len(self.models)): log_probs = self.models[c](premise, hypothesis) model_probs = q.get('latent', c) objective += (log_probs * one_hot_label).sum('label') * model_probs # adding on the KL regularizing term ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent') uniform_dist = ds.Categorical(logits=ones, dim_logit='latent') kl = ds.kl_divergence(latent_dist, uniform_dist) * self.kl_importance loss = -(objective.mean() - kl.mean()) return loss, loss.detach()
def exact(self, premise, hypothesis, label): q = self.q(premise, hypothesis, label).rename('label', 'latent') latent_dist = nds.Categorical(logits=q, dim_logit='latent') one_hot = torch.eye(4, out=torch.cuda.FloatTensor()).index_select( 0, label.values) one_hot = ntorch.tensor(one_hot, names=('batch', 'label')) # Calculate p(y | a, b, c) across all models K surrogate = 0 q = q.exp() for c in range(len(self.models)): log_probs = self.models[c](premise, hypothesis) model_probs = q.get('latent', c) surrogate += (log_probs * one_hot).sum('label') * model_probs # KL regularization ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent') prior = nds.Categorical(logits=ones, dim_logit='latent') KLD = nds.kl_divergence(latent_dist, prior) * self.kl_weight loss = KLD.mean() - surrogate._tensor.mean( ) # -(surrogate.mean() - kl.mean()) return loss, loss.detach()
def forward(self, source, target=None, teacher_forcing=1., max_length=20, encode_only=False): if target: max_length = target.shape["trgSeqlen"] x = self.in_embedding(source) out, (h, c) = self.encoder(x) h = ntorch.cat((h[{ "layers": slice(0, 1) }], h[{ "layers": slice(1, 2) }]), dim="rnnOutput") c = ntorch.cat((c[{ "layers": slice(0, 1) }], c[{ "layers": slice(1, 2) }]), dim="rnnOutput") if self.attention: def attend(x_t): alpha = out.dot("rnnOutput", x_t).softmax("srcSeqlen") context = alpha.dot("srcSeqlen", out) return context batch_size = source.shape["batch"] output_dists = ntorch.zeros( (batch_size, max_length, self.out_vocab_size), names=("batch", "trgSeqlen", "outVocab"), device=device) output_seq = ntorch.zeros((batch_size, max_length), names=("batch", "trgSeqlen"), device=device) #for the above, should set zeroith index to SOS score = ntorch.zeros((batch_size, max_length), names=("batch", "trgSeqlen"), device=device) if encode_only: return score, out, (h, c), output_seq for t in range(max_length - 1): #Oh god if t == 0: # always start with SOS token next_input = ntorch.ones((batch_size, 1), names=("batch", "trgSeqlen"), device=device).long() next_input *= EN.vocab.stoi["<s>"] elif np.random.random( ) < teacher_forcing and target: # we will force next_input = target[{"trgSeqlen": slice(t, t + 1)}] else: next_input = sample x_t, (h, c) = self.decoder(self.out_embedding(next_input), (h, c)) if t == 0: syntax_out, (s_h, s_c) = self.syntax_decoder( self.out_embedding(next_input)) else: syntax_out, (s_h, s_c) = self.syntax_decoder( self.out_embedding(next_input), (s_h, s_c)) if self.attention: fc = self.fc(ntorch.cat([attend(x_t), x_t], dim="rnnOutput")) else: fc = self.fc(x_t) s_fc = self.syntax_fc(syntax_out).sum("trgSeqlen") s_fc = s_fc.log_softmax("outVocab") dist = ntorch.distributions.Categorical(logits=fc, dim_logit="outVocab") sample = dist.sample() fc = fc.sum("trgSeqlen") next_token = (sample) if not target else target[{ "trgSeqlen": slice(t + 1, t + 2) }] #TODO #this is the line where the syntax thing does it's stuff fc = fc.log_softmax("outVocab") + s_fc indices = next_token.sum("trgSeqlen").rename("batch", "indices") batch_indices = ntorch.tensor( torch.tensor(np.arange(fc.shape["batch"]), device=device), ("batchIndices")) newsc = fc.index_select("outVocab", indices).index_select( "indices", batch_indices).get("batchIndices", 0) score[{"trgSeqlen": t + 1}] = newsc output_seq[{ "trgSeqlen": t + 1 }] = next_token.sum("trgSeqlen") #todo output_dists[{"trgSeqlen": t + 1}] = fc #Todo return output_seq, output_dists, score