def score_candidates(self, batch, cand_vecs, cand_encs=None): # convoluted check that not all memories are empty if (self.opt['use_memories'] and batch.memory_vecs is not None and sum(len(m) for m in batch.memory_vecs)): mems = padded_3d(batch.memory_vecs, use_cuda=self.use_cuda, pad_idx=self.NULL_IDX) else: mems = None if cand_encs is not None: # we pre-encoded the candidates, do not re-encode here cand_vecs = None context_h, cands_h = self.model( xs=batch.text_vec, mems=mems, cands=cand_vecs, ) if cand_encs is not None: cands_h = cand_encs scores = self._score(context_h, cands_h) return scores
def make_candidate_vecs(self, cands): cand_batches = [cands[i:i + 512] for i in range(0, len(cands), 512)] print("[ Vectorizing fixed candidate set ({} batch(es) of up to 512) ]" "".format(len(cand_batches))) cand_vecs = [] for batch in tqdm(cand_batches): cand_vecs.extend(self.vectorize_fixed_candidates(batch)) return padded_3d([cand_vecs], dtype=cand_vecs[0].dtype).squeeze(0)
def set_vocab_candidates(self, shared): """Load the tokens from the vocab as candidates self.vocab_candidates will contain a [num_cands] list of strings self.vocab_candidate_vecs will contain a [num_cands, 1] LongTensor """ self.opt['encode_candidate_vecs'] = True if shared: self.vocab_candidates = shared['vocab_candidates'] self.vocab_candidate_vecs = shared['vocab_candidate_vecs'] self.vocab_candidate_encs = shared['vocab_candidate_encs'] else: if 'vocab' in (self.opt['candidates'], self.opt['eval_candidates']): cands = [] vecs = [] for ind in range(1, len(self.dict)): txt = self.dict[ind] cands.append(txt) vecs.append( self._vectorize_text( txt, add_start=True, add_end=True, truncate=self.label_truncate, )) self.vocab_candidates = cands self.vocab_candidate_vecs = padded_3d([vecs]).squeeze(0) print("[ Loaded fixed candidate set (n = {}) from vocabulary ]" "".format(len(self.vocab_candidates))) enc_path = self.opt.get('model_file') + '.vocab.encs' if os.path.isfile(enc_path): self.vocab_candidate_encs = self.load_candidates( enc_path, cand_type='vocab encodings') else: cand_encs = [] vec_batches = [ self.vocab_candidate_vecs[i:i + 512] for i in range(0, len(self.vocab_candidate_vecs), 512) ] print("[ Vectorizing vocab candidates ({} batch(es) of up " "to 512) ]".format(len(vec_batches))) for vec_batch in tqdm(vec_batches): cand_encs.append(self.encode_candidates(vec_batch)) self.vocab_candidate_encs = torch.cat(cand_encs, 0) self.save_candidates(self.vocab_candidate_encs, enc_path, cand_type='vocab encodings') if self.use_cuda: self.vocab_candidate_vecs = self.vocab_candidate_vecs.cuda( ) self.vocab_candidate_encs = self.vocab_candidate_encs.cuda( ) else: self.vocab_candidates = None self.vocab_candidate_vecs = None self.vocab_candidate_encs = None
def _build_label_cands(self, batch): """Convert batch.candidate_vecs to 3D padded vector.""" if not batch.candidates: return None, None cand_inds = [i for i in range(len(batch.candidates)) if batch.candidates[i]] cands = padded_3d(batch.candidate_vecs, pad_idx=self.NULL_IDX, use_cuda=self.use_cuda) return cands, cand_inds
def score_candidates(self, batch, cand_vecs): # convoluted check that not all memories are empty if (self.opt['use_memories'] and batch.memory_vecs is not None and sum(len(m) for m in batch.memory_vecs)): mems = padded_3d(batch.memory_vecs, use_cuda=self.use_cuda) else: mems = None return self.model( xs=batch.text_vec, mems=mems, cands=cand_vecs, )
def eval_step(self, batch): questions = batch.text_vec contexts = padded_3d(batch.memory_vecs) if contexts.shape[0] != self.batch_size: return Output( self.dict.vec2txt( np.random.choice(self.dictionnary_size, size=contexts.shape[0])).split(" ")) output = self.recurrent_entity_network(questions, contexts) pred = output.argmax(dim=1) return Output(self.dict.vec2txt(pred).split(" "))
def make_candidate_vecs(self, cands): cand_batches = [cands[i:i + 200] for i in range(0, len(cands), 200)] cand_vecs = [] for batch in tqdm.tqdm(cand_batches, desc="[ Vectorizing fixed candidates set from " "({} batch(es) of up to 200) ]" "".format(len(cand_batches))): token_idx = [self._vectorize_text(cand, add_start=True, add_end=True, truncate=self.opt["label_truncate"]) for cand in batch] padded_input = padded_3d([token_idx]).squeeze(0) token_idx_cands, segment_idx_cands, mask_cands = to_bert_input( padded_input, self.NULL_IDX) _, embedding_cands = self.model( None, None, None, token_idx_cands, segment_idx_cands, mask_cands) cand_vecs.append(embedding_cands.cpu().detach()) return torch.cat(cand_vecs, 0)
def train_step(self, batch): #self.scheduler.step() self.recurrent_entity_network.train() questions, answers = batch.text_vec, batch.label_vec contexts = padded_3d(batch.memory_vecs) loss = 0 self.optimizer.zero_grad() output = self.recurrent_entity_network(questions, contexts) pred = output.argmax(dim=1) loss = self.criterion(output, answers.squeeze(1)) losses.append(loss.item()) self.writer.add_scalar("data/loss", loss, self.batch_iter) for name, param in self.recurrent_entity_network.named_parameters(): self.writer.add_histogram(name, param.clone().cpu().data.numpy(), self.batch_iter) #self.writer.add_histogram(name + "_grad", param.grad.clone().cpu().data.numpy(), self.batch_iter) # for memory_hop_layer in self.stacked_memory_hop.memory_hop_layers: # for name_in, param_in in memory_hop_layer.named_parameters(): # self.writer.add_histogram(name_in, param_in.clone().cpu().data.numpy(), self.batch_iter) # #self.writer.add_histogram(name_in + "_grad", param_in.grad.clone().cpu().data.numpy(), self.batch_iter) #print("Loss : ", loss.item()) #self.writer.add_histogram("predictions", output.clone().cpu().data.numpy(), self.batch_iter) loss.backward(retain_graph=True) self.optimizer.step() self.batch_iter += 1 return Output(self.dict.vec2txt(pred).split(" "))
def _build_candidates(self, batch, source, mode): """Build a candidate set for this batch :param batch: a Batch object (defined in torch_agent.py) :param source: the source from which candidates should be built, one of ['batch', 'inline', 'fixed'] :param mode: 'train' or 'eval' :return: tuple of tensors (label_inds, cands, cand_vecs) label_inds: A [bsz] LongTensor of the indices of the labels for each example from its respective candidate set cands: A [num_cands] list of (text) candidates OR a [batchsize] list of such lists if source=='inline' cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline' Possible sources of candidates: * batch: the set of all labels in this batch Use all labels in the batch as the candidate set (with all but the example's label being treated as negatives). Note: with this setting, the candidate set is identical for all examples in a batch. This option may be undesirable if it is possible for duplicate labels to occur in a batch, since the second instance of the correct label will be treated as a negative. * inline: batch_size lists, one list per example If each example comes with a list of possible candidates, use those. Note: With this setting, each example will have its own candidate set. * fixed: one global candidate list, provided in a file from the user If self.fixed_candidates is not None, use a set of fixed candidates for all examples. Note: this setting is not recommended for training unless the universe of possible candidates is very small. * vocab: one global candidate list, extracted from the vocabulary with the exception of self.NULL_IDX. """ label_vecs = batch.label_vec # [bsz] list of lists of LongTensors label_inds = None batchsize = batch.text_vec.shape[0] if label_vecs is not None: assert label_vecs.dim() == 2 if source == 'batch': warn_once( '[ Executing {} mode with batch labels as set of candidates. ]' ''.format(mode)) if batchsize == 1: warn_once( "[ Warning: using candidate source 'batch' and observed a " "batch of size 1. This may be due to uneven batch sizes at " "the end of an epoch. ]") if label_vecs is None: raise ValueError( "If using candidate source 'batch', then batch.label_vec cannot be " "None.") cands = batch.labels cand_vecs = label_vecs label_inds = label_vecs.new_tensor(range(batchsize)) elif source == 'inline': warn_once( '[ Executing {} mode with provided inline set of candidates ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'inline', then batch.candidate_vecs " "cannot be None. If your task does not have inline candidates, " "consider using one of --{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) cands = batch.candidates cand_vecs = padded_3d(batch.candidate_vecs, use_cuda=self.use_cuda) if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros(cand_vecs[i].size(1)) label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad) elif source == 'fixed': warn_once( "[ Executing {} mode with a common set of fixed candidates " "(n = {}). ]".format(mode, len(self.fixed_candidates))) if self.fixed_candidates is None: raise ValueError( "If using candidate source 'fixed', then you must provide the path " "to a file of candidates with the flag --fixed-candidates-path" ) cands = self.fixed_candidates cand_vecs = self.fixed_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) for i, label_vec in enumerate(label_vecs): label_inds[i] = self._find_match(cand_vecs, label_vec) elif source == 'vocab': warn_once( '[ Executing {} mode with tokens from vocabulary as candidates. ]' ''.format(mode)) cands = self.vocab_candidates cand_vecs = self.vocab_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) for i, label_vec in enumerate(label_vecs): label_inds[i] = self._find_match(cand_vecs, label_vec) return (cands, cand_vecs, label_inds)
def _build_candidates(self, batch, source, mode): """ Build a candidate set for this batch. :param batch: a Batch object (defined in torch_agent.py) :param source: the source from which candidates should be built, one of ['batch', 'batch-all-cands', 'inline', 'fixed'] :param mode: 'train' or 'eval' :return: tuple of tensors (label_inds, cands, cand_vecs) label_inds: A [bsz] LongTensor of the indices of the labels for each example from its respective candidate set cands: A [num_cands] list of (text) candidates OR a [batchsize] list of such lists if source=='inline' cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline' Possible sources of candidates: * batch: the set of all labels in this batch Use all labels in the batch as the candidate set (with all but the example's label being treated as negatives). Note: with this setting, the candidate set is identical for all examples in a batch. This option may be undesirable if it is possible for duplicate labels to occur in a batch, since the second instance of the correct label will be treated as a negative. * batch-all-cands: the set of all candidates in this batch Use all candidates in the batch as candidate set. Note 1: This can result in a very large number of candidates. Note 2: In this case we will deduplicate candidates. Note 3: just like with 'batch' the candidate set is identical for all examples in a batch. * inline: batch_size lists, one list per example If each example comes with a list of possible candidates, use those. Note: With this setting, each example will have its own candidate set. * fixed: one global candidate list, provided in a file from the user If self.fixed_candidates is not None, use a set of fixed candidates for all examples. Note: this setting is not recommended for training unless the universe of possible candidates is very small. * vocab: one global candidate list, extracted from the vocabulary with the exception of self.NULL_IDX. """ label_vecs = batch.label_vec # [bsz] list of lists of LongTensors label_inds = None batchsize = batch.text_vec.shape[0] if label_vecs is not None: assert label_vecs.dim() == 2 if source == 'batch': warn_once( '[ Executing {} mode with batch labels as set of candidates. ]' ''.format(mode)) if batchsize == 1: warn_once( "[ Warning: using candidate source 'batch' and observed a " "batch of size 1. This may be due to uneven batch sizes at " "the end of an epoch. ]") if label_vecs is None: raise ValueError( "If using candidate source 'batch', then batch.label_vec cannot be " "None.") cands = batch.labels cand_vecs = label_vecs label_inds = label_vecs.new_tensor(range(batchsize)) elif source == 'batch-all-cands': warn_once( '[ Executing {} mode with all candidates provided in the batch ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'batch-all-cands', then batch." "candidate_vecs cannot be None. If your task does not have " "inline candidates, consider using one of " "--{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) # initialize the list of cands with the labels cands = [] all_cands_vecs = [] # dictionary used for deduplication cands_to_id = {} for i, cands_for_sample in enumerate(batch.candidates): for j, cand in enumerate(cands_for_sample): if cand not in cands_to_id: cands.append(cand) cands_to_id[cand] = len(cands_to_id) all_cands_vecs.append(batch.candidate_vecs[i][j]) cand_vecs, _ = padded_tensor( all_cands_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) label_inds = label_vecs.new_tensor( [cands_to_id[label] for label in batch.labels]) elif source == 'inline': warn_once( '[ Executing {} mode with provided inline set of candidates ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'inline', then batch.candidate_vecs " "cannot be None. If your task does not have inline candidates, " "consider using one of --{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) cands = batch.candidates cand_vecs = padded_3d( batch.candidate_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros( cand_vecs[i].size(1)).fill_(self.NULL_IDX) if cand_vecs[i].size(1) < len(label_vec): label_vec = label_vec[0:cand_vecs[i].size(1)] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad) if label_inds[i] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'fixed': warn_once( "[ Executing {} mode with a common set of fixed candidates " "(n = {}). ]".format(mode, len(self.fixed_candidates))) if self.fixed_candidates is None: raise ValueError( "If using candidate source 'fixed', then you must provide the path " "to a file of candidates with the flag --fixed-candidates-path" ) cands = self.fixed_candidates cand_vecs = self.fixed_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros( cand_vecs[i].size(0)).fill_(self.NULL_IDX) if cand_vecs[i].size(0) < len(label_vec): label_vec = label_vec[0:cand_vecs[i].size(1)] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs, label_vec_pad) if label_inds[i] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'vocab': warn_once( '[ Executing {} mode with tokens from vocabulary as candidates. ]' ''.format(mode)) cands = self.vocab_candidates cand_vecs = self.vocab_candidate_vecs # NOTE: label_inds is None here, as we will not find the label in # the set of vocab candidates else: raise Exception("Unrecognized source: %s" % source) return (cands, cand_vecs, label_inds)
def batchify(self, obs_batch, sort=False): """ Create a batch of valid observations from an unchecked batch. A valid observation is one that passes the lambda provided to the function, which defaults to checking if the preprocessed 'text_vec' field is present which would have been set by this agent's 'vectorize' function. Returns a namedtuple Batch. See original definition above for in-depth explanation of each field. If you want to include additonal fields in the batch, you can subclass this function and return your own "Batch" namedtuple: copy the Batch namedtuple at the top of this class, and then add whatever additional fields that you want to be able to access. You can then call super().batchify(...) to set up the original fields and then set up the additional fields in your subclass and return that batch instead. :param obs_batch: List of vectorized observations :param sort: Default False, orders the observations by length of vectors. Set to true when using torch.nn.utils.rnn.pack_padded_sequence. Uses the text vectors if available, otherwise uses the label vectors if available. """ if len(obs_batch) == 0: return Batch() valid_obs = [(i, ex) for i, ex in enumerate(obs_batch) if self.is_valid(ex)] if len(valid_obs) == 0: return Batch() valid_inds, exs = zip(*valid_obs) # TEXT xs, x_lens, context_lens, floors = None, None, None, None if any('text_vec' in ex for ex in exs): _xs = [ex.get('text_vec', [self.EMPTY]) for ex in exs] xs = padded_3d( _xs, self.NULL_IDX, self.use_cuda, fp16friendly=self.opt.get('fp16'), ) x_lens = (xs != self.NULL_IDX).sum(dim=-1) # bsz, context_len context_lens = (x_lens != 0).sum(dim=-1) # bsz floors, _ = padded_tensor( [make_floor(c_len.item()) for c_len in context_lens], use_cuda=self.use_cuda) # We do not sort on the xs which in the shape of [bsz, context_len, utt_len] is this agent # if sort: # sort = False # now we won't sort on labels # xs, x_lens, valid_inds, exs = argsort( # x_lens, xs, x_lens, valid_inds, exs, descending=True # ) # LABELS labels_avail = any('labels_vec' in ex for ex in exs) some_labels_avail = (labels_avail or any('eval_labels_vec' in ex for ex in exs)) ys, y_lens, labels = None, None, None if some_labels_avail: field = 'labels' if labels_avail else 'eval_labels' label_vecs = [ex.get(field + '_vec', self.EMPTY) for ex in exs] labels = [ex.get(field + '_choice') for ex in exs] y_lens = [y.shape[0] for y in label_vecs] ys, y_lens = padded_tensor(label_vecs, self.NULL_IDX, self.use_cuda, fp16friendly=self.opt.get('fp16')) y_lens = torch.LongTensor(y_lens) if self.use_cuda: y_lens = y_lens.cuda() # We do not sort examples in batch for this agent # if sort and xs is None: # ys, valid_inds, label_vecs, labels, y_lens = argsort( # y_lens, ys, valid_inds, label_vecs, labels, y_lens, # descending=True # ) # LABEL_CANDIDATES cands, cand_vecs = None, None if any('label_candidates_vecs' in ex for ex in exs): cands = [ex.get('label_candidates', None) for ex in exs] cand_vecs = [ex.get('label_candidates_vecs', None) for ex in exs] # IMAGE imgs = None if any('image' in ex for ex in exs): imgs = [ex.get('image', None) for ex in exs] return Batch(text_vec=xs, text_lengths=x_lens, context_lens=context_lens, floors=floors, label_vec=ys, label_lengths=y_lens, labels=labels, valid_indices=valid_inds, candidates=cands, candidate_vecs=cand_vecs, image=imgs, observations=exs)