def __call__(self, samples): ''' samples: [dict]: [{'input_text':'text to condition on'}] returns: [dict]: [{'input_text':'text to condition on', 'pred_text':"text from BARTs decoder"}] ''' samples = self.BART_numericalise_transform(samples) if self.show_tqdm: pbar = tqdm(list(chunks(samples, self.chunk_size)), desc="BART is thinking:") else: pbar = samples for chunk in pbar: input_tensor = torch.nn.utils.rnn.pad_sequence( [ torch.tensor(sample_obj["input_ids"], dtype=torch.long) for sample_obj in chunk ], padding_value=self.PAD).T.to(self.device) attention_mask = (input_tensor != self.PAD).type(torch.float).to( self.device) output_ids = self.BART_conditional_generator.generate( input_tensor, attention_mask=attention_mask, pad_token_id=self.PAD, num_beams=4, max_length=512, early_stopping=False) for i in range(len(chunk)): single_out_ids = output_ids[i].tolist() chunk[i]["pred_ids"] = single_out_ids del input_tensor del attention_mask del output_ids samples = self.BART_denumericalise_transform(samples) return samples
def __call__(self, samples): ''' samples: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1]}] returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], "score":0.56}] ''' all_scores = torch.zeros((0, 1), device=self.device) for sample_obj_batch in chunks(samples, self.batch_size): with torch.no_grad(): input_tensor = torch.nn.utils.rnn.pad_sequence( [ torch.tensor(sample_obj["input_ids"], dtype=torch.long, device=self.device) for sample_obj in sample_obj_batch ], padding_value=self.PAD).T type_ids = torch.nn.utils.rnn.pad_sequence( [ torch.tensor(sample_obj["type_ids"], dtype=torch.long) for sample_obj in sample_obj_batch ], padding_value=self.PAD).T.to(self.device) attention_mask = (input_tensor != self.PAD).type( torch.float).to(self.device) scores = self.BERT_Reranker( input_tensor, attention_mask=attention_mask, token_type_ids=type_ids)[0][:, 1].tolist() for sample_obj, score in zip(sample_obj_batch, scores): sample_obj["score"] = score return samples
def __call__(self, samples): ''' The score given corresponds to the likelihood A is more relevant than B. So I higher score is favorrable for A. samples: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], ...}] returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], 'score':0.95, ...}] ''' for sample_obj_batch in chunks(samples, self.batch_size): with torch.no_grad(): input_tensor = torch.nn.utils.rnn.pad_sequence( [ torch.tensor(sample_obj["input_ids"], dtype=torch.long) for sample_obj in sample_obj_batch ], padding_value=self.PAD).T.to(self.device) type_ids = torch.nn.utils.rnn.pad_sequence( [ torch.tensor(sample_obj["type_ids"], dtype=torch.long) for sample_obj in sample_obj_batch ], padding_value=self.PAD).T.to(self.device) attention_mask = (input_tensor != self.PAD).type( torch.float).to(self.device) scores = outputs = self.duoBERT_Reranker( input_tensor, attention_mask=attention_mask, token_type_ids=type_ids)[0][:, 1].tolist() for sample_obj, score in zip(sample_obj_batch, scores): sample_obj["score"] = score return samples
def __init__(self, samples, slow_pipe, real_time_pipe, valid_sample_fn=None, sort_key_fn=None, batch_bucket_size=1, shuffle=False, **kwargs): self.real_time_pipe = real_time_pipe self.PAD = 0 pbar = tqdm(slow_pipe) for transform in pbar: pbar.set_description(transform.__class__.__name__) samples = transform(samples) self.samples = samples if sort_key_fn: assert batch_bucket_size < len( self.samples), 'Bucket size too large' flag_not_valid = [] items_keys = [] for i in tqdm(range(len(self.samples)), desc='pre-sort processing'): sample = self.__getitem__(i) if valid_sample_fn: if valid_sample_fn(sample) == False: flag_not_valid.append(i) items_keys.append(sort_key_fn(sample)) sort_idxs = np.argsort(items_keys)[::-1] sort_idxs = [idx for idx in sort_idxs if idx not in flag_not_valid] idx_chunks = list(chunks(sort_idxs, batch_bucket_size)) first_idx_batch_largest = idx_chunks[0] even_chunks = idx_chunks[1:-1] last_chunk = idx_chunks[-1] if shuffle: random.shuffle(even_chunks) bucketed_idxs = list(first_idx_batch_largest) + [ item for sublist in even_chunks for item in sublist ] + list(last_chunk) self.samples = [self.samples[i] for i in bucketed_idxs] super().__init__()