def get_features(all_sents, tokenizer, max_seq_length, labels=None, verbose=True): """Encode a list of sentences into a list of tuples of (input_ids, segment_ids, input_mask, label).""" features = [] sent_lengths = [] too_long_count = 0 for sent_id, sent in enumerate(all_sents): if sent_id % 1000 == 0: logging.debug(f"Encoding sentence {sent_id}/{len(all_sents)}") sent_subtokens = [tokenizer.cls_token] for word in sent: word_tokens = tokenizer.text_to_tokens(word) sent_subtokens.extend(word_tokens) if max_seq_length > 0 and len(sent_subtokens) + 1 > max_seq_length: sent_subtokens = sent_subtokens[:max_seq_length] too_long_count += 1 sent_subtokens.append(tokenizer.sep_token) sent_lengths.append(len(sent_subtokens)) input_ids = [tokenizer.tokens_to_ids(t) for t in sent_subtokens] # The mask has 1 for real tokens and 0 for padding tokens. # Only real tokens are attended to. input_mask = [1] * len(input_ids) segment_ids = [0] * len(input_ids) if verbose and sent_id < 2: logging.info("*** Example ***") logging.info(f"example {sent_id}: {sent}") logging.info("subtokens: %s" % " ".join(sent_subtokens)) logging.info("input_ids: %s" % list2str(input_ids)) logging.info("segment_ids: %s" % list2str(segment_ids)) logging.info("input_mask: %s" % list2str(input_mask)) logging.info("label: %s" % labels[sent_id] if labels else "**Not Provided**") label = labels[sent_id] if labels else -1 features.append([ np.asarray(input_ids), np.asarray(segment_ids), np.asarray(input_mask), label ]) if max_seq_length > -1 and too_long_count > 0: logging.warning( f'Found {too_long_count} out of {len(all_sents)} sentences with more than {max_seq_length} subtokens. ' f'Truncated long sentences from the end.') if verbose: get_stats(sent_lengths) return features
def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): """ Called at the end of validation to aggregate outputs. outputs: list of individual outputs of each validation step. """ avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() preds = torch.cat([x['eval_tensors']['preds'] for x in outputs]) labels = torch.cat([x['eval_tensors']['labels'] for x in outputs]) all_preds = [] all_labels = [] if torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() for ind in range(world_size): all_preds.append(torch.empty_like(preds)) all_labels.append(torch.empty_like(labels)) torch.distributed.all_gather(all_preds, preds) torch.distributed.all_gather(all_labels, labels) else: all_preds.append(preds) all_labels.append(labels) tensorboard_logs = {} if not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0: preds = [] labels = [] for p in all_preds: preds.extend(tensor2list(p)) for l in all_labels: labels.extend(tensor2list(l)) tensorboard_logs = compute_metrics(self.task_name, np.array(preds), np.array(labels)) val_name = self._validation_names[dataloader_idx].upper() logging.info(f'{val_name} evaluation: {tensorboard_logs}') # writing labels and predictions to a file in output_dir is specified in the config output_dir = self._cfg.output_dir if output_dir: os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, f'{self.task_name}_{val_name}.txt') logging.info(f'Saving labels and predictions to {filename}') with open(filename, 'w') as f: f.write('labels\t' + list2str(labels) + '\n') f.write('preds\t' + list2str(preds) + '\n') tensorboard_logs['val_loss'] = avg_loss for key in tensorboard_logs: self.log(f'{key}', tensorboard_logs[key], prog_bar=True) return {'val_loss': avg_loss, 'log': tensorboard_logs}