def mask_and_tensorize(self, batch): batch = list(batch) if not batch: return torch.Tensor() masked_sources = [] masked_targets = [] for tokens in batch: dec_source, dec_target = self.mask.gen_masked_source_target( tokens, vocab=self.vocab ) masked_sources.append(dec_source) masked_targets.append(dec_target) return ( cuda.tensor( pad(masked_sources, pad_token=self.vocab.get_pad_index()), dtype=torch.long, ), cuda.tensor( pad(masked_targets, pad_token=self.vocab.get_pad_index()), dtype=torch.long, ), )
def get_weights_context(self, tensor_dict): batch_size = tensor_dict["doc_labels"].size()[0] return { "doc_weight": tensor_dict.get("doc_weight") or cuda_util.tensor([self.default_doc_loss_weight] * batch_size, dtype=torch.float), "word_weight": tensor_dict.get("word_weight") or cuda_util.tensor([self.default_word_loss_weight] * batch_size, dtype=torch.float), }
def pad_and_tensorize(batch, pad_token=0, pad_shape=None, dtype=torch.long): batch = list(batch) if not batch: return torch.Tensor() return cuda.tensor(pad(batch, pad_token=pad_token, pad_shape=pad_shape), dtype=dtype)
def tensorize(self, batch): # Pad a minibatch of dictionary features to be # batch_size * max_number_of_words * max_number_of_features # unpack the minibatch feats, weights, lengths = zip(*batch) lengths_flattened = [l for l_list in lengths for l in l_list] seq_lens = [len(l_list) for l_list in lengths] max_ex_len = max(seq_lens) max_feat_len = max(lengths_flattened) all_lengths, all_feats, all_weights = [], [], [] for i, seq_len in enumerate(seq_lens): ex_feats, ex_weights, ex_lengths = [], [], [] feats_lengths, feats_vals, feats_weights = lengths[i], feats[ i], weights[i] max_feat_len_example = max(feats_lengths) r_offset = 0 for _ in feats_lengths: # The dict feats obtained from the featurizer will have necessary # padding at the utterance level. Therefore we move the offset by # max feature length in the example. ex_feats.extend(feats_vals[r_offset:r_offset + max_feat_len_example]) ex_feats.extend([self.vocab.get_pad_index()] * (max_feat_len - max_feat_len_example)) ex_weights.extend(feats_weights[r_offset:r_offset + max_feat_len_example]) ex_weights.extend([0.0] * (max_feat_len - max_feat_len_example)) r_offset += max_feat_len_example ex_lengths.extend(feats_lengths) # Pad examples ex_padding = (max_ex_len - seq_len) * max_feat_len ex_feats.extend([self.vocab.get_pad_index()] * ex_padding) ex_weights.extend([0.0] * ex_padding) ex_lengths.extend([1] * (max_ex_len - seq_len)) all_feats.append(ex_feats) all_weights.append(ex_weights) all_lengths.append(ex_lengths) return ( cuda.tensor(all_feats, torch.long), cuda.tensor(all_weights, torch.float), cuda.tensor(all_lengths, torch.long), )
def get_label_weights(vocab_dict: Dict[str, int], label_weights: Dict[str, float]): # prune the label_weights to remove the labels that do not exist in the dataset pruned_label_weights = { vocab_dict[k]: v for (k, v) in label_weights.items() if k in vocab_dict } if len(pruned_label_weights) != len(label_weights): filtered_labels = [k for k in label_weights if k not in vocab_dict] print( f"Warning: these labels are filtered from original label weights {filtered_labels}" ) if len(pruned_label_weights) == 0: return None # All unspecified classes will get a weight of 1 weights_tensor = [1] * len(vocab_dict) for k, v in pruned_label_weights.items(): weights_tensor[k] = v return tensor(weights_tensor, dtype=torch.float)
def report_realtime_metric(self, stage): if stage != Stage.TRAIN: return if cuda.DISTRIBUTED_WORLD_SIZE > 1: all_reduce_stats = cuda.tensor( [ self.last_batch_tps, self.last_batch_loss, self.aggregate_loss, self.total_masked_tokens, self.realtime_meters["tps"].n, ], dtype=torch.float32, ) total_elapsed_time = self.realtime_meters["tps"].elapsed_time torch.distributed.all_reduce(all_reduce_stats) # average last_batch_loss by distributed_world_size all_reduce_stats[1:2].div_(cuda.DISTRIBUTED_WORLD_SIZE) [ last_batch_tps, last_batch_loss, aggregate_loss, total_masked_tokens, total_tokens, ] = all_reduce_stats.tolist() tps = total_tokens / total_elapsed_time else: last_batch_tps = self.last_batch_tps last_batch_loss = self.last_batch_loss aggregate_loss = self.aggregate_loss total_masked_tokens = self.total_masked_tokens tps = self.realtime_meters["tps"].avg print( f"Tokens/s: {last_batch_tps:.0f}, " f"batch ppl: {math.exp(last_batch_loss):.2f}, " f"agg ppl: {math.exp(self._calculate_loss(aggregate_loss, total_masked_tokens)):.2f}, " f"number of batches: {self.total_batches:.0f}, " f"accumulated tokens/s: {tps:.0f}", flush=True, ) # TODO: remove GPU0 report print( f"GPU-0 tokens/s: {self.last_batch_tps:.0f}, " f"batch ppl: {math.exp(self.last_batch_loss):.2f}, " f"agg ppl: {math.exp(self.calculate_loss()):.2f}, " f"number of batches: {self.total_batches}, " f"accumulated tokens/s: {self.realtime_meters['tps'].avg:.0f}", flush=True, ) if self.pep_format: # used for pep regression benchmark print( "PyTorchObserver " + json.dumps({ "type": "MLM", "metric": "tps", "unit": "token/sec", "value": f"{tps:.0f}", }), flush=True, )
def tensorize(self, batch): return cuda.tensor(batch, torch.float)