def test_distributed_covariance(self): batch_size = 10 num_labels = 10 predictions = torch.randn(batch_size, num_labels) labels = 0.5 * predictions + torch.randn(batch_size, num_labels) # Random binary mask mask = torch.randint(0, 2, size=(batch_size, num_labels)).bool() expected_covariance = np.cov( predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy(), fweights=mask.view(-1).cpu().numpy(), )[0, 1] predictions = [predictions[:5], predictions[5:]] labels = [labels[:5], labels[5:]] mask = [mask[:5], mask[5:]] metric_kwargs = { "predictions": predictions, "gold_labels": labels, "mask": mask } run_distributed_test( [-1, -1], global_distributed_metric, Covariance(), metric_kwargs, expected_covariance, exact=(0.0001, 1e-01), )
def test_distributed_metric_values(self): top_spans = torch.tensor([[[0, 1], [4, 6], [8, 9]]]) antecedent_indices = torch.tensor([[[-1, -1, -1], [0, -1, -1], [0, 1, -1]]]) predicted_antecedents = torch.tensor([[-1, -1, 1]]) metadata_list = [[{ "clusters": [((4, 6), (8, 9))] }], [{ "clusters": [((0, 1), (4, 6))] }]] metric_kwargs = { "top_spans": [top_spans, top_spans], "antecedent_indices": [antecedent_indices, antecedent_indices], "predicted_antecedents": [predicted_antecedents, predicted_antecedents], "metadata_list": metadata_list, } desired_values = (0.625, 0.625, 0.625) run_distributed_test( [-1, -1], global_distributed_metric, ConllCorefScores(), metric_kwargs, desired_values, exact=True, )
def test_distributed_auc_unequal_batches(self): predictions = torch.randn(8) labels = torch.randint(3, 5, (8,), dtype=torch.long) # We make sure that the positive label is always present. labels[0] = 4 labels[4] = 4 false_positive_rates, true_positive_rates, _ = metrics.roc_curve( labels.cpu().numpy(), predictions.cpu().numpy(), pos_label=4 ) predictions = [predictions[:2], predictions[2:]] labels = [labels[:2], labels[2:]] metric_kwargs = {"predictions": predictions, "gold_labels": labels} desired_auc = metrics.auc(false_positive_rates, true_positive_rates) with pytest.raises(Exception) as _: run_distributed_test( [-1, -1], global_distributed_metric, Auc(positive_label=4), metric_kwargs, desired_auc, exact=False, )
def test_multiple_distributed_runs(self): predictions = [ torch.tensor([[1, 0, 0], [1, 1, 0]]), torch.tensor([[1, 1, 1]]), ] gold_targets = [ torch.tensor([[2, 0, 0], [1, 0, 0]]), torch.tensor([[1, 1, 2]]), ] check = math.exp(0.5 * (math.log(3) - math.log(6)) + 0.5 * (math.log(1) - math.log(3))) metric_kwargs = { "predictions": predictions, "gold_targets": gold_targets } desired_values = {"BLEU": check} run_distributed_test( [-1, -1], multiple_runs, BLEU(ngram_weights=(0.5, 0.5), exclude_indices={0}), metric_kwargs, desired_values, exact=False, )
def test_distributed_accuracy(self): gold = torch.tensor([[2, 4, 8], [1, 2, 3], [7, 1, 1], [11, 14, 17]]) predictions = torch.tensor([ [[2, 4, 8], [2, 5, 9]], # 3/3 [[-1, 2, 4], [3, 8, -1]], # 2/2 [[-1, -1, -1], [7, 2, -1]], # 1/2 [[12, 13, 17], [11, 13, 18]], # 2/2 ]) mask = torch.tensor([[True, True, True], [False, True, True], [True, True, False], [True, False, True]]) gold = [gold[:2], gold[2:]] predictions = [predictions[:2], predictions[2:]] mask = [mask[:2], mask[2:]] metric_kwargs = { "predictions": predictions, "gold_labels": gold, "mask": mask } desired_values = {"unigram_recall": 7 / 8} run_distributed_test( [-1, -1], global_distributed_metric, UnigramRecall(), metric_kwargs, desired_values, exact=False, )
def test_multiple_distributed_runs(self): predictions = [ torch.tensor( [ [0.55, 0.25, 0.10, 0.10, 0.20], [0.10, 0.60, 0.10, 0.95, 0.00], [0.90, 0.80, 0.75, 0.80, 0.00], ] ), torch.tensor( [ [0.49, 0.50, 0.95, 0.55, 0.00], [0.60, 0.49, 0.60, 0.65, 0.85], [0.85, 0.40, 0.10, 0.20, 0.00], ] ), ] targets = [ torch.tensor([[1, 1, 0, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 1, 0]]), torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 0], [0, 0, 0, 0, 0]]), ] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_metrics = { "precision": self.desired_precisions, "recall": self.desired_recalls, "fscore": self.desired_fscores, } run_distributed_test( [-1, -1], multiple_runs, FBetaMultiLabelMeasure(), metric_kwargs, desired_metrics, exact=False, )
def test_multiple_distributed_runs(self): predictions = [torch.Tensor([[0, 1, 3, 5, 2, 4]]), torch.Tensor([[0, 3, 2, 1, 0, 0]])] gold_indices = [torch.Tensor([[0, 1, 3, 5, 2, 4]]), torch.Tensor([[0, 3, 2, 1, 0, 0]])] label_predictions = [ torch.Tensor([[0, 5, 2, 3, 3, 3]]), torch.Tensor([[7, 4, 8, 2, 0, 0]]), ] gold_labels = [torch.Tensor([[0, 5, 2, 1, 4, 2]]), torch.Tensor([[0, 4, 8, 2, 0, 0]])] mask = [ torch.tensor([[True, True, True, True, True, True]]), torch.tensor([[True, True, True, True, False, False]]), ] metric_kwargs = { "predicted_indices": predictions, "gold_indices": gold_indices, "predicted_labels": label_predictions, "gold_labels": gold_labels, "mask": mask, } desired_metrics = { "UAS": 1.0, "LAS": 0.6, "UEM": 1.0, "LEM": 0.0, } run_distributed_test( [-1, -1], multiple_runs, AttachmentScores(), metric_kwargs, desired_metrics, exact=True, )
def test_distributed_sequence_accuracy(self): gold = torch.tensor([[1, 2, 3], [2, 4, 8], [0, 1, 1], [11, 13, 17]]) predictions = torch.tensor([ [[1, 2, 3], [1, 2, -1]], [[2, 4, 8], [2, 5, 9]], [[-1, -1, -1], [0, 1, -1]], [[12, 13, 17], [11, 13, 18]], ]) mask = torch.tensor([[False, True, True], [True, True, True], [True, True, False], [True, False, True]], ) gold = [gold[:2], gold[2:]] predictions = [predictions[:2], predictions[2:]] mask = [mask[:2], mask[2:]] metric_kwargs = { "predictions": predictions, "gold_labels": gold, "mask": mask } desired_values = {"accuracy": 3 / 4} run_distributed_test( [-1, -1], global_distributed_metric, SequenceAccuracy(), metric_kwargs, desired_values, exact=False, )
def test_distributed_npmixy_masked_computation(self): Y = torch.ones(3, 3).long() X = torch.eye(3).long() mask = torch.ones_like(Y).bool() expected_ova_npmixy_gaps = { 0: [np.nan, 0.0], 1: [np.nan, 0.0], } metric_kwargs = {"predicted_labels": Y, "protected_variable_labels": X, "mask": mask} run_distributed_test( [-1, -1], global_distributed_metric, AssociationWithoutGroundTruth(2, 2, "npmixy", "ova"), metric_kwargs, expected_ova_npmixy_gaps, exact=True, ) expected_pairwise_npmixy_gaps = { 0: {0: [np.nan, 0.0], 1: [np.nan, 0.0]}, 1: {0: [np.nan, 0.0], 1: [np.nan, 0.0]}, } metric_kwargs = {"predicted_labels": Y, "protected_variable_labels": X, "mask": mask} run_distributed_test( [-1, -1], global_distributed_metric, AssociationWithoutGroundTruth(2, 2, "npmixy", "pairwise"), metric_kwargs, expected_pairwise_npmixy_gaps, exact=True, )
def test_multiple_distributed_runs(self): predictions = [torch.tensor([[0, 1], [2, 3]]), torch.tensor([[4, 5], [6, 7]])] targets = [torch.tensor([[0, 1], [2, 2]]), torch.tensor([[4, 5], [7, 7]])] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_values = 0.5 run_distributed_test( [-1, -1], multiple_runs, BooleanAccuracy(), metric_kwargs, desired_values, exact=True, )
def test_distributed_entropy(self): logits = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 1]], dtype=torch.float) logits = [logits[0], logits[1]] metric_kwargs = {"logits": logits} desired_values = {"entropy": 1.38629436} run_distributed_test( [-1, -1], global_distributed_metric, Entropy(), metric_kwargs, desired_values, exact=False, )
def test_distributed_average(self, device: str): device_ids = [-1, -1] if device == "cpu" else [0, 1] metric_kwargs = { "value": [1.0, 2.0], } run_distributed_test( device_ids, global_distributed_metric, self.metric, metric_kwargs, 1.5, exact=True, )
def test_distributed_accuracy_unequal_batches(self): predictions = [torch.tensor([[0, 1], [2, 3], [4, 5]]), torch.tensor([[6, 7]])] targets = [torch.tensor([[0, 1], [2, 2], [4, 5]]), torch.tensor([[7, 7]])] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_values = 0.5 run_distributed_test( [-1, -1], global_distributed_metric, BooleanAccuracy(), metric_kwargs, desired_values, exact=True, )
def test_distributed_independence_masked_computation(self): A = torch.eye(3).long() C = 2 * A mask = torch.ones_like(C).bool() expected_kl_divs = {0: 0.4055, 1: 1.0986} metric_kwargs = {"predicted_labels": C, "protected_variable_labels": A, "mask": mask} run_distributed_test( [-1, -1], global_distributed_metric, Independence(4, 2), metric_kwargs, expected_kl_divs, exact=False, )
def test_multiple_distributed_runs(self): predictions = [ torch.tensor([[0.35, 0.25, 0.1, 0.1, 0.2]]), torch.tensor([[0.1, 0.6, 0.1, 0.2, 0.0]]), ] targets = [torch.tensor([0]), torch.tensor([3])] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_accuracy = 0.5 run_distributed_test( [-1, -1], multiple_runs, CategoricalAccuracy(), metric_kwargs, desired_accuracy, exact=True, )
def test_distributed_spearman(self): batch_size = 10 num_labels = 10 predictions = torch.randn(batch_size, num_labels) labels = 0.5 * predictions + torch.randn(batch_size, num_labels) desired_spearman = spearman_formula(predictions.reshape(-1), labels.reshape(-1)) predictions = [predictions[:5], predictions[5:]] labels = [labels[:5], labels[5:]] metric_kwargs = {"predictions": predictions, "gold_labels": labels} run_distributed_test( [-1, -1], global_distributed_metric, SpearmanCorrelation(), metric_kwargs, desired_spearman, exact=False, )
def test_distributed_accuracy_unequal_batches(self): predictions = [ torch.tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0]]), torch.tensor([[0.1, 0.2, 0.5, 0.2, 0.0]]), ] targets = [torch.tensor([0, 3]), torch.tensor([0])] mask = [torch.tensor([False, True]), torch.tensor([True])] metric_kwargs = {"predictions": predictions, "gold_labels": targets, "mask": mask} desired_accuracy = 0.5 run_distributed_test( [-1, -1], global_distributed_metric, CategoricalAccuracy(top_k=2), metric_kwargs, desired_accuracy, exact=False, )
def test_distributed_nli(self): nli_probabilities = 0.6 * torch.eye(3) expected_scores = { "net_neutral": 0.6 / 3, "fraction_neutral": 1 / 3, "threshold_0.5": 1 / 3, "threshold_0.7": 0.0, } metric_kwargs = {"nli_probabilities": [nli_probabilities, nli_probabilities]} run_distributed_test( [-1, -1], global_distributed_metric, NaturalLanguageInference(0), metric_kwargs, expected_scores, exact=False, )
def test_multiple_distributed_runs(self): predictions = [ torch.tensor([[1.0, 1.5, 1.0], [2.0, 3.0, 3.5]]), torch.tensor([[4.0, 5.0, 5.5], [6.0, 7.0, 7.5]]), ] targets = [ torch.tensor([[0.0, 1.0, 0.0], [2.0, 2.0, 0.0]]), torch.tensor([[4.0, 5.0, 0.0], [7.0, 7.0, 0.0]]), ] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_values = {"mae": 21.0 / 12.0} run_distributed_test( [-1, -1], multiple_runs, MeanAbsoluteError(), metric_kwargs, desired_values, exact=True, )
def test_multiple_distributed_runs(self): predictions = [ torch.tensor( [[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]] ), torch.tensor( [[0.1, 0.5, 0.1, 0.2, 0.0], [0.1, 0.2, 0.1, 0.7, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]] ), ] targets = [torch.tensor([0, 4, 1]), torch.tensor([0, 3, 0])] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_metrics = { "precision": self.desired_precisions, "recall": self.desired_recalls, "fscore": self.desired_fscores, } run_distributed_test( [-1, -1], multiple_runs, FBetaMeasure(), metric_kwargs, desired_metrics, exact=False, )
def test_distributed_setting_throws_an_error(self): from allennlp_models.structured_prediction.models.srl import ( convert_bio_tags_to_conll_format, ) batch_verb_indices = [2] batch_sentences = [["The", "cat", "loves", "hats", "."]] batch_bio_predicted_tags = [["B-ARG0", "B-ARG1", "B-V", "B-ARG1", "O"]] batch_conll_predicted_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags ] batch_bio_gold_tags = [["B-ARG0", "I-ARG0", "B-V", "B-ARG1", "O"]] batch_conll_gold_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags ] metric_kwargs = { "batch_verb_indices": [batch_verb_indices, batch_verb_indices], "batch_sentences": [batch_sentences, batch_sentences], "batch_conll_formatted_predicted_tags": [ batch_conll_predicted_tags, batch_conll_predicted_tags, ], "batch_conll_formatted_gold_tags": [batch_conll_gold_tags, batch_conll_gold_tags], } desired_values = {} # it does not matter, we expect the run to fail. with pytest.raises(Exception) as exc: run_distributed_test( [-1, -1], global_distributed_metric, SrlEvalScorer(ignore_classes=["V"]), metric_kwargs, desired_values, exact=True, ) assert ( "RuntimeError: Distributed aggregation for `SrlEvalScorer` is currently not supported." in str(exc.value))
def test_distributed_squad_em_and_f1(self): best_span_string = ["this is the best span", "this is another span"] answer_strings = [ ["this is a good span", "something irrelevant"], ["this is another span", "this one is less perfect"], ] metric_kwargs = { "best_span_string": best_span_string, "answer_strings": answer_strings } desired_values = (1 / 2, 1.75 / 2) run_distributed_test( [-1, -1], global_distributed_metric, SquadEmAndF1(), metric_kwargs, desired_values, exact=True, )
def test_distributed_accuracy(self): logits = [ torch.tensor([[0.35, 0.25, 0.1, 0.1, 0.2]]), torch.tensor([[0.1, 0.6, 0.1, 0.2, 0.0]]), ] labels = [torch.tensor([[0]]), torch.tensor([[3]])] label_weights = [torch.tensor([[1 / 3]]), torch.tensor([[2 / 3]])] metric_kwargs = { "logits": logits, "labels": labels, "label_weights": label_weights } desired_accuracy = {"score": (1 / 3) / 2} run_distributed_test( [-1, -1], global_distributed_metric, VqaMeasure(), metric_kwargs, desired_accuracy, exact=False, )
def test_distributed_sufficiency_masked_computation(self): C = torch.zeros(3, 3).long() Y = torch.eye(3).long() A = Y mask = torch.ones_like(C).bool() expected_kl_divs = {0: {0: 0.4055, 1: 1.0986}, 1: {0: np.nan, 1: np.nan}} metric_kwargs = { "predicted_labels": C, "gold_labels": Y, "protected_variable_labels": A, "mask": mask, } run_distributed_test( [-1, -1], global_distributed_metric, Sufficiency(2, 2), metric_kwargs, expected_kl_divs, exact=False, )
def test_distributed_pearson(self): batch_size = 10 num_labels = 10 predictions = torch.randn(batch_size, num_labels) labels = 0.5 * predictions + torch.randn(batch_size, num_labels) expected_pearson_correlation = pearson_corrcoef( predictions.view(-1).cpu().numpy(), labels.view(-1).cpu().numpy(), ) predictions = [predictions[:5], predictions[5:]] labels = [labels[:5], labels[5:]] metric_kwargs = {"predictions": predictions, "gold_labels": labels} run_distributed_test( [-1, -1], global_distributed_metric, PearsonCorrelation(), metric_kwargs, expected_pearson_correlation, exact=(0.0001, 1e-01), )
def test_distributed_fbeta_measure(self): predictions = [ torch.tensor([[0.35, 0.25, 0.1, 0.1, 0.2], [0.1, 0.6, 0.1, 0.2, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]]), torch.tensor([[0.1, 0.5, 0.1, 0.2, 0.0], [0.1, 0.2, 0.1, 0.7, 0.0], [0.1, 0.6, 0.1, 0.2, 0.0]]), ] targets = [torch.tensor([0, 4, 1]), torch.tensor([0, 3, 0])] metric_kwargs = {"predictions": predictions, "gold_labels": targets} desired_metrics = { "precision": 1.0, "recall": 0.333333333, "f1": 0.499999999, } run_distributed_test( [-1, -1], global_distributed_metric, F1Measure(positive_label=0), metric_kwargs, desired_metrics, exact=False, )
def test_distributed_loading_and_training(self, mixed_precision, flatten_parameters): run_distributed_test( [0, 1], func=_dist_load_and_train, test_dir=self.TEST_DIR, mixed_precision=mixed_precision, flatten_parameters=flatten_parameters, ) # Now make sure the sharded saved state is exactly the same as the original state when consolidated. original_state = torch.load(self.TEST_DIR / "state.pt", map_location="cpu") consolidated_state = FairScaleFsdpWrappedModel.consolidate_sharded_state( [ self.TEST_DIR / "state_worker0.pt", self.TEST_DIR / "state_worker1.pt", ]) assert set(original_state.keys()) - set(consolidated_state.keys()) == { "decoder.linear.weight" # won't be in the state dict since param is tied to embedding.weight } for key, tensor0 in original_state.items(): if key not in consolidated_state: continue # Need to give extra tolerance for buffers when `mixed_precision` is `True`. tolerance = None if not mixed_precision or "buffer" not in key else 1e-3 tensor1 = consolidated_state[key] assert_allclose( tensor0, tensor1, msg= f"{key} is off in consolidated state.\nExpected:\n{tensor0}\nGot:\n{tensor1}", atol=tolerance, rtol=tolerance, )
def test_distributed_evalb(self): tree1 = Tree.fromstring( "(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring( "(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") predicted_trees = [[tree1], [tree2]] gold_trees = [[tree2], [tree2]] metric_kwargs = { "predicted_trees": predicted_trees, "gold_trees": gold_trees } desired_values = { "evalb_recall": 0.875, "evalb_precision": 0.875, "evalb_f1_measure": 0.875, } run_distributed_test( [-1, -1], global_distributed_metric, EvalbBracketingScorer(), metric_kwargs, desired_values, exact=True, )
def test_distributed_mention_recall(self): batched_top_spans = [ torch.tensor([[[2, 4], [1, 3]]]), torch.tensor([[[5, 6], [7, 8]]]) ] batched_metadata = [[{ "clusters": [[(2, 4), (3, 5)]] }], [{ "clusters": [[(5, 6), (7, 8)]] }]] metric_kwargs = { "batched_top_spans": batched_top_spans, "batched_metadata": batched_metadata, } desired_values = 0.75 run_distributed_test( [-1, -1], global_distributed_metric, MentionRecall(), metric_kwargs, desired_values, exact=True, )
def test_distributed_drop_em_and_f1(self): prediction = ["this is the best span", "this is another span"] ground_truths = [ [{ "spans": ["this is a good span", "something irrelevant"] }], [{ "spans": ["this is another span"] }], ] metric_kwargs = { "prediction": prediction, "ground_truths": ground_truths } desired_values = (1 / 2, 1.38 / 2) run_distributed_test( [-1, -1], global_distributed_metric, DropEmAndF1(), metric_kwargs, desired_values, exact=True, )