def create_dataloader(df: pd.DataFrame, split: str, task_name: str) -> DictDataLoader: dataset = DictDataset( name="TestData", split=split, X_dict={ "coordinates": torch.stack((torch.tensor(df["x1"]), torch.tensor(df["x2"])), dim=1) }, Y_dict={task_name: torch.tensor(df["y"], dtype=torch.long)}, ) dataloader = DictDataLoader(dataset=dataset, batch_size=4, shuffle=(dataset.split == "train")) return dataloader
def test_remapped_labels(self): # Test additional label keys in the Y_dict # Without remapping, model should ignore them task_name = self.task1.name X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() Y_dict = {task_name: Y, "other_task": Y} dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict=Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) model = MultitaskClassifier([self.task1]) loss_dict, count_dict = model.calculate_loss(dataset.X_dict, dataset.Y_dict) self.assertIn("task1", loss_dict) # Test setting without remapping results = model.predict(dataloader) self.assertIn("task1", results["golds"]) self.assertNotIn("other_task", results["golds"]) scores = model.score([dataloader]) self.assertIn("task1/dataset/train/accuracy", scores) self.assertNotIn("other_task/dataset/train/accuracy", scores) # Test remapped labelsets results = model.predict(dataloader, remap_labels={"other_task": task_name}) self.assertIn("task1", results["golds"]) self.assertIn("other_task", results["golds"]) results = model.score([dataloader], remap_labels={"other_task": task_name}) self.assertIn("task1/dataset/train/accuracy", results) self.assertIn("other_task/dataset/train/accuracy", results)
def test_make_slice_dataloader(self): # Test correct construction dataloader = self.slice_model.make_slice_dataloader( dataset=self.datasets[0], S=self.S) Y_dict = dataloader.dataset.Y_dict self.assertEqual(len(Y_dict), 7) self.assertIn("test_task", Y_dict) self.assertIn("test_task_slice:base_pred", Y_dict) self.assertIn("test_task_slice:base_ind", Y_dict) self.assertIn("test_task_slice:f_pred", Y_dict) self.assertIn("test_task_slice:f_ind", Y_dict) self.assertIn("test_task_slice:g_pred", Y_dict) self.assertIn("test_task_slice:g_ind", Y_dict) # Test bad data input bad_data_dataset = DictDataset( name="test_data", split="train", X_dict={self.data_name: self.X}, Y_dict={"bad_labels": self.Y}, ) with self.assertRaisesRegex(ValueError, "labels missing"): self.slice_model.make_slice_dataloader(dataset=bad_data_dataset, S=self.S)
# Get the number of classes included in dataset to use in task-specific head later num_classes = len(output_label_to_int_dict.keys()) # Define dictionary keys for the data, dataset and task of the given task task_data_name = f"{task_name}_data" task_formal_name = f"{task_name}_task" task_dataset_name = f"{task_name}Dataset" for split, X, Y in ( ("train", train_X, train_y), ("valid", dev_X, dev_y), ("test", test_X, test_y), ): X_dict = {task_data_name: torch.tensor(X, dtype=torch.long)} Y_dict = {task_formal_name: torch.tensor(Y, dtype=torch.long)} dataset = DictDataset(task_dataset_name, split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) dataloaders.append(dataloader) # Define a one-layer prediction "head" module specific to each task head_module = task_type_function_mapping[task_type]["head_module"]( hidden_layer_size, num_classes) task_head_name = f"{task_name}_head_module" # The module pool contains all the modules this task uses module_pool = nn.ModuleDict({ "bert_module": bert_module, task_head_name: head_module })
def test_classifier_dataloader(self): """Unit test of DictDataLoader""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = DictDataset( name="new_data", split="train", X_dict={"data1": x1, "data2": x2}, Y_dict={"task1": y1, "task2": y2}, ) dataloader1 = DictDataLoader(dataset=dataset, batch_size=2) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed self.assertEqual(dataloader1.dataset.split, "train") self.assertTrue(torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1]))) dataloader2 = DictDataLoader(dataset=dataset, batch_size=3) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed self.assertEqual(dataloader2.dataset.split, "train") self.assertTrue( torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) ) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1, 1]))) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["task2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2]]))) x_batch, y_batch = next(iter(dataloader2)) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2], [2]])))
# `DictDataloader` is a wrapper for `torch.utils.data.Dataloader`, which handles the collate function for `DictDataset` appropriately. # %% import torch from snorkel.classification import DictDataset, DictDataLoader dataloaders = [] for task_name in ["circle", "square"]: for split, X, Y in ( ("train", X_train, Y_train), ("valid", X_valid, Y_valid), ("test", X_test, Y_test), ): X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])} Y_dict = {f"{task_name}_task": torch.LongTensor(Y[task_name])} dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=32) dataloaders.append(dataloader) # %% [markdown] # We now have 6 data loaders, one for each split (`train`, `valid`, `test`) of each task (`circle_task` and `square_task`). # %% [markdown] # ## Define Model # %% [markdown] # Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier. # We'll instantiate it from a list of `Tasks`. # %% [markdown] # ### Tasks
import random import unittest import numpy as np import torch from snorkel.classification import DictDataLoader, DictDataset from snorkel.classification.training.schedulers import ( SequentialScheduler, ShuffledScheduler, ) dataset1 = DictDataset( "d1", "train", X_dict={"data": [0, 1, 2, 3, 4]}, Y_dict={"labels": torch.LongTensor([1, 1, 1, 1, 1])}, ) dataset2 = DictDataset( "d2", "train", X_dict={"data": [5, 6, 7, 8, 9]}, Y_dict={"labels": torch.LongTensor([2, 2, 2, 2, 2])}, ) dataloader1 = DictDataLoader(dataset1, batch_size=2) dataloader2 = DictDataLoader(dataset2, batch_size=2) dataloaders = [dataloader1, dataloader2] class SequentialTest(unittest.TestCase):
def create_dataset(X, Y, split, dataset_name, input_name, task_name): return DictDataset(name=dataset_name, split=split, X_dict={input_name: X}, Y_dict={task_name: Y})