def test_classifier_dataset(self): """Unit test of DictDataset""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) dataset = DictDataset( X_dict={"data1": x1}, Y_dict={"task1": y1}, name="new_data", split="train" ) # Check if the dataset is correctly constructed self.assertTrue(torch.equal(dataset[0][0]["data1"], x1[0])) self.assertTrue(torch.equal(dataset[0][1]["task1"], y1[0])) self.assertEqual( repr(dataset), "DictDataset(name=new_data, X_keys=['data1'], Y_keys=['task1'])", ) # Test from_tensors inits with default values dataset = DictDataset.from_tensors(x1, y1, "train") self.assertEqual( repr(dataset), f"DictDataset(name={DEFAULT_DATASET_NAME}, " f"X_keys=['{DEFAULT_INPUT_DATA_KEY}'], Y_keys=['{DEFAULT_TASK_NAME}'])", )
def create_dataloader(task_name="task", split="train"): X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() dataset = DictDataset( name="dataset", split=split, X_dict={"data": X}, Y_dict={task_name: Y} ) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) return dataloader
def test_score_shuffled(self): # Test scoring with a shuffled dataset set_seed(123) class SimpleVoter(nn.Module): def forward(self, x): """Set class 0 to -1 if x and 1 otherwise""" mask = x % 2 == 0 out = torch.zeros(x.shape[0], 2) out[mask, 0] = 1 # class 0 out[~mask, 1] = 1 # class 1 return out # Create model task_name = "VotingTask" module_name = "simple_voter" module_pool = nn.ModuleDict({module_name: SimpleVoter()}) op0 = Operation(module_name=module_name, inputs=[("_input_", "data")], name="op0") op_sequence = [op0] task = Task(name=task_name, module_pool=module_pool, op_sequence=op_sequence) model = MultitaskClassifier([task]) # Create dataset y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] x_list = [i for i in range(len(y_list))] Y = torch.LongTensor(y_list * 100) X = torch.FloatTensor(x_list * 100) dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict={task_name: Y}) # Create dataloaders dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False) scores = model.score([dataloader]) self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6) dataloader_shuffled = DictDataLoader(dataset, batch_size=2, shuffle=True) scores_shuffled = model.score([dataloader_shuffled]) self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"], 0.6)
def test_add_slice_labels(self): # Create dummy data # Given slicing function f(), we expect the first two entries to be active x = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.5]) y = torch.Tensor([0, 1, 1, 0, 1]).long() dataset = DictDataset( name="TestData", split="train", X_dict={"data": x}, Y_dict={"TestTask": y} ) # Ensure that we start with 1 labelset self.assertEqual(len(dataset.Y_dict), 1) # Apply SFs with PandasSFApplier df = pd.DataFrame({"val": x, "y": y}) slicing_functions = [f] applier = PandasSFApplier(slicing_functions) S = applier.apply(df, progress_bar=False) dataloader = DictDataLoader(dataset) dummy_task = create_dummy_task(task_name="TestTask") add_slice_labels(dataloader, dummy_task, S) # Ensure that all the fields are present labelsets = dataloader.dataset.Y_dict self.assertIn("TestTask", labelsets) self.assertIn("TestTask_slice:base_ind", labelsets) self.assertIn("TestTask_slice:base_pred", labelsets) self.assertIn("TestTask_slice:f_ind", labelsets) self.assertIn("TestTask_slice:f_pred", labelsets) self.assertEqual(len(labelsets), 5) # Ensure "ind" contains mask self.assertEqual( labelsets["TestTask_slice:f_ind"].numpy().tolist(), [1, 1, 0, 0, 0] ) self.assertEqual( labelsets["TestTask_slice:base_ind"].numpy().tolist(), [1, 1, 1, 1, 1] ) # Ensure "pred" contains masked elements self.assertEqual( labelsets["TestTask_slice:f_pred"].numpy().tolist(), [0, 1, -1, -1, -1] ) self.assertEqual( labelsets["TestTask_slice:base_pred"].numpy().tolist(), [0, 1, 1, 0, 1] ) self.assertEqual(labelsets["TestTask"].numpy().tolist(), [0, 1, 1, 0, 1])
def create_dataloader(df: pd.DataFrame, split: str) -> DictDataLoader: dataset = DictDataset( name="TestData", split=split, X_dict={ "coordinates": torch.stack( (torch.tensor(df["x1"]), torch.tensor(df["x2"])), dim=1 ) }, Y_dict={"task": torch.tensor(df["y"], dtype=torch.long)}, ) dataloader = DictDataLoader( dataset=dataset, batch_size=4, shuffle=(dataset.split == "train") ) return dataloader
def test_remapped_labels(self): # Test additional label keys in the Y_dict # Without remapping, model should ignore them task_name = self.task1.name X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() Y_dict = {task_name: Y, "other_task": Y} dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict=Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) model = MultitaskClassifier([self.task1]) loss_dict, count_dict = model.calculate_loss(dataset.X_dict, dataset.Y_dict) self.assertIn("task1", loss_dict) # Test setting without remapping results = model.predict(dataloader) self.assertIn("task1", results["golds"]) self.assertNotIn("other_task", results["golds"]) scores = model.score([dataloader]) self.assertIn("task1/dataset/train/accuracy", scores) self.assertNotIn("other_task/dataset/train/accuracy", scores) # Test remapped labelsets results = model.predict(dataloader, remap_labels={"other_task": task_name}) self.assertIn("task1", results["golds"]) self.assertIn("other_task", results["golds"]) results = model.score([dataloader], remap_labels={"other_task": task_name}) self.assertIn("task1/dataset/train/accuracy", results) self.assertIn("other_task/dataset/train/accuracy", results)
def test_make_slice_dataloader(self): # Test correct construction dataloader = self.slice_model.make_slice_dataloader( dataset=self.datasets[0], S=self.S) Y_dict = dataloader.dataset.Y_dict self.assertEqual(len(Y_dict), 7) self.assertIn("test_task", Y_dict) self.assertIn("test_task_slice:base_pred", Y_dict) self.assertIn("test_task_slice:base_ind", Y_dict) self.assertIn("test_task_slice:f_pred", Y_dict) self.assertIn("test_task_slice:f_ind", Y_dict) self.assertIn("test_task_slice:g_pred", Y_dict) self.assertIn("test_task_slice:g_ind", Y_dict) # Test bad data input bad_data_dataset = DictDataset( name="test_data", split="train", X_dict={self.data_name: self.X}, Y_dict={"bad_labels": self.Y}, ) with self.assertRaisesRegex(ValueError, "labels missing"): self.slice_model.make_slice_dataloader(dataset=bad_data_dataset, S=self.S)
# Get the number of classes included in dataset to use in task-specific head later num_classes = len(output_label_to_int_dict.keys()) # Define dictionary keys for the data, dataset and task of the given task task_data_name = f"{task_name}_data" task_formal_name = f"{task_name}_task" task_dataset_name = f"{task_name}Dataset" for split, X, Y in ( ("train", train_X, train_y), ("valid", dev_X, dev_y), ("test", test_X, test_y), ): X_dict = {task_data_name: torch.tensor(X, dtype=torch.long)} Y_dict = {task_formal_name: torch.tensor(Y, dtype=torch.long)} dataset = DictDataset(task_dataset_name, split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) dataloaders.append(dataloader) # Define a one-layer prediction "head" module specific to each task head_module = task_type_function_mapping[task_type]["head_module"]( hidden_layer_size, num_classes) task_head_name = f"{task_name}_head_module" # The module pool contains all the modules this task uses module_pool = nn.ModuleDict({ "bert_module": bert_module, task_head_name: head_module })
def test_classifier_dataloader(self): """Unit test of DictDataLoader""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = DictDataset( name="new_data", split="train", X_dict={"data1": x1, "data2": x2}, Y_dict={"task1": y1, "task2": y2}, ) dataloader1 = DictDataLoader(dataset=dataset, batch_size=2) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed self.assertEqual(dataloader1.dataset.split, "train") self.assertTrue(torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1]))) dataloader2 = DictDataLoader(dataset=dataset, batch_size=3) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed self.assertEqual(dataloader2.dataset.split, "train") self.assertTrue( torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) ) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1, 1]))) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["task2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2]]))) x_batch, y_batch = next(iter(dataloader2)) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2], [2]])))
# `DictDataloader` is a wrapper for `torch.utils.data.Dataloader`, which handles the collate function for `DictDataset` appropriately. # %% import torch from snorkel.classification import DictDataset, DictDataLoader dataloaders = [] for task_name in ["circle", "square"]: for split, X, Y in ( ("train", X_train, Y_train), ("valid", X_valid, Y_valid), ("test", X_test, Y_test), ): X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])} Y_dict = {f"{task_name}_task": torch.LongTensor(Y[task_name])} dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=32) dataloaders.append(dataloader) # %% [markdown] # We now have 6 data loaders, one for each split (`train`, `valid`, `test`) of each task (`circle_task` and `square_task`). # %% [markdown] # ## Define Model # %% [markdown] # Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier. # We'll instantiate it from a list of `Tasks`. # %% [markdown] # ### Tasks
import random import unittest import numpy as np import torch from snorkel.classification import DictDataLoader, DictDataset from snorkel.classification.training.schedulers import ( SequentialScheduler, ShuffledScheduler, ) dataset1 = DictDataset( "d1", "train", X_dict={"data": [0, 1, 2, 3, 4]}, Y_dict={"labels": torch.LongTensor([1, 1, 1, 1, 1])}, ) dataset2 = DictDataset( "d2", "train", X_dict={"data": [5, 6, 7, 8, 9]}, Y_dict={"labels": torch.LongTensor([2, 2, 2, 2, 2])}, ) dataloader1 = DictDataLoader(dataset1, batch_size=2) dataloader2 = DictDataLoader(dataset2, batch_size=2) dataloaders = [dataloader1, dataloader2] class SequentialTest(unittest.TestCase):
def create_dataset(X, Y, split, dataset_name, input_name, task_name): return DictDataset(name=dataset_name, split=split, X_dict={input_name: X}, Y_dict={task_name: Y})