def test_score_shuffled(self): # Test scoring with a shuffled dataset set_seed(123) class SimpleVoter(nn.Module): def forward(self, x): """Set class 0 to -1 if x and 1 otherwise""" mask = x % 2 == 0 out = torch.zeros(x.shape[0], 2) out[mask, 0] = 1 # class 0 out[~mask, 1] = 1 # class 1 return out # Create model task_name = "VotingTask" module_name = "simple_voter" module_pool = nn.ModuleDict({module_name: SimpleVoter()}) op0 = Operation(module_name=module_name, inputs=[("_input_", "data")], name="op0") op_sequence = [op0] task = Task(name=task_name, module_pool=module_pool, op_sequence=op_sequence) model = MultitaskClassifier([task]) # Create dataset y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] x_list = [i for i in range(len(y_list))] Y = torch.LongTensor(y_list * 100) X = torch.FloatTensor(x_list * 100) dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict={task_name: Y}) # Create dataloaders dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False) scores = model.score([dataloader]) self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6) dataloader_shuffled = DictDataLoader(dataset, batch_size=2, shuffle=True) scores_shuffled = model.score([dataloader_shuffled]) self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"], 0.6)
def make_slice_dataloader(self, dataset: DictDataset, S: np.recarray, **dataloader_kwargs: Any) -> DictDataLoader: """Create DictDataLoader with slice labels, initialized from specified dataset. Parameters ---------- dataset A DictDataset that will be converted into a slice-aware dataloader S A [num_examples, num_slices] slice matrix indicating whether each example is in every slice slice_names A list of slice names corresponding to columns of ``S`` dataloader_kwargs Arbitrary kwargs to be passed to DictDataLoader See ``DictDataLoader.__init__``. """ # Base task must have corresponding labels in dataset if self.base_task.name not in dataset.Y_dict: # type: ignore raise ValueError( f"Base task ({self.base_task.name}) labels missing from {dataset}" ) # Initialize dataloader dataloader = DictDataLoader(dataset, **dataloader_kwargs) # Make dataloader slice-aware add_slice_labels(dataloader, self.base_task, S) return dataloader
def create_dataloader(task_name="task", split="train"): X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() dataset = DictDataset( name="dataset", split=split, X_dict={"data": X}, Y_dict={task_name: Y} ) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) return dataloader
def test_add_slice_labels(self): # Create dummy data # Given slicing function f(), we expect the first two entries to be active x = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.5]) y = torch.Tensor([0, 1, 1, 0, 1]).long() dataset = DictDataset( name="TestData", split="train", X_dict={"data": x}, Y_dict={"TestTask": y} ) # Ensure that we start with 1 labelset self.assertEqual(len(dataset.Y_dict), 1) # Apply SFs with PandasSFApplier df = pd.DataFrame({"val": x, "y": y}) slicing_functions = [f] applier = PandasSFApplier(slicing_functions) S = applier.apply(df, progress_bar=False) dataloader = DictDataLoader(dataset) dummy_task = create_dummy_task(task_name="TestTask") add_slice_labels(dataloader, dummy_task, S) # Ensure that all the fields are present labelsets = dataloader.dataset.Y_dict self.assertIn("TestTask", labelsets) self.assertIn("TestTask_slice:base_ind", labelsets) self.assertIn("TestTask_slice:base_pred", labelsets) self.assertIn("TestTask_slice:f_ind", labelsets) self.assertIn("TestTask_slice:f_pred", labelsets) self.assertEqual(len(labelsets), 5) # Ensure "ind" contains mask self.assertEqual( labelsets["TestTask_slice:f_ind"].numpy().tolist(), [1, 1, 0, 0, 0] ) self.assertEqual( labelsets["TestTask_slice:base_ind"].numpy().tolist(), [1, 1, 1, 1, 1] ) # Ensure "pred" contains masked elements self.assertEqual( labelsets["TestTask_slice:f_pred"].numpy().tolist(), [0, 1, -1, -1, -1] ) self.assertEqual( labelsets["TestTask_slice:base_pred"].numpy().tolist(), [0, 1, 1, 0, 1] ) self.assertEqual(labelsets["TestTask"].numpy().tolist(), [0, 1, 1, 0, 1])
def create_dataloader(df: pd.DataFrame, split: str) -> DictDataLoader: dataset = DictDataset( name="TestData", split=split, X_dict={ "coordinates": torch.stack( (torch.tensor(df["x1"]), torch.tensor(df["x2"])), dim=1 ) }, Y_dict={"task": torch.tensor(df["y"], dtype=torch.long)}, ) dataloader = DictDataLoader( dataset=dataset, batch_size=4, shuffle=(dataset.split == "train") ) return dataloader
def test_remapped_labels(self): # Test additional label keys in the Y_dict # Without remapping, model should ignore them task_name = self.task1.name X = torch.FloatTensor([[i, i] for i in range(NUM_EXAMPLES)]) Y = torch.ones(NUM_EXAMPLES).long() Y_dict = {task_name: Y, "other_task": Y} dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict=Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) model = MultitaskClassifier([self.task1]) loss_dict, count_dict = model.calculate_loss(dataset.X_dict, dataset.Y_dict) self.assertIn("task1", loss_dict) # Test setting without remapping results = model.predict(dataloader) self.assertIn("task1", results["golds"]) self.assertNotIn("other_task", results["golds"]) scores = model.score([dataloader]) self.assertIn("task1/dataset/train/accuracy", scores) self.assertNotIn("other_task/dataset/train/accuracy", scores) # Test remapped labelsets results = model.predict(dataloader, remap_labels={"other_task": task_name}) self.assertIn("task1", results["golds"]) self.assertIn("other_task", results["golds"]) results = model.score([dataloader], remap_labels={"other_task": task_name}) self.assertIn("task1/dataset/train/accuracy", results) self.assertIn("other_task/dataset/train/accuracy", results)
num_classes = len(output_label_to_int_dict.keys()) # Define dictionary keys for the data, dataset and task of the given task task_data_name = f"{task_name}_data" task_formal_name = f"{task_name}_task" task_dataset_name = f"{task_name}Dataset" for split, X, Y in ( ("train", train_X, train_y), ("valid", dev_X, dev_y), ("test", test_X, test_y), ): X_dict = {task_data_name: torch.tensor(X, dtype=torch.long)} Y_dict = {task_formal_name: torch.tensor(Y, dtype=torch.long)} dataset = DictDataset(task_dataset_name, split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=BATCH_SIZE) dataloaders.append(dataloader) # Define a one-layer prediction "head" module specific to each task head_module = task_type_function_mapping[task_type]["head_module"]( hidden_layer_size, num_classes) task_head_name = f"{task_name}_head_module" # The module pool contains all the modules this task uses module_pool = nn.ModuleDict({ "bert_module": bert_module, task_head_name: head_module }) # Operation with same name to all other tasks as it contains the shared bert_module
def test_classifier_dataloader(self): """Unit test of DictDataLoader""" x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = DictDataset( name="new_data", split="train", X_dict={"data1": x1, "data2": x2}, Y_dict={"task1": y1, "task2": y2}, ) dataloader1 = DictDataLoader(dataset=dataset, batch_size=2) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed self.assertEqual(dataloader1.dataset.split, "train") self.assertTrue(torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1]))) dataloader2 = DictDataLoader(dataset=dataset, batch_size=3) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed self.assertEqual(dataloader2.dataset.split, "train") self.assertTrue( torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) ) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task1"], torch.Tensor([0, 0, 0]))) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([1, 1, 1]))) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["task2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2]]))) x_batch, y_batch = next(iter(dataloader2)) self.assertTrue( torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) ) self.assertTrue(torch.equal(y_batch["task2"], torch.Tensor([[2], [2], [2]])))
# %% import torch from snorkel.classification import DictDataset, DictDataLoader dataloaders = [] for task_name in ["circle", "square"]: for split, X, Y in ( ("train", X_train, Y_train), ("valid", X_valid, Y_valid), ("test", X_test, Y_test), ): X_dict = {f"{task_name}_data": torch.FloatTensor(X[task_name])} Y_dict = {f"{task_name}_task": torch.LongTensor(Y[task_name])} dataset = DictDataset(f"{task_name}Dataset", split, X_dict, Y_dict) dataloader = DictDataLoader(dataset, batch_size=32) dataloaders.append(dataloader) # %% [markdown] # We now have 6 data loaders, one for each split (`train`, `valid`, `test`) of each task (`circle_task` and `square_task`). # %% [markdown] # ## Define Model # %% [markdown] # Now we'll define the `MultitaskClassifier` model, a PyTorch multi-task classifier. # We'll instantiate it from a list of `Tasks`. # %% [markdown] # ### Tasks
# #### Create DataLoaders for Classifier # %% from snorkel.classification import DictDataLoader from model import SceneGraphDataset, create_model df_train["labels"] = label_model.predict(L_train) if sample: TRAIN_DIR = "data/VRD/sg_dataset/samples" else: TRAIN_DIR = "data/VRD/sg_dataset/sg_train_images" dl_train = DictDataLoader( SceneGraphDataset("train_dataset", "train", TRAIN_DIR, df_train), batch_size=16, shuffle=True, ) dl_valid = DictDataLoader( SceneGraphDataset("valid_dataset", "valid", TRAIN_DIR, df_valid), batch_size=16, shuffle=False, ) # %% [markdown] # #### Define Model Architecture # %% import torchvision.models as models
) dataset1 = DictDataset( "d1", "train", X_dict={"data": [0, 1, 2, 3, 4]}, Y_dict={"labels": torch.LongTensor([1, 1, 1, 1, 1])}, ) dataset2 = DictDataset( "d2", "train", X_dict={"data": [5, 6, 7, 8, 9]}, Y_dict={"labels": torch.LongTensor([2, 2, 2, 2, 2])}, ) dataloader1 = DictDataLoader(dataset1, batch_size=2) dataloader2 = DictDataLoader(dataset2, batch_size=2) dataloaders = [dataloader1, dataloader2] class SequentialTest(unittest.TestCase): def test_sequential(self): scheduler = SequentialScheduler() data = [] for (batch, dl) in scheduler.get_batches(dataloaders): X_dict, Y_dict = batch data.extend(X_dict["data"]) self.assertEqual(data, sorted(data)) def test_shuffled(self): random.seed(123)