def test_shuffled(self): set_seed(123) scheduler = ShuffledScheduler() data = [] for (batch, dl) in scheduler.get_batches(dataloaders): X_dict, Y_dict = batch data.extend(X_dict["data"]) self.assertNotEqual(data, sorted(data))
def setUpClass(cls): # Ensure deterministic runs set_seed(123) # Create raw data cls.N_TRAIN = 1500 cls.cardinality = 2 cls.df_train = create_data(cls.N_TRAIN)
def setUpClass(cls): # Ensure deterministic runs set_seed(123) # Create raw data cls.N_TRAIN = 1500 cls.N_VALID = 300 cls.df_train = create_data(cls.N_TRAIN) cls.df_valid = create_data(cls.N_VALID)
def test_score_shuffled(self): # Test scoring with a shuffled dataset set_seed(123) class SimpleVoter(nn.Module): def forward(self, x): """Set class 0 to -1 if x and 1 otherwise""" mask = x % 2 == 0 out = torch.zeros(x.shape[0], 2) out[mask, 0] = 1 # class 0 out[~mask, 1] = 1 # class 1 return out # Create model task_name = "VotingTask" module_name = "simple_voter" module_pool = nn.ModuleDict({module_name: SimpleVoter()}) op0 = Operation(module_name=module_name, inputs=[("_input_", "data")], name="op0") op_sequence = [op0] task = Task(name=task_name, module_pool=module_pool, op_sequence=op_sequence) model = MultitaskClassifier([task]) # Create dataset y_list = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] x_list = [i for i in range(len(y_list))] Y = torch.LongTensor(y_list * 100) X = torch.FloatTensor(x_list * 100) dataset = DictDataset(name="dataset", split="train", X_dict={"data": X}, Y_dict={task_name: Y}) # Create dataloaders dataloader = DictDataLoader(dataset, batch_size=2, shuffle=False) scores = model.score([dataloader]) self.assertEqual(scores["VotingTask/dataset/train/accuracy"], 0.6) dataloader_shuffled = DictDataLoader(dataset, batch_size=2, shuffle=True) scores_shuffled = model.score([dataloader_shuffled]) self.assertEqual(scores_shuffled["VotingTask/dataset/train/accuracy"], 0.6)
def fit( self, L_train: np.ndarray, Y_dev: Optional[np.ndarray] = None, class_balance: Optional[List[float]] = None, **kwargs: Any, ) -> None: """Train label model. Train label model to estimate mu, the parameters used to combine LFs. Parameters ---------- L_train An [n,m] matrix with values in {-1,0,1,...,k-1} Y_dev Gold labels for dev set for estimating class_balance, by default None class_balance Each class's percentage of the population, by default None **kwargs Arguments for changing train config defaults Raises ------ Exception If loss in NaN Examples -------- >>> L = np.array([[0, 0, -1], [-1, 0, 1], [1, -1, 0]]) >>> Y_dev = [0, 1, 0] >>> label_model = LabelModel(verbose=False) >>> label_model.fit(L) >>> label_model.fit(L, Y_dev=Y_dev) >>> label_model.fit(L, class_balance=[0.7, 0.3]) """ # Set random seed self.train_config: TrainConfig = merge_config( # type:ignore TrainConfig(), kwargs # type:ignore ) # Update base config so that it includes all parameters set_seed(self.train_config.seed) L_shift = L_train + 1 # convert to {0, 1, ..., k} if L_shift.max() > self.cardinality: raise ValueError( f"L_train has cardinality {L_shift.max()}, cardinality={self.cardinality} passed in." ) self._set_class_balance(class_balance, Y_dev) self._set_constants(L_shift) self._create_tree() lf_analysis = LFAnalysis(L_train) self.coverage = lf_analysis.lf_coverages() # Compute O and initialize params if self.config.verbose: # pragma: no cover logging.info("Computing O...") self._generate_O(L_shift) self._init_params() # Estimate \mu if self.config.verbose: # pragma: no cover logging.info("Estimating \mu...") # Set model to train mode self.train() # Move model to GPU if self.config.verbose and self.config.device != "cpu": # pragma: no cover logging.info("Using GPU...") self.to(self.config.device) # Set training components self._set_logger() self._set_optimizer() self._set_lr_scheduler() # Restore model if necessary start_iteration = 0 # Train the model metrics_hist = {} # The most recently seen value for all metrics for epoch in range(start_iteration, self.train_config.n_epochs): self.running_loss = 0.0 self.running_examples = 0 # Zero the parameter gradients self.optimizer.zero_grad() # Forward pass to calculate the average loss per example loss = self._loss_mu(l2=self.train_config.l2) if torch.isnan(loss): msg = "Loss is NaN. Consider reducing learning rate." raise Exception(msg) # Backward pass to calculate gradients # Loss is an average loss per example loss.backward() # Perform optimizer step self.optimizer.step() # Calculate metrics, log, and checkpoint as necessary metrics_dict = self._execute_logging(loss) metrics_hist.update(metrics_dict) # Update learning rate self._update_lr_scheduler(epoch) self.eval() # Print confusion matrix if applicable if self.config.verbose: # pragma: no cover logging.info("Finished Training")
# The purpose of this tutorial is to introduce the basic interfaces and flow of multi-task learning tools within Snorkel. # We assume that you have prior experience with MTL, so we don't motivate or explain multi-task learning at large here. # # In this notebook, we will start by looking at a simple MTL model with only two tasks, each having distinct data and only one set of ground truth labels ("gold" labels). We'll also use a simple dataset where the raw data is directly usable as features, for simplicity (i.e., unlike text data, where we would first need to tokenize and transform the data into token ids). # At the end, you'll fill in the missing details to add a third task to the model. # %% [markdown] {"tags": ["md-exclude"]} # ## Environment Setup # %% {"tags": ["md-exclude"]} # %matplotlib inline from snorkel.utils import set_seed SEED = 123 set_seed(SEED) # %% [markdown] # ## Create Toy Data # %% [markdown] # We'll now create a toy dataset to work with. # Our data points are 2D points in a square centered on the origin. # Our tasks will be classifying whether these points are: # # 1. Inside a **unit circle** centered on the origin (label 0 = `False`, label 1 = `True`) # 2. Inside a **unit square** centered on the origin (label 0 = `False`, label 1 = `True`) # %% [markdown] {"tags": ["md-exclude"]} # We'll visualize these decision boundaries in a few cells.
# In this tutorial, we: # 1. **Introduce _Slicing Functions (SFs)_** as a programming interface # 1. **Monitor** application-critical data subsets # 2. **Improve model performance** on slices # %% [markdown] {"tags": ["md-exclude"]} # First, we'll set up our notebook for reproducibility and proper logging. # %% {"tags": ["md-exclude"]} import logging import os from snorkel.utils import set_seed # For reproducibility os.environ["PYTHONHASHSEED"] = "0" set_seed(111) # Make sure we're running from the spam/ directory if os.path.basename(os.getcwd()) == "snorkel-tutorials": os.chdir("spam") # To visualize logs logger = logging.getLogger() logger.setLevel(logging.WARNING) # %% [markdown] {"tags": ["md-exclude"]} # If you want to display all comment text untruncated, change `DISPLAY_ALL_TEXT` to `True` below. # %% {"tags": ["md-exclude"]} import pandas as pd
def setUp(self): set_seed(123)
def setUpClass(cls): # Ensure deterministic runs set_seed(123)
def setUpClass(cls): set_seed(123)