def test_abstain_labels(self) -> None: # We abstain on the last example by convention (label=-1) golds = np.array([1, 0, 1, 0, -1]) preds = np.array([1, 0, 1, 1, 0]) probs = np.array([0.8, 0.6, 0.9, 0.7, 0.4]) # Test no abstain scorer = Scorer(metrics=["accuracy"], abstain_label=None) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.6) self.assertEqual(results, results_expected) # Test abstain=-1 for gold scorer = Scorer(metrics=["accuracy"], abstain_label=-1) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.75) self.assertEqual(results, results_expected) # Test abstain=-1 for preds and gold abstain_preds = np.array([-1, -1, 1, 1, 0]) results = scorer.score(golds, abstain_preds) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) # Test abstain set to different value scorer = Scorer(metrics=["accuracy"], abstain_label=10) results = scorer.score(golds, preds, probs) results_expected = dict(accuracy=0.6) self.assertEqual(results, results_expected)
def test_dict_metric(self) -> None: def dict_metric(golds, preds, probs): return dict(a=1, b=2) scorer = Scorer(custom_metric_funcs=dict(dict_metric=dict_metric)) results = scorer.score(*self._get_labels()) results_expected = dict(a=1, b=2) self.assertEqual(results, results_expected)
def test_scorer(self) -> None: def pred_sum(golds, preds, probs): return np.sum(preds) scorer = Scorer(metrics=["accuracy", "f1"], custom_metric_funcs=dict(pred_sum=pred_sum)) results = scorer.score(*self._get_labels()) results_expected = dict(accuracy=0.6, f1=2 / 3, pred_sum=3) self.assertEqual(results, results_expected)
def score( self, L: np.ndarray, Y: np.ndarray, metrics: Optional[List[str]] = ["accuracy"], tie_break_policy: str = "abstain", ) -> Dict[str, float]: """Calculate one or more scores from user-specified and/or user-defined metrics. Parameters ---------- L An [n,m] matrix with values in {-1,0,1,...,k-1} Y Gold labels associated with data points in L metrics A list of metric names tie_break_policy Policy to break ties when converting probabilistic labels to predictions Returns ------- Dict[str, float] A dictionary mapping metric names to metric scores Example ------- >>> L = np.array([[1, 1, -1], [0, 0, -1], [1, 1, -1]]) >>> label_model = LabelModel(verbose=False) >>> label_model.fit(L) >>> label_model.score(L, Y=np.array([1, 1, 1])) {'accuracy': 0.6666666666666666} >>> label_model.score(L, Y=np.array([1, 1, 1]), metrics=["f1"]) {'f1': 0.8} """ if tie_break_policy == "abstain": # pragma: no cover logging.warning( "Metrics calculated over data points with non-abstain labels only" ) Y_pred, Y_prob = self.predict(L, return_probs=True, tie_break_policy=tie_break_policy) scorer = Scorer(metrics=metrics) results = scorer.score(Y, Y_pred, Y_prob) return results
def create_model(resnet_cnn): # freeze the resnet weights for param in resnet_cnn.parameters(): param.requires_grad = False # define input features in_features = resnet_cnn.fc.in_features feature_extractor = nn.Sequential(*list(resnet_cnn.children())[:-1]) # initialize FC layer: maps 3 sets of image features to class logits WEMB_SIZE = 100 fc = nn.Linear(in_features * 3 + 2 * WEMB_SIZE, 3) init_fc(fc) # define layers module_pool = nn.ModuleDict( { "feat_extractor": feature_extractor, "prediction_head": fc, "feat_concat": FlatConcat(), "word_emb": WordEmb(), } ) # define task flow through modules op_sequence = get_op_sequence() pred_cls_task = Task( name="visual_relation_task", module_pool=module_pool, op_sequence=op_sequence, scorer=Scorer(metrics=["f1_micro"]), ) return MultitaskClassifier([pred_cls_task])
def create_task(task_name: str, module_suffixes: List[str]) -> Task: module1_name = f"linear1{module_suffixes[0]}" module2_name = f"linear2{module_suffixes[1]}" module_pool = nn.ModuleDict({ module1_name: nn.Sequential(nn.Linear(2, 20), nn.ReLU()), module2_name: nn.Linear(20, 2), }) op1 = Operation(module_name=module1_name, inputs=[("_input_", "coordinates")]) op2 = Operation(module_name=module2_name, inputs=[op1.name]) op_sequence = [op1, op2] task = Task( name=task_name, module_pool=module_pool, op_sequence=op_sequence, scorer=Scorer(metrics=["accuracy"]), ) return task
def __init__( self, base_architecture: nn.Module, head_dim: int, slice_names: List[str], input_data_key: str = DEFAULT_INPUT_DATA_KEY, task_name: str = DEFAULT_TASK_NAME, scorer: Scorer = Scorer(metrics=["accuracy", "f1"]), **multitask_kwargs: Any, ) -> None: # Initialize module_pool with 1) base_architecture and 2) prediction_head # Assuming `head_dim` can be used to map base_architecture to prediction_head module_pool = nn.ModuleDict({ "base_architecture": base_architecture, "prediction_head": nn.Linear(head_dim, 2), }) # Create op_sequence from base_architecture -> prediction_head op_sequence = [ Operation( name="input_op", module_name="base_architecture", inputs=[("_input_", input_data_key)], ), Operation(name="head_op", module_name="prediction_head", inputs=["input_op"]), ] # Initialize base_task using specified base_architecture self.base_task = Task( name=task_name, module_pool=module_pool, op_sequence=op_sequence, scorer=scorer, ) # Convert base_task to associated slice_tasks slice_tasks = convert_to_slice_tasks(self.base_task, slice_names) # Initialize a MultitaskClassifier with all slice_tasks model_name = f"{task_name}_slicing_classifier" super().__init__(tasks=slice_tasks, name=model_name, **multitask_kwargs) self.slice_names = slice_names
def __init__( self, name: str, module_pool: nn.ModuleDict, op_sequence: Sequence[Operation], scorer: Scorer = Scorer(metrics=["accuracy"]), loss_func: Optional[Callable[..., torch.Tensor]] = None, output_func: Optional[Callable[..., torch.Tensor]] = None, ) -> None: self.name = name self.module_pool = module_pool self.op_sequence = op_sequence self.loss_func = loss_func or F.cross_entropy self.output_func = output_func or partial(F.softmax, dim=1) self.scorer = scorer logging.info(f"Created task: {self.name}")
def test_score_slices(self): DATA = [5, 10, 19, 22, 25] @slicing_function() def sf(x): return x.num < 20 # We expect 3/5 correct -> 0.6 accuracy golds = np.array([0, 1, 0, 1, 0]) preds = np.array([0, 0, 0, 0, 0]) probs = preds_to_probs(preds, 2) # In the slice, we expect the last 2 elements to masked # We expect 2/3 correct -> 0.666 accuracy data = [SimpleNamespace(num=x) for x in DATA] S = SFApplier([sf]).apply(data) scorer = Scorer(metrics=["accuracy"]) # Test normal score metrics = scorer.score(golds=golds, preds=preds, probs=probs) self.assertEqual(metrics["accuracy"], 0.6) # Test score_slices slice_metrics = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs) self.assertEqual(slice_metrics["overall"]["accuracy"], 0.6) self.assertEqual(slice_metrics["sf"]["accuracy"], 2.0 / 3.0) # Test as_dataframe=True metrics_df = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs, as_dataframe=True) self.assertTrue(isinstance(metrics_df, pd.DataFrame)) self.assertEqual(metrics_df["accuracy"]["overall"], 0.6) self.assertEqual(metrics_df["accuracy"]["sf"], 2.0 / 3.0) # Test wrong shapes with self.assertRaisesRegex(ValueError, "must have the same number of elements"): scorer.score_slices(S=S, golds=golds[:1], preds=preds, probs=probs, as_dataframe=True)
def setUp(self): # Define S_matrix data_points = [SimpleNamespace(num=num) for num in DATA] applier = SFApplier([f, g]) self.S = applier.apply(data_points, progress_bar=False) # Define base architecture self.hidden_dim = 10 self.mlp = nn.Sequential( nn.Linear(2, self.hidden_dim), nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(), ) # Define model parameters self.data_name = "test_data" self.task_name = "test_task" # Define datasets # Repeated data value for [N x 2] dim Tensor self.X = torch.FloatTensor([(x, x) for x in DATA]) # Alternating labels self.Y = torch.LongTensor([int(i % 2 == 0) for i in range(len(DATA))]) dataset_name = "test_dataset" splits = ["train", "valid"] self.datasets = [ create_dataset(self.X, self.Y, split, dataset_name, self.data_name, self.task_name) for split in splits ] self.slice_model = SliceAwareClassifier( base_architecture=self.mlp, head_dim=self.hidden_dim, slice_names=[sf.name for sf in sfs], input_data_key=self.data_name, task_name=self.task_name, scorer=Scorer(metrics=["f1"]), )
# Other parameters n_epochs = 100 lr = 0.01 sig = 0.05 policy = "new" # Copy data from notebook L_data_global = np.copy(L_alarms[:, :57]) Y_data_global = np.copy(Y_alarms) # Set up Scorer my_metrics = { "abstain rate": lambda golds, preds, probs: np.sum(preds == ABSTAIN) / len(preds) } scorer = Scorer(metrics=["accuracy", "f1"], custom_metric_funcs=my_metrics) # Define the experiment results_mtx = np.empty((n_exps, 4, n_iters), dtype=float) results_mtx[:] = np.nan def thread_experiment(exp, L_data, Y_data): for iter in range(n_iters): # Randomly sample J sets of K LFs subsets = np.random.choice(L_data.shape[1], size=(n_subsets, subset_size), replace=with_replacement) # Define a new LF for each of the J sets as the prediction of a dependency-informed Snorkel model with the K LFs L_train, L_dev = train_test_split(L_data, test_size=0.2, shuffle=True)
default="512", help='Max size of the input in tokens') parser.add_argument('--batch_size', default="32", help='Batch size of every dataset') args = parser.parse_args() MAX_SEQ_LENGTH = int(args.max_seq_length) BATCH_SIZE = int(args.batch_size) task_type_function_mapping = { "Classification_Tasks": { "data_handler": Classification_Task_Data_Handler, "head_module": ClassificationLinearLayer, "loss_function": F.cross_entropy, "scorer": Scorer(metrics=["accuracy"]) }, "Tagging_Tasks": { "data_handler": Tagging_Task_Data_Handler, "head_module": TaggingLinearLayer, "loss_function": tagging_cross_entropy, "scorer": Scorer(custom_metric_funcs={"Tag_accuracy": tag_accuracy_scorer}) } } # Get the absolute current working directory of the project cwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) # Create empty list to hold every Dataloader object dataloaders = []
def test_no_probs(self) -> None: scorer = Scorer() golds, preds, probs = self._get_labels() self.assertEqual(scorer.score(golds, preds), scorer.score(golds, preds, probs))
def test_no_labels(self) -> None: scorer = Scorer() with self.assertRaisesRegex(ValueError, "Cannot score"): scorer.score([], [], [])
def test_no_metrics(self) -> None: scorer = Scorer() self.assertEqual(scorer.score(*self._get_labels()), {})
def test_invalid_metric(self) -> None: with self.assertRaisesRegex(ValueError, "Unrecognized metric"): Scorer(metrics=["accuracy", "f2"])
def slicing_evaluation(df_train, df_test, train_model=None): if train_model is None: train_model = "mlp" sfs = [ SlicingFunction.short_comment, SlicingFunction.ind_keyword, SlicingFunction.cmp_re, SlicingFunction.industry_keyword ] slice_names = [sf.name for sf in sfs] scorer = Scorer(metrics=["f1"]) ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin") def get_ftr(text): return ft.get_sentence_vector(' '.join( [w for w in jieba.lcut(text.strip())])) X_train = np.array(list(df_train.text.apply(get_ftr).values)) X_test = np.array(list(df_test.text.apply(get_ftr).values)) Y_train = df_train.label.values Y_test = df_test.label.values if train_model == "lr": sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=Y_train) preds_test = sklearn_model.predict(X_test) probs_test = preds_to_probs( preds_test, len([c for c in dir(Polarity) if not c.startswith("__")])) print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%") applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) analysis = scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) return analysis if train_model == "mlp": # Define model architecture bow_dim = X_train.shape[1] hidden_dim = bow_dim mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2) # Initialize slice model slice_model = SliceAwareClassifier( base_architecture=mlp, head_dim=hidden_dim, slice_names=slice_names, scorer=scorer, ) # generate the remaining S matrices with the new set of slicing functions applier = PandasSFApplier(sfs) S_train = applier.apply(df_train) S_test = applier.apply(df_test) # add slice labels to an existing dataloader BATCH_SIZE = 64 train_dl = create_dict_dataloader(X_train, Y_train, "train") train_dl_slice = slice_model.make_slice_dataloader( train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE) test_dl = create_dict_dataloader(X_test, Y_test, "train") test_dl_slice = slice_model.make_slice_dataloader( test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE) # fit our classifier with the training set dataloader trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True) trainer.fit(slice_model, [train_dl_slice]) analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True) return analysis
# For our data format, we leverage the [`PandasSFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/slicing/snorkel.slicing.PandasSFApplier.html#snorkel.slicing.PandasSFApplier). # The output of the `applier` is an [`np.recarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html) which stores vectors in named fields indicating whether each of $n$ data points belongs to the corresponding slice. # %% {"tags": ["md-exclude-output"]} from snorkel.slicing import PandasSFApplier applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) # %% [markdown] # Now, we initialize a [`Scorer`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer) using the desired `metrics`. # %% from snorkel.analysis import Scorer scorer = Scorer(metrics=["f1"]) # %% [markdown] # Using the [`score_slices`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer.score_slices) method, we can see both `overall` and slice-specific performance. # %% scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) # %% [markdown] # Despite high overall performance, the `short_comment` slice performs poorly here! # %% [markdown]
def convert_to_slice_tasks(base_task: Task, slice_names: List[str]) -> List[Task]: """Add slice labels to dataloader and creates new slice tasks (including base slice). Each slice will get two slice-specific heads: - an indicator head that learns to identify when DataPoints are in that slice - a predictor head that is trained on only members of that slice The base task's head is replaced by a master head that makes predictions based on a combination of the predictor heads' predictions that are weighted by the indicator heads' prediction confidences. WARNING: The current implementation pollutes the module_pool---the indicator task's module_pool includes predictor modules and vice versa since both are modified in place. This does not affect the result because the op sequences dictate which modules get used, and those do not include the extra modules. An alternative would be to make separate copies of the module pool for each, but that wastes time and memory extra copies of (potentially very large) modules that will be merged in a moment away in the model since they have the same name. We leave resolution of this issue for a future release. Parameters ---------- base_task Task for which we are adding slice tasks. As noted in the WARNING, this task's module_pool will currently be modified in place for efficiency purposes. slice_names List of slice names corresponding to the columns of the slice matrix. Returns ------- List[Task] Containins original base_task, pred/ind tasks for the base slice, and pred/ind tasks for each of the specified slice_names """ if "base" not in slice_names: slice_names = slice_names + ["base"] slice_tasks: List[Task] = [] # Keep track of all operations related to slice tasks slice_task_ops: List[Operation] = [] # NOTE: We assume here that the last operation uses the head module # Identify base task head module head_module_op = base_task.op_sequence[-1] head_module = base_task.module_pool[head_module_op.module_name] original_loss_func = base_task.loss_func if isinstance(head_module, nn.DataParallel): head_module = head_module.module neck_size = head_module.in_features assert isinstance(neck_size, int) base_task_cardinality = head_module.out_features assert isinstance(base_task_cardinality, int) # Remove the slice-unaware head module from module pool and op sequence del base_task.module_pool[head_module_op.module_name] body_flow = base_task.op_sequence[:-1] # Create slice indicator tasks for slice_name in slice_names: ind_task_name = f"{base_task.name}_slice:{slice_name}_ind" ind_head_module_name = f"{ind_task_name}_head" # Indicator head always predicts "in the slice or not", so is always binary ind_head_module = nn.Linear(neck_size, 2) # Create module_pool ind_module_pool = base_task.module_pool ind_module_pool[ind_head_module_name] = ind_head_module # Define operations for task head ind_head_op = Operation( module_name=ind_head_module_name, inputs=head_module_op.inputs ) ind_task_ops = [ind_head_op] slice_task_ops.extend(ind_task_ops) # Create op sequence ind_op_sequence = list(body_flow) + list(ind_task_ops) # Create ind task ind_task = Task( name=ind_task_name, module_pool=ind_module_pool, op_sequence=ind_op_sequence, # NOTE: F1 by default because indicator task is often class imbalanced scorer=Scorer(metrics=["f1"]), ) slice_tasks.append(ind_task) # Create slice predictor tasks shared_pred_head_module = nn.Linear(neck_size, base_task_cardinality) for slice_name in slice_names: pred_task_name = f"{base_task.name}_slice:{slice_name}_pred" pred_head_module_name = f"{pred_task_name}_head" pred_transform_module_name = f"{pred_task_name}_transform" pred_transform_module = nn.Linear(neck_size, neck_size) # Create module_pool # NOTE: See note in doc string about module_pool polution pred_module_pool = base_task.module_pool pred_module_pool[pred_transform_module_name] = pred_transform_module pred_module_pool[pred_head_module_name] = shared_pred_head_module # Define operations for task head pred_transform_op = Operation( module_name=pred_transform_module_name, inputs=head_module_op.inputs ) pred_head_op = Operation( module_name=pred_head_module_name, inputs=[pred_transform_op.name] ) pred_task_ops = [pred_transform_op, pred_head_op] slice_task_ops.extend(pred_task_ops) # Create op sequence pred_op_sequence = list(body_flow) + list(pred_task_ops) # Create pred task pred_task = Task( name=pred_task_name, module_pool=pred_module_pool, op_sequence=pred_op_sequence, scorer=base_task.scorer, ) slice_tasks.append(pred_task) # Create master task master_task_name = base_task.name master_combiner_module_name = f"{base_task.name}_master_combiner" master_combiner_module = SliceCombinerModule() master_head_module_name = f"{base_task.name}_master_head" master_head_module = head_module # Create module_pool master_module_pool = nn.ModuleDict( { master_combiner_module_name: master_combiner_module, master_head_module_name: master_head_module, } ) master_combiner_op = Operation(module_name=master_combiner_module_name, inputs=[]) master_head_op = Operation( module_name=master_head_module_name, inputs=[master_combiner_op.name] ) # NOTE: See note in doc string about module_pool polution # Create op_sequence master_op_sequence = ( list(body_flow) + list(slice_task_ops) + [master_combiner_op, master_head_op] ) master_task = Task( name=master_task_name, module_pool=master_module_pool, op_sequence=master_op_sequence, scorer=base_task.scorer, loss_func=original_loss_func, ) return slice_tasks + [master_task]
# %% from functools import partial import torch.nn.functional as F from snorkel.analysis import Scorer from snorkel.classification import Task circle_task = Task( name="circle_task", module_pool=module_pool, op_sequence=op_sequence, loss_func=F.cross_entropy, output_func=partial(F.softmax, dim=1), scorer=Scorer(metrics=["accuracy"]), ) # %% [markdown] # Note that `Task` objects are not dependent on a particular dataset; multiple datasets can be passed through the same modules for pre-training or co-training. # %% [markdown] # ### Again, but faster # %% [markdown] # We'll now define the square task, but more succinctly—for example, using the fact that the default name for an `Operation` is its `module_name` (since most tasks only use their modules once per forward pass). # # We'll also define the square task to share the first module in its task flow (`base_mlp`) with the circle task to demonstrate how to share modules. (Note that this is purely for illustrative purposes; for this toy task, it is quite possible that this is not the optimal arrangement of modules). # # Finally, the most common task definitions we see in practice are classification tasks with cross-entropy loss and softmax on the output of the last module, and accuracy is most often the primary metric of interest, these are all the default values, so we can drop them here for brevity.
# Extract parameters from arguments n_epochs = int(sys.argv[1]) lr = float(sys.argv[2]) abstain_rate = float(sys.argv[3]) # if < 0 then no abstain rate requested # Other parameters n_folds = 5 # Extract relevant data L_data_local = np.copy(L_data[:,:57]) Y_data_local = np.copy(Y_data) # Set up Scorer my_metrics = {"abstain rate": lambda golds, preds, probs: np.sum(preds == ABSTAIN) / len(preds)} scorer = Scorer(metrics=["accuracy","f1"], custom_metric_funcs=my_metrics) # Cross validation all_scores = [] kf = KFold(n_splits=n_folds, shuffle=True) for i, (train_idx, test_idx) in enumerate(kf.split(L_data_local)): # Define train dataset L_train = L_data_local[train_idx] Y_train = Y_data_local[train_idx] # Define test dataset L_test = L_data_local[test_idx] Y_test = Y_data_local[test_idx] # Evaluate a dependency-informed Snorkel model l_model = LabelModel(cardinality=2, verbose=False)