def test_performance(self): """Test slicing performance with 2 corresponding slice tasks that represent roughly <10% of the data.""" dataloaders = [] for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]: dataloader = create_dataloader(df, split) dataloaders.append(dataloader) base_task = create_task("task", module_suffixes=["A", "B"]) # Apply SFs slicing_functions = [f, g] # low-coverage slices slice_names = [sf.name for sf in slicing_functions] applier = PandasSFApplier(slicing_functions) S_train = applier.apply(self.df_train, progress_bar=False) S_valid = applier.apply(self.df_valid, progress_bar=False) # Add slice labels add_slice_labels(dataloaders[0], base_task, S_train) add_slice_labels(dataloaders[1], base_task, S_valid) # Convert to slice tasks tasks = convert_to_slice_tasks(base_task, slice_names) model = MultitaskClassifier(tasks=tasks) # Train # NOTE: Needs more epochs to convergence with more heads trainer = Trainer(lr=0.001, n_epochs=65, progress_bar=False) trainer.fit(model, dataloaders) scores = model.score(dataloaders) # Confirm reasonably high slice scores # Check train scores self.assertGreater(scores["task/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:f_pred/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:f_ind/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:g_pred/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:g_ind/TestData/train/f1"], 0.9) self.assertGreater(scores["task_slice:base_pred/TestData/train/f1"], 0.9) self.assertEqual(scores["task_slice:base_ind/TestData/train/f1"], 1.0) # Check valid scores self.assertGreater(scores["task/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:f_pred/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:f_ind/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:g_pred/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:g_ind/TestData/valid/f1"], 0.9) self.assertGreater(scores["task_slice:base_pred/TestData/valid/f1"], 0.9) # base_ind is trivial: all labels are positive self.assertEqual(scores["task_slice:base_ind/TestData/valid/f1"], 1.0)
def test_convergence(self): """Test slicing convergence with 1 slice task that represents ~25% of the data.""" dataloaders = [] for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]: dataloader = create_dataloader(df, split) dataloaders.append(dataloader) base_task = create_task("task", module_suffixes=["A", "B"]) # Apply SFs slicing_functions = [h] # high coverage slice slice_names = [sf.name for sf in slicing_functions] applier = PandasSFApplier(slicing_functions) S_train = applier.apply(self.df_train, progress_bar=False) S_valid = applier.apply(self.df_valid, progress_bar=False) self.assertEqual(S_train.shape, (self.N_TRAIN, )) self.assertEqual(S_valid.shape, (self.N_VALID, )) self.assertIn("h", S_train.dtype.names) # Add slice labels add_slice_labels(dataloaders[0], base_task, S_train) add_slice_labels(dataloaders[1], base_task, S_valid) # Convert to slice tasks tasks = convert_to_slice_tasks(base_task, slice_names) model = MultitaskClassifier(tasks=tasks) # Train trainer = Trainer(lr=0.001, n_epochs=50, progress_bar=False) trainer.fit(model, dataloaders) scores = model.score(dataloaders) # Confirm near perfect scores self.assertGreater(scores["task/TestData/valid/accuracy"], 0.94) self.assertGreater(scores["task_slice:h_pred/TestData/valid/accuracy"], 0.94) self.assertGreater(scores["task_slice:h_ind/TestData/valid/f1"], 0.94) # Calculate/check train/val loss train_dataset = dataloaders[0].dataset train_loss_output = model.calculate_loss(train_dataset.X_dict, train_dataset.Y_dict) train_loss = train_loss_output[0]["task"].item() self.assertLess(train_loss, 0.1) val_dataset = dataloaders[1].dataset val_loss_output = model.calculate_loss(val_dataset.X_dict, val_dataset.Y_dict) val_loss = val_loss_output[0]["task"].item() self.assertLess(val_loss, 0.1)
def test_add_slice_labels(self): # Create dummy data # Given slicing function f(), we expect the first two entries to be active x = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.5]) y = torch.Tensor([0, 1, 1, 0, 1]).long() dataset = DictDataset( name="TestData", split="train", X_dict={"data": x}, Y_dict={"TestTask": y} ) # Ensure that we start with 1 labelset self.assertEqual(len(dataset.Y_dict), 1) # Apply SFs with PandasSFApplier df = pd.DataFrame({"val": x, "y": y}) slicing_functions = [f] applier = PandasSFApplier(slicing_functions) S = applier.apply(df, progress_bar=False) dataloader = DictDataLoader(dataset) dummy_task = create_dummy_task(task_name="TestTask") add_slice_labels(dataloader, dummy_task, S) # Ensure that all the fields are present labelsets = dataloader.dataset.Y_dict self.assertIn("TestTask", labelsets) self.assertIn("TestTask_slice:base_ind", labelsets) self.assertIn("TestTask_slice:base_pred", labelsets) self.assertIn("TestTask_slice:f_ind", labelsets) self.assertIn("TestTask_slice:f_pred", labelsets) self.assertEqual(len(labelsets), 5) # Ensure "ind" contains mask self.assertEqual( labelsets["TestTask_slice:f_ind"].numpy().tolist(), [1, 1, 0, 0, 0] ) self.assertEqual( labelsets["TestTask_slice:base_ind"].numpy().tolist(), [1, 1, 1, 1, 1] ) # Ensure "pred" contains masked elements self.assertEqual( labelsets["TestTask_slice:f_pred"].numpy().tolist(), [0, 1, -1, -1, -1] ) self.assertEqual( labelsets["TestTask_slice:base_pred"].numpy().tolist(), [0, 1, 1, 0, 1] ) self.assertEqual(labelsets["TestTask"].numpy().tolist(), [0, 1, 1, 0, 1])
def slice_dataframe(df: pd.DataFrame, slicing_function: SlicingFunction) -> pd.DataFrame: """Return a dataframe with examples corresponding to specified ``SlicingFunction``. Parameters ---------- df A pandas DataFrame that will be sliced slicing_function SlicingFunction which will operate over df to return a subset of examples; function returns a subset of data for which ``slicing_function`` output is True Returns ------- pd.DataFrame A DataFrame including only examples belonging to slice_name """ S = PandasSFApplier([slicing_function]).apply(df) # Index into the SF labels by name df_idx = np.where(S[slicing_function.name])[0] # type: ignore return df.iloc[df_idx]
from sklearn.metrics import f1_score print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%") # %% [markdown] # ### Store slice metadata in `S` # %% [markdown] # We apply our list of `sfs` to the data using an SF applier. # For our data format, we leverage the [`PandasSFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/slicing/snorkel.slicing.PandasSFApplier.html#snorkel.slicing.PandasSFApplier). # The output of the `applier` is an [`np.recarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html) which stores vectors in named fields indicating whether each of $n$ data points belongs to the corresponding slice. # %% {"tags": ["md-exclude-output"]} from snorkel.slicing import PandasSFApplier applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) # %% [markdown] # Now, we initialize a [`Scorer`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer) using the desired `metrics`. # %% from snorkel.analysis import Scorer scorer = Scorer(metrics=["f1"]) # %% [markdown] # Using the [`score_slices`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer.score_slices) method, we can see both `overall` and slice-specific performance. # %% scorer.score_slices(S=S_test,
def get_performance(y_true: np.ndarray, y_pred: np.ndarray, classes: List, df: pd.DataFrame = None) -> Dict: """Per-class performance metrics. Args: y_true (np.ndarray): True class labels. y_pred (np.ndarray): Predicted class labels. classes (List): List of all unique classes. df (pd.DataFrame, optional): dataframe used for slicing. Returns: Dictionary of overall and per-class performance metrics. """ # Performance performance = {"overall": {}, "class": {}} # Overall performance metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted") performance["overall"]["precision"] = metrics[0] performance["overall"]["recall"] = metrics[1] performance["overall"]["f1"] = metrics[2] performance["overall"]["num_samples"] = np.float64(len(y_true)) # Per-class performance metrics = precision_recall_fscore_support(y_true, y_pred, average=None) for i in range(len(classes)): performance["class"][classes[i]] = { "precision": metrics[0][i], "recall": metrics[1][i], "f1": metrics[2][i], "num_samples": np.float64(metrics[3][i]), } # Slicing performance if df is not None: # Slices slicing_functions = [cv_transformers, short_text] applier = PandasSFApplier(slicing_functions) slices = applier.apply(df) # Score slices # Use snorkel.analysis.Scorer for multiclass tasks # Naive implementation for our multilabel task # based on snorkel.analysis.Scorer performance["slices"] = {} for slice_name in slices.dtype.names: mask = slices[slice_name].astype(bool) metrics = precision_recall_fscore_support(y_true[mask], y_pred[mask], average="micro") performance["slices"][slice_name] = {} performance["slices"][slice_name]["precision"] = metrics[0] performance["slices"][slice_name]["recall"] = metrics[1] performance["slices"][slice_name]["f1"] = metrics[2] performance["slices"][slice_name]["num_samples"] = len( y_true[mask]) # Weighted slice f1 performance["slices"]["f1"] = np.mean( list( itertools.chain.from_iterable( [[performance["slices"][slice_name]["f1"]] * performance["slices"][slice_name]["num_samples"] for slice_name in performance["slices"]]))) return performance
def slicing_evaluation(df_train, df_test, train_model=None): if train_model is None: train_model = "mlp" sfs = [ SlicingFunction.short_comment, SlicingFunction.ind_keyword, SlicingFunction.cmp_re, SlicingFunction.industry_keyword ] slice_names = [sf.name for sf in sfs] scorer = Scorer(metrics=["f1"]) ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin") def get_ftr(text): return ft.get_sentence_vector(' '.join( [w for w in jieba.lcut(text.strip())])) X_train = np.array(list(df_train.text.apply(get_ftr).values)) X_test = np.array(list(df_test.text.apply(get_ftr).values)) Y_train = df_train.label.values Y_test = df_test.label.values if train_model == "lr": sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=Y_train) preds_test = sklearn_model.predict(X_test) probs_test = preds_to_probs( preds_test, len([c for c in dir(Polarity) if not c.startswith("__")])) print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%") applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) analysis = scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) return analysis if train_model == "mlp": # Define model architecture bow_dim = X_train.shape[1] hidden_dim = bow_dim mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2) # Initialize slice model slice_model = SliceAwareClassifier( base_architecture=mlp, head_dim=hidden_dim, slice_names=slice_names, scorer=scorer, ) # generate the remaining S matrices with the new set of slicing functions applier = PandasSFApplier(sfs) S_train = applier.apply(df_train) S_test = applier.apply(df_test) # add slice labels to an existing dataloader BATCH_SIZE = 64 train_dl = create_dict_dataloader(X_train, Y_train, "train") train_dl_slice = slice_model.make_slice_dataloader( train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE) test_dl = create_dict_dataloader(X_test, Y_test, "train") test_dl_slice = slice_model.make_slice_dataloader( test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE) # fit our classifier with the training set dataloader trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True) trainer.fit(slice_model, [train_dl_slice]) analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True) return analysis
def get_metrics(y_true: np.ndarray, y_pred: np.ndarray, classes: List, df: pd.DataFrame = None) -> Dict: """Calculate metrics for fine-grained performance evaluation. Args: y_true (np.ndarray): True class labels. y_pred (np.ndarray): Predicted class labels. classes (List): List of all unique classes. df (pd.DataFrame, optional): dataframe used for slicing. Returns: Dictionary of fine-grained performance metrics. """ # Performance metrics = {"overall": {}, "class": {}} # Overall metrics overall_metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted") metrics["overall"]["precision"] = overall_metrics[0] metrics["overall"]["recall"] = overall_metrics[1] metrics["overall"]["f1"] = overall_metrics[2] metrics["overall"]["num_samples"] = np.float64(len(y_true)) # Per-class metrics class_metrics = precision_recall_fscore_support(y_true, y_pred, average=None) for i in range(len(classes)): metrics["class"][classes[i]] = { "precision": class_metrics[0][i], "recall": class_metrics[1][i], "f1": class_metrics[2][i], "num_samples": np.float64(class_metrics[3][i]), } # Slicing metrics if df is not None: # Slices slicing_functions = [cv_transformers, short_text] applier = PandasSFApplier(slicing_functions) slices = applier.apply(df) # Score slices # Use snorkel.analysis.Scorer for multiclass tasks # Naive implementation for our multilabel task # based on snorkel.analysis.Scorer metrics["slices"] = {} metrics["slices"]["class"] = {} for slice_name in slices.dtype.names: mask = slices[slice_name].astype(bool) if sum( mask ): # pragma: no cover, test set may not have enough samples for slicing slice_metrics = precision_recall_fscore_support( y_true[mask], y_pred[mask], average="micro") metrics["slices"]["class"][slice_name] = {} metrics["slices"]["class"][slice_name][ "precision"] = slice_metrics[0] metrics["slices"]["class"][slice_name][ "recall"] = slice_metrics[1] metrics["slices"]["class"][slice_name]["f1"] = slice_metrics[2] metrics["slices"]["class"][slice_name]["num_samples"] = len( y_true[mask]) # Weighted overall slice metrics metrics["slices"]["overall"] = {} for metric in ["precision", "recall", "f1"]: metrics["slices"]["overall"][metric] = np.mean( list( itertools.chain.from_iterable( [[metrics["slices"]["class"][slice_name][metric]] * metrics["slices"]["class"][slice_name]["num_samples"] for slice_name in metrics["slices"]["class"]]))) return metrics
# We define a `LogisticRegression` model from `sklearn` and show how we might visualize these slice-specific scores. from sklearn.linear_model import LogisticRegression sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=Y_train) print(f"Test set accuracy: {100 * sklearn_model.score(X_test, Y_test):.1f}%") from snorkel.utils import preds_to_probs preds_test = sklearn_model.predict(X_test) probs_test = preds_to_probs(preds_test, 2) from snorkel.slicing import PandasSFApplier applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) from snorkel.analysis import Scorer scorer = Scorer(metrics=["accuracy", "f1"]) scorer.score_slices( S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True ) # ### Write additional slicing functions (SFs) # Slices are dynamic — as monitoring needs grow or change with new data distributions or application needs, an ML pipeline might require dozens, or even hundreds, of slices. from snorkel.slicing import SlicingFunction, slicing_function from snorkel.preprocess import preprocessor