def test_score_slices(self): DATA = [5, 10, 19, 22, 25] @slicing_function() def sf(x): return x.num < 20 # We expect 3/5 correct -> 0.6 accuracy golds = np.array([0, 1, 0, 1, 0]) preds = np.array([0, 0, 0, 0, 0]) probs = preds_to_probs(preds, 2) # In the slice, we expect the last 2 elements to masked # We expect 2/3 correct -> 0.666 accuracy data = [SimpleNamespace(num=x) for x in DATA] S = SFApplier([sf]).apply(data) scorer = Scorer(metrics=["accuracy"]) # Test normal score metrics = scorer.score(golds=golds, preds=preds, probs=probs) self.assertEqual(metrics["accuracy"], 0.6) # Test score_slices slice_metrics = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs) self.assertEqual(slice_metrics["overall"]["accuracy"], 0.6) self.assertEqual(slice_metrics["sf"]["accuracy"], 2.0 / 3.0) # Test as_dataframe=True metrics_df = scorer.score_slices(S=S, golds=golds, preds=preds, probs=probs, as_dataframe=True) self.assertTrue(isinstance(metrics_df, pd.DataFrame)) self.assertEqual(metrics_df["accuracy"]["overall"], 0.6) self.assertEqual(metrics_df["accuracy"]["sf"], 2.0 / 3.0) # Test wrong shapes with self.assertRaisesRegex(ValueError, "must have the same number of elements"): scorer.score_slices(S=S, golds=golds[:1], preds=preds, probs=probs, as_dataframe=True)
# %% [markdown] # Now, we initialize a [`Scorer`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer) using the desired `metrics`. # %% from snorkel.analysis import Scorer scorer = Scorer(metrics=["f1"]) # %% [markdown] # Using the [`score_slices`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer.score_slices) method, we can see both `overall` and slice-specific performance. # %% scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) # %% [markdown] # Despite high overall performance, the `short_comment` slice performs poorly here! # %% [markdown] # ### Write additional slicing functions (SFs) # # Slices are dynamic — as monitoring needs grow or change with new data distributions or application needs, an ML pipeline might require dozens, or even hundreds, of slices. # # We'll take inspiration from the labeling tutorial to write additional slicing functions. # We demonstrate how the same powerful preprocessors and utilities available for labeling functions can be leveraged for slicing functions. # %%
def slicing_evaluation(df_train, df_test, train_model=None): if train_model is None: train_model = "mlp" sfs = [ SlicingFunction.short_comment, SlicingFunction.ind_keyword, SlicingFunction.cmp_re, SlicingFunction.industry_keyword ] slice_names = [sf.name for sf in sfs] scorer = Scorer(metrics=["f1"]) ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin") def get_ftr(text): return ft.get_sentence_vector(' '.join( [w for w in jieba.lcut(text.strip())])) X_train = np.array(list(df_train.text.apply(get_ftr).values)) X_test = np.array(list(df_test.text.apply(get_ftr).values)) Y_train = df_train.label.values Y_test = df_test.label.values if train_model == "lr": sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=Y_train) preds_test = sklearn_model.predict(X_test) probs_test = preds_to_probs( preds_test, len([c for c in dir(Polarity) if not c.startswith("__")])) print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%") applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) analysis = scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) return analysis if train_model == "mlp": # Define model architecture bow_dim = X_train.shape[1] hidden_dim = bow_dim mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2) # Initialize slice model slice_model = SliceAwareClassifier( base_architecture=mlp, head_dim=hidden_dim, slice_names=slice_names, scorer=scorer, ) # generate the remaining S matrices with the new set of slicing functions applier = PandasSFApplier(sfs) S_train = applier.apply(df_train) S_test = applier.apply(df_test) # add slice labels to an existing dataloader BATCH_SIZE = 64 train_dl = create_dict_dataloader(X_train, Y_train, "train") train_dl_slice = slice_model.make_slice_dataloader( train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE) test_dl = create_dict_dataloader(X_test, Y_test, "train") test_dl_slice = slice_model.make_slice_dataloader( test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE) # fit our classifier with the training set dataloader trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True) trainer.fit(slice_model, [train_dl_slice]) analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True) return analysis