Esempio n. 1
0
    def test_performance(self):
        """Test slicing performance with 2 corresponding slice tasks that
        represent roughly <10% of the data."""

        dataloaders = []
        for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]:
            dataloader = create_dataloader(df, split)
            dataloaders.append(dataloader)

        base_task = create_task("task", module_suffixes=["A", "B"])

        # Apply SFs
        slicing_functions = [f, g]  # low-coverage slices
        slice_names = [sf.name for sf in slicing_functions]
        applier = PandasSFApplier(slicing_functions)
        S_train = applier.apply(self.df_train, progress_bar=False)
        S_valid = applier.apply(self.df_valid, progress_bar=False)

        # Add slice labels
        add_slice_labels(dataloaders[0], base_task, S_train)
        add_slice_labels(dataloaders[1], base_task, S_valid)

        # Convert to slice tasks
        tasks = convert_to_slice_tasks(base_task, slice_names)
        model = MultitaskClassifier(tasks=tasks)

        # Train
        # NOTE: Needs more epochs to convergence with more heads
        trainer = Trainer(lr=0.001, n_epochs=65, progress_bar=False)
        trainer.fit(model, dataloaders)
        scores = model.score(dataloaders)

        # Confirm reasonably high slice scores
        # Check train scores
        self.assertGreater(scores["task/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_pred/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_ind/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_pred/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_ind/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:base_pred/TestData/train/f1"],
                           0.9)
        self.assertEqual(scores["task_slice:base_ind/TestData/train/f1"], 1.0)

        # Check valid scores
        self.assertGreater(scores["task/TestData/valid/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_pred/TestData/valid/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_ind/TestData/valid/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_pred/TestData/valid/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_ind/TestData/valid/f1"], 0.9)
        self.assertGreater(scores["task_slice:base_pred/TestData/valid/f1"],
                           0.9)
        # base_ind is trivial: all labels are positive
        self.assertEqual(scores["task_slice:base_ind/TestData/valid/f1"], 1.0)
Esempio n. 2
0
    def test_convergence(self):
        """Test slicing convergence with 1 slice task that represents ~25% of
        the data."""

        dataloaders = []
        for df, split in [(self.df_train, "train"), (self.df_valid, "valid")]:
            dataloader = create_dataloader(df, split)
            dataloaders.append(dataloader)

        base_task = create_task("task", module_suffixes=["A", "B"])

        # Apply SFs
        slicing_functions = [h]  # high coverage slice
        slice_names = [sf.name for sf in slicing_functions]
        applier = PandasSFApplier(slicing_functions)
        S_train = applier.apply(self.df_train, progress_bar=False)
        S_valid = applier.apply(self.df_valid, progress_bar=False)

        self.assertEqual(S_train.shape, (self.N_TRAIN, ))
        self.assertEqual(S_valid.shape, (self.N_VALID, ))
        self.assertIn("h", S_train.dtype.names)

        # Add slice labels
        add_slice_labels(dataloaders[0], base_task, S_train)
        add_slice_labels(dataloaders[1], base_task, S_valid)

        # Convert to slice tasks
        tasks = convert_to_slice_tasks(base_task, slice_names)
        model = MultitaskClassifier(tasks=tasks)

        # Train
        trainer = Trainer(lr=0.001, n_epochs=50, progress_bar=False)
        trainer.fit(model, dataloaders)
        scores = model.score(dataloaders)

        # Confirm near perfect scores
        self.assertGreater(scores["task/TestData/valid/accuracy"], 0.94)
        self.assertGreater(scores["task_slice:h_pred/TestData/valid/accuracy"],
                           0.94)
        self.assertGreater(scores["task_slice:h_ind/TestData/valid/f1"], 0.94)

        # Calculate/check train/val loss
        train_dataset = dataloaders[0].dataset
        train_loss_output = model.calculate_loss(train_dataset.X_dict,
                                                 train_dataset.Y_dict)
        train_loss = train_loss_output[0]["task"].item()
        self.assertLess(train_loss, 0.1)

        val_dataset = dataloaders[1].dataset
        val_loss_output = model.calculate_loss(val_dataset.X_dict,
                                               val_dataset.Y_dict)
        val_loss = val_loss_output[0]["task"].item()
        self.assertLess(val_loss, 0.1)
Esempio n. 3
0
    def test_add_slice_labels(self):
        # Create dummy data
        # Given slicing function f(), we expect the first two entries to be active
        x = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.5])
        y = torch.Tensor([0, 1, 1, 0, 1]).long()
        dataset = DictDataset(
            name="TestData", split="train", X_dict={"data": x}, Y_dict={"TestTask": y}
        )

        # Ensure that we start with 1 labelset
        self.assertEqual(len(dataset.Y_dict), 1)

        # Apply SFs with PandasSFApplier
        df = pd.DataFrame({"val": x, "y": y})
        slicing_functions = [f]
        applier = PandasSFApplier(slicing_functions)
        S = applier.apply(df, progress_bar=False)

        dataloader = DictDataLoader(dataset)

        dummy_task = create_dummy_task(task_name="TestTask")
        add_slice_labels(dataloader, dummy_task, S)

        # Ensure that all the fields are present
        labelsets = dataloader.dataset.Y_dict
        self.assertIn("TestTask", labelsets)
        self.assertIn("TestTask_slice:base_ind", labelsets)
        self.assertIn("TestTask_slice:base_pred", labelsets)
        self.assertIn("TestTask_slice:f_ind", labelsets)
        self.assertIn("TestTask_slice:f_pred", labelsets)
        self.assertEqual(len(labelsets), 5)

        # Ensure "ind" contains mask
        self.assertEqual(
            labelsets["TestTask_slice:f_ind"].numpy().tolist(), [1, 1, 0, 0, 0]
        )
        self.assertEqual(
            labelsets["TestTask_slice:base_ind"].numpy().tolist(), [1, 1, 1, 1, 1]
        )

        # Ensure "pred" contains masked elements
        self.assertEqual(
            labelsets["TestTask_slice:f_pred"].numpy().tolist(), [0, 1, -1, -1, -1]
        )
        self.assertEqual(
            labelsets["TestTask_slice:base_pred"].numpy().tolist(), [0, 1, 1, 0, 1]
        )
        self.assertEqual(labelsets["TestTask"].numpy().tolist(), [0, 1, 1, 0, 1])
Esempio n. 4
0
def slice_dataframe(df: pd.DataFrame,
                    slicing_function: SlicingFunction) -> pd.DataFrame:
    """Return a dataframe with examples corresponding to specified ``SlicingFunction``.

    Parameters
    ----------
    df
        A pandas DataFrame that will be sliced
    slicing_function
        SlicingFunction which will operate over df to return a subset of examples;
        function returns a subset of data for which ``slicing_function`` output is True

    Returns
    -------
    pd.DataFrame
        A DataFrame including only examples belonging to slice_name
    """

    S = PandasSFApplier([slicing_function]).apply(df)

    # Index into the SF labels by name
    df_idx = np.where(S[slicing_function.name])[0]  # type: ignore
    return df.iloc[df_idx]
Esempio n. 5
0
from sklearn.metrics import f1_score

print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%")

# %% [markdown]
# ### Store slice metadata in `S`

# %% [markdown]
# We apply our list of `sfs` to the data using an SF applier.
# For our data format, we leverage the [`PandasSFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/slicing/snorkel.slicing.PandasSFApplier.html#snorkel.slicing.PandasSFApplier).
# The output of the `applier` is an [`np.recarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.recarray.html) which stores vectors in named fields indicating whether each of $n$ data points belongs to the corresponding slice.

# %% {"tags": ["md-exclude-output"]}
from snorkel.slicing import PandasSFApplier

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

# %% [markdown]
# Now, we initialize a [`Scorer`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer) using the desired `metrics`.

# %%
from snorkel.analysis import Scorer

scorer = Scorer(metrics=["f1"])

# %% [markdown]
# Using the [`score_slices`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/analysis/snorkel.analysis.Scorer.html#snorkel.analysis.Scorer.score_slices) method, we can see both `overall` and slice-specific performance.

# %%
scorer.score_slices(S=S_test,
Esempio n. 6
0
def get_performance(y_true: np.ndarray,
                    y_pred: np.ndarray,
                    classes: List,
                    df: pd.DataFrame = None) -> Dict:
    """Per-class performance metrics.

    Args:
        y_true (np.ndarray): True class labels.
        y_pred (np.ndarray): Predicted class labels.
        classes (List): List of all unique classes.
        df (pd.DataFrame, optional): dataframe used for slicing.

    Returns:
        Dictionary of overall and per-class performance metrics.
    """
    # Performance
    performance = {"overall": {}, "class": {}}

    # Overall performance
    metrics = precision_recall_fscore_support(y_true,
                                              y_pred,
                                              average="weighted")
    performance["overall"]["precision"] = metrics[0]
    performance["overall"]["recall"] = metrics[1]
    performance["overall"]["f1"] = metrics[2]
    performance["overall"]["num_samples"] = np.float64(len(y_true))

    # Per-class performance
    metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
    for i in range(len(classes)):
        performance["class"][classes[i]] = {
            "precision": metrics[0][i],
            "recall": metrics[1][i],
            "f1": metrics[2][i],
            "num_samples": np.float64(metrics[3][i]),
        }

    # Slicing performance
    if df is not None:
        # Slices
        slicing_functions = [cv_transformers, short_text]
        applier = PandasSFApplier(slicing_functions)
        slices = applier.apply(df)

        # Score slices
        # Use snorkel.analysis.Scorer for multiclass tasks
        # Naive implementation for our multilabel task
        # based on snorkel.analysis.Scorer
        performance["slices"] = {}
        for slice_name in slices.dtype.names:
            mask = slices[slice_name].astype(bool)
            metrics = precision_recall_fscore_support(y_true[mask],
                                                      y_pred[mask],
                                                      average="micro")
            performance["slices"][slice_name] = {}
            performance["slices"][slice_name]["precision"] = metrics[0]
            performance["slices"][slice_name]["recall"] = metrics[1]
            performance["slices"][slice_name]["f1"] = metrics[2]
            performance["slices"][slice_name]["num_samples"] = len(
                y_true[mask])

        # Weighted slice f1
        performance["slices"]["f1"] = np.mean(
            list(
                itertools.chain.from_iterable(
                    [[performance["slices"][slice_name]["f1"]] *
                     performance["slices"][slice_name]["num_samples"]
                     for slice_name in performance["slices"]])))

    return performance
Esempio n. 7
0
def slicing_evaluation(df_train, df_test, train_model=None):
    if train_model is None:
        train_model = "mlp"

    sfs = [
        SlicingFunction.short_comment, SlicingFunction.ind_keyword,
        SlicingFunction.cmp_re, SlicingFunction.industry_keyword
    ]

    slice_names = [sf.name for sf in sfs]
    scorer = Scorer(metrics=["f1"])

    ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin")

    def get_ftr(text):
        return ft.get_sentence_vector(' '.join(
            [w for w in jieba.lcut(text.strip())]))

    X_train = np.array(list(df_train.text.apply(get_ftr).values))
    X_test = np.array(list(df_test.text.apply(get_ftr).values))
    Y_train = df_train.label.values
    Y_test = df_test.label.values

    if train_model == "lr":
        sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
        sklearn_model.fit(X=X_train, y=Y_train)
        preds_test = sklearn_model.predict(X_test)
        probs_test = preds_to_probs(
            preds_test,
            len([c for c in dir(Polarity) if not c.startswith("__")]))
        print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%")
        applier = PandasSFApplier(sfs)
        S_test = applier.apply(df_test)
        analysis = scorer.score_slices(S=S_test,
                                       golds=Y_test,
                                       preds=preds_test,
                                       probs=probs_test,
                                       as_dataframe=True)
        return analysis

    if train_model == "mlp":
        # Define model architecture
        bow_dim = X_train.shape[1]
        hidden_dim = bow_dim
        mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2)

        # Initialize slice model
        slice_model = SliceAwareClassifier(
            base_architecture=mlp,
            head_dim=hidden_dim,
            slice_names=slice_names,
            scorer=scorer,
        )

        # generate the remaining S matrices with the new set of slicing functions
        applier = PandasSFApplier(sfs)
        S_train = applier.apply(df_train)
        S_test = applier.apply(df_test)

        # add slice labels to an existing dataloader
        BATCH_SIZE = 64

        train_dl = create_dict_dataloader(X_train, Y_train, "train")
        train_dl_slice = slice_model.make_slice_dataloader(
            train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE)
        test_dl = create_dict_dataloader(X_test, Y_test, "train")
        test_dl_slice = slice_model.make_slice_dataloader(
            test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE)

        #  fit our classifier with the training set dataloader
        trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True)
        trainer.fit(slice_model, [train_dl_slice])

        analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True)
        return analysis
Esempio n. 8
0
def get_metrics(y_true: np.ndarray,
                y_pred: np.ndarray,
                classes: List,
                df: pd.DataFrame = None) -> Dict:
    """Calculate metrics for fine-grained performance evaluation.

    Args:
        y_true (np.ndarray): True class labels.
        y_pred (np.ndarray): Predicted class labels.
        classes (List): List of all unique classes.
        df (pd.DataFrame, optional): dataframe used for slicing.

    Returns:
        Dictionary of fine-grained performance metrics.
    """
    # Performance
    metrics = {"overall": {}, "class": {}}

    # Overall metrics
    overall_metrics = precision_recall_fscore_support(y_true,
                                                      y_pred,
                                                      average="weighted")
    metrics["overall"]["precision"] = overall_metrics[0]
    metrics["overall"]["recall"] = overall_metrics[1]
    metrics["overall"]["f1"] = overall_metrics[2]
    metrics["overall"]["num_samples"] = np.float64(len(y_true))

    # Per-class metrics
    class_metrics = precision_recall_fscore_support(y_true,
                                                    y_pred,
                                                    average=None)
    for i in range(len(classes)):
        metrics["class"][classes[i]] = {
            "precision": class_metrics[0][i],
            "recall": class_metrics[1][i],
            "f1": class_metrics[2][i],
            "num_samples": np.float64(class_metrics[3][i]),
        }

    # Slicing metrics
    if df is not None:
        # Slices
        slicing_functions = [cv_transformers, short_text]
        applier = PandasSFApplier(slicing_functions)
        slices = applier.apply(df)

        # Score slices
        # Use snorkel.analysis.Scorer for multiclass tasks
        # Naive implementation for our multilabel task
        # based on snorkel.analysis.Scorer
        metrics["slices"] = {}
        metrics["slices"]["class"] = {}
        for slice_name in slices.dtype.names:
            mask = slices[slice_name].astype(bool)
            if sum(
                    mask
            ):  # pragma: no cover, test set may not have enough samples for slicing
                slice_metrics = precision_recall_fscore_support(
                    y_true[mask], y_pred[mask], average="micro")
                metrics["slices"]["class"][slice_name] = {}
                metrics["slices"]["class"][slice_name][
                    "precision"] = slice_metrics[0]
                metrics["slices"]["class"][slice_name][
                    "recall"] = slice_metrics[1]
                metrics["slices"]["class"][slice_name]["f1"] = slice_metrics[2]
                metrics["slices"]["class"][slice_name]["num_samples"] = len(
                    y_true[mask])

        # Weighted overall slice metrics
        metrics["slices"]["overall"] = {}
        for metric in ["precision", "recall", "f1"]:
            metrics["slices"]["overall"][metric] = np.mean(
                list(
                    itertools.chain.from_iterable(
                        [[metrics["slices"]["class"][slice_name][metric]] *
                         metrics["slices"]["class"][slice_name]["num_samples"]
                         for slice_name in metrics["slices"]["class"]])))

    return metrics
Esempio n. 9
0
# We define a `LogisticRegression` model from `sklearn` and show how we might visualize these slice-specific scores.

from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
sklearn_model.fit(X=X_train, y=Y_train)
print(f"Test set accuracy: {100 * sklearn_model.score(X_test, Y_test):.1f}%")

from snorkel.utils import preds_to_probs

preds_test = sklearn_model.predict(X_test)
probs_test = preds_to_probs(preds_test, 2)

from snorkel.slicing import PandasSFApplier

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

from snorkel.analysis import Scorer

scorer = Scorer(metrics=["accuracy", "f1"])

scorer.score_slices(
    S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True
)

# ### Write additional slicing functions (SFs)
# Slices are dynamic — as monitoring needs grow or change with new data distributions or application needs, an ML pipeline might require dozens, or even hundreds, of slices.

from snorkel.slicing import SlicingFunction, slicing_function
from snorkel.preprocess import preprocessor