def test_ignores(self):
     golds = np.array([0, 0, 0, 1, 1])
     preds = np.array([0, -1, 0, 1, 0])
     score = metric_score(golds, preds, probs=None, metric="accuracy")
     self.assertAlmostEqual(score, 0.6)
     score = metric_score(golds,
                          preds,
                          probs=None,
                          metric="accuracy",
                          filter_dict={"preds": [-1]})
     self.assertAlmostEqual(score, 0.75)
     score = metric_score(golds,
                          preds,
                          probs=None,
                          metric="accuracy",
                          filter_dict={"golds": [0]})
     self.assertAlmostEqual(score, 0.5)
     score = metric_score(
         golds,
         preds,
         probs=None,
         metric="accuracy",
         filter_dict={
             "golds": [1],
             "preds": [-1]
         },
     )
     self.assertAlmostEqual(score, 1.0)
    def test_roc_auc(self):
        golds = np.array([0, 0, 0, 0, 1])
        probs = preds_to_probs(golds, 2)
        probs_nonbinary = np.array([
            [1.0, 0.0, 0.0],
            [0.7, 0.0, 0.3],
            [0.8, 0.0, 0.2],
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
        ])

        roc_auc = metric_score(golds,
                               preds=None,
                               probs=probs,
                               metric="roc_auc")
        self.assertAlmostEqual(roc_auc, 1.0)
        probs = np.fliplr(probs)
        roc_auc = metric_score(golds,
                               preds=None,
                               probs=probs,
                               metric="roc_auc")
        self.assertAlmostEqual(roc_auc, 0.0)

        with self.assertRaisesRegex(
                ValueError,
                "Metric roc_auc is currently only defined for binary"):
            metric_score(golds,
                         preds=None,
                         probs=probs_nonbinary,
                         metric="roc_auc")
    def test_f1_multiclass(self):
        golds = np.array([0, 0, 1, 1, 2])
        preds = np.array([1, 1, 0, 1, 2])
        score = metric_score(golds, preds, probs=None, metric="f1_micro")
        self.assertAlmostEqual(score, 0.4)

        score = metric_score(golds, preds, probs=None, metric="f1_macro")
        self.assertAlmostEqual(score, 0.47, 2)
    def test_f1(self):
        golds = np.array([0, 0, 0, 1, 1])
        preds = np.array([1, 1, 0, 0, 1])
        score = metric_score(golds, preds, probs=None, metric="f1")
        self.assertAlmostEqual(score, 0.4)

        golds = np.array([0, 0, 1, 1, 2])
        preds = np.array([1, 1, 0, 1, 2])
        with self.assertRaisesRegex(ValueError,
                                    "f1 not supported for multiclass"):
            score = metric_score(golds, preds, probs=None, metric="f1")
 def test_coverage(self):
     golds = np.array([0, 0, 0, 0, 1])
     preds = np.array([-1, -1, 0, 0, 0])
     score = metric_score(golds, preds, probs=None, metric="coverage")
     self.assertAlmostEqual(score, 0.6)
     score = metric_score(golds,
                          preds,
                          probs=None,
                          filter_dict={"golds": [1]},
                          metric="coverage")
     self.assertAlmostEqual(score, 0.5)
 def test_fbeta(self):
     golds = np.array([0, 0, 0, 0, 1])
     preds = np.array([1, 1, 0, 0, 1])
     pre = metric_score(golds, preds, probs=None, metric="precision")
     rec = metric_score(golds, preds, probs=None, metric="recall")
     self.assertAlmostEqual(
         pre,
         metric_score(golds, preds, probs=None, metric="fbeta", beta=1e-6),
         places=2,
     )
     self.assertAlmostEqual(
         rec,
         metric_score(golds, preds, probs=None, metric="fbeta", beta=1e6),
         places=2,
     )
    def test_matthews(self):
        golds = np.array([0, 0, 0, 0, 1])
        preds = np.array([1, 0, 0, 0, 0])
        mcc = metric_score(golds,
                           preds,
                           probs=None,
                           metric="matthews_corrcoef")
        self.assertAlmostEqual(mcc, -0.25)

        golds = np.array([0, 0, 0, 0, 1])
        preds = np.array([0, 0, 0, 0, 1])
        mcc = metric_score(golds,
                           preds,
                           probs=None,
                           metric="matthews_corrcoef")
        self.assertAlmostEqual(mcc, 1.0)
    def test_bad_inputs(self):
        golds = np.array([0, 0, 0, 1, 1])
        pred1 = np.array([0, 0, 0, 1, 0.5])
        pred2 = np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]])
        with self.assertRaisesRegex(ValueError,
                                    "Input contains at least one non-integer"):
            metric_score(golds, pred1, probs=None, metric="accuracy")

        with self.assertRaisesRegex(ValueError,
                                    "Input could not be converted"):
            metric_score(golds, pred2, probs=None, metric="accuracy")

        with self.assertRaisesRegex(ValueError, "The metric you provided"):
            metric_score(golds, pred2, probs=None, metric="bad_metric")

        with self.assertRaisesRegex(ValueError,
                                    "filter_dict must only include keys in"):
            metric_score(
                golds,
                golds,
                probs=None,
                metric="accuracy",
                filter_dict={"bad_map": [0]},
            )
Beispiel #9
0
# Define a vanilla logistic regression model with Keras
keras_model = get_keras_logreg(input_dim=X_train.shape[1])

keras_model.fit(
    x=X_train,
    y=probs_train_filtered,
    validation_data=(X_valid, preds_to_probs(Y_valid, 2)),
    callbacks=[get_keras_early_stopping()],
    epochs=50,
    verbose=0,
)

# %%
preds_test = keras_model.predict(x=X_test).argmax(axis=1)
test_acc = metric_score(golds=Y_test, preds=preds_test, metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

# %% [markdown]
# **We observe an additional boost in accuracy over the `LabelModel` by multiple points!
# By using the label model to transfer the domain knowledge encoded in our LFs to the discriminative model,
# we were able to generalize beyond the noisy labeling heuristics**.

# %% [markdown]
# We can compare this to the score we could have gotten if we had used our small labeled `dev` set directly as training data instead of using it to guide the creation of LFs.

# %% {"tags": ["md-exclude-output"]}
keras_dev_model = get_keras_logreg(input_dim=X_train.shape[1], output_dim=1)

keras_dev_model.fit(
    x=X_dev,
Beispiel #10
0

if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data()
    lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name,
           lf_married, lf_familial_relationship, lf_family_left_window,
           lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names]
    applier = PandasLFApplier(lfs)
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev))
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')))
    probs_train = label_model.predict_proba(L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)
    X_train = get_feature_arrays(df_train_filtered)
    model = get_model()
    batch_size = 64
    model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100)
    X_test = get_feature_arrays(df_test)
    probs_test = model.predict(X_test)
    preds_test = probs_to_preds(probs_test)
    print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))

 def test_accuracy_basic(self):
     golds = np.array([0, 0, 0, 1, 1])
     preds = np.array([0, 0, 0, 1, 0])
     score = metric_score(golds, preds, probs=None, metric="accuracy")
     self.assertAlmostEqual(score, 0.8)
 def test_recall(self):
     golds = np.array([0, 0, 0, 1, 1])
     preds = np.array([1, 1, 0, 0, 1])
     score = metric_score(golds, preds, probs=None, metric="recall")
     self.assertAlmostEqual(score, 0.5)
 def test_precision(self):
     golds = np.array([0, 0, 0, 1, 1])
     preds = np.array([1, 1, 0, 0, 1])
     score = metric_score(golds, preds, probs=None, metric="precision")
     self.assertAlmostEqual(score, 0.333, places=2)
 def test_missing_preds(self):
     golds = np.array([0, 0, 1, 1])
     with self.assertRaisesRegex(ValueError, "requires access to"):
         metric_score(golds=golds, metric="accuracy")
Beispiel #15
0
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import LabelModel

# Train LabelModel.
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01)

# %% [markdown]
# As a spot-check for the quality of our LabelModel, we'll score it on the dev set.

# %%
from snorkel.analysis import metric_score

preds_dev = label_model.predict(L_dev)

acc = metric_score(Y_dev, preds_dev, probs=None, metric="accuracy")
print(f"LabelModel Accuracy: {acc:.3f}")

# %% [markdown]
# We see that we get very high accuracy on the development set.
# This is due to the abundance of high quality crowdworker labels.
# **Since we don't have these high quality crowdsourcing labels for the
# test set or new incoming data points, we can't use the LabelModel reliably
# at inference time.**
# In order to run inference on new incoming data points, we need to train a
# discriminative model over the tweets themselves.
# Let's generate a set of labels for that training set.

# %%
preds_train = label_model.predict(L_train)