def test_ignores(self): golds = np.array([0, 0, 0, 1, 1]) preds = np.array([0, -1, 0, 1, 0]) score = metric_score(golds, preds, probs=None, metric="accuracy") self.assertAlmostEqual(score, 0.6) score = metric_score(golds, preds, probs=None, metric="accuracy", filter_dict={"preds": [-1]}) self.assertAlmostEqual(score, 0.75) score = metric_score(golds, preds, probs=None, metric="accuracy", filter_dict={"golds": [0]}) self.assertAlmostEqual(score, 0.5) score = metric_score( golds, preds, probs=None, metric="accuracy", filter_dict={ "golds": [1], "preds": [-1] }, ) self.assertAlmostEqual(score, 1.0)
def test_roc_auc(self): golds = np.array([0, 0, 0, 0, 1]) probs = preds_to_probs(golds, 2) probs_nonbinary = np.array([ [1.0, 0.0, 0.0], [0.7, 0.0, 0.3], [0.8, 0.0, 0.2], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], ]) roc_auc = metric_score(golds, preds=None, probs=probs, metric="roc_auc") self.assertAlmostEqual(roc_auc, 1.0) probs = np.fliplr(probs) roc_auc = metric_score(golds, preds=None, probs=probs, metric="roc_auc") self.assertAlmostEqual(roc_auc, 0.0) with self.assertRaisesRegex( ValueError, "Metric roc_auc is currently only defined for binary"): metric_score(golds, preds=None, probs=probs_nonbinary, metric="roc_auc")
def test_f1_multiclass(self): golds = np.array([0, 0, 1, 1, 2]) preds = np.array([1, 1, 0, 1, 2]) score = metric_score(golds, preds, probs=None, metric="f1_micro") self.assertAlmostEqual(score, 0.4) score = metric_score(golds, preds, probs=None, metric="f1_macro") self.assertAlmostEqual(score, 0.47, 2)
def test_f1(self): golds = np.array([0, 0, 0, 1, 1]) preds = np.array([1, 1, 0, 0, 1]) score = metric_score(golds, preds, probs=None, metric="f1") self.assertAlmostEqual(score, 0.4) golds = np.array([0, 0, 1, 1, 2]) preds = np.array([1, 1, 0, 1, 2]) with self.assertRaisesRegex(ValueError, "f1 not supported for multiclass"): score = metric_score(golds, preds, probs=None, metric="f1")
def test_coverage(self): golds = np.array([0, 0, 0, 0, 1]) preds = np.array([-1, -1, 0, 0, 0]) score = metric_score(golds, preds, probs=None, metric="coverage") self.assertAlmostEqual(score, 0.6) score = metric_score(golds, preds, probs=None, filter_dict={"golds": [1]}, metric="coverage") self.assertAlmostEqual(score, 0.5)
def test_fbeta(self): golds = np.array([0, 0, 0, 0, 1]) preds = np.array([1, 1, 0, 0, 1]) pre = metric_score(golds, preds, probs=None, metric="precision") rec = metric_score(golds, preds, probs=None, metric="recall") self.assertAlmostEqual( pre, metric_score(golds, preds, probs=None, metric="fbeta", beta=1e-6), places=2, ) self.assertAlmostEqual( rec, metric_score(golds, preds, probs=None, metric="fbeta", beta=1e6), places=2, )
def test_matthews(self): golds = np.array([0, 0, 0, 0, 1]) preds = np.array([1, 0, 0, 0, 0]) mcc = metric_score(golds, preds, probs=None, metric="matthews_corrcoef") self.assertAlmostEqual(mcc, -0.25) golds = np.array([0, 0, 0, 0, 1]) preds = np.array([0, 0, 0, 0, 1]) mcc = metric_score(golds, preds, probs=None, metric="matthews_corrcoef") self.assertAlmostEqual(mcc, 1.0)
def test_bad_inputs(self): golds = np.array([0, 0, 0, 1, 1]) pred1 = np.array([0, 0, 0, 1, 0.5]) pred2 = np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]]) with self.assertRaisesRegex(ValueError, "Input contains at least one non-integer"): metric_score(golds, pred1, probs=None, metric="accuracy") with self.assertRaisesRegex(ValueError, "Input could not be converted"): metric_score(golds, pred2, probs=None, metric="accuracy") with self.assertRaisesRegex(ValueError, "The metric you provided"): metric_score(golds, pred2, probs=None, metric="bad_metric") with self.assertRaisesRegex(ValueError, "filter_dict must only include keys in"): metric_score( golds, golds, probs=None, metric="accuracy", filter_dict={"bad_map": [0]}, )
# Define a vanilla logistic regression model with Keras keras_model = get_keras_logreg(input_dim=X_train.shape[1]) keras_model.fit( x=X_train, y=probs_train_filtered, validation_data=(X_valid, preds_to_probs(Y_valid, 2)), callbacks=[get_keras_early_stopping()], epochs=50, verbose=0, ) # %% preds_test = keras_model.predict(x=X_test).argmax(axis=1) test_acc = metric_score(golds=Y_test, preds=preds_test, metric="accuracy") print(f"Test Accuracy: {test_acc * 100:.1f}%") # %% [markdown] # **We observe an additional boost in accuracy over the `LabelModel` by multiple points! # By using the label model to transfer the domain knowledge encoded in our LFs to the discriminative model, # we were able to generalize beyond the noisy labeling heuristics**. # %% [markdown] # We can compare this to the score we could have gotten if we had used our small labeled `dev` set directly as training data instead of using it to guide the creation of LFs. # %% {"tags": ["md-exclude-output"]} keras_dev_model = get_keras_logreg(input_dim=X_train.shape[1], output_dim=1) keras_dev_model.fit( x=X_dev,
if __name__ == "__main__": warnings.filterwarnings("ignore") ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data() lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name, lf_married, lf_familial_relationship, lf_family_left_window, lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names] applier = PandasLFApplier(lfs) L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev)) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345) probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc'))) probs_train = label_model.predict_proba(L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) X_train = get_feature_arrays(df_train_filtered) model = get_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100) X_test = get_feature_arrays(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test) print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))
def test_accuracy_basic(self): golds = np.array([0, 0, 0, 1, 1]) preds = np.array([0, 0, 0, 1, 0]) score = metric_score(golds, preds, probs=None, metric="accuracy") self.assertAlmostEqual(score, 0.8)
def test_recall(self): golds = np.array([0, 0, 0, 1, 1]) preds = np.array([1, 1, 0, 0, 1]) score = metric_score(golds, preds, probs=None, metric="recall") self.assertAlmostEqual(score, 0.5)
def test_precision(self): golds = np.array([0, 0, 0, 1, 1]) preds = np.array([1, 1, 0, 0, 1]) score = metric_score(golds, preds, probs=None, metric="precision") self.assertAlmostEqual(score, 0.333, places=2)
def test_missing_preds(self): golds = np.array([0, 0, 1, 1]) with self.assertRaisesRegex(ValueError, "requires access to"): metric_score(golds=golds, metric="accuracy")
# %% {"tags": ["md-exclude-output"]} from snorkel.labeling import LabelModel # Train LabelModel. label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01) # %% [markdown] # As a spot-check for the quality of our LabelModel, we'll score it on the dev set. # %% from snorkel.analysis import metric_score preds_dev = label_model.predict(L_dev) acc = metric_score(Y_dev, preds_dev, probs=None, metric="accuracy") print(f"LabelModel Accuracy: {acc:.3f}") # %% [markdown] # We see that we get very high accuracy on the development set. # This is due to the abundance of high quality crowdworker labels. # **Since we don't have these high quality crowdsourcing labels for the # test set or new incoming data points, we can't use the LabelModel reliably # at inference time.** # In order to run inference on new incoming data points, we need to train a # discriminative model over the tweets themselves. # Let's generate a set of labels for that training set. # %% preds_train = label_model.predict(L_train)