コード例 #1
0
    def test_mv_default(self):
        # less than 2 LFs have overlaps
        label_model = LabelModel(cardinality=2, verbose=False)
        L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 0]))

        # less than 2 LFs have conflicts
        L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]])
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, 1, 1]))
コード例 #2
0
    def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        #1422).
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(self.n,
                                               self.m,
                                               self.cardinality,
                                               abstain_multiplier=1000.0)

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        idx, = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
        self.assertEqual(len(idx),
                         np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
コード例 #3
0
    def test_save_and_load(self):
        L = np.array([[0, -1, 0], [0, 1, 1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        original_preds = label_model.predict(L)

        dir_path = tempfile.mkdtemp()
        save_path = dir_path + "label_model.pkl"
        label_model.save(save_path)

        label_model_new = LabelModel(cardinality=2, verbose=False)
        label_model_new.load(save_path)
        loaded_preds = label_model_new.predict(L)
        shutil.rmtree(dir_path)

        np.testing.assert_array_equal(loaded_preds, original_preds)
コード例 #4
0
def snorkel_process(keylist, dataframe, allweaklabf):
    def func(x):
        idx = (-x).argsort()[1:]
        x[idx] = 0
        return x

    cardinalitynu = len(keylist)
    applier = PandasLFApplier(lfs=allweaklabf)
    all_train_l = applier.apply(df=dataframe)
    report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary()
    print(report)
    label_model = LabelModel(cardinality=cardinalitynu, verbose=False)
    label_model.fit(all_train_l)
    predt = label_model.predict(all_train_l)
    predt1 = label_model.predict_proba(all_train_l)
    keylist1 = keylist.copy()
    #keylist1.append('Not_relevent')
    predt2 = pd.DataFrame(predt1, columns=keylist1)
    dataframe['L_label'] = predt
    dataframe1 = dataframe.join(predt2, how='outer')
    dataframe1 = dataframe1[dataframe1.L_label >= 0]

    train, test = train_test_split(dataframe1, test_size=0.2)

    trainsent = train.sent.values
    trainlabel = train[keylist].values
    trainlabe2 = trainlabel.copy()
    np.apply_along_axis(func, 1, trainlabe2)
    trainlabe2 = np.where(trainlabe2 > 0, 1, 0)
    testsent = test.sent.values
    testlabel = test[keylist].values
    testlabe2 = testlabel.copy()
    np.apply_along_axis(func, 1, testlabe2)
    testlabe2 = np.where(testlabe2 > 0, 1, 0)
    return trainsent, trainlabe2, testsent, testlabe2, keylist, report
コード例 #5
0
    def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([1, -1, 1]))

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)

        results = label_model.score(L=L,
                                    Y=np.array([1, 0]),
                                    metrics=["accuracy", "f1"])
        results_expected = dict(accuracy=0.5, f1=2 / 3)
        self.assertEqual(results, results_expected)
def generate_labels_with_snorkel(dataframe):
    """
    Labels the full data using Snorkel
    :param dataframe: Pandas dataframe containing all data
    :return: dataframe extended with a label column
    """

    # Define the set of labeling functions (LFs)
    lfs = [
        lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company,
        lf_non_uk_blacklisted_company
    ]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(dataframe)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    dataframe["label"] = label_model.predict(L=L_train,
                                             tie_break_policy="abstain")

    # Filter out the abstain data points
    dataframe = dataframe[dataframe.label != ABSTAIN]

    return dataframe
コード例 #7
0
    def test_predict(self):
        # 3 LFs that always disagree/abstain leads to all abstains
        L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        np.testing.assert_array_almost_equal(label_model.predict(L),
                                             np.array([-1, -1, -1]))

        L = np.array([[0, 1, 0], [0, 1, 0]])
        label_model = self._set_up_model(L)

        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(
            0.01, 0.99))
        preds = label_model.predict(L)

        true_preds = np.array([0, 0])
        np.testing.assert_array_equal(preds, true_preds)

        preds, probs = label_model.predict(L, return_probs=True)
        true_probs = np.array([[0.99, 0.01], [0.99, 0.01]])
        np.testing.assert_array_almost_equal(probs, true_probs)
コード例 #8
0
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'):
    df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv')
    #from utils import load_unlabeled_spam_dataset
    #df_train = load_unlabeled_spam_dataset()

    # Define the set of labeling functions (LFs)
    #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr]

    #lfs = [lf_keyword_keywords]

    lfs = [lf_keyword_wateroverlast]

    # Apply the LFs to the unlabeled training data
    applier = PandasLFApplier(lfs)
    L_train = applier.apply(df_train)

    # Train the label model and compute the training labels
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
    df_train["label"] = label_model.predict(L=L_train,
                                            tie_break_policy="abstain")
    #tie_break_policy="true-random"
    #tie_break_policy="abstain"
    counter = 0
    for i in range(len(df_train["label"])):
        if df_train["label"][i] == WATER:
            print()
            print(df_train["text"][i])
            print(df_train["label"][i])
            print()
            counter += 1

    print("num entries total: " + str(len(df_train["label"])))
    print("num entries water: " + str(counter))

    #df_train = df_train[df_train.label != ABSTAIN]

    twitter_curated = df_train[df_train.label == WATER]
    twitter_curated = twitter_curated.drop(columns='label')
    twitter_curated.to_csv(save_name, index=False)
コード例 #9
0
test_unfired_idx = [i for i,item in enumerate(test_m) if sum(item)==0]
targets_test = test_L[test_fired_idx]

#majority voting using snorkel's majority voting model
maj_preds_test = majority_model.predict(L=test_lsnork[test_fired_idx])
maj_precision_test, maj_recall_test, maj_f1_score_test, maj_support_test = precision_recall_fscore_support(targets_test, maj_preds_test)
maj_accuracy_test = compute_accuracy(maj_support_test, maj_recall_test)

print("precision on *** RULE COVERD TEST SET ***   of MAJORITY VOTING: {}".format(maj_precision_test))
print("recall on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of MAJORITY VOTING: {}".format(maj_f1_score_test))
print("support on *** RULE COVERED TEST SET ***  of MAJORITY VOTING: {}".format(maj_support_test))
print("accuracy on *** RULE COVERED TEST SET ***   of MAJORITY VOTING: {}".format(maj_accuracy_test))


#Now train snorkels label model
print("Training Snorkel's LabelModel")
label_model = LabelModel(cardinality=num_classes, verbose=True)
label_model.fit(L_train=U_lsnork, n_epochs=1000, lr=0.001, log_freq=100, seed=123)
label_model.save(os.path.join(path_dir,"saved_label_model"))



snork_preds_test = label_model.predict(L=test_lsnork[test_fired_idx])
snork_precision_test, snork_recall_test, snork_f1_score_test, snork_support_test = precision_recall_fscore_support(targets_test, snork_preds_test)
snork_accuracy_test = compute_accuracy(snork_support_test, snork_recall_test)
print("precision on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_precision_test))
print("recall on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_recall_test))
print("f1_score on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_f1_score_test))
print("support on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_support_test))
print("accuracy on *** RULE COVERED TEST SET *** of SNORKEL VOTING: {}".format(snork_accuracy_test))
コード例 #10
0
    lf.lf_spacy_words_sexism, lf.lf_keyword_raicism, lf.lf_spacy_words_gpe,
    lf.lf_keyword_shaming, lf.lf_spacy_threat, lf.lf_spacy_terrorism,
    lf.lf_neg_nonehumansubject
]
# Unused ones :
# lf.lf_spacy_animals, lf.lf_spacy_politics,  # giving false positives

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
# Cardinality was 2. Got : ValueError: L_train has cardinality 3, cardinality=2 passed in.
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

#output
df_train.to_csv('labelledDataset.csv', index=None, header=True)

# Filter out useless data
df_train = df_train[df_train.label != ABSTAIN]
print("Useful data remaining: " + str(df_train.shape[0]))

# Ignoring Transformation Functions for Data Augmentation for now...
# TODO: create transformation functions for different categories of hatespeech

# Ignoring slicing, don't think we need it

# Training a Classifier
docs = df_train.iloc[:, 0].tolist()  # first column of data frame (first_name)
コード例 #11
0
# ## Train LabelModel And Generate Probabilistic Labels

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import LabelModel

# Train LabelModel.
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01)

# %% [markdown]
# As a spot-check for the quality of our LabelModel, we'll score it on the dev set.

# %%
from snorkel.analysis import metric_score

preds_dev = label_model.predict(L_dev)

acc = metric_score(Y_dev, preds_dev, probs=None, metric="accuracy")
print(f"LabelModel Accuracy: {acc:.3f}")

# %% [markdown]
# We see that we get very high accuracy on the development set.
# This is due to the abundance of high quality crowdworker labels.
# **Since we don't have these high quality crowdsourcing labels for the
# test set or new incoming data points, we can't use the LabelModel reliably
# at inference time.**
# In order to run inference on new incoming data points, we need to train a
# discriminative model over the tweets themselves.
# Let's generate a set of labels for that training set.

# %%
コード例 #12
0
# %%
label_model.score(L_valid, Y_valid, metrics=["f1_micro"])

# %% [markdown]
# ## 4. Train a Classifier
# You can then use these training labels to train any standard discriminative model, such as [an off-the-shelf ResNet](https://github.com/KaimingHe/deep-residual-networks), which should learn to generalize beyond the LF's we've developed!

# %% [markdown]
# #### Create DataLoaders for Classifier

# %%
from snorkel.classification import DictDataLoader
from model import SceneGraphDataset, create_model

df_train["labels"] = label_model.predict(L_train)

if sample:
    TRAIN_DIR = "data/VRD/sg_dataset/samples"
else:
    TRAIN_DIR = "data/VRD/sg_dataset/sg_train_images"

dl_train = DictDataLoader(
    SceneGraphDataset("train_dataset", "train", TRAIN_DIR, df_train),
    batch_size=16,
    shuffle=True,
)

dl_valid = DictDataLoader(
    SceneGraphDataset("valid_dataset", "valid", TRAIN_DIR, df_valid),
    batch_size=16,
コード例 #13
0
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

# In[12]:

print(
    f"Training set coverage: {100 * LFAnalysis(L_train).label_coverage(): 0.001f}%"
)

# In[15]:

from snorkel.labeling import LabelModel

# Train LabelModel.
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=150, seed=125, log_freq=30, l2=0.1, lr=0.01)

# In[16]:

label = label_model.predict(L_train)

# In[18]:

len(label)

# In[26]:

# In[ ]:

with open('flabel.pkl', 'wb') as f:
    pickle.dump(label, f)