def label_model_trainer(label_model, L_train, df_train):
    """
    To train the extraction model, 
    we first output the probabilities of the binary choices: True and False from our label model.
    Then, using the probabilities, we train our end model
    """

    # extract the probabiliteis from the training set using our label model
    probs_train = label_model.predict_proba(L_train)

    # Since we cannot use the data points that did not receive any labels (Not covered by our labeling functions),
    # we filter them out

    # extract only the data points that received any labels from the labeling functions
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
        X=df_train, y=probs_train, L=L_train)

    X_train = uniform_length(df_train_filtered)
    model = rnn_model()
    batch_size = 64
    model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=50)

    X_test = uniform_length(df_test)
    probs_test = model.predict(X_test)
    preds_test = probs_to_preds(probs_test)
    print(
        f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}"
    )
    print(
        f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
    )
Ejemplo n.º 2
0
    def train(self, dataset):
        # Apply labeler functions to training set
        lfs_applier = PandasLFApplier(lfs=self.lfs)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            lfs_train = lfs_applier.apply(df=dataset)

        # Build probabilistic label model
        label_model = LabelModel(cardinality=3, verbose=True)
        label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42)
        label_probs = label_model.predict_proba(lfs_train)

        # Filter unlabeled data points
        df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset,
                                                                 y=label_probs,
                                                                 L=lfs_train)

        # Featurize data using scikit
        self.vectorizer = CountVectorizer(ngram_range=(1, 5))
        dataset_train = self.vectorizer.fit_transform(
            df_filtered.sentence.tolist())

        # Replace probabilistic labels with most likely label
        preds_filtered = probs_to_preds(probs=probs_filtered)

        # Train scikit model
        self.model = LogisticRegression(C=1e3,
                                        solver="liblinear",
                                        multi_class='auto')
        self.model.fit(X=dataset_train, y=preds_filtered)
Ejemplo n.º 3
0
    def predict(
        self,
        L: np.ndarray,
        return_probs: Optional[bool] = False,
        tie_break_policy: str = "random",
    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        """Return predicted labels, with ties broken according to policy.

        Policies to break ties include:
        "abstain": return an abstain vote (-1)
        "true-random": randomly choose among the tied options
        "random": randomly choose among tied option using deterministic hash

        NOTE: if tie_break_policy="true-random", repeated runs may have slightly different
        results due to difference in broken ties


        Parameters
        ----------
        L
            An [n,m] matrix with values in {-1,0,1,...,k-1}
        return_probs
            Whether to return probs along with preds
        tie_break_policy
            Policy to break ties when converting probabilistic labels to predictions

        Returns
        -------
        np.ndarray
            An [n,1] array of integer labels

        (np.ndarray, np.ndarray)
            An [n,1] array of integer labels and an [n,k] array of probabilistic labels


        Example
        -------
        >>> L = np.array([[0, 0, -1], [1, 1, -1], [0, 0, -1]])
        >>> label_model = LabelModel(verbose=False)
        >>> label_model.fit(L)
        >>> label_model.predict(L)
        array([0, 1, 0])
        """
        Y_probs = self.predict_proba(L)
        Y_p = probs_to_preds(Y_probs, tie_break_policy)
        if return_probs:
            return Y_p, Y_probs
        return Y_p
Ejemplo n.º 4
0
    def predict(
        self,
        cliquesets: CliqueSetList,
        return_probs: Optional[bool] = False,
        tie_break_policy: str = "abstain",
    ) -> Union[CliqueSetProbs, CliqueSetProbsAndPreds]:
        r"""Run prediction on a ```CliqueSetList```.

        A ```LabelModel's``` output is determined by the "Events" that cooccured, which we call a CliqueSet.
        This accepts an iterable of CliqueSets and runs prediction on each. In practice, the number of unique CliqueSets
        present in the data set is order of magnitude smaller than the total number of CliqueSets as well the number of
        distinct examples. Hence, this method runs inference once per inputed CliqueSet and returns the CliqueSets as well
        as the predections, allowing the user to join back on the original data at reduced computational cost.

        Parameters
        ----------
        cliquesets
            An iterable of CliqueSets
        return_probs
            Whether to return probs along with preds
        tie_break_policy
            Policy to break ties when converting probabilistic labels to predictions

        Returns
        -------
        CliqueSetProbs
            A 2-tuple whose first element is a list of CliqueSets and whose second element is an [len(cliquesets),k] array
            such that ar[i] are the probabilities for cliqueset i

        CliqueSetProbsAndPreds
            A 3-tuple whose first element is a list of CliqueSets and whose second element is an [len(cliquesets),k] array,
            and third element as [len(cliquestes),1] array such that ar_1[i] are the probabilities for cliqueset i
            and ar_2[i] is the predicted class for that cliqueset.

        """
        # The users cliqueset might be an unordered iterable (set) so we take the ordered list
        cliqsets_list, Y_probs = self.predict_proba_from_cliqueset(cliquesets)
        if return_probs:
            Y_p = probs_to_preds(Y_probs, tie_break_policy)
            return (cliqsets_list, Y_probs, Y_p)
        else:
            return (cliqsets_list, Y_probs)
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test):

    # Accumulate all the labeling_functions for supply
    supply_lfs = [
        lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition,
        lf_people, lf_sold, lf_relation, lf_competition
    ]

    # Apply the above labeling functions to the data in Pandas dataframe formats
    applier = PandasLFApplier(supply_lfs)

    # Use the applier of the labeling functions to both development set and train set
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    L_test = applier.apply(df_test)

    # caridnality : 2 (True and False)
    label_model = LabelModel(cardinality=2, verbose=True)

    # Fit the label_model
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500)

    # accuracy for the label model using the test set
    label_model_acc = label_model.score(L=L_test,
                                        Y=Y_test,
                                        tie_break_policy="random")["accuracy"]
    print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

    # check the F-1 score and ROC_AUC score
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print(
        f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}"
    )
    print(
        f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
    )

    return label_model, L_train
Ejemplo n.º 6
0
    def train(self):
        '''
        Train the logistic regression discriminative model
        '''
        # We pull out the label vectors for ease of use later
        Y_test = self.df_test.label.values

        applier = PandasLFApplier(lfs=self.lfs)
        L_train = applier.apply(df=self.df_train)

        # Use Label Model to combined input data
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

        # Make predictions
        probs_train = label_model.predict_proba(L=L_train)

        # Filter abstained inputs
        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=self.df_train, y=probs_train, L=L_train)

        # Represent each data point as a one-hot vector
        vectorizer = CountVectorizer(ngram_range=(1, 5))
        X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
        X_test = vectorizer.transform(self.df_test.text.tolist())

        # Turn probs into preds
        preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

        # Train logistic regression model
        sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
        sklearn_model.fit(X=X_train, y=preds_train_filtered)

        print(
            f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%"
        )
        dump(sklearn_model, 'sklearn_model.joblib')
        dump(vectorizer, 'vectorizer.joblib')
Ejemplo n.º 7
0
def labeling_evaluation(df_train, df_test, label_model):
    lfs = [
        LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short,
        LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword,
        LabelingFunction.lf_surname_re, LabelingFunction.industry_cls
    ]

    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train)
    L_test = applier.apply(df=df_test)
    Y_test = df_test.label.values
    analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()

    if label_model == "majority":
        majority_model = MajorityLabelVoter()
        preds_train = majority_model.predict(L=L_train)
        majority_acc = majority_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

        df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=preds_train, L=L_train)
        return df_train_filtered, preds_train_filtered, analysis

    if label_model == "weighted":
        label_model = LabelModel(cardinality=len(
            [c for c in dir(Polarity) if not c.startswith("__")]),
                                 verbose=True)
        label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)
        probs_train = label_model.predict_proba(L_train)
        label_model_acc = label_model.score(
            L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
        preds_train_filtered = probs_to_preds(probs_train_filtered)
        return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
Ejemplo n.º 8
0
    def test_probs_to_preds(self):
        np.testing.assert_array_equal(probs_to_preds(PROBS), PREDS)

        # abtains with ties
        probs = np.array([[0.33, 0.33, 0.33]])
        preds = probs_to_preds(probs, tie_break_policy="abstain")
        true_preds = np.array([-1])
        np.testing.assert_array_equal(preds, true_preds)

        # true random with ties
        probs = np.array([[0.33, 0.33, 0.33]])
        random_preds = []
        for seed in range(10):
            preds = probs_to_preds(probs, tie_break_policy="true-random")
            random_preds.append(preds[0])

        # check predicted labels within range
        self.assertLessEqual(max(random_preds), 2)
        self.assertGreaterEqual(min(random_preds), 0)

        # deterministic random with ties
        probs = np.array(
            [[0.33, 0.33, 0.33], [0.0, 0.5, 0.5], [0.33, 0.33, 0.33], [0.5, 0.5, 0]]
        )
        random_preds = []
        for _ in range(10):
            preds = probs_to_preds(probs, tie_break_policy="random")
            random_preds.append(preds)

        # check labels are same across seeds
        for i in range(len(random_preds) - 1):
            np.testing.assert_array_equal(random_preds[i], random_preds[i + 1])

        # check predicted labels within range (only one instance since should all be same)
        self.assertLessEqual(max(random_preds[0]), 2)
        self.assertGreaterEqual(min(random_preds[0]), 0)

        # check invalid policy
        with self.assertRaisesRegex(ValueError, "policy not recognized"):
            preds = probs_to_preds(probs, tie_break_policy="negative")

        # check invalid input
        with self.assertRaisesRegex(ValueError, "probs must have probabilities"):
            preds = probs_to_preds(np.array([[0.33], [0.33]]))
Ejemplo n.º 9
0
    def predict(
        self,
        dataloader: DictDataLoader,
        return_preds: bool = False,
        remap_labels: Dict[str, Optional[str]] = {},
    ) -> Dict[str, Dict[str, torch.Tensor]]:
        """Calculate probabilities, (optionally) predictions, and pull out gold labels.

        Parameters
        ----------
        dataloader
            A DictDataLoader to make predictions for
        return_preds
            If True, include predictions in the return dict (not just probabilities)
        remap_labels
            A dict specifying which labels in the dataset's Y_dict (key)
            to remap to a new task (value)

        Returns
        -------
        Dict[str, Dict[str, torch.Tensor]]
            A dictionary mapping label type ('golds', 'probs', 'preds') to values
        """
        self.eval()

        gold_dict_list: Dict[str, List[torch.Tensor]] = defaultdict(list)
        prob_dict_list: Dict[str, List[torch.Tensor]] = defaultdict(list)

        labels_to_tasks = self._get_labels_to_tasks(
            label_names=dataloader.dataset.Y_dict.keys(),  # type: ignore
            remap_labels=remap_labels,
        )
        for batch_num, (X_batch_dict, Y_batch_dict) in enumerate(dataloader):
            prob_batch_dict = self._calculate_probs(X_batch_dict,
                                                    labels_to_tasks.values())
            for label_name in labels_to_tasks:
                task_name = labels_to_tasks[label_name]
                Y = Y_batch_dict[label_name]

                # Note: store results under label_name
                # but retrieve from pre-computed results using task_name
                prob_dict_list[label_name].extend(prob_batch_dict[task_name])
                gold_dict_list[label_name].extend(Y.cpu().numpy())

        gold_dict: Dict[str, np.ndarray] = {}
        prob_dict: Dict[str, np.ndarray] = {}

        for task_name in gold_dict_list:
            gold_dict[task_name] = np.array(gold_dict_list[task_name])
            prob_dict[task_name] = np.array(prob_dict_list[task_name])

        if return_preds:
            pred_dict: Dict[str, np.ndarray] = defaultdict(list)
            for task_name, probs in prob_dict.items():
                pred_dict[task_name] = probs_to_preds(probs)

        results = {"golds": gold_dict, "probs": prob_dict}

        if return_preds:
            results["preds"] = pred_dict

        return results
Ejemplo n.º 10
0
test_acc = metric_score(golds=Y_test, preds=preds_test_dev, metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

# %% [markdown]
# ### Scikit-Learn with Rounded Labels

# %% [markdown]
# If we want to use a library or model that doesn't accept probabilistic labels, we can replace each label distribution with the label of the class that has the maximum probability.
# This can easily be done using the
# [`probs_to_preds` helper method](https://snorkel.readthedocs.io/en/master/packages/_autosummary/utils/snorkel.utils.probs_to_preds.html#snorkel.utils.probs_to_preds).
# It's important to note that this transformation is lossy, as we no longer have values for our confidence in each label.

# %%
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

# %% [markdown]
# For example, this allows us to use standard models from Scikit-Learn.

# %% {"tags": ["md-exclude-output"]}
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

# %%
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

# %% [markdown]
# ## Summary
Ejemplo n.º 11
0
probs_train = label_model.predict_proba(L=L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=probs_train, L=L_train)

print('setting up Label Model')
stop_words = config['stop_words']
custom_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
# vectorizer = CountVectorizer(ngram_range=(1, 5))
vectorizer = TfidfVectorizer(stop_words=custom_stop_words).fit(
    X_train.text.tolist())
X_train_vectorized = vectorizer.transform(X_train.text.tolist())
X_train_filtered_vectorized = vectorizer.transform(
    df_train_filtered.text.tolist())
preds_train_filtered = probs_to_preds(
    probs=probs_train_filtered
)  # using weak labels generated by Label Model to train downstream classifier
X_dev = vectorizer.transform(X_dev.text.tolist())

print('training Logistic Regression model...')
log_param('model', 'log_reg_cv_10')
sklearn_model_weak_sup = LogisticRegressionCV(max_iter=500,
                                              cv=10,
                                              random_state=0,
                                              solver='liblinear').fit(
                                                  X_train_filtered_vectorized,
                                                  preds_train_filtered)
sklearn_model_full_sup = LogisticRegressionCV(max_iter=500,
                                              cv=10,
                                              random_state=0,
                                              solver='liblinear').fit(
Ejemplo n.º 12
0
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

# result using majority-vote model
Y_test = data_test.label.values
majority_acc = majority_model.score(L=L_test,
                                    Y=Y_test,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

# results using label model
label_model_acc = label_model.score(L=L_test,
                                    Y=Y_test,
                                    tie_break_policy="random")["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

# representing each data point using "bag of n-gram" feature
probs_train = label_model.predict_proba(L=L_train)
vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(data_train.text.tolist())
X_test = vectorizer.transform(data_test.text.tolist())

# replace each label distribution with the label having maximum probability
preds_train = probs_to_preds(probs=probs_train)

# train a Scikit-Learn classifier
sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train)
# result of the classifier accuracy
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")
Ejemplo n.º 13
0

if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data()
    lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name,
           lf_married, lf_familial_relationship, lf_family_left_window,
           lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names]
    applier = PandasLFApplier(lfs)
    L_dev = applier.apply(df_dev)
    L_train = applier.apply(df_train)
    print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev))
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)
    probs_dev = label_model.predict_proba(L_dev)
    preds_dev = probs_to_preds(probs_dev)
    print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')))
    probs_train = label_model.predict_proba(L_train)
    df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train)
    X_train = get_feature_arrays(df_train_filtered)
    model = get_model()
    batch_size = 64
    model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100)
    X_test = get_feature_arrays(df_test)
    probs_test = model.predict(X_test)
    preds_test = probs_to_preds(probs_test)
    print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1')))
    print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))

Ejemplo n.º 14
0
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

    L_test = applier.apply(test_df)
    # to_numerical = lambda x: x=='leave'
    # Y_test = [to_numerical(item) for item in test_df.label]
    Y_test = []
    for item in test_df.label:
        if item == 'stay':
            Y_test.append(STAY)
        else:
            Y_test.append(LEAVE)

    Y_test = np.asarray(Y_test)
    label_model_performance = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random",
                                                metrics=['accuracy', 'precision', 'recall', 'f1'])
    print(f"Label Model Accuracy: {label_model_performance['accuracy'] * 100:.1f}%")
    predict_probs = label_model.predict_proba(L_unlabeled)
    preds = probs_to_preds(predict_probs)
    pred_labels = []
    for i in range(len(preds)):
        if preds[i]:
            pred_labels.append('leave')
        else:
            pred_labels.append('stay')
    unlabeled_data['label'] = pred_labels
    unlabeled_data.to_csv(os.path.join(data_dir, 'snorkel_labeled_data.csv'), sep=',', index=False)


X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data()

applier = PandasLFApplier(label_functions.lfs)

df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt'])

L_train = applier.apply(df_train)

label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True)
label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123)

label_model.save('./models/LabelModel.model')

train_probs = label_model.predict_proba(L_train)
train_preds = probs_to_preds(train_probs, tie_break_policy='abstain')

df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))}))

# -1 to otherwiseRelated
df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated']

# Downsample otherwiseRelated
dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean())
df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index)

cnts = {}
for x in df_train['preds']:
    name = rel_names.rels_int_to_text[x]
    if name not in cnts:
        cnts[name] = 0