def label_model_trainer(label_model, L_train, df_train): """ To train the extraction model, we first output the probabilities of the binary choices: True and False from our label model. Then, using the probabilities, we train our end model """ # extract the probabiliteis from the training set using our label model probs_train = label_model.predict_proba(L_train) # Since we cannot use the data points that did not receive any labels (Not covered by our labeling functions), # we filter them out # extract only the data points that received any labels from the labeling functions df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) X_train = uniform_length(df_train_filtered) model = rnn_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=50) X_test = uniform_length(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test) print( f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}" ) print( f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}" )
def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def predict( self, L: np.ndarray, return_probs: Optional[bool] = False, tie_break_policy: str = "random", ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: """Return predicted labels, with ties broken according to policy. Policies to break ties include: "abstain": return an abstain vote (-1) "true-random": randomly choose among the tied options "random": randomly choose among tied option using deterministic hash NOTE: if tie_break_policy="true-random", repeated runs may have slightly different results due to difference in broken ties Parameters ---------- L An [n,m] matrix with values in {-1,0,1,...,k-1} return_probs Whether to return probs along with preds tie_break_policy Policy to break ties when converting probabilistic labels to predictions Returns ------- np.ndarray An [n,1] array of integer labels (np.ndarray, np.ndarray) An [n,1] array of integer labels and an [n,k] array of probabilistic labels Example ------- >>> L = np.array([[0, 0, -1], [1, 1, -1], [0, 0, -1]]) >>> label_model = LabelModel(verbose=False) >>> label_model.fit(L) >>> label_model.predict(L) array([0, 1, 0]) """ Y_probs = self.predict_proba(L) Y_p = probs_to_preds(Y_probs, tie_break_policy) if return_probs: return Y_p, Y_probs return Y_p
def predict( self, cliquesets: CliqueSetList, return_probs: Optional[bool] = False, tie_break_policy: str = "abstain", ) -> Union[CliqueSetProbs, CliqueSetProbsAndPreds]: r"""Run prediction on a ```CliqueSetList```. A ```LabelModel's``` output is determined by the "Events" that cooccured, which we call a CliqueSet. This accepts an iterable of CliqueSets and runs prediction on each. In practice, the number of unique CliqueSets present in the data set is order of magnitude smaller than the total number of CliqueSets as well the number of distinct examples. Hence, this method runs inference once per inputed CliqueSet and returns the CliqueSets as well as the predections, allowing the user to join back on the original data at reduced computational cost. Parameters ---------- cliquesets An iterable of CliqueSets return_probs Whether to return probs along with preds tie_break_policy Policy to break ties when converting probabilistic labels to predictions Returns ------- CliqueSetProbs A 2-tuple whose first element is a list of CliqueSets and whose second element is an [len(cliquesets),k] array such that ar[i] are the probabilities for cliqueset i CliqueSetProbsAndPreds A 3-tuple whose first element is a list of CliqueSets and whose second element is an [len(cliquesets),k] array, and third element as [len(cliquestes),1] array such that ar_1[i] are the probabilities for cliqueset i and ar_2[i] is the predicted class for that cliqueset. """ # The users cliqueset might be an unordered iterable (set) so we take the ordered list cliqsets_list, Y_probs = self.predict_proba_from_cliqueset(cliquesets) if return_probs: Y_p = probs_to_preds(Y_probs, tie_break_policy) return (cliqsets_list, Y_probs, Y_p) else: return (cliqsets_list, Y_probs)
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test): # Accumulate all the labeling_functions for supply supply_lfs = [ lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition, lf_people, lf_sold, lf_relation, lf_competition ] # Apply the above labeling functions to the data in Pandas dataframe formats applier = PandasLFApplier(supply_lfs) # Use the applier of the labeling functions to both development set and train set L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) L_test = applier.apply(df_test) # caridnality : 2 (True and False) label_model = LabelModel(cardinality=2, verbose=True) # Fit the label_model label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500) # accuracy for the label model using the test set label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # check the F-1 score and ROC_AUC score probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print( f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}" ) print( f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}" ) return label_model, L_train
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def test_probs_to_preds(self): np.testing.assert_array_equal(probs_to_preds(PROBS), PREDS) # abtains with ties probs = np.array([[0.33, 0.33, 0.33]]) preds = probs_to_preds(probs, tie_break_policy="abstain") true_preds = np.array([-1]) np.testing.assert_array_equal(preds, true_preds) # true random with ties probs = np.array([[0.33, 0.33, 0.33]]) random_preds = [] for seed in range(10): preds = probs_to_preds(probs, tie_break_policy="true-random") random_preds.append(preds[0]) # check predicted labels within range self.assertLessEqual(max(random_preds), 2) self.assertGreaterEqual(min(random_preds), 0) # deterministic random with ties probs = np.array( [[0.33, 0.33, 0.33], [0.0, 0.5, 0.5], [0.33, 0.33, 0.33], [0.5, 0.5, 0]] ) random_preds = [] for _ in range(10): preds = probs_to_preds(probs, tie_break_policy="random") random_preds.append(preds) # check labels are same across seeds for i in range(len(random_preds) - 1): np.testing.assert_array_equal(random_preds[i], random_preds[i + 1]) # check predicted labels within range (only one instance since should all be same) self.assertLessEqual(max(random_preds[0]), 2) self.assertGreaterEqual(min(random_preds[0]), 0) # check invalid policy with self.assertRaisesRegex(ValueError, "policy not recognized"): preds = probs_to_preds(probs, tie_break_policy="negative") # check invalid input with self.assertRaisesRegex(ValueError, "probs must have probabilities"): preds = probs_to_preds(np.array([[0.33], [0.33]]))
def predict( self, dataloader: DictDataLoader, return_preds: bool = False, remap_labels: Dict[str, Optional[str]] = {}, ) -> Dict[str, Dict[str, torch.Tensor]]: """Calculate probabilities, (optionally) predictions, and pull out gold labels. Parameters ---------- dataloader A DictDataLoader to make predictions for return_preds If True, include predictions in the return dict (not just probabilities) remap_labels A dict specifying which labels in the dataset's Y_dict (key) to remap to a new task (value) Returns ------- Dict[str, Dict[str, torch.Tensor]] A dictionary mapping label type ('golds', 'probs', 'preds') to values """ self.eval() gold_dict_list: Dict[str, List[torch.Tensor]] = defaultdict(list) prob_dict_list: Dict[str, List[torch.Tensor]] = defaultdict(list) labels_to_tasks = self._get_labels_to_tasks( label_names=dataloader.dataset.Y_dict.keys(), # type: ignore remap_labels=remap_labels, ) for batch_num, (X_batch_dict, Y_batch_dict) in enumerate(dataloader): prob_batch_dict = self._calculate_probs(X_batch_dict, labels_to_tasks.values()) for label_name in labels_to_tasks: task_name = labels_to_tasks[label_name] Y = Y_batch_dict[label_name] # Note: store results under label_name # but retrieve from pre-computed results using task_name prob_dict_list[label_name].extend(prob_batch_dict[task_name]) gold_dict_list[label_name].extend(Y.cpu().numpy()) gold_dict: Dict[str, np.ndarray] = {} prob_dict: Dict[str, np.ndarray] = {} for task_name in gold_dict_list: gold_dict[task_name] = np.array(gold_dict_list[task_name]) prob_dict[task_name] = np.array(prob_dict_list[task_name]) if return_preds: pred_dict: Dict[str, np.ndarray] = defaultdict(list) for task_name, probs in prob_dict.items(): pred_dict[task_name] = probs_to_preds(probs) results = {"golds": gold_dict, "probs": prob_dict} if return_preds: results["preds"] = pred_dict return results
test_acc = metric_score(golds=Y_test, preds=preds_test_dev, metric="accuracy") print(f"Test Accuracy: {test_acc * 100:.1f}%") # %% [markdown] # ### Scikit-Learn with Rounded Labels # %% [markdown] # If we want to use a library or model that doesn't accept probabilistic labels, we can replace each label distribution with the label of the class that has the maximum probability. # This can easily be done using the # [`probs_to_preds` helper method](https://snorkel.readthedocs.io/en/master/packages/_autosummary/utils/snorkel.utils.probs_to_preds.html#snorkel.utils.probs_to_preds). # It's important to note that this transformation is lossy, as we no longer have values for our confidence in each label. # %% from snorkel.utils import probs_to_preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # %% [markdown] # For example, this allows us to use standard models from Scikit-Learn. # %% {"tags": ["md-exclude-output"]} from sklearn.linear_model import LogisticRegression sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) # %% print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%") # %% [markdown] # ## Summary
probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=X_train, y=probs_train, L=L_train) print('setting up Label Model') stop_words = config['stop_words'] custom_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words) # vectorizer = CountVectorizer(ngram_range=(1, 5)) vectorizer = TfidfVectorizer(stop_words=custom_stop_words).fit( X_train.text.tolist()) X_train_vectorized = vectorizer.transform(X_train.text.tolist()) X_train_filtered_vectorized = vectorizer.transform( df_train_filtered.text.tolist()) preds_train_filtered = probs_to_preds( probs=probs_train_filtered ) # using weak labels generated by Label Model to train downstream classifier X_dev = vectorizer.transform(X_dev.text.tolist()) print('training Logistic Regression model...') log_param('model', 'log_reg_cv_10') sklearn_model_weak_sup = LogisticRegressionCV(max_iter=500, cv=10, random_state=0, solver='liblinear').fit( X_train_filtered_vectorized, preds_train_filtered) sklearn_model_full_sup = LogisticRegressionCV(max_iter=500, cv=10, random_state=0, solver='liblinear').fit(
label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # result using majority-vote model Y_test = data_test.label.values majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") # results using label model label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # representing each data point using "bag of n-gram" feature probs_train = label_model.predict_proba(L=L_train) vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(data_train.text.tolist()) X_test = vectorizer.transform(data_test.text.tolist()) # replace each label distribution with the label having maximum probability preds_train = probs_to_preds(probs=probs_train) # train a Scikit-Learn classifier sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train) # result of the classifier accuracy print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")
if __name__ == "__main__": warnings.filterwarnings("ignore") ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data() lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name, lf_married, lf_familial_relationship, lf_family_left_window, lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names] applier = PandasLFApplier(lfs) L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev)) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345) probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc'))) probs_train = label_model.predict_proba(L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) X_train = get_feature_arrays(df_train_filtered) model = get_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100) X_test = get_feature_arrays(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test) print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))
label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) L_test = applier.apply(test_df) # to_numerical = lambda x: x=='leave' # Y_test = [to_numerical(item) for item in test_df.label] Y_test = [] for item in test_df.label: if item == 'stay': Y_test.append(STAY) else: Y_test.append(LEAVE) Y_test = np.asarray(Y_test) label_model_performance = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random", metrics=['accuracy', 'precision', 'recall', 'f1']) print(f"Label Model Accuracy: {label_model_performance['accuracy'] * 100:.1f}%") predict_probs = label_model.predict_proba(L_unlabeled) preds = probs_to_preds(predict_probs) pred_labels = [] for i in range(len(preds)): if preds[i]: pred_labels.append('leave') else: pred_labels.append('stay') unlabeled_data['label'] = pred_labels unlabeled_data.to_csv(os.path.join(data_dir, 'snorkel_labeled_data.csv'), sep=',', index=False)
X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data() applier = PandasLFApplier(label_functions.lfs) df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt']) L_train = applier.apply(df_train) label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True) label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123) label_model.save('./models/LabelModel.model') train_probs = label_model.predict_proba(L_train) train_preds = probs_to_preds(train_probs, tie_break_policy='abstain') df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))})) # -1 to otherwiseRelated df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated'] # Downsample otherwiseRelated dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean()) df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index) cnts = {} for x in df_train['preds']: name = rel_names.rels_int_to_text[x] if name not in cnts: cnts[name] = 0