def startSnorkelLabeling(df, keyword_groups={}, label=IRRELEVANT, l_type='SnorkelFilter'): ''' Function: Filter words for user Inputs: - df: tweets DataFrame (columns: [id, text]) - keywords: Keyword group and its relevant keywords E.g. {'usps': ['postal service', 'usps'], 'invest': ['invest','portfolio','stock']} Outputs: - a_df: Categorised Data (e.g. columns = ['id', 'tweets', 'Refund', 'COVID']) - analysis: Snorkel Labeling Function statistics ''' lfs = [] for name, keywords in keyword_groups.items(): lfs.append(make_keyword_lf(lf_name=name, keywords=keywords, label=label)) applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df) if l_type == 'SnorkelFilter': # For spam detection (Step 2) L_final = get_L_final_filter(L_train) df['relevance'] = L_final elif l_type == 'SnorkelCategorise': # For categorising tweets (Step 3) L_final = get_L_final_categorise(L_train) L_final_with_names = dict(zip(keyword_groups.keys(), L_final)) for name, L_values in L_final_with_names.items(): df[name] = L_values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #return L_train, L_final, df, analysis return df, analysis
def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def snorkel_process(keylist, dataframe, allweaklabf): def func(x): idx = (-x).argsort()[1:] x[idx] = 0 return x cardinalitynu = len(keylist) applier = PandasLFApplier(lfs=allweaklabf) all_train_l = applier.apply(df=dataframe) report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary() print(report) label_model = LabelModel(cardinality=cardinalitynu, verbose=False) label_model.fit(all_train_l) predt = label_model.predict(all_train_l) predt1 = label_model.predict_proba(all_train_l) keylist1 = keylist.copy() #keylist1.append('Not_relevent') predt2 = pd.DataFrame(predt1, columns=keylist1) dataframe['L_label'] = predt dataframe1 = dataframe.join(predt2, how='outer') dataframe1 = dataframe1[dataframe1.L_label >= 0] train, test = train_test_split(dataframe1, test_size=0.2) trainsent = train.sent.values trainlabel = train[keylist].values trainlabe2 = trainlabel.copy() np.apply_along_axis(func, 1, trainlabe2) trainlabe2 = np.where(trainlabe2 > 0, 1, 0) testsent = test.sent.values testlabel = test[keylist].values testlabe2 = testlabel.copy() np.apply_along_axis(func, 1, testlabe2) testlabe2 = np.where(testlabe2 > 0, 1, 0) return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def generate_labels_with_snorkel(dataframe): """ Labels the full data using Snorkel :param dataframe: Pandas dataframe containing all data :return: dataframe extended with a label column """ # Define the set of labeling functions (LFs) lfs = [ lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company, lf_non_uk_blacklisted_company ] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(dataframe) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) dataframe["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") # Filter out the abstain data points dataframe = dataframe[dataframe.label != ABSTAIN] return dataframe
def test_lf_applier_pandas(self) -> None: df = pd.DataFrame(dict(num=DATA)) applier = PandasLFApplier([f, g]) L = applier.apply(df, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(df, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED)
def lf_examples(self, lf_id, n=5): lf = self.lfs[lf_id] applier = PandasLFApplier(lfs=[lf]) L_train = applier.apply(df=self.df_train) labeled_examples = self.df_train[L_train != -1] samples = labeled_examples.sample(min(n, len(labeled_examples)), random_state=13) return [{"text": t} for t in samples["text"].values]
def train_model(self, df_train: pd.DataFrame, application_area_lfs: list, analysis_path: str = "output", label_output_path: str = "labels.jsonl", save_model_path: str = None): """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points :param df_train: The training data for the model :type df_train: pd.DataFrame :param application_area_lfs: A list of labeling functions to use in training the Label Model :type application_area_lfs: list :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output` :type analysis_path: str, optional :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl" :type label_output_path: str, optional :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved :type save_model_path: str, optional """ file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") applier = PandasLFApplier(lfs=application_area_lfs) L_train = applier.apply(df=df_train) model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) if (save_model_path is not None): model.save(save_model_path) int_labels, prob_labels = model.predict(L=L_train, return_probs=True, tie_break_policy="abstain") probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(f"{label_output_path}", mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n") # output LF analysis to csv file sorted by coverage lf_analysis = LFAnalysis(L=L_train, lfs=application_area_lfs).lf_summary() with open( f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv", "w") as outfile: lf_analysis = lf_analysis.sort_values("Coverage") lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
def lf_mistakes(self, lf_id, n=5): lf = self.lfs[lf_id] applier = PandasLFApplier(lfs=[lf]) L_dev = applier.apply(df=self.df_dev).squeeze() labeled_examples = self.df_dev[(L_dev != -1) & (L_dev != self.df_dev["label"])] samples = labeled_examples.sample(min(n, len(labeled_examples)), random_state=13) return [{"text": t} for t in samples["text"].values]
def test_lf_applier_pandas(self) -> None: df = pd.DataFrame(dict(num=DATA)) applier = PandasLFApplier([f, g]) L = applier.apply(df, progress_bar=False) np.testing.assert_equal(L, L_EXPECTED) L = applier.apply(df, progress_bar=True) np.testing.assert_equal(L, L_EXPECTED) L, meta = applier.apply(df, return_meta=True) np.testing.assert_equal(L, L_EXPECTED) self.assertEqual(meta, ApplierMetadata(dict()))
def apply_lfs_to_dataset( lfs: List[LabelingFunction], artifact_df: pd.DataFrame, save_to: AbsolutePath, ) -> np.ndarray: applier = PandasLFApplier(lfs=lfs) applied_lf_matrix = applier.apply(df=artifact_df) df = pd.DataFrame(applied_lf_matrix, columns=[lf.name for lf in lfs]) df.to_pickle(str(save_to)) return applied_lf_matrix
def get_snorkel_labels(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) L_probs = label_model.predict_proba(L=L_train) df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df, y=L_probs, L=L_train) return df_filtered, probs_filtered
def get_majority_vote_label(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) majority_model = MajorityLabelVoter(cardinality=len(labels)) preds_train = majority_model.predict(L=L_train) non_abstain_idxs = np.argwhere(preds_train >= 0).flatten() df_filtered = train_df.iloc[non_abstain_idxs] probs_filtered = preds_train[non_abstain_idxs] return df_filtered, probs_filtered
def LF_applier(df_train: pd.DataFrame, df_test: pd.DataFrame): """Create the labling functions and apply those labeling functions on the data points :param df_train: The training dataset :type df_train: pd.DataFrame :param df_test: The gold labels :type df_test: pd.DataFrame :return: Return the matrix of labels emitted by the labeling functions :rtype: List[np.ndarray, np.ndarray, List[LabelingFunction]] """ # Make keywords keyword_vehicle_detection = make_keyword_lf( keywords=["vehicle detection", "vehicle detector"]) keyword_driver_identification = make_keyword_lf( keywords=["driver identification", "driver identifier"]) keyword_human_detection = make_keyword_lf( keywords=["human detection", "human detector"]) keyword_license_info = make_keyword_lf( keywords=["license plate", "license number"]) keyword_vehicle_recognition = make_keyword_lf( keywords=["vehicle recognition", "vehicle identification"]) keyword_driving_system = make_keyword_lf(keywords=["driving system"]) keyword_autonomous_vehicle = make_keyword_lf( keywords=["autonomous vehicle"], label=IRRELEVANT) keyword_driverless_vehicle = make_keyword_lf( keywords=["driverless cars", "driverless vehicle", "unmanned vehicle"], label=IRRELEVANT) keyword_lidar = make_keyword_lf(keywords=["lidar", "laser detection"], label=IRRELEVANT) keyword_radar = make_keyword_lf(keywords=["radar", "vehicle radar"], label=IRRELEVANT) keyword_computer_vision = make_keyword_lf( keywords=["computer vision", "opencv"], label=IRRELEVANT) # Apply LFs lfns = [ keyword_vehicle_detection, keyword_human_detection, keyword_driver_identification, keyword_vehicle_recognition, keyword_driving_system, keyword_autonomous_vehicle, keyword_driverless_vehicle, keyword_lidar, keyword_radar, keyword_computer_vision ] applier = PandasLFApplier(lfs=lfns) apply_train_time_start = time() L_train = applier.apply(df=df_train) apply_train_time_end = time() print( f"LF Application Time: {apply_train_time_end - apply_train_time_start} seconds" ) L_test = applier.apply(df=df_test) return [L_train, L_test, lfns]
def test_lf_applier_pandas_spacy_preprocessor(self) -> None: spacy = SpacyPreprocessor(text_field="text", doc_field="doc") @labeling_function(pre=[spacy]) def first_is_name(x: DataPoint) -> int: return 0 if x.doc[0].pos_ == "PROPN" else -1 @labeling_function(pre=[spacy]) def has_verb(x: DataPoint) -> int: return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1 df = pd.DataFrame(dict(text=TEXT_DATA)) applier = PandasLFApplier([first_is_name, has_verb]) L = applier.apply(df, progress_bar=False) np.testing.assert_equal(L, L_TEXT_EXPECTED)
def test_lf_applier_pandas_preprocessor_memoized(self) -> None: square_hit_tracker = SquareHitTracker() @preprocessor(memoize=True) def square_memoize(x: DataPoint) -> DataPoint: x.num_squared = square_hit_tracker(x.num) return x @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -1 df = pd.DataFrame(dict(num=DATA)) applier = PandasLFApplier([f, fp_memoized]) L = applier.apply(df, progress_bar=False) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) self.assertEqual(square_hit_tracker.n_hits, 4)
def get_snorkel_labels(frame_to_train, pkl_name): print( "==============================Labeling is now started=======================================" ) applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=frame_to_train) date_parser_coverage, currency_coverage,\ zipcode_coverage,state_coverage,\ quntity_coverage,phonenumber_coverage,SSN_coverage,\ first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0) frame_to_train.rename(columns={ "word_id": "word_tokens", "text": "ocr", "label_number": "preds" }, inplace=True) print( "==============================Labeling is now complete=======================================" ) print( "==============================Summary Stats==================================================" ) print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%") print(f"currency_coverage: {currency_coverage * 100:.1f}%") print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%") print(f"state_coverage: {state_coverage * 100:.1f}%") print(f"quntity_coverage: {quntity_coverage * 100:.1f}%") print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%") print(f"SSN_coverage: {SSN_coverage * 100:.1f}%") print(f"first_name_coverage: {first_name_coverage * 100:.1f}%") print(f"last_name_coverage: {last_name_coverage * 100:.1f}%") #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%") lol = f"pickle_files/{pkl_name}.pkl" print("File name I got:", lol) print(f"percent_coverage: {percent_coverge * 100:.1f}%") with open(lol, 'rb') as f: label_model = pickle.load(f) #label_model = LabelModel(cardinality=15, verbose=True) #label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) frame_to_train["label_number"] = label_model.predict( L=L_train, tie_break_policy="abstain") frame_to_train.label_number.fillna(0, inplace=True) frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct) return frame_to_train #dataset_df = pd.DataFrame() return frame_to_train
def apply_lf_on_data(df_train,df_dev,sentences_number): """ This function apply the labeling functions (from labeled_function.py) on given train data frame. Other parameters: df_dev (for further developing the LFs) and sentences_number for inner use. Return the train df with the tagging. """ print("") print("Labeling Functions:") # Y_dev = df_dev.tag.values lfs = [labeled_function.masechet_then_parans, labeled_function.perek_then_parans, labeled_function.daf_in_parntes, labeled_function.no_double_parans, labeled_function.no_mishna] applier = PandasLFApplier(lfs=lfs) print("-Applying the labeling functions...") l_train = applier.apply(df=df_train) # l_dev = applier.apply(df=df_dev) print_analysis(l_train,lfs) print("-Applying the MajorityLabelVoter...") majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=l_train) #put predicted labels in df train print("-Removing unnecessary n-grams...") df_train['tag'] = preds_train for i in range(sentences_number): df_filter_by_sentences = df_train.loc[df_train['sentence_index'] == i] df_filter = df_filter_by_sentences.loc[df_filter_by_sentences['tag'] == 1] # this section handles cases of positively tagged ngram within a bigger positively tagged ngram, and removes it. for row_checked in df_filter.index: for row_other in df_filter.index: if df_filter['n_gram_id'][row_checked] != df_filter['n_gram_id'][row_other] and \ df_filter['text'][row_checked] in df_filter['text'][row_other]: df_train = df_train[df_train.n_gram_id != df_filter['n_gram_id'][row_checked]] break print("-Dropping the abstained and extra columns...") df_train = df_train.drop(["sentence_index","n_gram_id"],axis=1) df_train = df_train[df_train['tag'] != ABSTAIN] print("DONE") return df_train
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ( [f] + [get_positive_labeling_function(divisor) for divisor in range(2, 9)] + [get_negative_labeling_function(divisor) for divisor in range(2, 9)] ) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'): df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv') #from utils import load_unlabeled_spam_dataset #df_train = load_unlabeled_spam_dataset() # Define the set of labeling functions (LFs) #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr] #lfs = [lf_keyword_keywords] lfs = [lf_keyword_wateroverlast] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") #tie_break_policy="true-random" #tie_break_policy="abstain" counter = 0 for i in range(len(df_train["label"])): if df_train["label"][i] == WATER: print() print(df_train["text"][i]) print(df_train["label"][i]) print() counter += 1 print("num entries total: " + str(len(df_train["label"]))) print("num entries water: " + str(counter)) #df_train = df_train[df_train.label != ABSTAIN] twitter_curated = df_train[df_train.label == WATER] twitter_curated = twitter_curated.drop(columns='label') twitter_curated.to_csv(save_name, index=False)
def apply_metamap(self, dataset: DataFrame): """ Apply metamap to a list of sentences """ train_data = dataset.loc[dataset["split"] == "train"] test_data = dataset.loc[dataset["split"] == "test"] applier = applier = PandasLFApplier(lfs=[metamap]) l_metamap_train = applier.apply(df=train_data.head(10)) l_metamap_test = applier.apply(df=test_data.head(10)) return l_metamap_train, l_metamap_test
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test): # Accumulate all the labeling_functions for supply supply_lfs = [ lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition, lf_people, lf_sold, lf_relation, lf_competition ] # Apply the above labeling functions to the data in Pandas dataframe formats applier = PandasLFApplier(supply_lfs) # Use the applier of the labeling functions to both development set and train set L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) L_test = applier.apply(df_test) # caridnality : 2 (True and False) label_model = LabelModel(cardinality=2, verbose=True) # Fit the label_model label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500) # accuracy for the label model using the test set label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # check the F-1 score and ROC_AUC score probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print( f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}" ) print( f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}" ) return label_model, L_train
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type): labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function, negative1_labeling_function] pandasApplier = PandasLFApplier(lfs=labeling_functions) label_training_matrix = pandasApplier.apply(df=dataframe) if model_type == "label_model": # constructing a probabilistic label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123) dataframe["weak_labels"] = label_model.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe else: majorityLabelVoter = MajorityLabelVoter() dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe
def getTrainedModel2(self): # Apply the LFs to the unlabeled training data applier = PandasLFApplier(self.LFs) L_train = applier.apply(self.train['comments']) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) self.train['resolution'] = label_model.predict( L=L_train, tie_break_policy="abstain") df_train = self.train[self.train.resolution != self.ABSTAIN] train_text = df_train.comments.tolist() X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text) clf = LogisticRegression(solver="lbfgs") clf.fit(X=X_train, y=df_train.resolution.values) prob = clf.predict_proba(self.test) if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' end_model = EndModel([1000, 10, 2], seed=123, device=device) end_model.train_model( (self.train['comments'], self.test['comments']), valid_data=(self.train['resolution'], self.test['comments']), lr=0.01, l2=0.01, batch_size=256, n_epochs=5, checkpoint_metric='accuracy', checkpoint_metric_mode='max') return prob
def test_lf_applier_pandas_fault(self) -> None: df = pd.DataFrame(dict(num=DATA)) applier = PandasLFApplier([f, f_bad]) with self.assertRaises(AttributeError): applier.apply(df, progress_bar=False) L = applier.apply(df, progress_bar=False, fault_tolerant=True) np.testing.assert_equal(L, L_EXPECTED_BAD) L, meta = applier.apply( df, progress_bar=False, fault_tolerant=True, return_meta=True ) np.testing.assert_equal(L, L_EXPECTED_BAD) self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel, role_label_model: LabelModel): if 'event_triggers' not in documents and 'event_roles' not in documents: documents = documents.apply(pipeline.add_default_events, axis=1) # 1. Get trigger probabilities df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents) trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs()) L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers) event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers) merged_event_trigger_examples = pipeline.merge_event_trigger_examples( df_predict_triggers, utils.zero_out_abstains(event_trigger_probs, L_predict_triggers)) # 2. Get role probabilities df_predict_roles, _ = pipeline.build_event_role_examples(documents) role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs()) L_predict_roles = role_lf_applier.apply(df_predict_roles) event_roles_probs = role_label_model.predict_proba(L_predict_roles) merged_event_role_examples = pipeline.merge_event_role_examples( df_predict_roles, utils.zero_out_abstains(event_roles_probs, L_predict_roles)) # 3. Update documents with trigger & role probabilities labeled_documents: pd.DataFrame = documents.copy() # Make sure to remove event_triggers and roles that were built per default for idx, row in labeled_documents.iterrows(): row['event_triggers'] = [] row['event_roles'] = [] if 'id' in labeled_documents: labeled_documents.set_index('id', inplace=True) triggers = merged_event_trigger_examples[['event_triggers']] roles = merged_event_role_examples[['event_roles']] labeled_documents.update(triggers) labeled_documents.update(roles) labeled_documents.reset_index(level=0, inplace=True) # 4. Add ACE events labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents) return labeled_documents
def apply_heuristics(args) -> Dict[str, Any]: stats: Dict[str, Any] = {} train_filename = PROJECT_DIR / args.dataset_path_train test_filename = PROJECT_DIR / args.dataset_path_test df_train = pd.read_csv(train_filename, sep=';', index_col=0, nrows=args.rows_train) df_test = pd.read_csv(test_filename, sep=';', index_col=0, nrows=args.rows_test) stats['commits_train'] = len(df_train.index) stats['bugs_fraction'] = (df_train.label.values == BUG).mean() stats['nonbugs_fraction'] = (df_train.label.values == BUGLESS).mean() clean_text_columns(df_train) clean_text_columns(df_test) lfs = bugs.heuristics + nonbugs.heuristics stats['n_labeling_functions'] = len(lfs) applier = PandasLFApplier(lfs=lfs) L_dev = applier.apply(df=df_train) L_dev.dump(PROJECT_DIR / args.save_heuristics_matrix_train_to) applier = PandasLFApplier(lfs=lfs) L_test = applier.apply(df=df_test) L_test.dump(PROJECT_DIR / args.save_heuristics_matrix_test_to) stats['coverage_train'] = sum((L_dev != -1).any(axis=1)) / len(L_dev) stats['coverage_test'] = sum((L_test != -1).any(axis=1)) / len(L_test) stats['majority_accuracy_train'] = majority_acc(L_dev, df_train) stats['majority_accuracy_test'] = majority_acc(L_test, df_test) return stats
# Our next step is to apply the labeling functions we wrote to the unlabeled training data. # The result is a *label matrix*, `L_train`, where each row corresponds to a data point and each column corresponds to a labeling function. # Since the labeling functions have unknown accuracies and correlations, their output labels may overlap and conflict. # We use the `LabelModel` to automatically estimate their accuracies and correlations, reweight and combine their labels, and produce our final set of clean, integrated training labels: # %% from snorkel.labeling.model import LabelModel from snorkel.labeling import PandasLFApplier # Define the set of labeling functions (LFs) lfs = [ lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity ] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") # %% [markdown] # Note that we used the `LabelModel` to label data; however, on many data points, all the labeling functions abstain, and so the `LabelModel` abstains as well. # We'll filter these data points out of our training set now: # %% df_train = df_train[df_train.label != ABSTAIN] # %% [markdown]
# [`PandasLFApplier`](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.PandasLFApplier.html). # Correspondingly, a single data point `x` that's passed into our LFs will be a [Pandas `Series` object](https://pandas.pydata.org/pandas-docs/stable/reference/series.html). # # It's important to note that these LFs will work for any object with an attribute named `text`, not just Pandas objects. # Snorkel has several other appliers for different data point collection types which you can browse in the [API documentation](https://snorkel.readthedocs.io/en/master/packages/labeling.html). # # The output of the `apply(...)` method is a ***label matrix***, a fundamental concept in Snorkel. # It's a NumPy array `L` with one column for each LF and one row for each data point, where `L[i, j]` is the label that the `j`th labeling function output for the `i`th data point. # We'll create one label matrix for the `train` set and one for the `dev` set. # %% {"tags": ["md-exclude-output"]} from snorkel.labeling import PandasLFApplier lfs = [check_out, check] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_dev = applier.apply(df=df_dev) # %% L_train # %% [markdown] # ### c) Evaluate performance on training and development sets # %% [markdown] # We can easily calculate the coverage of these LFs (i.e., the percentage of the dataset that they label) as follows: # %% coverage_check_out, coverage_check = (L_train != ABSTAIN).mean(axis=0) print(f"check_out coverage: {coverage_check_out * 100:.1f}%")