def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type): labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function, negative1_labeling_function] pandasApplier = PandasLFApplier(lfs=labeling_functions) label_training_matrix = pandasApplier.apply(df=dataframe) if model_type == "label_model": # constructing a probabilistic label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123) dataframe["weak_labels"] = label_model.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe else: majorityLabelVoter = MajorityLabelVoter() dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe
] # apply label functions applier = PandasLFApplier(lfs=lfs) # create a label matrix for the training set L_train = applier.apply(df=data_train) # create a label matrix for the test det L_test = applier.apply(df=data_test) # summary statistics for the LFs lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(lf_summary) # take the majority vote on a per-data point basis majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) # use LabelModel to produce training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # result using majority-vote model Y_test = data_test.label.values majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") # results using label model label_model_acc = label_model.score(L=L_test, Y=Y_test,