def test_majority_label_vote(self): L = np.array([[0, 1, 0], [0, 1, 0], [1, 0, 0], [-1, -1, 1]]) ml_voter = MajorityLabelVoter() Y_p = ml_voter.predict_proba(L) Y_p_true = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]) np.testing.assert_array_almost_equal(Y_p, Y_p_true)
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type): labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function, negative1_labeling_function] pandasApplier = PandasLFApplier(lfs=labeling_functions) label_training_matrix = pandasApplier.apply(df=dataframe) if model_type == "label_model": # constructing a probabilistic label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123) dataframe["weak_labels"] = label_model.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe else: majorityLabelVoter = MajorityLabelVoter() dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe
def model_analysis(label_model: LabelModel, training_set: pd.DataFrame, L_train: np.ndarray, L_test: np.ndarray, Y_test: np.ndarray, lfs: list, output_file="output") -> None: # TODO: consider using **kwargs instead of this painful list of arguments """Output analysis for the label model to a file :param label_model: The current label model which we want to output analysis for :type label_model: LabelModel :param training_set: A dataframe containing the training dataset :type training_set: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param L_test: The matrix of labels generated bt the labeling functions on the testing data :type L_test: np.ndarray :param Y_test: Gold labels associated with data points in L_test :type Y_test: np.ndarray :param lfs: List of labeling functions :type lfs: list :param output_file: A path where the output file should be writtent to, defaults to `PROJECT_ROOT/output` :type output_file: str, optional """ Y_train = label_model.predict_proba(L=L_train) Y_pred = label_model.predict(L=L_test, tie_break_policy="abstain") lf_analysis_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() # TODO: Write this df to a output file. Ask Jennifer about how to handle this print(lf_analysis_train) # build majority label voter model majority_model = MajorityLabelVoter() majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain", metrics=["f1", "accuracy"]) # get precision and recall scores p_score = precision_score(y_true=Y_test, y_pred=Y_pred, average='weighted') r_score = recall_score(y_true=Y_test, y_pred=Y_pred, average='weighted', labels=np.unique(Y_pred)) # how many documents abstained probs_train = majority_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=training_set, y=probs_train, L=L_train) # get number of false positives buckets = get_label_buckets(Y_test, Y_pred) true_positives, false_positives, true_negatives, false_negatives = ( buckets.get((1, 1)), buckets.get((1, 0)), buckets.get( (0, 0)), buckets.get((0, 1))) # write analysis to file timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") with open(f"{'../output/logs/'}{output_file}_run_{timestamp}.txt", "w") as output_file: output_file.write( f"{'Majority Vote Accuracy:':<25} {majority_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Majority Vote F1 Score:':<25} {majority_acc['f1'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model Accuracy:':<25} {label_model_acc['accuracy'] * 100:.2f}%" ) output_file.write( f"\n{'Label Model F1 Score:':<25} {label_model_acc['f1'] * 100:.2f}%" ) output_file.write(f"\n{'Precision Score:':<25} {p_score * 100:.2f}%") output_file.write(f"\n{'Recall Score:':<25} {r_score * 100:.2f}%") output_file.write( f"\n{'Abstained Data Points:':<25} {len(df_train_filtered)}") output_file.write( f"\n{'True Positives:':<25} {len(true_positives) if true_positives is not None else 0}" ) output_file.write( f"\n{'False Positives:':<25} {len(false_positives) if false_positives is not None else 0}" ) output_file.write( f"\n{'False Negatives:':<25} {len(false_negatives) if false_negatives is not None else 0}" ) output_file.write( f"\n{'True Negatives:':<25} {len(true_negatives) if true_negatives is not None else 0}" ) output_file.write( f"\n{'Abstained Positives:':<25} {len(buckets[(1, -1)])}") output_file.write( f"\n{'Abstained Negatives:':<25} {len(buckets[(0, -1)])}")
print('applying labelling functions to data...') applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=X_train) L_dev = applier.apply(df=X_dev) print('fitting Label Model') label_model = LabelModel(cardinality=config['cardinality'], verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) label_model_acc = label_model.score(L=L_dev, Y=y_dev, tie_break_policy="random")["accuracy"] print(f'label model acc: {label_model_acc}') print('fitting Majority Label Voter model') majority_model = MajorityLabelVoter(cardinality=config['cardinality']) # preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score(L=L_dev, Y=np.array(y_dev).reshape(-1, 1), tie_break_policy="random")["accuracy"] print(f'majority_label_acc: {majority_acc}') log_metric('majority_label_acc', majority_acc) log_metric('label_model_acc', label_model_acc) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=X_train, y=probs_train, L=L_train) print('setting up Label Model') stop_words = config['stop_words']
keyword_verb ] # apply label functions applier = PandasLFApplier(lfs=lfs) # create a label matrix for the training set L_train = applier.apply(df=data_train) # create a label matrix for the test det L_test = applier.apply(df=data_test) # summary statistics for the LFs lf_summary = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(lf_summary) # take the majority vote on a per-data point basis majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) # use LabelModel to produce training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # result using majority-vote model Y_test = data_test.label.values majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") # results using label model label_model_acc = label_model.score(L=L_test,
lfs += allKeywordLFGemeentenBE lfs += allKeywordLFGemeentenNL lfs += allKeywordNamedEntBE lfs += allKeywordNamedEntNL applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) result = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(result) from snorkel.labeling.model import MajorityLabelVoter majority_model = MajorityLabelVoter(cardinality=2) preds_train_majority = majority_model.predict(L=L_train) from snorkel.labeling.model import LabelModel label_model = LabelModel(cardinality=2, verbose=True, device='cuda') #according to location data, BE tweets = 10-15% label_model.fit(L_train=L_train, n_epochs=500, class_balance=[0.15, 0.85], log_freq=100, seed=82) preds_train_label = label_model.predict(L=L_train) L_dev = applier.apply(df=df_dev) mapping = {'BE': 0, 'NL': 1} Y_dev = np.array([mapping[i] for i in df_dev['label']])
lr=0.05, class_balance=[0.7, 0.3], n_epochs=100) # %% Y_probs_valid = label_model.predict_proba(L_valid) Y_preds_valid = probs_to_preds(Y_probs_valid) metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1") # %% [markdown] # **Majority Vote** # %% from snorkel.labeling.model import MajorityLabelVoter mv_model = MajorityLabelVoter() Y_probs_valid = mv_model.predict_proba(L_valid) Y_preds_valid = probs_to_preds(Y_probs_valid) metric_score(Y_valid, Y_preds_valid, probs=None, metric="f1") # %% # from metal.tuners import RandomSearchTuner # # Creating search space # search_space = { # "l2": {"range": [0.0001, 0.1], "scale": "log"}, # linear range # "lr": {"range": [0.0001, 0.1], "scale": "log"}, # log range # } # searcher = RandomSearchTuner(LabelModel, log_dir="./run_logs", log_writer_class=None)
def majority_acc(line: np.ndarray, label_series: Series) -> float: majority_model = MajorityLabelVoter() maj_model_train_acc = majority_model.score( L=line, Y=label_series.values, tie_break_policy="random")["accuracy"] return maj_model_train_acc
def majority_acc(L: np.ndarray, df: pd.DataFrame) -> float: majority_model = MajorityLabelVoter() maj_model_train_acc = majority_model.score( L=L, Y=df.label, tie_break_policy="random")["accuracy"] return maj_model_train_acc