def get_L_final_filter(L_train, method='model'): L_final = [] if len(L_train[0]) < 3: method = 'absolute' else: method = 'model' ## TEMPORARY MEASURE method = 'absolute' ## if method == 'absolute': ## Absolute Method: Any 'irrelevant' keywords matched will be flagged as irrelevant for array in L_train: if 0 in array: L_final.append(0) else: L_final.append(1) else: ## Label Model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) L_final = label_model.predict(L=L_train,return_probs=False) return L_final
def test_label_model_sparse(self) -> None: """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset. This tests the common setting where LFs abstain most of the time, which can cause issues for example if parameter clamping set too high (e.g. see Issue #1422). """ np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality, abstain_multiplier=1000.0) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=1000, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels *only on non-abstained data points* Y_pred = label_model.predict(L, tie_break_policy="abstain") (idx, ) = np.where(Y_pred != -1) acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx) self.assertGreaterEqual(acc, 0.65) # Make sure that we don't output abstain when an LF votes, per issue #1422 self.assertEqual(len(idx), np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def test_score(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, -1, 1])) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected) L = np.array([[1, 0, 1], [1, 0, 1]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) results = label_model.score(L, Y=np.array([0, 1])) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"]) results_expected = dict(accuracy=0.5, f1=2 / 3) self.assertEqual(results, results_expected)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def labelmodel_predict(L_train, y_true, L_test, return_probs=False, **kwargs): kwargs.setdefault('n_epochs', 500) kwargs.setdefault('log_freq', 100) from snorkel.labeling.model import LabelModel n = len(set(y_true[~y_true.isna()].values)) log.info('y_true values: %s', set(y_true[~y_true.isna()].values)) label_model = LabelModel(cardinality=n, verbose=True) L_train_val = set(L_train.values.flatten()) y_true_val = set(y_true.values.flatten()) log.info('Values in L_train but not y_true: %s', L_train_val - y_true_val) log.info('Values in y_true but not L_train: %s', y_true_val - L_train_val) L_train, Y_dev = to_numbered(L_train, y_true) log.info('L_train values: %s, %s', set(L_train.flatten()), type(L_train)) log.info('Y_dev values: %s, %s', set(Y_dev.flatten()), type(Y_dev)) log.info('kwargs: %s', kwargs) label_model.fit(L_train=L_train, Y_dev=Y_dev[Y_dev != -1], **kwargs) y_pred = label_model.predict(to_numbered(L_test, y_true)[0], return_probs=return_probs) if return_probs: y_pred, y_score = y_pred y_pred = from_numbered(L_test, y_true, y_pred) return (y_pred, y_score) if return_probs else y_pred
def test_sparse_and_regular_make_same_probs(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix( self.known_dimensions.num_examples, self.known_dimensions.num_functions, self.known_dimensions.num_classes, ) example_event_lists: List[ExampleEventListOccurence] = [] for example_num, example in enumerate(L): event_list = [] for func_id, cls_id in enumerate(example): if (cls_id) > -1: event_id = func_id * self.known_dimensions.num_classes + cls_id event_list.append(event_id) example_event_lists.append((ExampleEventListOccurence(event_list))) sparse_model = SparseExampleEventListLabelModel() sparse_model.fit_from_sparse_example_event_list( example_event_list=example_event_lists, known_dimensions=self.known_dimensions, n_epochs=200, lr=0.01, seed=123, ) label_model = LabelModel(cardinality=self.known_dimensions.num_classes) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) P_lm = label_model.get_conditional_probs() P_slm = sparse_model.get_conditional_probs() np.testing.assert_array_almost_equal( P_slm, P_lm, )
def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def train_model(self, df_train: pd.DataFrame, application_area_lfs: list, analysis_path: str = "output", label_output_path: str = "labels.jsonl", save_model_path: str = None): """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points :param df_train: The training data for the model :type df_train: pd.DataFrame :param application_area_lfs: A list of labeling functions to use in training the Label Model :type application_area_lfs: list :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output` :type analysis_path: str, optional :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl" :type label_output_path: str, optional :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved :type save_model_path: str, optional """ file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") applier = PandasLFApplier(lfs=application_area_lfs) L_train = applier.apply(df=df_train) model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) if (save_model_path is not None): model.save(save_model_path) int_labels, prob_labels = model.predict(L=L_train, return_probs=True, tie_break_policy="abstain") probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(f"{label_output_path}", mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n") # output LF analysis to csv file sorted by coverage lf_analysis = LFAnalysis(L=L_train, lfs=application_area_lfs).lf_summary() with open( f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv", "w") as outfile: lf_analysis = lf_analysis.sort_values("Coverage") lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
def test_set_mu_eps(self): mu_eps = 0.0123 # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit # the mu_eps floor L = np.array([[1, 1, 1], [1, 1, 1]]) label_model = LabelModel(verbose=False) label_model.fit(L, mu_eps=mu_eps) self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def test_loss(self): L = np.array([[0, -1, 0], [0, 1, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05) # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03 self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03) self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03) # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2 self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
def test_progress_bar(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100, progress_bar=False) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal( label_model.predict(L), np.array([1, -1, 1]) ) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected)
def test_mv_default(self): # less than 2 LFs have overlaps label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 0])) # less than 2 LFs have conflicts L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 1]))
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(cardinality=2) logger.info("Training generative model...") model.fit(L_train=L_train, n_epochs=n_epochs, seed=1234, log_freq=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) return marginals
def test_L_form(self): label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]]) label_model._set_constants(L) self.assertEqual(label_model.n, 4) self.assertEqual(label_model.m, 3) L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]]) with self.assertRaisesRegex(ValueError, "L_train has cardinality"): label_model.fit(L, n_epochs=1) L = np.array([[0, 1], [1, 1], [0, 1]]) with self.assertRaisesRegex(ValueError, "L_train should have at least 3"): label_model.fit(L, n_epochs=1)
def test_optimizer(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1, optimizer="sgd") label_model.fit(L, n_epochs=1, optimizer="adam") label_model.fit(L, n_epochs=1, optimizer="adamax") with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"): label_model.fit(L, n_epochs=1, optimizer="bad_opt")
def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) original_preds = label_model.predict(L) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model.pkl" label_model.save(save_path) label_model_new = LabelModel(cardinality=2, verbose=False) label_model_new.load(save_path) loaded_preds = label_model_new.predict(L) shutil.rmtree(dir_path) np.testing.assert_array_equal(loaded_preds, original_preds)
def get_snorkel_labels(frame_to_train, pkl_name): print( "==============================Labeling is now started=======================================" ) applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=frame_to_train) date_parser_coverage, currency_coverage,\ zipcode_coverage,state_coverage,\ quntity_coverage,phonenumber_coverage,SSN_coverage,\ first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0) frame_to_train.rename(columns={ "word_id": "word_tokens", "text": "ocr", "label_number": "preds" }, inplace=True) print( "==============================Labeling is now complete=======================================" ) print( "==============================Summary Stats==================================================" ) print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%") print(f"currency_coverage: {currency_coverage * 100:.1f}%") print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%") print(f"state_coverage: {state_coverage * 100:.1f}%") print(f"quntity_coverage: {quntity_coverage * 100:.1f}%") print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%") print(f"SSN_coverage: {SSN_coverage * 100:.1f}%") print(f"first_name_coverage: {first_name_coverage * 100:.1f}%") print(f"last_name_coverage: {last_name_coverage * 100:.1f}%") #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%") # lol= f"{pkl_name}.pkl" # print("File name I got:", lol) # print(f"percent_coverage: {percent_coverge * 100:.1f}%") # with open(lol, 'rb') as f: # label_model = pickle.load(f) label_model = LabelModel(cardinality=15, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) frame_to_train["label_number"] = label_model.predict( L=L_train, tie_break_policy="abstain") frame_to_train.label_number.fillna(0, inplace=True) frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct) return frame_to_train #dataset_df = pd.DataFrame() return frame_to_train
def test_label_model_basic(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels score = label_model.score(L, Y) self.assertGreaterEqual(score["accuracy"], 0.9)
def test_label_model_basic(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() conditional_probs_err = ( np.linalg.norm(P.flatten() - P_lm.flatten(), ord=1) / P.size) self.assertLessEqual(conditional_probs_err, 0.01) # Test predicted labels score = label_model.score(L, Y) self.assertGreaterEqual(score["accuracy"], 0.9)
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ([f] + [ get_positive_labeling_function(divisor) for divisor in range(2, 9) ] + [ get_negative_labeling_function(divisor) for divisor in range(2, 9) ]) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def test_predict(self): # 3 LFs that always disagree/abstain leads to all abstains L = np.array([[-1, 1, 0], [0, -1, 1], [1, 0, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([-1, -1, -1])) L = np.array([[0, 1, 0], [0, 1, 0]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) preds = label_model.predict(L) true_preds = np.array([0, 0]) np.testing.assert_array_equal(preds, true_preds) preds, probs = label_model.predict(L, return_probs=True) true_probs = np.array([[0.99, 0.01], [0.99, 0.01]]) np.testing.assert_array_almost_equal(probs, true_probs)
def test_get_weight(self): # set up L matrix true_accs = [0.95, 0.6, 0.7, 0.55, 0.8] coverage = [1.0, 0.8, 1.0, 1.0, 1.0] L = -1 * np.ones((1000, len(true_accs))) Y = np.zeros(1000) for i in range(1000): Y[i] = 1 if np.random.rand() <= 0.5 else 0 for j in range(5): if np.random.rand() <= coverage[j]: L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else np.abs(Y[i] - 1)) label_model = LabelModel(cardinality=2) label_model.fit(L, n_epochs=1000, seed=123) accs = label_model.get_weights() for i in range(len(accs)): true_acc = true_accs[i] self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def label_model_creator(df_dev, Y_dev, df_train, df_test, Y_test): # Accumulate all the labeling_functions for supply supply_lfs = [ lf_supply, lf_customer, lf_sales_to, lf_our_customer, lf_acquisition, lf_people, lf_sold, lf_relation, lf_competition ] # Apply the above labeling functions to the data in Pandas dataframe formats applier = PandasLFApplier(supply_lfs) # Use the applier of the labeling functions to both development set and train set L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) L_test = applier.apply(df_test) # caridnality : 2 (True and False) label_model = LabelModel(cardinality=2, verbose=True) # Fit the label_model label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500) # accuracy for the label model using the test set label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") # check the F-1 score and ROC_AUC score probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print( f"Label model f1 score: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1')}" ) print( f"Label model roc-auc: {metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}" ) return label_model, L_train
def test_optimizer_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, optimizer="sgd", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.SGD) label_model.fit(L, optimizer="adam", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adam) label_model.fit(L, optimizer="adamax", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adamax) with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"): label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')
def labeling_evaluation(df_train, df_test, label_model): lfs = [ LabelingFunction.lf_ind_keyword, LabelingFunction.lf_short, LabelingFunction.lf_cmp_re, LabelingFunction.lf_industry_keyword, LabelingFunction.lf_surname_re, LabelingFunction.industry_cls ] applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=df_train) L_test = applier.apply(df=df_test) Y_test = df_test.label.values analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() if label_model == "majority": majority_model = MajorityLabelVoter() preds_train = majority_model.predict(L=L_train) majority_acc = majority_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%") df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe( X=df_train, y=preds_train, L=L_train) return df_train_filtered, preds_train_filtered, analysis if label_model == "weighted": label_model = LabelModel(cardinality=len( [c for c in dir(Polarity) if not c.startswith("__")]), verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) probs_train = label_model.predict_proba(L_train) label_model_acc = label_model.score( L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%") df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) preds_train_filtered = probs_to_preds(probs_train_filtered) return df_train_filtered, probs_train_filtered, preds_train_filtered, analysis
def weak_supervisor(dataframe, model_type): labeling_functions = [positive_labeling_function, positive1_labeling_function, negative_labeling_function, negative1_labeling_function] pandasApplier = PandasLFApplier(lfs=labeling_functions) label_training_matrix = pandasApplier.apply(df=dataframe) if model_type == "label_model": # constructing a probabilistic label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=label_training_matrix, n_epochs=300, log_freq=50, seed=123) dataframe["weak_labels"] = label_model.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe else: majorityLabelVoter = MajorityLabelVoter() dataframe["weak_labels"] = majorityLabelVoter.predict(L=label_training_matrix) print("dataframe shape: ", dataframe.shape) dataframe = dataframe[dataframe["weak_labels"] != -1] print("dataframe shape after filtering: ", dataframe.shape) return dataframe
def train_model(training_data: pd.DataFrame, testing_data: pd.DataFrame, L_train: np.ndarray, save_model=True) -> LabelModel: """Train a label model using the label matrix generated by the labeling functions :param training_data: Dataframe of training data :type training_data: pd.DataFrame :param testing_data: Dataframe of testing data :type testing_data: pd.DataFrame :param L_train: The matrix of labels generated by the labeling functions on the training data :type L_train: np.ndarray :param save_model: Set this to `True` to save the model to disk, defaults to `True` :type save_model: bool, optional :return: A label model :rtype: LabelModel """ # Build noise aware majority model model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) # , class_balance=[0.673, 0.327]) if (save_model): model.save("../output/model_export/saved_label_model.pkl") return model
def test_scheduler_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, lr_scheduler="constant", n_epochs=1) self.assertIsNone(label_model.lr_scheduler) label_model.fit(L, lr_scheduler="linear", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR) label_model.fit(L, lr_scheduler="exponential", n_epochs=1) self.assertIsInstance( label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR ) label_model.fit(L, lr_scheduler="step", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)