def labelmodel_predict(L_train, y_true, L_test, return_probs=False, **kwargs): kwargs.setdefault('n_epochs', 500) kwargs.setdefault('log_freq', 100) from snorkel.labeling.model import LabelModel n = len(set(y_true[~y_true.isna()].values)) log.info('y_true values: %s', set(y_true[~y_true.isna()].values)) label_model = LabelModel(cardinality=n, verbose=True) L_train_val = set(L_train.values.flatten()) y_true_val = set(y_true.values.flatten()) log.info('Values in L_train but not y_true: %s', L_train_val - y_true_val) log.info('Values in y_true but not L_train: %s', y_true_val - L_train_val) L_train, Y_dev = to_numbered(L_train, y_true) log.info('L_train values: %s, %s', set(L_train.flatten()), type(L_train)) log.info('Y_dev values: %s, %s', set(Y_dev.flatten()), type(Y_dev)) log.info('kwargs: %s', kwargs) label_model.fit(L_train=L_train, Y_dev=Y_dev[Y_dev != -1], **kwargs) y_pred = label_model.predict(to_numbered(L_test, y_true)[0], return_probs=return_probs) if return_probs: y_pred, y_score = y_pred y_pred = from_numbered(L_test, y_true, y_pred) return (y_pred, y_score) if return_probs else y_pred
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def test_sparse_and_regular_make_same_probs(self) -> None: """Test the LabelModel's estimate of P and Y on a simple synthetic dataset.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix( self.known_dimensions.num_examples, self.known_dimensions.num_functions, self.known_dimensions.num_classes, ) example_event_lists: List[ExampleEventListOccurence] = [] for example_num, example in enumerate(L): event_list = [] for func_id, cls_id in enumerate(example): if (cls_id) > -1: event_id = func_id * self.known_dimensions.num_classes + cls_id event_list.append(event_id) example_event_lists.append((ExampleEventListOccurence(event_list))) sparse_model = SparseExampleEventListLabelModel() sparse_model.fit_from_sparse_example_event_list( example_event_list=example_event_lists, known_dimensions=self.known_dimensions, n_epochs=200, lr=0.01, seed=123, ) label_model = LabelModel(cardinality=self.known_dimensions.num_classes) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) P_lm = label_model.get_conditional_probs() P_slm = sparse_model.get_conditional_probs() np.testing.assert_array_almost_equal( P_slm, P_lm, )
def train(self, dataset): # Apply labeler functions to training set lfs_applier = PandasLFApplier(lfs=self.lfs) with warnings.catch_warnings(): warnings.filterwarnings('ignore') lfs_train = lfs_applier.apply(df=dataset) # Build probabilistic label model label_model = LabelModel(cardinality=3, verbose=True) label_model.fit(L_train=lfs_train, n_epochs=500, log_freq=100, seed=42) label_probs = label_model.predict_proba(lfs_train) # Filter unlabeled data points df_filtered, probs_filtered = filter_unlabeled_dataframe(X=dataset, y=label_probs, L=lfs_train) # Featurize data using scikit self.vectorizer = CountVectorizer(ngram_range=(1, 5)) dataset_train = self.vectorizer.fit_transform( df_filtered.sentence.tolist()) # Replace probabilistic labels with most likely label preds_filtered = probs_to_preds(probs=probs_filtered) # Train scikit model self.model = LogisticRegression(C=1e3, solver="liblinear", multi_class='auto') self.model.fit(X=dataset_train, y=preds_filtered)
def get_L_final_filter(L_train, method='model'): L_final = [] if len(L_train[0]) < 3: method = 'absolute' else: method = 'model' ## TEMPORARY MEASURE method = 'absolute' ## if method == 'absolute': ## Absolute Method: Any 'irrelevant' keywords matched will be flagged as irrelevant for array in L_train: if 0 in array: L_final.append(0) else: L_final.append(1) else: ## Label Model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) L_final = label_model.predict(L=L_train,return_probs=False) return L_final
def test_optimizer(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1, optimizer="sgd") label_model.fit(L, n_epochs=1, optimizer="adam") label_model.fit(L, n_epochs=1, optimizer="adamax") with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"): label_model.fit(L, n_epochs=1, optimizer="bad_opt")
def test_set_mu_eps(self): mu_eps = 0.0123 # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit # the mu_eps floor L = np.array([[1, 1, 1], [1, 1, 1]]) label_model = LabelModel(verbose=False) label_model.fit(L, mu_eps=mu_eps) self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def test_score(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, -1, 1])) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected) L = np.array([[1, 0, 1], [1, 0, 1]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) results = label_model.score(L, Y=np.array([0, 1])) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"]) results_expected = dict(accuracy=0.5, f1=2 / 3) self.assertEqual(results, results_expected)
def main(): lfs = [lf_contains_link, lf_contains_co, lf_contains_sub] baseApp = LFApplier(lfs) labels = baseApp.apply(src) print(labels) print(LFAnalysis(labels, lfs).lf_summary()) buckets = get_label_buckets(labels[:, 0], labels[:, 1]) print(buckets) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(labels, n_epochs=500, log_freq=50, seed=123) pred_labels = label_model.predict(L=labels, tie_break_policy="abstain") print(pred_labels)
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(cardinality=2) logger.info("Training generative model...") model.fit(L_train=L_train, n_epochs=n_epochs, seed=1234, log_freq=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) return marginals
def calculate_metrics( label_model: LabelModel, dataset_name: str, true_labels: np.ndarray, save_to: AbsolutePath, ) -> Dict[str, float]: """ >>> from collections import namedtuple; import tempfile >>> def mocked_predictions(l,return_probs,tie_break_policy): return np.array([1, 0, 1]), np.array([[0.1, 0.9], [0.8, 0.2], [0.25, 0.75]]) >>> def mocked_scores(L,Y,tie_break_policy,metrics): ... return {"f1": 1.0} if metrics == ['f1'] else {"roc_auc": 0.78} >>> lm = namedtuple('LM', ['predict', 'score'])(mocked_predictions, mocked_scores) >>> with tempfile.TemporaryDirectory() as tmpdirname: ... np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl") ... calculate_metrics(lm, "test_set", np.array([1, 1, 0]), Path(tmpdirname)) {'label_model_accuracy_test_set': 0.333, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.404} >>> with tempfile.TemporaryDirectory() as tmpdirname: ... np.ndarray([]).dump(f"{tmpdirname}/heuristic_matrix_test_set.pkl") ... calculate_metrics(lm, "test_set", np.array([0, 1, 0]), Path(tmpdirname)) {'label_model_accuracy_test_set': 0.0, 'label_model_auc_test_set': 0.78, 'label_model_f1_test_set': 1.0, 'label_model_mse_test_set': 0.671} """ lines = np.load(str(save_to / f"heuristic_matrix_{dataset_name}.pkl"), allow_pickle=True) tie_break_policy = "random" Y_pred, Y_prob = label_model.predict(lines, return_probs=True, tie_break_policy=tie_break_policy) try: auc = label_model.score(L=lines, Y=true_labels, tie_break_policy="random", metrics=["roc_auc"])["roc_auc"] auc = round(auc, 3) except ValueError: auc = "n/a" f1 = label_model.score(L=lines, Y=true_labels, tie_break_policy="random", metrics=["f1"])["f1"] accuracy = sum(Y_pred == true_labels) / float(len(Y_pred)) mse = np.mean((Y_prob[:, 1] - true_labels)**2) return { f"label_model_accuracy_{dataset_name}": round(accuracy, 3), f"label_model_auc_{dataset_name}": auc, f"label_model_f1_{dataset_name}": round(f1, 3), f"label_model_mse_{dataset_name}": round(mse, 3), }
def test_model_loss(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) init_loss = label_model._loss_mu().item() label_model.fit(L, n_epochs=10) next_loss = label_model._loss_mu().item() self.assertLessEqual(next_loss, init_loss) with self.assertRaisesRegex(Exception, "Loss is NaN."): label_model.fit(L, n_epochs=10, lr=1e8)
def test_optimizer_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, optimizer="sgd", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.SGD) label_model.fit(L, optimizer="adam", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adam) label_model.fit(L, optimizer="adamax", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adamax) with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"): label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
def test_save_with_conda_yaml(tmp_path: Path, setup_common_components: Dict): """Test if a model can be saved with a conda yaml file.""" kwargs = setup_common_components labeler = Labeler(None, [PartTemp]) # Mock the get_keys() labeler.get_keys = MagicMock(return_value=[LabelKey(name="key1")]) lfs = [[LF_storage_row]] label_models = [LabelModel()] # Create a conda yaml file with open(tmp_path.joinpath("my_conda.yaml"), "w") as f: yaml.dump(_get_default_conda_env(), f) # Save a model with a conda yaml file. save_model( HardwareFonduerModel(), os.path.join(tmp_path, artifact_path), **kwargs, conda_env=tmp_path.joinpath("my_conda.yaml"), code_paths=[ "tests" ], # pass a directory name to preserver the directory hierarchy model_type="label", labeler=labeler, lfs=lfs, label_models=label_models, ) # Your conda yaml file is saved as "conda.yaml". assert os.path.exists(os.path.join(tmp_path, artifact_path, "conda.yaml"))
def get_snorkel_labels(frame_to_train, pkl_name): print( "==============================Labeling is now started=======================================" ) applier = PandasLFApplier(lfs=lfs) L_train = applier.apply(df=frame_to_train) date_parser_coverage, currency_coverage,\ zipcode_coverage,state_coverage,\ quntity_coverage,phonenumber_coverage,SSN_coverage,\ first_name_coverage,last_name_coverage,percent_coverge= (L_train != ABSTAIN).mean(axis=0) frame_to_train.rename(columns={ "word_id": "word_tokens", "text": "ocr", "label_number": "preds" }, inplace=True) print( "==============================Labeling is now complete=======================================" ) print( "==============================Summary Stats==================================================" ) print(f"date_parser_coverage: {date_parser_coverage * 100:.1f}%") print(f"currency_coverage: {currency_coverage * 100:.1f}%") print(f"zipcode_coverage: {zipcode_coverage * 100:.1f}%") print(f"state_coverage: {state_coverage * 100:.1f}%") print(f"quntity_coverage: {quntity_coverage * 100:.1f}%") print(f"phonenumber_coverage: {phonenumber_coverage * 100:.1f}%") print(f"SSN_coverage: {SSN_coverage * 100:.1f}%") print(f"first_name_coverage: {first_name_coverage * 100:.1f}%") print(f"last_name_coverage: {last_name_coverage * 100:.1f}%") #print(f"alpha_number_coverage: {alpha_number_coverage * 100:.1f}%") # lol= f"{pkl_name}.pkl" # print("File name I got:", lol) # print(f"percent_coverage: {percent_coverge * 100:.1f}%") # with open(lol, 'rb') as f: # label_model = pickle.load(f) label_model = LabelModel(cardinality=15, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) frame_to_train["label_number"] = label_model.predict( L=L_train, tie_break_policy="abstain") frame_to_train.label_number.fillna(0, inplace=True) frame_to_train['pred_names'] = frame_to_train.label_number.map(inv_et_dct) return frame_to_train #dataset_df = pd.DataFrame() return frame_to_train
def test_scheduler_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, lr_scheduler="constant", n_epochs=1) self.assertIsNone(label_model.lr_scheduler) label_model.fit(L, lr_scheduler="linear", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR) label_model.fit(L, lr_scheduler="exponential", n_epochs=1) self.assertIsInstance( label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR ) label_model.fit(L, lr_scheduler="step", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
def test(): patient = 'maskedLiverIso.nii' filenames = ['antiga.nii', 'jerman.nii'] labeling_functions = [li_thresholding, otsu_thresholding] path = 'F:/Deep Learning/Data/snorkel/*' dataset = [] size = 0 for dir_path in sorted(glob(str(Path(path))), key=sorting): ip: Path = Path(dir_path) / patient arr, im = io_load_image(str(ip)) shape = arr.shape arr = arr.flatten() labels = [] for fn in filenames: for func in labeling_functions: i = Path(dir_path) / fn array, image = io_load_image(str(i)) array = func(array.flatten()) size += array.shape[-1] labels.append(array) dataset.append([im, arr, labels, ip.parts[-2], shape]) array_reshape = (size // len(filenames) // len(labeling_functions), len(filenames) * len(labeling_functions)) print(array_reshape) lab: np.ndarray = np.zeros((size), dtype='float16').reshape(array_reshape) print(size, lab.shape) s = 0 for data in dataset: _, _, label, _, _ = data T: np.ndarray = np.array(label).T si = T.shape[0] lab[s:s + si, :] = T s += si LM: LabelModel = LabelModel(cardinality=2, verbose=True, device='cuda') LM.fit(lab, seed=12345, log_freq=1, n_epochs=100, class_balance=[0.985, 0.015]) s = 0 for data in dataset: im, arr, label, fn, shape = data print(fn) T: np.ndarray = np.array(label).T p: np.ndarray = LM.predict(T) p = p.reshape(shape) p = getLargestCC(p) p[p > 0] = 255 p = np.array(p, dtype='uint8') io_save_image('temp/' + fn + '.nii', p, im)
def test_augmented_L_construction(self): # 5 LFs n = 3 m = 5 k = 2 L = np.array([[0, 0, 0, 1, 0], [0, 1, 1, 0, -1], [0, 0, 0, 0, -1]]) L_shift = L + 1 lm = LabelModel(cardinality=k, verbose=False) lm._set_constants(L_shift) lm._create_tree() L_aug = lm._get_augmented_label_matrix(L_shift, higher_order=True) # Should have 10 columns: # - 5 * 2 = 10 for the sources self.assertEqual(L_aug.shape, (3, 10)) # 13 total nonzero entries self.assertEqual(L_aug.sum(), 13) # Next, check the singleton entries for i in range(n): for j in range(m): if L_shift[i, j] > 0: self.assertEqual(L_aug[i, j * k + L_shift[i, j] - 1], 1) # Finally, check the clique entries # Singleton clique 1 self.assertEqual(len(lm.c_tree.node[1]["members"]), 1) j = lm.c_tree.node[1]["start_index"] self.assertEqual(L_aug[0, j], 1) # Singleton clique 2 self.assertEqual(len(lm.c_tree.node[2]["members"]), 1) j = lm.c_tree.node[2]["start_index"] self.assertEqual(L_aug[0, j + 1], 0)
def test_sparse_and_regular_make_same_objective(self): np.random.seed(123) P, Y, L = generate_simple_label_matrix( self.known_dimensions.num_examples, self.known_dimensions.num_functions, self.known_dimensions.num_classes, ) sparse_event_occurence: List[EventCooccurence] = [] label_model = LabelModel(cardinality=self.known_dimensions.num_classes) label_model._set_constants(L) L_shift = L + 1 label_model_lind = label_model._create_L_ind(L_shift) co_oc_matrix = label_model_lind.T @ label_model_lind for a_id, cols in enumerate(co_oc_matrix): for b_id, freq in enumerate(cols): sparse_event_occurence.append( EventCooccurence(a_id, b_id, frequency=freq)) sparse_model = SparseEventPairLabelModel() sparse_model._set_constants(known_dimensions=self.known_dimensions) sparse_model_objective = sparse_model._prepare_objective_from_sparse_event_cooccurence( known_dimensions=self.known_dimensions, sparse_event_occurence=sparse_event_occurence, ) self.assertEqual(label_model.n, sparse_model.n) self.assertEqual(label_model.m, sparse_model.m) self.assertEqual(label_model.cardinality, sparse_model.cardinality) label_model._generate_O(L_shift, ) label_model_O = label_model.O.detach().numpy() np.testing.assert_almost_equal(label_model_O, sparse_model_objective)
def test_loss(self): L = np.array([[0, -1, 0], [0, 1, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05) # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03 self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03) self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03) # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2 self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
def train_model(self, df_train: pd.DataFrame, application_area_lfs: list, analysis_path: str = "output", label_output_path: str = "labels.jsonl", save_model_path: str = None): """Using our labeling functions, we can train a probabilistic model which is able to generate weak labels for our data points :param df_train: The training data for the model :type df_train: pd.DataFrame :param application_area_lfs: A list of labeling functions to use in training the Label Model :type application_area_lfs: list :param analysis_path: Folder path where the model output should be stored, defaults to `PROJECT_ROOT/output` :type analysis_path: str, optional :param label_output_path: Path to file where probabilistic labels generated by the model should be stored, defaults to "labels.jsonl" :type label_output_path: str, optional :param save_model_path: A path to where the Label Model should be save at. If no path is provided, the model is not saved :type save_model_path: str, optional """ file_name_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") applier = PandasLFApplier(lfs=application_area_lfs) L_train = applier.apply(df=df_train) model = LabelModel(cardinality=2, verbose=True) model.fit(L_train=L_train, n_epochs=800, log_freq=100) if (save_model_path is not None): model.save(save_model_path) int_labels, prob_labels = model.predict(L=L_train, return_probs=True, tie_break_policy="abstain") probs_df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=prob_labels, L=L_train) int_df_train_filtered, int_train_filtered = filter_unlabeled_dataframe( X=df_train, y=int_labels, L=L_train) # write out both labels. In the probability outputs, p_rel is the second probability listed assert list(probs_df_train_filtered["paperid"]) == list( int_df_train_filtered["paperid"]) with open(f"{label_output_path}", mode="w") as out: for idx, paper_id in enumerate(probs_df_train_filtered["paperid"]): out.write( json.dumps({ "id": paper_id, # cast to int and float to get rid of nonserializable numpy types "is_rel": int(int_train_filtered[idx]), "p_rel": float(probs_train_filtered[idx][1]) }) + "\n") # output LF analysis to csv file sorted by coverage lf_analysis = LFAnalysis(L=L_train, lfs=application_area_lfs).lf_summary() with open( f"{self.PROJECT_ROOT}/output/{analysis_path}_{file_name_timestamp}.csv", "w") as outfile: lf_analysis = lf_analysis.sort_values("Coverage") lf_analysis.to_csv(outfile, encoding="utf-8", index=True)
def test_mv_default(self): # less than 2 LFs have overlaps label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 0])) # less than 2 LFs have conflicts L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 1]))
def test_class_balance(self): label_model = LabelModel(cardinality=2, verbose=False) # Test class balance Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1]) label_model._set_class_balance(class_balance=None, Y_dev=Y_dev) np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4])) class_balance = np.array([0.0, 1.0]) with self.assertRaisesRegex(ValueError, "Class balance prior is 0"): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) class_balance = np.array([0.0]) with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) Y_dev_one_class = np.array([0, 0, 0]) with self.assertRaisesRegex( ValueError, "Does not match LabelModel cardinality" ): label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
def test_get_weight(self): # set up L matrix true_accs = [0.95, 0.6, 0.7, 0.55, 0.8] coverage = [1.0, 0.8, 1.0, 1.0, 1.0] L = -1 * np.ones((1000, len(true_accs))) Y = np.zeros(1000) for i in range(1000): Y[i] = 1 if np.random.rand() <= 0.5 else 0 for j in range(5): if np.random.rand() <= coverage[j]: L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else np.abs(Y[i] - 1)) label_model = LabelModel(cardinality=2) label_model.fit(L, n_epochs=1000, seed=123) accs = label_model.get_weights() for i in range(len(accs)): true_acc = true_accs[i] self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ([f] + [ get_positive_labeling_function(divisor) for divisor in range(2, 9) ] + [ get_negative_labeling_function(divisor) for divisor in range(2, 9) ]) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def label_dataset( task: Task, dataset: Dataset, path_config: Optional[PathConfig] = None, debug: bool = False, ): path_config = path_config or PathConfig.load() applied_heuristics_df = pd.read_pickle( str(path_config.generated / task.name / f"heuristic_matrix_{dataset.name}.pkl")) label_model = LabelModel() label_model.load(str(path_config.generated / task.name / "label_model.pkl")) df = dataset.load() df_labeled = do_labeling(label_model, applied_heuristics_df.to_numpy(), df, task.labels) if debug: for ( heuristic_name, applied_heuristic_series, ) in applied_heuristics_df.iteritems(): applied_heuristics_df[ heuristic_name] = applied_heuristic_series.map({ 0: heuristic_name, 1: heuristic_name, -1: "" }) col_lfs = applied_heuristics_df.apply( lambda row: ";".join([elm for elm in row if elm]), axis=1) df_labeled["lfs"] = col_lfs labeled_data_path = path_config.labeled_data / task.name if not labeled_data_path.exists(): labeled_data_path.mkdir(parents=True) target_file = labeled_data_path / f"{dataset.name}.labeled.csv" df_labeled.to_csv(target_file, index=False) print(f"Labeled dataset has been written to {target_file}.")
def test_label_model_sparse(self) -> None: """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset. This tests the common setting where LFs abstain most of the time, which can cause issues for example if parameter clamping set too high (e.g. see Issue #1422). """ np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality, abstain_multiplier=1000.0) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=1000, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels *only on non-abstained data points* Y_pred = label_model.predict(L, tie_break_policy="abstain") (idx, ) = np.where(Y_pred != -1) acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx) self.assertGreaterEqual(acc, 0.65) # Make sure that we don't output abstain when an LF votes, per issue #1422 self.assertEqual(len(idx), np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def test_L_form(self): label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]]) label_model._set_constants(L) self.assertEqual(label_model.n, 4) self.assertEqual(label_model.m, 3) L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]]) with self.assertRaisesRegex(ValueError, "L_train has cardinality"): label_model.fit(L, n_epochs=1) L = np.array([[0, 1], [1, 1], [0, 1]]) with self.assertRaisesRegex(ValueError, "L_train should have at least 3"): label_model.fit(L, n_epochs=1)
def test_warmup(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) lr_scheduler_config = {"warmup_percentage": 3 / 5} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) with self.assertRaisesRegex(ValueError, "LabelModel does not support"): lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
def train(self): ''' Train the logistic regression discriminative model ''' # We pull out the label vectors for ease of use later Y_test = self.df_test.label.values applier = PandasLFApplier(lfs=self.lfs) L_train = applier.apply(df=self.df_train) # Use Label Model to combined input data label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123) # Make predictions probs_train = label_model.predict_proba(L=L_train) # Filter abstained inputs df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=L_train) # Represent each data point as a one-hot vector vectorizer = CountVectorizer(ngram_range=(1, 5)) X_train = vectorizer.fit_transform(df_train_filtered.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) # Turn probs into preds preds_train_filtered = probs_to_preds(probs=probs_train_filtered) # Train logistic regression model sklearn_model = LogisticRegression(C=1e3, solver="liblinear") sklearn_model.fit(X=X_train, y=preds_train_filtered) print( f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%" ) dump(sklearn_model, 'sklearn_model.joblib') dump(vectorizer, 'vectorizer.joblib')