def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def generate_labels_with_snorkel(dataframe): """ Labels the full data using Snorkel :param dataframe: Pandas dataframe containing all data :return: dataframe extended with a label column """ # Define the set of labeling functions (LFs) lfs = [ lf_ubo_is_company, lf_troika_company, lf_uk_blacklisted_company, lf_non_uk_blacklisted_company ] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(dataframe) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) dataframe["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") # Filter out the abstain data points dataframe = dataframe[dataframe.label != ABSTAIN] return dataframe
def test_class_balance(self): label_model = LabelModel(cardinality=2, verbose=False) # Test class balance Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1]) label_model._set_class_balance(class_balance=None, Y_dev=Y_dev) np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4]))
def test_optimizer(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1, optimizer="sgd") label_model.fit(L, n_epochs=1, optimizer="adam") label_model.fit(L, n_epochs=1, optimizer="adamax") with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"): label_model.fit(L, n_epochs=1, optimizer="bad_opt")
def test_set_mu_eps(self): mu_eps = 0.0123 # Construct a label matrix such that P(\lambda_1 = 0 | Y) = 0.0, so it will hit # the mu_eps floor L = np.array([[1, 1, 1], [1, 1, 1]]) label_model = LabelModel(verbose=False) label_model.fit(L, mu_eps=mu_eps) self.assertAlmostEqual(label_model.get_conditional_probs()[0, 1, 0], mu_eps)
def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid): hyper_grid_results = defaultdict(dict) train_grid_results = defaultdict(dict) dev_grid_results = defaultdict(dict) test_grid_results = defaultdict(dict) models = defaultdict(dict) for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)): for param in regularization_grid: label_model = LabelModel(cardinality=2) label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=param, ) # Get marginals for each parameter hyper_grid_results[str(param)] = roc_curve( dev_labels, label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1]) # Convert marginals into AUROCs hyper_grid_results = { param: auc(hyper_grid_results[param][0], hyper_grid_results[param][1]) for param in hyper_grid_results } # Select the parameter with the highest AUROC best_param = float( max(hyper_grid_results.items(), key=operator.itemgetter(1))[0]) # Re-fit the model label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=best_param, ) # Save marginals for output key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}' train_grid_results[key] = label_model.predict_proba( train_matrix[:, lf_sample[1]]) dev_grid_results[key] = label_model.predict_proba( dev_matrix[:, lf_sample[1]]) test_grid_results[key] = label_model.predict_proba( test_matrix[:, lf_sample[1]]) models[key] = label_model return train_grid_results, dev_grid_results, test_grid_results, models
def test_score(self): L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]]) Y = np.array([1, 0, 1]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=100) results = label_model.score(L, Y, metrics=["accuracy", "coverage"]) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, -1, 1])) results_expected = dict(accuracy=1.0, coverage=2 / 3) self.assertEqual(results, results_expected) L = np.array([[1, 0, 1], [1, 0, 1]]) label_model = self._set_up_model(L) label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp( 0.01, 0.99)) results = label_model.score(L, Y=np.array([0, 1])) results_expected = dict(accuracy=0.5) self.assertEqual(results, results_expected) results = label_model.score(L=L, Y=np.array([1, 0]), metrics=["accuracy", "f1"]) results_expected = dict(accuracy=0.5, f1=2 / 3) self.assertEqual(results, results_expected)
def load_snorkel_ee_components(save_path: Union[str, Path]) \ -> Tuple[LabelModel, LabelModel]: save_path = Path(save_path) assert save_path.exists(), f"Save path does not exist: {save_path}" trigger_label_model: LabelModel = LabelModel() trigger_label_model.load(Path(save_path).joinpath('trigger_lm.pt')) role_label_model: LabelModel = LabelModel() role_label_model.load(Path(save_path).joinpath('role_lm.pt')) return trigger_label_model, role_label_model
def get_snorkel_labels(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) L_probs = label_model.predict_proba(L=L_train) df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df, y=L_probs, L=L_train) return df_filtered, probs_filtered
def test_model_loss(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) init_loss = label_model._loss_mu().item() label_model.fit(L, n_epochs=10) next_loss = label_model._loss_mu().item() self.assertLessEqual(next_loss, init_loss) with self.assertRaisesRegex(Exception, "Loss is NaN."): label_model.fit(L, n_epochs=10, lr=1e8)
def __init__( self, cardinality: int = 2, verbose: bool = True, device: str = "cpu", metric: str = "accuracy", tie_break_policy: str = "abstain", n_epochs: int = 100, lr: float = 0.01, l2: float = 0.0, optimizer: str = "sgd", optimizer_config: Optional[OptimizerConfig] = None, lr_scheduler: str = "constant", lr_scheduler_config: Optional[LRSchedulerConfig] = None, prec_init: float = 0.7, seed: int = np.random.randint(1e6), log_freq: int = 10, mu_eps: Optional[float] = None, class_balance: Optional[List[float]] = None, **kwargs: Any, ) -> None: self.cardinality = cardinality self.verbose = verbose self.device = device self.metric = metric self.tie_break_policy = tie_break_policy self.n_epochs = n_epochs self.lr = lr self.l2 = l2 self.optimizer = optimizer self.optimizer_config = ( optimizer_config if optimizer_config is not None else OptimizerConfig() # type: ignore ) self.lr_scheduler = lr_scheduler self.lr_scheduler_config = ( lr_scheduler_config if lr_scheduler_config is not None else LRSchedulerConfig() # type: ignore ) self.prec_init = prec_init self.seed = seed self.log_freq = log_freq self.mu_eps = mu_eps self.class_balance = class_balance self.label_model = LabelModel(cardinality=self.cardinality, verbose=self.verbose, device=self.device)
def __init__(self, df_train, df_dev, df_valid, df_test, df_heldout, lfs={}, label_model=None): df_train["seen"] = 0 self.df_train = df_train.reset_index() self.df_dev = df_dev self.df_valid = df_valid self.df_test = df_test self.df_heldout = df_heldout #self.Y_train = df_train.label.values self.Y_dev = df_dev.label.values self.Y_valid = df_valid.label.values self.Y_test = df_test.label.values self.Y_heldout = df_heldout.label.values self.lfs = lfs self.L_train = None self.L_dev = None self.L_valid = None self.L_heldout = None cardinality = len(df_valid.label.unique()) # for DEMOing purposes self.first_text_indices = [ 1262, #"check out" "youtube" 1892, # I love 1117, # url concept 1706, # emoji concept 952, # "nice" 971, # positive concept 958, # actually use emoji concept ] self.count = 0 if label_model is None: self.label_model = LabelModel(cardinality=cardinality, verbose=True) else: self.label_model = label_model self.vectorizer = CountVectorizer(ngram_range=(1, 2)) self.vectorizer.fit(df_train.text.tolist())
def test_optimizer_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, optimizer="sgd", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.SGD) label_model.fit(L, optimizer="adam", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adam) label_model.fit(L, optimizer="adamax", n_epochs=1) self.assertIsInstance(label_model.optimizer, optim.Adamax) with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"): label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
def load(self, dir_name): with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file: lfs = pickle.load(file) label_model = LabelModel.load( os.path.join(dir_name, 'label_model.pkl')) self.lfs = lfs self.label_model = label_model
def test_save_and_load(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) dir_path = tempfile.mkdtemp() save_path = dir_path + "label_model" label_model.save(save_path) label_model.load(save_path) shutil.rmtree(dir_path)
def test_scheduler_init(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() label_model.fit(L, lr_scheduler="constant", n_epochs=1) self.assertIsNone(label_model.lr_scheduler) label_model.fit(L, lr_scheduler="linear", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR) label_model.fit(L, lr_scheduler="exponential", n_epochs=1) self.assertIsInstance( label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR ) label_model.fit(L, lr_scheduler="step", n_epochs=1) self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
def test_augmented_L_construction(self): # 5 LFs n = 3 m = 5 k = 2 L = np.array([[0, 0, 0, 1, 0], [0, 1, 1, 0, -1], [0, 0, 0, 0, -1]]) L_shift = L + 1 lm = LabelModel(cardinality=k, verbose=False) lm._set_constants(L_shift) lm._create_tree() L_aug = lm._get_augmented_label_matrix(L_shift, higher_order=True) # Should have 10 columns: # - 5 * 2 = 10 for the sources self.assertEqual(L_aug.shape, (3, 10)) # 13 total nonzero entries self.assertEqual(L_aug.sum(), 13) # Next, check the singleton entries for i in range(n): for j in range(m): if L_shift[i, j] > 0: self.assertEqual(L_aug[i, j * k + L_shift[i, j] - 1], 1) # Finally, check the clique entries # Singleton clique 1 self.assertEqual(len(lm.c_tree.node[1]["members"]), 1) j = lm.c_tree.node[1]["start_index"] self.assertEqual(L_aug[0, j], 1) # Singleton clique 2 self.assertEqual(len(lm.c_tree.node[2]["members"]), 1) j = lm.c_tree.node[2]["start_index"] self.assertEqual(L_aug[0, j + 1], 0)
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel, role_label_model: LabelModel): if 'event_triggers' not in documents and 'event_roles' not in documents: documents = documents.apply(pipeline.add_default_events, axis=1) # 1. Get trigger probabilities df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents) trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs()) L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers) event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers) merged_event_trigger_examples = pipeline.merge_event_trigger_examples( df_predict_triggers, utils.zero_out_abstains(event_trigger_probs, L_predict_triggers)) # 2. Get role probabilities df_predict_roles, _ = pipeline.build_event_role_examples(documents) role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs()) L_predict_roles = role_lf_applier.apply(df_predict_roles) event_roles_probs = role_label_model.predict_proba(L_predict_roles) merged_event_role_examples = pipeline.merge_event_role_examples( df_predict_roles, utils.zero_out_abstains(event_roles_probs, L_predict_roles)) # 3. Update documents with trigger & role probabilities labeled_documents: pd.DataFrame = documents.copy() # Make sure to remove event_triggers and roles that were built per default for idx, row in labeled_documents.iterrows(): row['event_triggers'] = [] row['event_roles'] = [] if 'id' in labeled_documents: labeled_documents.set_index('id', inplace=True) triggers = merged_event_trigger_examples[['event_triggers']] roles = merged_event_role_examples[['event_roles']] labeled_documents.update(triggers) labeled_documents.update(roles) labeled_documents.reset_index(level=0, inplace=True) # 4. Add ACE events labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents) return labeled_documents
def test_loss(self): L = np.array([[0, -1, 0], [0, 1, -1]]) label_model = LabelModel(cardinality=2, verbose=False) label_model.fit(L, n_epochs=1) label_model.mu = nn.Parameter(label_model.mu_init.clone() + 0.05) # l2_loss = l2*M*K*||mu - mu_init||_2 = 3*2*(0.05^2) = 0.03 self.assertAlmostEqual(label_model._loss_l2(l2=1.0).item(), 0.03) self.assertAlmostEqual(label_model._loss_l2(l2=np.ones(6)).item(), 0.03) # mu_loss = ||O - \mu^T P \mu||_2 + ||\mu^T P - diag(O)||_2 self.assertAlmostEqual(label_model._loss_mu().item(), 0.675, 3)
def test_class_balance(self): label_model = LabelModel(cardinality=2, verbose=False) # Test class balance Y_dev = np.array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1]) label_model._set_class_balance(class_balance=None, Y_dev=Y_dev) np.testing.assert_array_almost_equal(label_model.p, np.array([0.6, 0.4])) class_balance = np.array([0.0, 1.0]) with self.assertRaisesRegex(ValueError, "Class balance prior is 0"): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) class_balance = np.array([0.0]) with self.assertRaisesRegex(ValueError, "class_balance has 1 entries."): label_model._set_class_balance(class_balance=class_balance, Y_dev=Y_dev) Y_dev_one_class = np.array([0, 0, 0]) with self.assertRaisesRegex( ValueError, "Does not match LabelModel cardinality" ): label_model._set_class_balance(class_balance=None, Y_dev=Y_dev_one_class)
def test_mv_default(self): # less than 2 LFs have overlaps label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, -1, 1], [-1, 1, -1], [0, -1, -1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 0])) # less than 2 LFs have conflicts L = np.array([[-1, -1, 1], [-1, 1, 1], [1, 1, 1]]) label_model.fit(L, n_epochs=100) np.testing.assert_array_almost_equal(label_model.predict(L), np.array([1, 1, 1]))
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ( [f] + [get_positive_labeling_function(divisor) for divisor in range(2, 9)] + [get_negative_labeling_function(divisor) for divisor in range(2, 9)] ) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def test_get_weight(self): # set up L matrix true_accs = [0.95, 0.6, 0.7, 0.55, 0.8] coverage = [1.0, 0.8, 1.0, 1.0, 1.0] L = -1 * np.ones((1000, len(true_accs))) Y = np.zeros(1000) for i in range(1000): Y[i] = 1 if np.random.rand() <= 0.5 else 0 for j in range(5): if np.random.rand() <= coverage[j]: L[i, j] = (Y[i] if np.random.rand() <= true_accs[j] else np.abs(Y[i] - 1)) label_model = LabelModel(cardinality=2) label_model.fit(L, n_epochs=1000, seed=123) accs = label_model.get_weights() for i in range(len(accs)): true_acc = true_accs[i] self.assertAlmostEqual(accs[i], true_acc, delta=0.1)
def get_majority_vote_label(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) majority_model = MajorityLabelVoter(cardinality=len(labels)) preds_train = majority_model.predict(L=L_train) non_abstain_idxs = np.argwhere(preds_train >= 0).flatten() df_filtered = train_df.iloc[non_abstain_idxs] probs_filtered = preds_train[non_abstain_idxs] return df_filtered, probs_filtered
def test_label_model_sparse(self) -> None: """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset. This tests the common setting where LFs abstain most of the time, which can cause issues for example if parameter clamping set too high (e.g. see Issue #1422). """ np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality, abstain_multiplier=1000.0) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=1000, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model.get_conditional_probs() np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels *only on non-abstained data points* Y_pred = label_model.predict(L, tie_break_policy="abstain") idx, = np.where(Y_pred != -1) acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx) self.assertGreaterEqual(acc, 0.65) # Make sure that we don't output abstain when an LF votes, per issue #1422 self.assertEqual(len(idx), np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
def snorkel_process(keylist, dataframe, allweaklabf): def func(x): idx = (-x).argsort()[1:] x[idx] = 0 return x cardinalitynu = len(keylist) applier = PandasLFApplier(lfs=allweaklabf) all_train_l = applier.apply(df=dataframe) report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary() print(report) label_model = LabelModel(cardinality=cardinalitynu, verbose=False) label_model.fit(all_train_l) predt = label_model.predict(all_train_l) predt1 = label_model.predict_proba(all_train_l) keylist1 = keylist.copy() #keylist1.append('Not_relevent') predt2 = pd.DataFrame(predt1, columns=keylist1) dataframe['L_label'] = predt dataframe1 = dataframe.join(predt2, how='outer') dataframe1 = dataframe1[dataframe1.L_label >= 0] train, test = train_test_split(dataframe1, test_size=0.2) trainsent = train.sent.values trainlabel = train[keylist].values trainlabe2 = trainlabel.copy() np.apply_along_axis(func, 1, trainlabe2) trainlabe2 = np.where(trainlabe2 > 0, 1, 0) testsent = test.sent.values testlabel = test[keylist].values testlabe2 = testlabel.copy() np.apply_along_axis(func, 1, testlabe2) testlabe2 = np.where(testlabe2 > 0, 1, 0) return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def curate_twitter(save_name='../../pandafied_data/curated_twitter.csv'): df_train = pd.read_csv('../../pandafied_data/pandafied_twitter.csv') #from utils import load_unlabeled_spam_dataset #df_train = load_unlabeled_spam_dataset() # Define the set of labeling functions (LFs) #lfs = [lf_keyword_wateroverlast,lf_keyword_voertuig,lf_keyword_aanrijding,lf_keyword_te_water,lf_keyword_persoon,lf_keyword_brand,lf_keyword_mps,lf_keyword_kps,lf_keyword_luchtdr] #lfs = [lf_keyword_keywords] lfs = [lf_keyword_wateroverlast] # Apply the LFs to the unlabeled training data applier = PandasLFApplier(lfs) L_train = applier.apply(df_train) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain") #tie_break_policy="true-random" #tie_break_policy="abstain" counter = 0 for i in range(len(df_train["label"])): if df_train["label"][i] == WATER: print() print(df_train["text"][i]) print(df_train["label"][i]) print() counter += 1 print("num entries total: " + str(len(df_train["label"]))) print("num entries water: " + str(counter)) #df_train = df_train[df_train.label != ABSTAIN] twitter_curated = df_train[df_train.label == WATER] twitter_curated = twitter_curated.drop(columns='label') twitter_curated.to_csv(save_name, index=False)
def test_L_form(self): label_model = LabelModel(cardinality=2, verbose=False) L = np.array([[-1, 1, -1], [-1, 1, -1], [1, -1, -1], [-1, 1, -1]]) label_model._set_constants(L) self.assertEqual(label_model.n, 4) self.assertEqual(label_model.m, 3) L = np.array([[-1, 0, 1], [-1, 0, 2], [0, -1, 2], [-1, 0, -1]]) with self.assertRaisesRegex(ValueError, "L_train has cardinality"): label_model.fit(L, n_epochs=1) L = np.array([[0, 1], [1, 1], [0, 1]]) with self.assertRaisesRegex(ValueError, "L_train should have at least 3"): label_model.fit(L, n_epochs=1)
def test_warmup(self): L = np.array([[0, -1, 0], [0, 1, 0]]) label_model = LabelModel() lr_scheduler_config = {"warmup_steps": 3, "warmup_unit": "epochs"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) lr_scheduler_config = {"warmup_percentage": 3 / 5} label_model.fit(L, lr_scheduler_config=lr_scheduler_config, n_epochs=5) self.assertEqual(label_model.warmup_steps, 3) with self.assertRaisesRegex(ValueError, "LabelModel does not support"): lr_scheduler_config = {"warmup_steps": 1, "warmup_unit": "batches"} label_model.fit(L, lr_scheduler_config=lr_scheduler_config)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")