def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid): hyper_grid_results = defaultdict(dict) train_grid_results = defaultdict(dict) dev_grid_results = defaultdict(dict) test_grid_results = defaultdict(dict) models = defaultdict(dict) for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)): for param in regularization_grid: label_model = LabelModel(cardinality=2) label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=param, ) # Get marginals for each parameter hyper_grid_results[str(param)] = roc_curve( dev_labels, label_model.predict_proba(dev_matrix[:, lf_sample[1]])[:, 1]) # Convert marginals into AUROCs hyper_grid_results = { param: auc(hyper_grid_results[param][0], hyper_grid_results[param][1]) for param in hyper_grid_results } # Select the parameter with the highest AUROC best_param = float( max(hyper_grid_results.items(), key=operator.itemgetter(1))[0]) # Re-fit the model label_model.fit( train_matrix[:, lf_sample[1]], n_epochs=1000, seed=100, lr=0.01, l2=best_param, ) # Save marginals for output key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}' train_grid_results[key] = label_model.predict_proba( train_matrix[:, lf_sample[1]]) dev_grid_results[key] = label_model.predict_proba( dev_matrix[:, lf_sample[1]]) test_grid_results[key] = label_model.predict_proba( test_matrix[:, lf_sample[1]]) models[key] = label_model return train_grid_results, dev_grid_results, test_grid_results, models
def snorkel_process(keylist, dataframe, allweaklabf): def func(x): idx = (-x).argsort()[1:] x[idx] = 0 return x cardinalitynu = len(keylist) applier = PandasLFApplier(lfs=allweaklabf) all_train_l = applier.apply(df=dataframe) report = LFAnalysis(L=all_train_l, lfs=allweaklabf).lf_summary() print(report) label_model = LabelModel(cardinality=cardinalitynu, verbose=False) label_model.fit(all_train_l) predt = label_model.predict(all_train_l) predt1 = label_model.predict_proba(all_train_l) keylist1 = keylist.copy() #keylist1.append('Not_relevent') predt2 = pd.DataFrame(predt1, columns=keylist1) dataframe['L_label'] = predt dataframe1 = dataframe.join(predt2, how='outer') dataframe1 = dataframe1[dataframe1.L_label >= 0] train, test = train_test_split(dataframe1, test_size=0.2) trainsent = train.sent.values trainlabel = train[keylist].values trainlabe2 = trainlabel.copy() np.apply_along_axis(func, 1, trainlabe2) trainlabe2 = np.where(trainlabe2 > 0, 1, 0) testsent = test.sent.values testlabel = test[keylist].values testlabe2 = testlabel.copy() np.apply_along_axis(func, 1, testlabe2) testlabe2 = np.where(testlabe2 > 0, 1, 0) return trainsent, trainlabe2, testsent, testlabe2, keylist, report
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
def predict_documents(documents: pd.DataFrame, trigger_label_model: LabelModel, role_label_model: LabelModel): if 'event_triggers' not in documents and 'event_roles' not in documents: documents = documents.apply(pipeline.add_default_events, axis=1) # 1. Get trigger probabilities df_predict_triggers, _ = pipeline.build_event_trigger_examples(documents) trigger_lf_applier = PandasLFApplier(pipeline.get_trigger_list_lfs()) L_predict_triggers = trigger_lf_applier.apply(df_predict_triggers) event_trigger_probs = trigger_label_model.predict_proba(L_predict_triggers) merged_event_trigger_examples = pipeline.merge_event_trigger_examples( df_predict_triggers, utils.zero_out_abstains(event_trigger_probs, L_predict_triggers)) # 2. Get role probabilities df_predict_roles, _ = pipeline.build_event_role_examples(documents) role_lf_applier = PandasLFApplier(pipeline.get_role_list_lfs()) L_predict_roles = role_lf_applier.apply(df_predict_roles) event_roles_probs = role_label_model.predict_proba(L_predict_roles) merged_event_role_examples = pipeline.merge_event_role_examples( df_predict_roles, utils.zero_out_abstains(event_roles_probs, L_predict_roles)) # 3. Update documents with trigger & role probabilities labeled_documents: pd.DataFrame = documents.copy() # Make sure to remove event_triggers and roles that were built per default for idx, row in labeled_documents.iterrows(): row['event_triggers'] = [] row['event_roles'] = [] if 'id' in labeled_documents: labeled_documents.set_index('id', inplace=True) triggers = merged_event_trigger_examples[['event_triggers']] roles = merged_event_role_examples[['event_roles']] labeled_documents.update(triggers) labeled_documents.update(roles) labeled_documents.reset_index(level=0, inplace=True) # 4. Add ACE events labeled_documents = ace_formatter.snorkel_to_ace_format(labeled_documents) return labeled_documents
def get_snorkel_labels(train_df, lfs, labels): applier = PandasLFApplier( [labeling_function(name=lf.__name__)(lf) for lf in lfs]) label_model = LabelModel(cardinality=len(labels), verbose=True) L_train = applier.apply(df=train_df) label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123) L_probs = label_model.predict_proba(L=L_train) df_filtered, probs_filtered = filter_unlabeled_dataframe(X=train_df, y=L_probs, L=L_train) return df_filtered, probs_filtered
def test_label_model(self) -> None: """Test the LabelModel's estimate of P and Y.""" np.random.seed(123) P, Y, L = generate_simple_label_matrix(self.n, self.m, self.cardinality) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L, n_epochs=200, lr=0.01, seed=123) # Test estimated LF conditional probabilities P_lm = label_model._get_conditional_probs().reshape( (self.m, self.cardinality + 1, -1)) np.testing.assert_array_almost_equal(P, P_lm, decimal=2) # Test predicted labels Y_lm = label_model.predict_proba(L).argmax(axis=1) err = np.where(Y != Y_lm, 1, 0).sum() / self.n self.assertLess(err, 0.1)
def test_labeling_convergence(self) -> None: """Test convergence of end to end labeling pipeline.""" # Apply LFs labeling_functions = ( [f] + [get_positive_labeling_function(divisor) for divisor in range(2, 9)] + [get_negative_labeling_function(divisor) for divisor in range(2, 9)] ) applier = PandasLFApplier(labeling_functions) L_train = applier.apply(self.df_train, progress_bar=False) self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions))) # Train LabelModel label_model = LabelModel(cardinality=self.cardinality, verbose=False) label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0) Y_lm = label_model.predict_proba(L_train).argmax(axis=1) Y = self.df_train.y err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN self.assertLess(err, 0.05)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" PARALLEL = 4 max_docs = 12 fonduer.init_logging( log_dir="log_folder", format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert (len( mention_extractor.get_mentions(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 70) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert (len( candidate_extractor.get_candidates(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 1432) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]) assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) # Collect word counter word_counter = collect_word_counter(train_cands) emmental.init(fonduer.Meta.log_path) # Training config config = { "meta_config": { "verbose": False }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 5, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LSTM") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
def train_f_on_d_U(self, datafeeder, num_epochs, loss_type): sess = self.hls.sess total_batch = datafeeder.get_batches_per_epoch(f_d_U) batch_size = datafeeder.get_batch_size(f_d_U) if loss_type == 'pure-likelihood': train_op = self.hls.f_d_U_pure_likelihood_op loss_op = self.hls.f_d_U_pure_likelihood_loss elif loss_type == 'implication': train_op = self.hls.f_d_U_implication_op loss_op = self.hls.f_d_U_implication_loss elif loss_type == 'pr_loss': train_op = self.hls.pr_train_op loss_op = self.hls.pr_loss elif loss_type == 'gcross': train_op = self.hls.gcross_train_op loss_op = self.hls.gcross_loss elif loss_type == 'gcross_snorkel': train_op = self.hls.snork_gcross_train_op loss_op = self.hls.snork_gcross_loss elif loss_type == 'learn2reweight': train_op = self.hls.l2r_train_op loss_op = self.hls.l2r_loss elif loss_type == 'label_snorkel': train_op = self.hls.label_snorkel_train_op loss_op = self.hls.label_snorkel_loss elif loss_type == 'pure_snorkel': train_op = self.hls.pure_snorkel_train_op loss_op = self.hls.pure_snorkel_loss else: raise ValueError('Invalid loss type %s' % loss_type) best_saver_f_d_U = self.hls.best_savers.get_best_saver(f_d_U) metrics_dict = {} #{'config': self.config} if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: label_model = LabelModel(cardinality=self.hls.num_classes, verbose=True) if os.path.isfile( os.path.join(self.config.data_dir, "saved_label_model")): label_model = label_model.load( os.path.join(self.config.data_dir, "saved_label_model")) else: print("LABEL MODEL NOT SAVED") exit() if 'gcross' in self.config.mode or 'learn2reweight' in self.config.mode: majority_model = MajorityLabelVoter( cardinality=self.hls.num_classes) with sess.as_default(): print("Optimization started for f_d_U with %s loss!" % loss_type) print("Batch size: %d!" % batch_size) print("Batches per epoch : %d!" % total_batch) print("Number of epochs: %d!" % num_epochs) # Training cycle iteration = 0 global_step = 0 patience = 0 for epoch in range(num_epochs): avg_epoch_cost = 0. for i in range(total_batch): batch_x, batch_l, batch_m, batch_L, batch_d, batch_r =\ datafeeder.get_f_d_U_next_batch() feed_dict = { self.hls.f_d_U_adam_lr: self.config.f_d_U_adam_lr, self.hls.f_d_U_x: batch_x, self.hls.f_d_U_l: batch_l, self.hls.f_d_U_m: batch_m, self.hls.f_d_U_L: batch_L, self.hls.f_d_U_d: batch_d, self.hls.f_d_U_r: batch_r } batch_lsnork = conv_l_to_lsnork(batch_l, batch_m) if 'label_snorkel' == self.config.mode or 'pure_snorkel' == self.config.mode or 'gcross_snorkel' == self.config.mode: batch_snork_L = label_model.predict_proba( L=batch_lsnork) #snorkel_probs feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L if 'gcross' == self.config.mode or 'learn2reweight' == self.config.mode: batch_snork_L = majority_model.predict( L=batch_lsnork) #majority votes batch_snork_L = np.eye( self.hls.num_classes)[batch_snork_L] #one hot rep feed_dict[self.hls.f_d_U_snork_L] = batch_snork_L merge_dict_a_into_b(self.hls.dropout_train_dict, feed_dict) # Run optimization op (backprop) and cost op (to get loss value) _, cost, num_d, f_d_U_global_step = sess.run( [ train_op, loss_op, self.hls.f_d_U_num_d, self.hls.f_d_U_global_step ], feed_dict=feed_dict) global_epoch = f_d_U_global_step / total_batch # This assertion is valid only if true U labels are available but not being used such as for # synthetic data. assert np.all(batch_L <= self.hls.num_classes) avg_epoch_cost += cost / total_batch cost1 = (avg_epoch_cost * total_batch) / (i + 1) global_step += 1 # Compute and report metrics, update checkpoints after each epoch print("\n========== epoch : {} ============\n".format(epoch)) print("cost: {}\n".format(cost1)) print("patience: {}\n".format(patience)) precision, recall, f1_score, support = self.hls.test.test_f( datafeeder) self.compute_f_d_metrics(metrics_dict, precision, recall, f1_score, support, global_epoch, f_d_U_global_step) print("\nmetrics_dict: ", metrics_dict) print() self.report_f_d_perfs_to_tensorboard(cost1, metrics_dict, global_step) did_improve = self.maybe_save_metrics_dict(f_d_U, metrics_dict) if did_improve: patience = 0 #rest patience if primary metric improved else: patience += 1 if patience > self.config.early_stopping_p: print("bye! stopping early!......") break # Save checkpoint print() self.hls.mru_saver.save(global_step) print() best_saver_f_d_U.save_if_best( metrics_dict[self.config.f_d_primary_metric]) print() global_step += 1 print("Optimization Finished for f_d_U!")
Y_data = df.bm25_relevant.values print(df.shape) lfs = [ lf.has_type_diap_medd_or_bhvr, lf.is_doctor_reply, lf.has_votes, lf.enity_overlap_jacc, lf.same_author, lf.number_relations_total, lf.entity_types ] applier = PandasLFApplier(lfs) L_data = applier.apply(df=df) label_model = LabelModel(cardinality=2, verbose=True) label_model.load("trained_model_ehf.lbm") valid_probabilities = label_model.predict_proba(L=L_data) if 'predicted_prob' in df: del df['predicted_prob'] df['predicted_prob'] = valid_probabilities[:, 1] PROBABILITY_CUTOFF = 0.5 df['predicted_label'] = df['predicted_prob'] >= PROBABILITY_CUTOFF df_out = df[df['predicted_label'] == int(RELEVANT)][[ 'query_id', 'document_id' ]] with open(qrels_path, 'a+', encoding='utf8') as output_file: for index, row in df_out.iterrows(): output_file.write( str(row['query_id']) + '\t0\t' + str(row['document_id']) +
def main(train_path, output_dir, label_dir): # Get all data df = pd.read_csv(train_path) # Get human labels human_labels = read_human_labels(label_dir) # df_test and lab_test: the set of all human-labeled notes, and their labels df_test = df.merge(human_labels, on=['record_number']) lab_test = df_test.human_label del df_test['human_label'] # df_train: formed by removing all patients from df with a human-labeled note df_train = df.merge(df_test.mr, indicator=True, how='left', on = ['mr']) df_train = df_train.query('_merge=="left_only"').drop('_merge', axis=1) # Generate label matrix L_train = PandasLFApplier(lfs=lfs).apply(df=df_train) L_test = PandasLFApplier(lfs=lfs).apply(df=df_test) # Summarize LFs output_train = LFAnalysis(L=L_train, lfs=lfs).lf_summary() #print(output_train) output_test = LFAnalysis(L=L_test, lfs=lfs).lf_summary(Y = lab_test.values) #print(output_test) # Save LF analysis path = os.path.join(output_dir, 'LF_analysis_train.csv') output_train.to_csv(path, index = True) path = os.path.join(output_dir, 'LF_analysis_test.csv') output_test.to_csv(path, index = True) # Create label model label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance = [0.3, 0.7]) # Evaluate the label model using labeled test set for metric in ['recall', 'precision', 'f1', 'accuracy']: label_model_acc = label_model.score(L=L_test, Y=lab_test, metrics=[metric], tie_break_policy="random")[metric] print("%-15s %.2f%%" % (metric+":", label_model_acc * 100)) null_f1 = f1_score(lab_test.values, np.ones((df_test.shape[0],))) print("%-15s %.2f%%" % ("null f1:", null_f1 * 100)) print("%-15s %.2f%%" % ("null accuracy:", np.maximum(1-np.mean(lab_test), np.mean(lab_test)) * 100)) # Save error analysis preds = label_model.predict_proba(L_test) error_analysis(df_test, L_test, lfs, preds[:,1], lab_test, output_dir) # Get labels on train probs_train = label_model.predict_proba(L_train) # Filter out unlabeled data points df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) # Save filtered training set df_train_filtered['prob'] = probs_train_filtered[:,1] path = os.path.join(output_dir, 'df_train_filtered.csv') df_train_filtered.to_csv(path, index = False) # Save label probs path = os.path.join(output_dir, 'probs_train_filtered') np.save(path, probs_train_filtered[:,1]) # Save training data set and labels assert len(df_test) == len(lab_test) df_test['human_label'] = lab_test path = os.path.join(output_dir, 'df_test.csv') df_test.to_csv(path, index = False) path = os.path.join(output_dir, 'lab_test') np.save(path, lab_test)
def label_user(inp_path, prefix=""): df_train = pd.read_pickle(inp_path) ########## threshold on word similarity take_first = 100 overall_first = 10000 global thresh_by_value, overall_thresh df_train['root_value'] = df_train['value'].swifter.set_dask_threshold( dask_threshold=0.001).allow_dask_on_strings().apply( lambda x: syn_to_hob[x]) thresh_by_value = df_train.groupby( ["root_value"]).apply(lambda x: np.partition( x['lexicon_counts'], max(len(x['lexicon_counts']) - take_first, 0) )[max(len(x['lexicon_counts']) - take_first, 0)]).to_dict() overall_thresh = np.partition(df_train["lexicon_counts"].to_numpy(), max(len(df_train) - overall_first, 0))[max( len(df_train) - overall_first, 0)] print(overall_thresh) ############################# # separately loose - strict, pos - neg, period - without names_pool = [ "context:2_count_pos", "context:3_count_pos", "context:100_count_pos", "context:2_period_count_pos", "context:3_period_count_pos", "context:100_period_count_pos", "context:2_count_neg", "context:3_count_neg", "context:100_count_neg", "context:2_period_count_neg", "context:3_period_count_neg", "context:100_period_count_neg" ] for f_name in names_pool: curr_cols = [x for x in df_train.columns if f_name in x] df_train['total_' + f_name] = df_train[curr_cols].swifter.apply(sum, axis=1) df_train = df_train.drop(curr_cols, axis=1) for p in ["pos", "neg"]: df_train["new_total_context:100_count_" + p] = df_train[[ "total_context:100_count_" + p, "total_context:3_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_count_" + p] - x["total_context:3_count_" + p]), axis=1) df_train["new_total_context:3_count_" + p] = df_train[[ "total_context:3_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_count_" + p] - x["total_context:2_count_" + p ]), axis=1) df_train["new_total_context:100_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:100_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:100_period_count_" + p] - x[ "total_context:3_period_count_" + p]), axis=1) df_train["new_total_context:3_period_count_" + p] = df_train[[ "total_context:3_period_count_" + p, "total_context:2_period_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:3_period_count_" + p] - x[ "total_context:2_period_count_" + p]), axis=1) df_train["new_total_context:2_count_" + p] = df_train[[ "total_context:100_period_count_" + p, "total_context:2_count_" + p ]].swifter.apply(lambda x: max( 0, x["total_context:2_count_" + p] - x[ "total_context:100_period_count_" + p]), axis=1) df_train = df_train.drop( ["total_" + x for x in names_pool if "2_period_count" not in x], axis=1) lfs = [val_in_name, positive_lexicon_overall, positive_lexicon_pervalue] num_of_thesholds = 3 step = 100 // num_of_thesholds for col in df_train: if col not in ["author", "value", "idd", "root_value"]: if col not in [ "pos_prob_mean", "neg_prob_mean", "num_good_posts" ]: # , "lexicon_counts", "subreddit_counts", "name_in_subr_count"]: thresholds = [0] if "lexicon" in col and "unique" not in col: continue if True: # col in ["lexicon_counts", "unique_lexicon_counts"]: vals = df_train[col].to_numpy() thresholds = np.percentile( vals, list(range(0 + step, 99 + step, step))).astype(int) thresholds = sorted(list(set(thresholds))) if len(thresholds) > 1: thresholds = thresholds[:-1] if "lexicon" in col: thresholds = [3] # max_val = max(vals) # thresholds = list(range(0, int(max_val), int(max_val/5) + 1)) # elif col == "pos_prob_mean": # thresholds = [0.5 + 0.1 * x for x in range(5)] for i in range(len(thresholds)): thresh = thresholds[i] next_threshold = sys.maxsize if i == len( thresholds) - 1 else thresholds[i + 1] previous_threshold = -sys.maxsize if i == 0 else thresholds[ i - 1] if "lexicon_counts" not in col: lfs.append( make_thresold_lf(thresh=thresh, col_name=col, next_threshold=next_threshold)) else: lfs.append( make_lexicon_lf( thresh=thresh, pref=col, previous_threshold=previous_threshold)) num_annotators = 0 if num_annotators > 0: for i in range(1, num_annotators + 1): lfs.append(make_annotator_lf(worker_index=i)) lfs = [ x for x in lfs if any(y in str(x) for y in ["less", "context:2", "worker", "lexicon"]) ] print("created lfs their number", len(lfs)) print("\n".join(str(x) for x in lfs)) #### validation ##### do_val = False if do_val: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_dev.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] print("Number val", df_val.shape) print("Number dev", df_dev.shape) df_val = df_val.merge(df_golden, on="auth_val") y_val = np.array(df_val["final"]) df_val = df_val.drop(labels="final", axis=1) # create test set as well with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_val = applier.apply(df=df_val, n_parallel=num_cpu) L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary() analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(y_val) analysis.to_csv("/home/tigunova/val_analysis.csv") dev_analysis.to_csv("/home/tigunova/dev_analysis.csv") print(analysis) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_dev) #, Y_dev=y_val) model_stat = label_model.score(L=L_val, Y=y_val) print(model_stat) exit(0) ########### #### picking threshold ##### do_threshold = False if do_threshold: df_golden = pd.read_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/gold_validation.csv" ) name_val = list(df_golden["auth_val"]) # df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) df_train["auth_val"] = df_train[["author", "value"]].swifter.apply( lambda x: x["author"] + "+++" + x["value"], axis=1) df_val = df_train[df_train.auth_val.isin(name_val)] df_dev = df_train[~df_train.auth_val.isin(name_val)] pop_size = df_dev.shape[0] print("Number val", df_val.shape) print("Number dev", df_dev.shape) applier = PandasParallelLFApplier(lfs=lfs) df_val = df_val.merge(df_golden, on="auth_val") L_val = applier.apply(df=df_val, n_parallel=num_cpu) val_thresholds = [0.01 * x for x in range(100)] label_model = LabelModel(cardinality=2, verbose=True) with TQDMDaskProgressBar(desc="Dask Apply"): L_dev = applier.apply(df=df_dev, n_parallel=num_cpu) label_model.fit(L_dev, class_balance=[0.5, 0.5]) # , Y_dev=y_val) wghts = label_model.get_weights() print("\n".join(str(x) for x in zip(lfs, wghts))) probs_val = label_model.predict_proba(L=L_val) probs_df = pd.DataFrame(probs_val, columns=["neg_prob", "pos_prob"]) df_val = pd.concat([df_val.reset_index(), probs_df], axis=1) probs_dev = label_model.predict_proba(L=L_dev) probs_df = pd.DataFrame(probs_dev, columns=["neg_prob", "pos_prob"]) df_dev = pd.concat([df_dev.reset_index(), probs_df], axis=1) y_true = np.array(df_val["final"]) for th in val_thresholds: y_pred = np.array( df_val["pos_prob"].apply(lambda x: 1 if x > th else 0)) #print("true negatives") #print(df_val[df_val["final"] == 1][df_val["pos_prob"] <= th][["auth_val", "text"]]) prec = precision_score(y_true, y_pred) pred_labels = y_pred true_labels = y_true # True Positive (TP): we predict a label of 1 (positive), and the true label is 1. TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1)) # True Negative (TN): we predict a label of 0 (negative), and the true label is 0. TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0)) # False Positive (FP): we predict a label of 1 (positive), but the true label is 0. FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0)) # False Negative (FN): we predict a label of 0 (negative), but the true label is 1. FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1)) print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP, FP, TN, FN)) # print(list(zip(label_model.predict(L=L_val_curr), y_val_curr))) # print("******************************") print("threshold %s, proportion population %.4f, precision %s" % (str(th), df_dev[df_dev["pos_prob"] > th].shape[0] / pop_size, str(prec))) exit(0) ########### with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary() print(analysis) df_l_train = pd.DataFrame( L_train, columns=["llf_" + str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("********************************************") t4 = time.time() label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123, class_balance=[0.3, 0.7]) probs_train = label_model.predict_proba(L=L_train) print("labeling model work ", (time.time() - t4) / 60) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) result_filtered = pd.concat([ df_train_filtered[['author', 'value', 'idd']].reset_index(), probs_df ], axis=1) print(result_filtered.shape) print("****************************************************") result_filtered.to_csv("/home/tigunova/some_result_" + prefix + ".csv") print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) df_train_filtered = df_train_filtered.drop(["index"], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".pkl") df_train_filtered.to_csv( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/user_" + prefix + ".csv") # df_train.iloc[L_train[:, 1] == POS].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/user_" + prefix + ".csv") ### write dict output_threshold = 0.63 output_dict = defaultdict(list) auth_hobby_dict = defaultdict(list) for index, row in result_filtered.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].append([row.value, row.pos_prob]) allowed_labels = [] for index, row in df_train_filtered.iterrows(): if row.value == row.value and row.author == row.author: if row.pos_prob > output_threshold: output_dict[row.author].append([row.value] + row.idd + [row.pos_prob]) allowed_labels.append(syn_to_hob[row.value]) print("\n".join([ str(y) for y in sorted(dict(Counter(allowed_labels)).items(), key=lambda x: x[1]) ])) print( "After cropping", sum([ x if x < 500 else 500 for x in dict(Counter(allowed_labels)).values() ])) print("users in total", len(output_dict)) for auth, stuffs in output_dict.items(): prof = ":::".join(set([x[0] for x in stuffs])) prob = ":::".join([str(x[-1]) for x in stuffs]) msgs = set([x for l in stuffs for x in l[1:-1]]) output_dict[auth] = [prof] + list(msgs) + [prob] with open( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/final_author_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict))) with open("/home/tigunova/users_profession1.txt", "w") as f_out: f_out.write(repr(dict(output_dict)))
def label_post(inp_path, prefix = ""): #lfs = [job_inpost, check_subreddit, check_iama] lfs = [job_inpost, check_iama] context_lens = [100, 3, 2] for with_per in [True, False]: for clen in context_lens: for kw in patterns: lfs.append(make_keyword_lf(keyword=kw, context_len=clen, with_period=with_per)) print("created lfs, their count", len(lfs)) df_train = pd.read_pickle(inp_path) df_train['texts'] = df_train['text'].swifter.apply(lambda x: [y.lower() for y in tokenize.sent_tokenize(x)]) df_train['root_value'] = df_train['value'].swifter.apply(lambda x: syn_to_hob[x]) #df_train['containing_sentences'] = df_train[['texts', 'value']].swifter.apply(lambda y: find_val(y['texts'], y['value']), axis=1) print("loaded dataset") t1 = time.time() with TQDMDaskProgressBar(desc="Dask Apply"): applier = PandasParallelLFApplier(lfs=lfs) L_train = applier.apply(df=df_train, n_parallel=num_cpu) print("time mins ", (time.time() - t1) / 60) print(LFAnalysis(L=L_train, lfs=lfs).lf_summary()) df_l_train = pd.DataFrame(L_train, columns=[str(x).split(",")[0] for x in lfs]) print(df_train.shape) print(df_l_train.shape) df_train = pd.concat([df_train.reset_index(), df_l_train], axis=1) print(df_train.shape) print("*************************************************") df_train = df_train.drop(["index"], axis=1) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123) probs_train = label_model.predict_proba(L=L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train ) print("the length of unfiltered posts", len(set(df_train['author'] + "+++++" + df_train['value']))) print("the length of filtered posts", len(set(df_train_filtered['author'] + "+++++" + df_train_filtered['value']))) probs_df = pd.DataFrame(probs_train_filtered, columns=["neg_prob", "pos_prob"]) print(df_train_filtered.shape) print(probs_df.shape) df_train_filtered = pd.concat([df_train_filtered.reset_index(), probs_df], axis=1) print(df_train_filtered.shape) df_train_filtered.to_pickle("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".pkl") df_train_filtered.to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/train_post_" + prefix + ".csv") #df_train.iloc[L_train[:, 1] != ABSTAIN].to_csv("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/intr_train_post_tmp.csv") verbose = True if verbose: for i in range(len(lfs)): ppath = "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/interesting_datasets/" + str(lfs[i]).split(",")[0] + ".csv" df_train.iloc[L_train[:, i] != ABSTAIN].to_csv(ppath) auth_hobby_dict = defaultdict(set) for index, row in df_train.iterrows(): if row.value == row.value and row.author == row.author: auth_hobby_dict[row.author].add(row.value) with open("/home/tigunova/PycharmProjects/snorkel_labels/data/profession/sources/author_profession_dict_" + prefix + ".txt", "w") as f_out: f_out.write(repr(dict(auth_hobby_dict)))
class Modeler: def __init__(self, df_train, df_dev, df_valid, df_test, df_heldout, lfs={}, label_model=None): df_train["seen"] = 0 self.df_train = df_train.reset_index() self.df_dev = df_dev self.df_valid = df_valid self.df_test = df_test self.df_heldout = df_heldout #self.Y_train = df_train.label.values self.Y_dev = df_dev.label.values self.Y_valid = df_valid.label.values self.Y_test = df_test.label.values self.Y_heldout = df_heldout.label.values self.lfs = lfs self.L_train = None self.L_dev = None self.L_valid = None self.L_heldout = None cardinality = len(df_valid.label.unique()) # for DEMOing purposes self.first_text_indices = [ 1262, #"check out" "youtube" 1892, # I love 1117, # url concept 1706, # emoji concept 952, # "nice" 971, # positive concept 958, # actually use emoji concept ] self.count = 0 if label_model is None: self.label_model = LabelModel(cardinality=cardinality, verbose=True) else: self.label_model = label_model self.vectorizer = CountVectorizer(ngram_range=(1, 2)) self.vectorizer.fit(df_train.text.tolist()) def get_lfs(self): return list(self.lfs.values()) def add_lfs(self, new_lfs: dict): self.lfs.update(new_lfs) def remove_lfs(self, old_lf_ids: list): for lf_id in old_lf_ids: del self.lfs[lf_id] return len(self.lfs) def apply_lfs(self): applier = PandasLFApplier(lfs=self.get_lfs()) self.L_train = applier.apply(df=self.df_train) self.L_dev = applier.apply(df=self.df_dev) self.L_heldout = applier.apply(df=self.df_heldout) #self.L_valid = applier.apply(df=self.df_valid) def find_duplicate_signature(self): label_matrix = np.vstack([self.L_train, self.L_dev]) seen_signatures = {} dupes = {} lfs = self.get_lfs() signatures = [ hash(label_matrix[:, i].tostring()) for i in range(len(lfs)) ] for i, s in enumerate(signatures): lf = lfs[i] if s in seen_signatures: dupes[lf.name] = seen_signatures[s] else: seen_signatures[s] = lf.name return dupes def lf_examples(self, lf_id, n=5): lf = self.lfs[lf_id] applier = PandasLFApplier(lfs=[lf]) L_train = applier.apply(df=self.df_train) labeled_examples = self.df_train[L_train != -1] samples = labeled_examples.sample(min(n, len(labeled_examples)), random_state=13) return [{"text": t} for t in samples["text"].values] def lf_mistakes(self, lf_id, n=5): lf = self.lfs[lf_id] applier = PandasLFApplier(lfs=[lf]) L_dev = applier.apply(df=self.df_dev).squeeze() labeled_examples = self.df_dev[(L_dev != -1) & (L_dev != self.df_dev["label"])] samples = labeled_examples.sample(min(n, len(labeled_examples)), random_state=13) return [{"text": t} for t in samples["text"].values] def fit_label_model(self): assert self.L_train is not None self.label_model.fit(L_train=self.L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123) def analyze_lfs(self): if len(self.lfs) > 0: df = LFAnalysis(L=self.L_train, lfs=self.get_lfs()).lf_summary() dev_df = LFAnalysis(L=self.L_dev, lfs=self.get_lfs()).lf_summary(Y=self.Y_dev) df = df.merge(dev_df, how="outer", suffixes=(" Training", " Dev."), left_index=True, right_index=True) df["Weight"] = self.label_model.get_weights() df["Duplicate"] = None for dupe, OG in self.find_duplicate_signature().items(): print("Duplicate labeling signature detected") print(dupe, OG) df.at[dupe, "Duplicate"] = OG return df return None def get_label_model_stats(self): result = self.label_model.score(L=self.L_dev, Y=self.Y_dev, metrics=["f1", "precision", "recall"]) probs_train = self.label_model.predict_proba(L=self.L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=self.L_train) result["training_label_coverage"] = len(probs_train_filtered) / len( probs_train) result["class_0_ratio"] = (probs_train_filtered[:, 0] > 0.5).sum() / len(probs_train_filtered) if len(probs_train_filtered) == 0: result["class_0_ratio"] = 0 return result def get_heldout_stats(self): if self.L_heldout is not None: return self.label_model.score( L=self.L_heldout, Y=self.Y_heldout, metrics=["f1", "precision", "recall"]) return {} def train(self): probs_train = self.label_model.predict_proba(L=self.L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=self.df_train, y=probs_train, L=self.L_train) if len(df_train_filtered) == 0: print("Labeling functions cover none of the training examples!", file=sys.stderr) return {"micro_f1": 0} #from tensorflow.keras.utils import to_categorical #df_train_filtered, probs_train_filtered = self.df_dev, to_categorical(self.df_dev["label"].values) vectorizer = self.vectorizer X_train = vectorizer.transform(df_train_filtered.text.tolist()) X_dev = vectorizer.transform(self.df_dev.text.tolist()) X_valid = vectorizer.transform(self.df_valid.text.tolist()) X_test = vectorizer.transform(self.df_test.text.tolist()) self.keras_model = get_keras_logreg(input_dim=X_train.shape[1]) self.keras_model.fit( x=X_train, y=probs_train_filtered, validation_data=(X_valid, preds_to_probs(self.Y_valid, 2)), callbacks=[get_keras_early_stopping()], epochs=20, verbose=0, ) preds_test = self.keras_model.predict(x=X_test).argmax(axis=1) #return preds_test return self.get_stats(self.Y_test, preds_test) def get_heldout_lr_stats(self): X_heldout = self.vectorizer.transform(self.df_heldout.text.tolist()) preds_test = self.keras_model.predict(x=X_heldout).argmax(axis=1) return self.get_stats(self.Y_heldout, preds_test) def get_stats(self, Y_test, preds_test): label_classes = np.unique(self.Y_test) accuracy = metrics.accuracy_score(Y_test, preds_test) precision_0, precision_1 = metrics.precision_score( Y_test, preds_test, labels=label_classes, average=None) recall_0, recall_1 = metrics.recall_score(Y_test, preds_test, labels=label_classes, average=None) test_f1 = metrics.f1_score(Y_test, preds_test, labels=label_classes) #recall_0, recall_1 = metrics.precision_recall_fscore_support(self.Y_test, preds_test, labels=label_classes)["recall"] return { "micro_f1": test_f1, "recall_0": recall_0, "precision_0": precision_0, "accuracy": accuracy, "recall_1": recall_1, "precision_1": precision_1 } def entropy(self, prob_dist): #return(-(L_row_i==-1).sum()) return (-sum([x * log(x) for x in prob_dist])) def save(self, dir_name): self.label_model.save(os.path.join(dir_name, 'label_model.pkl')) with open(os.path.join(dir_name, 'model_lfs.pkl'), "wb+") as file: pickle.dump(self.lfs, file) def load(self, dir_name): with open(os.path.join(dir_name, 'model_lfs.pkl'), "rb") as file: lfs = pickle.load(file) label_model = LabelModel.load( os.path.join(dir_name, 'label_model.pkl')) self.lfs = lfs self.label_model = label_model
def run_snorkel_labelling_classification(labeling_functions, file, l_train, l_valid): lfs = labeling_functions # lfs = [lf.is_same_thread, lf.has_entities, lf.enity_overlap_jacc, lf.entity_type_overlap_jacc] # lfs = [is_same_thread, enity_overlap, entity_types, entity_type_overlap] # lfs = [is_long, has_votes, is_doctor_reply, is_same_thread, enity_overlap, has_type_dsyn, has_type_patf, has_type_sosy, # has_type_dora, has_type_fndg, has_type_menp, has_type_chem, has_type_orch, has_type_horm, has_type_phsu, # has_type_medd, has_type_bhvr, has_type_diap, has_type_bacs, has_type_enzy, has_type_inpo, has_type_elii] # lfs = [has_votes, is_doctor_reply, is_same_thread, enity_overlap] # lfs = [is_same_thread, enity_overlap, is_doctor_reply] # analysis = LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=Y_train) # print(analysis) # print(analysis['Conflicts']) # print(analysis['Overlaps']) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train=l_train, n_epochs=20000, lr=0.0001, log_freq=10, seed=2345) # label_model.fit(L_train=L_train, n_epochs=20, lr=0.0001, log_freq=10, seed=81794) print("Model weights: " + str(label_model.get_weights())) valid_probabilities = label_model.predict_proba(L=l_valid) if 'predicted_prob' in df_valid: # df_valid.drop(columns=['predicted_prob'], axis=1) del df_valid['predicted_prob'] df_valid.insert(50, 'predicted_prob', valid_probabilities[:, 1]) # df_valid.to_csv("/container/filip/json/ehealthforum/trac/validation_df2.txt", sep="\t", header=True) # df_valid = pd.read_csv("/filip/json/ehealthforum/trac/validation_df.txt", sep="\t") def compute_precision_at_k(l, k): l = l[:k] return sum(l) / k PROBABILITY_CUTOFF = 0.5 df_valid[ 'predicted_label'] = df_valid['predicted_prob'] >= PROBABILITY_CUTOFF true_positive_ratio = df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'] / \ df_valid[df_valid.predicted_label == 1].count()['predicted_label'] print("Number of True relevant: " + str(df_valid[df_valid.bm25_relevant == 1].count()['bm25_relevant'])) print("Number of Predicted relevant: " + str(df_valid[ df_valid.predicted_label == 1].count()['predicted_label']) + '\n') print('True positive ratio: ' + str(true_positive_ratio) + '\n') df_tru = df_valid.groupby(['query_thread']).head(10)['bm25_relevant'] df_pred = df_valid.groupby(['query_thread']).head(10)['predicted_label'] overall_precision = [] for query, group in df_valid.groupby(['query_thread']): precision = compute_precision_at_k( group['predicted_label'].head(10).tolist(), 10) overall_precision.append(precision) print('Overall precision: ' + str(sum(overall_precision) / len(overall_precision))) print("Accuracy: " + str(accuracy_score(df_tru, df_pred))) label_model_acc = label_model.score(L=l_valid, Y=Y_valid)["accuracy"] print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")
# %% [markdown] {"tags": ["md-exclude"]} # Let's briefly confirm that the labels the `LabelModel` produces are probabilistic in nature. # The following histogram shows the confidences we have that each data point has the label SPAM. # The points we are least certain about will have labels close to 0.5. # %% {"tags": ["md-exclude"]} def plot_probabilities_histogram(Y): plt.hist(Y, bins=10) plt.xlabel("Probability of SPAM") plt.ylabel("Number of data points") plt.show() probs_train = label_model.predict_proba(L=L_train) plot_probabilities_histogram(probs_train[:, SPAM]) # %% [markdown] # ### Filtering out unlabeled data points # %% [markdown] # As we saw earlier, some of the data points in our `train` set received no labels from any of our LFs. # These data points convey no supervision signal and tend to hurt performance, so we filter them out before training using a # [built-in utility](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.filter_unlabeled_dataframe.html#snorkel.labeling.filter_unlabeled_dataframe). # %% from snorkel.labeling import filter_unlabeled_dataframe df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe( X=df_train, y=probs_train, L=L_train
return ABSTAIN if __name__ == "__main__": warnings.filterwarnings("ignore") ((df_dev, Y_dev), df_train, (df_test, Y_test)) = load_data() lfs = [lf_husband_wife, lf_husband_wife_left_window, lf_same_last_name, lf_married, lf_familial_relationship, lf_family_left_window, lf_other_relationship, lf_distant_supervision, lf_distant_supervision_last_names] applier = PandasLFApplier(lfs) L_dev = applier.apply(df_dev) L_train = applier.apply(df_train) print(LFAnalysis(L_dev, lfs).lf_summary(Y_dev)) label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345) probs_dev = label_model.predict_proba(L_dev) preds_dev = probs_to_preds(probs_dev) print("Label model F1: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_dev, preds_dev, probs=probs_dev, metric='roc_auc'))) probs_train = label_model.predict_proba(L_train) df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(X=df_train, y=probs_train, L=L_train) X_train = get_feature_arrays(df_train_filtered) model = get_model() batch_size = 64 model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=100) X_test = get_feature_arrays(df_test) probs_test = model.predict(X_test) preds_test = probs_to_preds(probs_test) print("Label model F1: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='f1'))) print("Label model AUC: {f}".format(f=metric_score(Y_test, preds_test, probs=probs_test, metric='roc_auc')))
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" PARALLEL = 4 max_docs = 12 fonduer.init_logging( log_dir="log_folder", format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert ( len( mention_extractor.get_mentions( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 70 ) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler] ) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert ( len( candidate_extractor.get_candidates( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 1432 ) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]" ).one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys( ["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt] ) assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, X_dev=(train_cands[0], F_train[0]), Y_dev=L_train_gold[0].reshape(-1), b=0.6, pos_label=TRUE, n_epochs=5, lr=0.001, ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) gen_model = LabelModel() gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = gen_model.predict_proba(L_train[0]) disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM disc_model = LSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse Logistic Regression disc_model = SparseLogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse LSTM disc_model = SparseLSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predict((test_cands[0], F_test[0]), b=0.6, pos_label=TRUE) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score == TRUE))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Evaluate mention level scores L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold") Y_test = L_test_gold[0].reshape(-1) scores = disc_model.score((test_cands[0], F_test[0]), Y_test, b=0.6, pos_label=TRUE) logger.info(scores) assert scores["f1"] > 0.6
X_gold_sent, X_gold_shortest_path, X_gold_src, X_gold_tgt, X_gold_src_txt, X_gold_tgt_txt, y_gold = data_handler.get_test_data() X_val_sent, X_val_shortest_path, X_val_src, X_val_tgt, X_val_src_txt, X_val_tgt_txt, y_val = data_handler.get_validation_data() applier = PandasLFApplier(label_functions.lfs) df_train = pd.DataFrame(list(zip(*data_handler.get_training_data())), columns=['shortest_path', 'sent', 'src', 'tgt', 'src_txt', 'tgt_txt']) L_train = applier.apply(df_train) label_model = LabelModel(cardinality=len(rel_names.rels_txt_to_int), verbose=True) label_model.fit(L_train, n_epochs=1000, lr=0.01, log_freq=100, seed=123) label_model.save('./models/LabelModel.model') train_probs = label_model.predict_proba(L_train) train_preds = probs_to_preds(train_probs, tie_break_policy='abstain') df_train = df_train.join(pd.DataFrame({'preds': train_preds, 'probs': list(map(max, train_probs))})) # -1 to otherwiseRelated df_train.loc[df_train.preds == -1, 'preds'] = rel_names.rels_txt_to_int['otherwiseRelated'] # Downsample otherwiseRelated dropNum = len(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']]) - int(df_train['preds'].value_counts().mean()) df_train = df_train.drop(df_train[df_train.preds == rel_names.rels_txt_to_int['otherwiseRelated']].sample(dropNum).index) cnts = {} for x in df_train['preds']: name = rel_names.rels_int_to_text[x] if name not in cnts: