def _test_biluov_task(): import es_core_news_md from scripts.utils import Sentence def forward(tokensxsentence, entitiesxsentence): labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence) return [ from_biluov(biluov, sentence, spans=True) for biluov, sentence in zip(labelsxsentence, tokensxsentence) ] training = Collection().load(Path("data/training/scenario.txt")) nlp = es_core_news_md.load() def per_label(label): tokensxsentence = [nlp(s.text) for s in training.sentences] entitiesxsentence = [[ k.spans for k in s.keyphrases if k.label == label ] for s in training.sentences] decoded = forward(tokensxsentence, entitiesxsentence) return decoded collection = Collection([Sentence(s.text) for s in training.sentences]) for label in ENTITIES: decoded = per_label(label) for entities, sentence in zip(decoded, collection.sentences): for spans in entities: keyphrase = Keyphrase(sentence, label, -1, spans) sentence.keyphrases.append(keyphrase) collection.fix_ids() output = Path( "data/submissions/forward-biluov/train/run1/scenario2-taskA/") output.mkdir(parents=True, exist_ok=True) collection.dump(output / "scenario.txt", skip_empty_sentences=False)
def main(gold_input, submit_input, skip_A, skip_B, verbose, skip_C=True): gold = Collection() gold.load(gold_input) submit = Collection() submit.load(submit_input) data = OrderedDict() dataA = subtaskA(gold, submit, verbose) data.update(dataA) if not skip_A: report(dataA, verbose) if not skip_B: dataB = subtaskB(gold, submit, dataA, verbose) data.update(dataB) report(dataB, verbose) if not skip_C: dataC = subtaskC(gold, submit, data, verbose) data.update(dataC) report(dataC, verbose) print("-" * 20) metrics = compute_metrics(data, skip_A, skip_B, skip_C) for key, value in metrics.items(): print("{0}: {1:0.4}".format(key, value)) return data
def _load_collection(self, scenario): gold = self.gold.format(scenario) gold = Path(gold) loader = Collection().load_dir if gold.is_dir() else Collection().load return loader( gold, legacy=False, keyphrases=scenario.endswith("-taskB"), relations=False, attributes=False, )
def get_clean_collection(anns_path: Path, select: str) -> Collection: collection = Collection() for file in sorted((anns_path / select).iterdir()): if file.suffix == ".txt": collection.load(file, attributes=False) for s in collection.sentences: overlaps = s.overlapping_keyphrases() if overlaps: print("Found overlapping:", overlaps) s.merge_overlapping_keyphrases() overlaps = s.overlapping_keyphrases() dups = s.dup_relations() if dups: print( "Found duplicated relations %r in sentence '%s'" % ([v[0] for v in dups.values()], s.text) ) s.remove_dup_relations() dups = s.dup_relations() assert not overlaps assert not dups return collection
def load_corpus(anns_path: Path, clean=True) -> Collection: collection = Collection() for file in sorted(anns_path.iterdir()): if file.name.endswith(".txt"): collection.load(file) if clean: for s in collection.sentences: overlaps = s.overlapping_keyphrases() if overlaps: print("Found overlapping:", overlaps) s.merge_overlapping_keyphrases() overlaps = s.overlapping_keyphrases() dups = s.dup_relations() if dups: print("Found duplicated relations %r in sentence '%s'" % ([v[0] for v in dups.values()], s.text)) s.remove_dup_relations() dups = s.dup_relations() assert not overlaps assert not dups return collection
def count_labels_based_on(path: Path, reference: Path): collection = load_corpus(path) reference = CollectionV1Handler.load_dir(Collection(), reference) sentences = [] for ref_sent in reference.sentences: for sent in collection.sentences: if sent.text == ref_sent.text: sentences.append(sent) break print(len(collection)) print(len(reference)) print(len(sentences)) return count_labels_on(Collection(sentences))
def main(): baseline = Baseline() baseline.train(Path("data/training/")) collection = CollectionV1Handler.load_dir( Collection(), Path("data/testing/scenario2-taskA/")) output = baseline.run(collection, taskA=True, taskB=False) CollectionV1Handler.dump(output, Path("pepe/input_scenario2.txt"), False)
def load_and_dump_from_corpus(path2sentences, path2corpus, path2output): sentences = path2sentences.read_text().splitlines() print(len(sentences)) collection = Collection().load_dir(path2corpus, legacy=True, attributes=False) print(len(collection)) collection = filter(collection, sentences) print(len(collection)) collection.dump(path2output)
def test(self, finput: Path, skip_A=False, skip_B=False): collection = Collection() collection.load(finput) self.predict_entities(collection) return collection
def load_training_data(corpus) -> Collection: packs = Path("/data") / corpus / "packs/submitted/" collection = Collection() for filename in packs.glob("*.txt"): collection.load(filename) return collection
def task_annotate_relations(corpus: str, pack: str): model = get_model(corpus) text_path = Path("/data") / corpus / "packs" / "open" / pack / "pack.txt" collection = Collection().load(text_path) collection = model.predict_relations(collection) collection.dump(text_path, skip_empty_sentences=False) return {"reload": True}
def get_train_valid_set(self, finput_train: Path, finput_valid: Path = None): #TRAIN SET finput_train = Path(finput_train) collection_train = (Collection().load_dir(finput_train) if finput_train.is_dir() else Collection().load(finput_train)) #VALIDATION SET if finput_valid: finput_valid = Path(finput_valid) collection_valid = (Collection().load_dir(finput_valid) if finput_valid.is_dir() else Collection().load(finput_valid)) else: collection_valid = None return collection_train, collection_valid
def filter(collection: Collection, sentences): # return Collection([s for s in collection.sentences if s.text in sentences]) def find(text): for s in collection.sentences: if s.text == text: return s raise Exception("Not found! " + text) return Collection([find(text) for text in sentences])
def _load_collection(self, scenario): gold = self.gold.format(scenario) return Collection().load( Path(gold), legacy=False, keyphrases=scenario.endswith("-taskB"), relations=False, attributes=False, )
def train(self, finput: Path): collection = (Collection().load_dir(finput) if finput.is_dir() else Collection().load(finput)) self.model = keyphrases, relations = {}, {} for sentence in collection.sentences: for keyphrase in sentence.keyphrases: text = keyphrase.text.lower() keyphrases[text] = keyphrase.label for sentence in collection.sentences: for relation in sentence.relations: origin = relation.from_phrase origin_text = origin.text.lower() destination = relation.to_phrase destination_text = destination.text.lower() relations[origin_text, origin.label, destination_text, destination.label] = relation.label
def task_clear_all(corpus: str, pack: str): path = Path("/data") / corpus / "packs" / "open" / pack / "pack.txt" collection = Collection() collection.load(path) for sentence in collection.sentences: sentence.relations = [] collection.dump(path) return {"reload": True}
def test(self, finput: Path, skip_A, skip_B): collection = Collection() if skip_A: collection.load_keyphrases(finput) else: collection.load_input(finput) self.predict_entities(collection) if not skip_B: for sentence in collection.sentences: self.predict_relations(sentence) sentence.remove_dup_relations() return collection
def main(anns_path: Path, training_path, develop_path, test_path, public): random.seed(42) # default seed, but each generator should use his own # dump training and development collections ---------------------------------- train_develop_sentences = get_training_and_development(anns_path) #### training training = Collection(train_develop_sentences[:800]) training.dump(training_path / "scenario.txt") #### development/main develop = Collection(train_develop_sentences[800:]) develop.dump(develop_path / "main" / "scenario.txt") # dump test collection (per scenario) ---------------------------------------- test_sentences = get_test(anns_path) extra_sentences_main = get_extra(anns_path, "main", train_develop_sentences, test_sentences) extra_sentences_transfer = get_extra(anns_path, "transfer", train_develop_sentences, test_sentences) #### test/scenario3 scn3 = Collection(test_sentences[200:]) clean(scn3, public, remove_keyphrases=False) scn3.dump(test_path / "scenario3-taskB" / "scenario.txt", False) #### test/scenario2 scn2 = Collection(test_sentences[100:200]) clean(scn2, public) scn2.dump(test_path / "scenario2-taskA" / "scenario.txt", False) #### test/scenario1 scn1 = Collection( shuffle(extra_sentences_main[:4900], test_sentences[:100])) clean(scn1, public) scn1.dump(test_path / "scenario1-main" / "scenario.txt", False) # dump transfer learning collections ---------------------------------------- transfer_sentences = get_transfer(anns_path) #### development/transfer develop_transfer = Collection(transfer_sentences[:100]) develop_transfer.dump(develop_path / "transfer" / "scenario.txt") #### test/scenario4 scn4 = Collection( shuffle(extra_sentences_transfer[:1400], transfer_sentences[100:])) clean(scn4, public) scn4.dump(test_path / "scenario4-transfer" / "scenario.txt", False)
def predict_entities(self, sentences): if isinstance(sentences[0], Sentence): sentences = [s.text for s in sentences] result = [] nlp = spacy_model("es") for i, sentence in enumerate(sentences): if self.callback: self.callback(msg="Processing sentence", current=i, total=len(sentences)) doc, xs = self.feature_sentence(sentence) sentence = self.predict_single(doc, xs) result.append(sentence) return Collection(sentences=result)
def evaluate_scenario(submit_path: Path, gold: Collection, scenario: int): submit_file = submit_path / ("scenario.txt") if not submit_file.exists(): warnings.warn("Input file not found in '%s'" % submit_path) return {} submit = Collection().load(submit_file) resultA = subtaskA(gold, submit) resultB = subtaskB(gold, submit, resultA) results = {} for k, v in list(resultA.items()) + list(resultB.items()): results[k] = len(v) metrics = compute_metrics(dict(resultA, **resultB), skipA=scenario == 3, skipB=scenario == 2) results.update(metrics) return results
def train(self, finput): collection = Collection() collection.load(finput) self.keyphrases.clear() for sentence in collection.sentences: for keyphrase in sentence.keyphrases: text = keyphrase.text.lower() self.keyphrases[text] = keyphrase.label self.relations.clear() for sentence in collection.sentences: for relation in sentence.relations: origin = relation.from_phrase origin_text = origin.text.lower() destination = relation.to_phrase destination_text = destination.text.lower() self.relations[origin_text, origin.label, destination_text, destination.label] = relation.label
def main(anns_path: Path, training_path, develop_path, test_path): random.seed(42) # default seed, but each generator should use his own # dump training and development collections ---------------------------------- train_develop_sentences = get_training_and_development(anns_path) #### training training = Collection(train_develop_sentences[:800]) training.dump(training_path / "scenario.txt") #### development/main develop = Collection(train_develop_sentences[800:]) develop.dump(develop_path / "main" / "scenario.txt") # dump test collection (per scenario) ---------------------------------------- test_sentences = get_test(anns_path) extra_sentences = get_extra(anns_path, train_develop_sentences, test_sentences) #### test/scenario3 scn3 = Collection(test_sentences[200:]) scn3.dump(test_path / "scenario3-taskB" / "scenario.txt") #### test/scenario2 scn2 = Collection(test_sentences[100:200]) scn2.dump(test_path / "scenario2-taskA" / "scenario.txt") #### test/scenario1 scn1 = Collection( extra_sentences[:4567] + test_sentences[:100] + extra_sentences[4567:] ) scn1.dump(test_path / "scenario1-main" / "scenario.txt", False) # dump transfer learning collections ---------------------------------------- transfer_sentences = get_transfer(anns_path) #### development/transfer develop_transfer = Collection(transfer_sentences[:100]) develop_transfer.dump(develop_path / "transfer" / "scenario.txt") #### test/scenario4 scn4 = Collection(transfer_sentences[100:]) scn4.dump(test_path / "scenario4-transfer" / "scenario.txt")
def train(self, finput: Path): collection = Collection() collection = CollectionV1Handler.load(collection, finput / "input_training.txt") # collection = CollectionV2Handler.load(collection, finput / 'ensemble.txt') self.model = keyphrases, relations = {}, {} for sentence in collection.sentences: for keyphrase in sentence.keyphrases: text = keyphrase.text.lower() keyphrases[text] = keyphrase.label for sentence in collection.sentences: for relation in sentence.relations: origin = relation.from_phrase origin_text = origin.text.lower() destination = relation.to_phrase destination_text = destination.text.lower() relations[origin_text, origin.label, destination_text, destination.label] = relation.label
def train(self, finput): collection = Collection() collection.load(finput) """ full_text = "" for sentence in collection.sentences: full_text += sentence.text full_text += "\n" doc = es_pipeline.nlp(full_text) """ x_train, y_train = self.preprocess(collection, True) # print('Counter train: %s' % Counter(y_train)) fit_result = self.clf.fit(x_train, y_train) print("Success at training!\n") self.label_set = list(set(y_train)) return fit_result
def count_labels(path: Path, handler=None): corpus = handler.load_dir(Collection(), path) if handler else load_corpus(path) return count_labels_on(corpus)
def main( mode="test", best=False, single=False, csv=False, pretty=False, final=False, plain=False, compact=False, gold="data", submit="data/submissions", ignore=False, ): users = collections.defaultdict(list) if csv and not best: raise ValueError("Error: --csv implies --best") if final and (not csv or not best): raise ValueError("Error: --final implies --csv and --best") if mode == "test": test_gold = Path(gold) gold_scenarios = [ Collection().load(test_gold / "testing/scenario1-main/scenario.txt"), Collection().load(test_gold / "testing/scenario2-taskA/scenario.txt"), Collection().load(test_gold / "testing/scenario3-taskB/scenario.txt"), Collection().load(test_gold / "testing/scenario4-transfer/scenario.txt"), ] elif mode == "dev": dev_gold = Path(gold) gold_scenarios = [ Collection().load(dev_gold / "development/main/scenario.txt"), Collection().load(dev_gold / "development/main/scenario.txt"), Collection().load(dev_gold / "development/main/scenario.txt"), Collection().load(dev_gold / "development/transfer/scenario.txt"), ] elif mode == "train": dev_gold = Path(gold) gold_scenarios = [ Collection().load(dev_gold / "training/scenario.txt"), Collection().load(dev_gold / "training/scenario.txt"), Collection().load(dev_gold / "training/scenario.txt"), ] else: raise ValueError("Unexpected mode: {0}".format(mode)) submits = Path(submit) if single: submits = submits / single runs = submits / mode if not runs.exists(): msg = "Directory {0} not found. Check --mode and --single options.".format( runs) raise ValueError(msg) ensure_number_of_runs(runs) for subfolder in runs.iterdir(): users[submits.name].append(evaluate_one(subfolder, *gold_scenarios)) else: for userfolder in submits.iterdir(): if not userfolder.is_dir(): continue runs = userfolder / mode if not runs.exists(): msg = "Directory {0} not found. Did you mean to use --single? Check --mode option.".format( runs) if ignore: warnings.warn(msg) continue else: raise ValueError(msg) ensure_number_of_runs(runs) for subfolder in runs.iterdir(): users[userfolder.name].append( evaluate_one(subfolder, *gold_scenarios)) results = dict(users) if best: results = filter_best(results) if csv: import pandas as pd items = [] for user, data in results.items(): userdata = dict(name=user) for k, metrics in data.items(): userdata.update( {"%s-%s" % (k, m): v for m, v in metrics.items()}) items.append(userdata) df = pd.DataFrame(items) df = df.set_index("name").sort_index().transpose() if final: df1 = df.transpose()[[ "scenario1-f1", "scenario1-precision", "scenario1-recall" ]] df1 = df1.sort_values("scenario1-f1", ascending=False) df2 = df.transpose()[[ "scenario2-f1", "scenario2-precision", "scenario2-recall" ]] df2 = df2.sort_values("scenario2-f1", ascending=False) df3 = df.transpose()[[ "scenario3-f1", "scenario3-precision", "scenario3-recall" ]] df3 = df3.sort_values("scenario3-f1", ascending=False) df4 = df.transpose()[[ "scenario4-f1", "scenario4-precision", "scenario4-recall" ]] df4 = df4.sort_values("scenario4-f1", ascending=False) if pretty: print(df1.round(3).to_markdown() + "\n") print(df2.round(3).to_markdown() + "\n") print(df3.round(3).to_markdown() + "\n") print(df4.round(3).to_markdown() + "\n") else: print(df1.to_csv()) print(df2.to_csv()) print(df3.to_csv()) print(df4.to_csv()) else: print(df.to_csv()) elif plain: for user, info in results.items(): print(50 * "=") print(" {0} ".format(user).center(50, ":").upper()) print(50 * "=") for run in info: print("[ {0} ]".format(run["submit"]).center(50, "-")) for scenario, data in run.items(): if scenario == "submit": continue print("> {0} ".format(scenario)) for metric, value in data.items(): if metric == "submit": continue metric = "{0}".format(metric).ljust(15) if isinstance(value, float): print(" {0} ~ {1:0.4}".format(metric, value)) else: print(" {0} = {1}".format(metric, value)) elif compact: if not single: raise ValueError("--compact requires --single") if not best: raise ValueError("--compact requires --best") results = results[single] for scn, metrics in results.items(): for m in ["f1", "precision", "recall"]: print(f"{scn}-{m}: {metrics[m]:0.5}") else: print(json.dumps(results, sort_keys=True, indent=2 if pretty else None))
def main( mode="test", best=False, single=False, csv=False, pretty=False, final=False, plain=False, ): users = collections.defaultdict(list) if csv and not best: raise ValueError("Error: --csv implies --best") if final and (not csv or not best): raise ValueError("Error: --final implies --csv and --best") if mode == "test": scn1_gold = Collection().load( Path("data/testing/scenario1-main/scenario.txt")) scn2_gold = Collection().load( Path("data/testing/scenario2-taskA/scenario.txt")) scn3_gold = Collection().load( Path("data/testing/scenario3-taskB/scenario.txt")) scn4_gold = Collection().load( Path("data/testing/scenario4-transfer/scenario.txt")) elif mode == "dev": scn1_gold = Collection().load( Path("data/development/main/scenario.txt")) scn2_gold = Collection().load( Path("data/development/main/scenario.txt")) scn3_gold = Collection().load( Path("data/development/main/scenario.txt")) scn4_gold = Collection().load( Path("data/development/transfer/scenario.txt")) else: raise ValueError("Unexpected mode: {0}".format(mode)) submits = Path("data/submissions/") if single: submits = submits / single runs = submits / mode if not runs.exists(): raise ValueError( "Directory {0} not found. Check --mode and --single options.") ensure_number_of_runs(runs) for subfolder in runs.iterdir(): users[submits.name].append( evaluate_one( subfolder, scn1_gold, scn2_gold, scn3_gold, scn4_gold, )) else: for userfolder in submits.iterdir(): if not userfolder.is_dir(): continue runs = userfolder / mode if not runs.exists(): raise ValueError( "Directory {0} not found. Did you mean to use --single? Check --mode option." ) ensure_number_of_runs(runs) for subfolder in runs.iterdir(): users[userfolder.name].append( evaluate_one( subfolder, scn1_gold, scn2_gold, scn3_gold, scn4_gold, )) results = dict(users) if best: results = filter_best(results) if csv: import pandas as pd items = [] for user, data in results.items(): userdata = dict(name=user) for k, metrics in data.items(): userdata.update( {"%s-%s" % (k, m): v for m, v in metrics.items()}) items.append(userdata) df = pd.DataFrame(items) df = df.set_index("name").sort_index().transpose() if final: df1 = df.transpose()[[ "scenario1-f1", "scenario1-precision", "scenario1-recall" ]] df1 = df1.sort_values("scenario1-f1", ascending=False).to_csv() df2 = df.transpose()[[ "scenario2-f1", "scenario2-precision", "scenario2-recall" ]] df2 = df2.sort_values("scenario2-f1", ascending=False).to_csv() df3 = df.transpose()[[ "scenario3-f1", "scenario3-precision", "scenario3-recall" ]] df3 = df3.sort_values("scenario3-f1", ascending=False).to_csv() df4 = df.transpose()[[ "scenario4-f1", "scenario4-precision", "scenario4-recall" ]] df4 = df4.sort_values("scenario4-f1", ascending=False).to_csv() print(df1) print(df2) print(df3) print(df4) elif pretty: print(df.to_html()) else: print(df.to_csv()) elif plain: for user, info in results.items(): print(50 * "=") print(" {0} ".format(user).center(50, ":").upper()) print(50 * "=") for run in info: print("[ {0} ]".format(run["submit"]).center(50, "-")) for scenario, data in run.items(): if scenario == "submit": continue print("> {0} ".format(scenario)) for metric, value in data.items(): if metric == "submit": continue metric = "{0}".format(metric).ljust(15) if isinstance(value, float): print(" {0} ~ {1:0.4}".format(metric, value)) else: print(" {0} = {1}".format(metric, value)) else: print(json.dumps(results, sort_keys=True, indent=2 if pretty else None))
def _training_task( n_epochs, *, bert_mode, cnet_mode, ignore_path, inclusion=1.1, task=None, jointly=True, early_stopping=None, use_crf=True, weight=True, only_bert=False, reduce=False, split_relations="both", straight_forward_encoding=False, dropout=False, stacked_layers=1, ): if split_relations not in ("both", "pair", "seq"): raise ValueError() training = Collection().load(Path("data/training/scenario.txt")) validation = Collection().load( Path("data/development/main/scenario.txt")) early_stopping = early_stopping or dict(wait=5, delta=0.0) train_pairs = (TAXONOMIC_RELS if split_relations == "both" else RELATIONS if split_relations == "pair" else None) train_seqs = (CONTEXT_RELS if split_relations == "both" else RELATIONS if split_relations == "seq" else None) algorithm = eHealth20Model( bert_mode=bert_mode, only_bert=only_bert, cnet_mode=cnet_mode, ignore_path=ignore_path, ) if task is None: algorithm.train( training, validation, jointly=jointly, inclusion=inclusion, n_epochs=n_epochs, save_to=name_to_path, early_stopping=early_stopping, use_crf=use_crf, weight=weight, train_pairs=train_pairs, train_seqs=train_seqs, straight_forward_encoding=straight_forward_encoding, reduce=reduce, dropout=dropout, stacked_layers=stacked_layers, ) elif task == "A": algorithm.train_taskA( training, validation, jointly=jointly, n_epochs=n_epochs, save_to=name_to_path, early_stopping=early_stopping, use_crf=use_crf, weight=weight, dropout=dropout, stacked_layers=stacked_layers, ) elif task == "B": # load A if jointly: taskA_models = {} for label in ENTITIES: checkpoint = torch.load(f"trained/taskA-{label}.pt") _ensure_bert(bert_mode, checkpoint) model = checkpoint["model"] taskA_models[label] = model model.eval() algorithm.taskA_models = taskA_models algorithm.train_taskB( training, validation, jointly=jointly, inclusion=inclusion, n_epochs=n_epochs, save_to=name_to_path, early_stopping=early_stopping, weight=weight, use_crf=use_crf, train_pairs=train_pairs, train_seqs=train_seqs, straight_forward_encoding=straight_forward_encoding, reduce=reduce, dropout=dropout, )
from pathlib import Path from scripts.utils import Collection, CollectionV1Handler, CollectionV2Handler talp = CollectionV1Handler.load( Collection(), Path("data/training/talp-576640/scenario1-main/input_scenario1.txt")) print(f"Talp: {len(talp)}") ensemble = CollectionV2Handler.load(Collection(), Path("data/training/ensemble.txt")) print(f"Ensemble: {len(ensemble)}") sentences = set([s.text for s in ensemble.sentences]) selection = Collection([s for s in talp.sentences if s.text in sentences]) print(f"Selection: {len(selection)}") output = Path("data/training/talp.txt") output.parent.mkdir(exist_ok=True) CollectionV2Handler.dump(selection, output, skip_empty_sentences=False)
from streamlit.ScriptRunner import StopException from autobrat.classifier import Model from scripts.score import compute_metrics, subtaskA, subtaskB from scripts.utils import ( ENTITIES, RELATIONS, Collection, CollectionV1Handler, CollectionV2Handler, Keyphrase, Relation, Sentence, ) c = Collection() if st.sidebar.checkbox("Original Data", value=False): c = CollectionV1Handler.load(c, Path("data/training/input_training.txt")) if st.sidebar.checkbox("Ensemble Data", value=False): old_size = len(c) c = CollectionV2Handler.load(c, Path("data/training/ensemble.txt")) ensemble_size = len(c) - old_size top_agreement = st.sidebar.number_input("Number of sentences (Ensemble)", 0, ensemble_size, ensemble_size) c.sentences = c.sentences[:old_size + top_agreement] if st.sidebar.checkbox("Talp Data", value=False): old_size = len(c) c = CollectionV2Handler.load(c, Path("data/training/talp.txt"))