def test_lowercase_augmenter(nlp, doc): augmenter = create_lower_casing_augmenter(level=1.0) with make_docbin([doc]) as output_file: reader = Corpus(output_file, augmenter=augmenter) corpus = list(reader(nlp)) eg = corpus[0] assert eg.reference.text == doc.text.lower() assert eg.predicted.text == doc.text.lower() ents = [(e.start, e.end, e.label) for e in doc.ents] assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents): assert ref_ent.text == orig_ent.text.lower() assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc] # check that augmentation works when lowercasing leads to different # predicted tokenization words = ["A", "B", "CCC."] doc = Doc(nlp.vocab, words=words) with make_docbin([doc]) as output_file: reader = Corpus(output_file, augmenter=augmenter) corpus = list(reader(nlp)) eg = corpus[0] assert eg.reference.text == doc.text.lower() assert eg.predicted.text == doc.text.lower() assert [t.text for t in eg.reference] == [t.lower() for t in words] assert [t.text for t in eg.predicted] == [ t.text for t in nlp.make_doc(doc.text.lower()) ]
def generate_corpus(nlp): directory_path = path.join('data') corpus_path = Path(path.join(directory_path, file_name) + ".spacy") raw_path = Path(path.join(directory_path, file_name) + ".jsonl") if exists(corpus_path): return Corpus(corpus_path)(nlp) vulnerabilities = [] with open(raw_path) as file: for line in file.readlines(): vulnerability = loads(line) vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])}) corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"]) for vulnerability in vulnerabilities: document = nlp.make_doc(vulnerability['description'].lower()) #print(vulnerability) #print(len(document)) #iob = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc] #biluo = iob_to_biluo(iob) #print(biluo) #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']]) #document.set_ents(list(document.ents)) tags = offsets_to_biluo_tags(document, vulnerability['entities']) entities = biluo_tags_to_spans(document, tags) document.set_ents(entities) ''' Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens. ''' #print(document.has_annotation(1)) #ID of "SOFTWARE" # passt alles! ents = list(document.ents) for i, _ in enumerate(ents): print(ents[i].label_) print(ents[i].text) print('\n') print('\nOK\n') #exit() corpus.add(document) print(len(corpus)) print(list(corpus.get_docs(nlp.vocab))) corpus.to_disk(corpus_path) if exists(corpus_path): return Corpus(corpus_path)(nlp)
def test_make_orth_variants(nlp): single = [ {"tags": ["NFP"], "variants": ["…", "..."]}, {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, ] # fmt: off words = ["\n\n", "A", "\t", "B", "a", "b", "…", "...", "-", "—", "–", "--", "---", "——"] tags = ["_SP", "NN", "\t", "NN", "NN", "NN", "NFP", "NFP", ":", ":", ":", ":", ":", ":"] # fmt: on spaces = [True] * len(words) spaces[0] = False spaces[2] = False doc = Doc(nlp.vocab, words=words, spaces=spaces, tags=tags) augmenter = create_orth_variants_augmenter( level=0.2, lower=0.5, orth_variants={"single": single} ) with make_docbin([doc] * 10) as output_file: reader = Corpus(output_file, augmenter=augmenter) # Due to randomness, only test that it works without errors list(reader(nlp)) # check that the following settings lowercase everything augmenter = create_orth_variants_augmenter( level=1.0, lower=1.0, orth_variants={"single": single} ) with make_docbin([doc] * 10) as output_file: reader = Corpus(output_file, augmenter=augmenter) for example in reader(nlp): for token in example.reference: assert token.text == token.text.lower() # check that lowercasing is applied without tags doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words)) augmenter = create_orth_variants_augmenter( level=1.0, lower=1.0, orth_variants={"single": single} ) with make_docbin([doc] * 10) as output_file: reader = Corpus(output_file, augmenter=augmenter) for example in reader(nlp): for ex_token, doc_token in zip(example.reference, doc): assert ex_token.text == doc_token.text.lower() # check that no lowercasing is applied with lower=0.0 doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words)) augmenter = create_orth_variants_augmenter( level=1.0, lower=0.0, orth_variants={"single": single} ) with make_docbin([doc] * 10) as output_file: reader = Corpus(output_file, augmenter=augmenter) for example in reader(nlp): for ex_token, doc_token in zip(example.reference, doc): assert ex_token.text == doc_token.text
def test_custom_data_augmentation(nlp, doc): def create_spongebob_augmenter(randomize: bool = False): def augment(nlp, example): text = example.text if randomize: ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text] else: ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] example_dict = example.to_dict() doc = nlp.make_doc("".join(ch)) example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] yield example yield example.from_dict(doc, example_dict) return augment with make_docbin([doc]) as output_file: reader = Corpus(output_file, augmenter=create_spongebob_augmenter()) corpus = list(reader(nlp)) orig_text = "Sarah 's sister flew to Silicon Valley via London . " augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . " assert corpus[0].text == orig_text assert corpus[0].reference.text == orig_text assert corpus[0].predicted.text == orig_text assert corpus[1].text == augmented assert corpus[1].reference.text == augmented assert corpus[1].predicted.text == augmented ents = [(e.start, e.end, e.label) for e in doc.ents] assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents
def test_make_orth_variants(nlp, doc): single = [ {"tags": ["NFP"], "variants": ["…", "..."]}, {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, ] augmenter = create_orth_variants_augmenter( level=0.2, lower=0.5, orth_variants={"single": single} ) with make_docbin([doc]) as output_file: reader = Corpus(output_file, augmenter=augmenter) # Due to randomness, only test that it works without errors for now list(reader(nlp))
def test_lowercase_augmenter(nlp, doc): augmenter = create_lower_casing_augmenter(level=1.0) with make_docbin([doc]) as output_file: reader = Corpus(output_file, augmenter=augmenter) corpus = list(reader(nlp)) eg = corpus[0] assert eg.reference.text == doc.text.lower() assert eg.predicted.text == doc.text.lower() ents = [(e.start, e.end, e.label) for e in doc.ents] assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents): assert ref_ent.text == orig_ent.text.lower() assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc]
def parser_tagger_data( path: Path, mixin_data_path: Optional[Path], mixin_data_percent: float, gold_preproc: bool, max_length: int = 0, limit: int = 0, augmenter: Optional[Callable] = None, seed: int = 0, ) -> Callable[[Language], Iterator[Example]]: random.seed(seed) main_corpus = Corpus( path, gold_preproc=gold_preproc, max_length=max_length, limit=limit, augmenter=augmenter, ) if mixin_data_path is not None: mixin_corpus = Corpus( mixin_data_path, gold_preproc=gold_preproc, max_length=max_length, limit=limit, augmenter=augmenter, ) def mixed_corpus(nlp: Language) -> Iterator[Example]: if mixin_data_path is not None: main_examples = main_corpus(nlp) mixin_examples = iter_sample(mixin_corpus(nlp), mixin_data_percent) return itertools.chain(main_examples, mixin_examples) else: return main_corpus(nlp) return mixed_corpus
def test_roundtrip_docs_to_docbin(doc): text = doc.text idx = [t.idx for t in doc] tags = [t.tag_ for t in doc] pos = [t.pos_ for t in doc] morphs = [str(t.morph) for t in doc] lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] cats = doc.cats ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] # roundtrip to DocBin with make_tempdir() as tmpdir: # use a separate vocab to test that all labels are added reloaded_nlp = English() json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) reader = Corpus(output_file) reloaded_examples = list(reader(reloaded_nlp)) assert len(doc) == sum(len(eg) for eg in reloaded_examples) reloaded_example = reloaded_examples[0] assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference] assert morphs == [str(t.morph) for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] assert ents == [ (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents ] assert "TRAVEL" in reloaded_example.reference.cats assert "BAKING" in reloaded_example.reference.cats assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
def test_issue4402(): json_data = { "id": 0, "paragraphs": [ { "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", "sentences": [ { "tokens": [ {"id": 0, "orth": "How", "ner": "O"}, {"id": 1, "orth": "should", "ner": "O"}, {"id": 2, "orth": "I", "ner": "O"}, {"id": 3, "orth": "cook", "ner": "O"}, {"id": 4, "orth": "bacon", "ner": "O"}, {"id": 5, "orth": "in", "ner": "O"}, {"id": 6, "orth": "an", "ner": "O"}, {"id": 7, "orth": "oven", "ner": "O"}, {"id": 8, "orth": "?", "ner": "O"}, ], "brackets": [], }, { "tokens": [ {"id": 9, "orth": "\n", "ner": "O"}, {"id": 10, "orth": "I", "ner": "O"}, {"id": 11, "orth": "'ve", "ner": "O"}, {"id": 12, "orth": "heard", "ner": "O"}, {"id": 13, "orth": "of", "ner": "O"}, {"id": 14, "orth": "people", "ner": "O"}, {"id": 15, "orth": "cooking", "ner": "O"}, {"id": 16, "orth": "bacon", "ner": "O"}, {"id": 17, "orth": "in", "ner": "O"}, {"id": 18, "orth": "an", "ner": "O"}, {"id": 19, "orth": "oven", "ner": "O"}, {"id": 20, "orth": ".", "ner": "O"}, ], "brackets": [], }, ], "cats": [ {"label": "baking", "value": 1.0}, {"label": "not_baking", "value": 0.0}, ], }, { "raw": "What is the difference between white and brown eggs?\n", "sentences": [ { "tokens": [ {"id": 0, "orth": "What", "ner": "O"}, {"id": 1, "orth": "is", "ner": "O"}, {"id": 2, "orth": "the", "ner": "O"}, {"id": 3, "orth": "difference", "ner": "O"}, {"id": 4, "orth": "between", "ner": "O"}, {"id": 5, "orth": "white", "ner": "O"}, {"id": 6, "orth": "and", "ner": "O"}, {"id": 7, "orth": "brown", "ner": "O"}, {"id": 8, "orth": "eggs", "ner": "O"}, {"id": 9, "orth": "?", "ner": "O"}, ], "brackets": [], }, {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, ], "cats": [ {"label": "baking", "value": 0.0}, {"label": "not_baking", "value": 1.0}, ], }, ], } nlp = English() attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" docs = json_to_docs([json_data]) data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) reader = Corpus(output_file) train_data = list(reader(nlp)) assert len(train_data) == 2 split_train_data = [] for eg in train_data: split_train_data.extend(eg.split_sents()) assert len(split_train_data) == 4