def test_cli_converters_conllu_to_docs_name_ner_map(lines): input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, ner_map={ "PER": "PERSON", "BAD": "" })) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0][ "raw"] == "Dommer FinnEilertsen avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 5 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu_to_docs(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) converted_docs = list(conllu_to_docs(input_data, n_sents=1)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu_to_docs_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER", "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, merge_subtokens=True, append_morphology=True)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] print(tokens) assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] assert [t["tag"] for t in tokens] == [ "NOUN__Definite=Ind|Gender=Masc|Number=Sing", "PROPN_X__Gender=Fem,Masc|Tense=past", "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", "PUNCT", ] assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"] assert [t["morph"] for t in tokens] == [ "Definite=Ind|Gender=Masc|Number=Sing", "Gender=Fem,Masc|Tense=past", "Mood=Ind|Tense=Pres|VerbForm=Fin", "", ] assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_roundtrip_docs_to_docbin(doc): text = doc.text idx = [t.idx for t in doc] tags = [t.tag_ for t in doc] pos = [t.pos_ for t in doc] morphs = [str(t.morph) for t in doc] lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] cats = doc.cats ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents] # roundtrip to DocBin with make_tempdir() as tmpdir: # use a separate vocab to test that all labels are added reloaded_nlp = English() json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) reader = Corpus(output_file) reloaded_examples = list(reader(reloaded_nlp)) assert len(doc) == sum(len(eg) for eg in reloaded_examples) reloaded_example = reloaded_examples[0] assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference] assert morphs == [str(t.morph) for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] assert ents == [ (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents ] assert "TRAVEL" in reloaded_example.reference.cats assert "BAKING" in reloaded_example.reference.cats assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
def convert(model="en", input_file=None, output_dir=None, n_texts=0): # Load model with tokenizer + sentencizer only nlp = spacy.load(model) nlp.select_pipes(disable=nlp.pipe_names) sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer, first=True) texts = [] cats = [] count = 0 if not input_file.exists(): print("Input file not found:", input_file) sys.exit(1) else: with open(input_file) as fileh: for line in fileh: data = srsly.json_loads(line) texts.append(data["text"]) cats.append(data["cats"]) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() else: output_dir = Path(".") docs = [] for i, doc in enumerate(nlp.pipe(texts)): doc.cats = cats[i] docs.append(doc) if n_texts > 0 and count == n_texts: break count += 1 srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
def test_cli_converters_iob_to_docs(): lines = [ "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", ] input_data = "\n".join(lines) converted_docs = list(iob_to_docs(input_data, n_sents=10)) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 assert len(converted["paragraphs"]) == 1 assert len(converted["paragraphs"][0]["sentences"]) == 4 for i in range(0, 4): sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] expected = ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["orth"] for t in tokens] == expected assert len(converted_docs[0].ents) == 8 for ent in converted_docs[0].ents: assert ent.text in ["New York City", "London"]
def test_cli_converters_conll_ner_to_docs(): lines = [ "-DOCSTART- -X- O O", "", "I\tO", "like\tO", "London\tB-GPE", "and\tO", "New\tB-GPE", "York\tI-GPE", "City\tI-GPE", ".\tO", "", "I O", "like O", "London B-GPE", "and O", "New B-GPE", "York I-GPE", "City I-GPE", ". O", "", "I PRP O", "like VBP O", "London NNP B-GPE", "and CC O", "New NNP B-GPE", "York NNP I-GPE", "City NNP I-GPE", ". . O", "", "I PRP _ O", "like VBP _ O", "London NNP _ B-GPE", "and CC _ O", "New NNP _ B-GPE", "York NNP _ I-GPE", "City NNP _ I-GPE", ". . _ O", "", "I\tPRP\t_\tO", "like\tVBP\t_\tO", "London\tNNP\t_\tB-GPE", "and\tCC\t_\tO", "New\tNNP\t_\tB-GPE", "York\tNNP\t_\tI-GPE", "City\tNNP\t_\tI-GPE", ".\t.\t_\tO", ] input_data = "\n".join(lines) converted_docs = list(conll_ner_to_docs(input_data, n_sents=10)) assert len(converted_docs) == 1 converted = docs_to_json(converted_docs) assert converted["id"] == 0 assert len(converted["paragraphs"]) == 1 assert len(converted["paragraphs"][0]["sentences"]) == 5 for i in range(0, 5): sent = converted["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == [ "I", "like", "London", "and", "New", "York", "City", "." ] # fmt: on assert len(converted_docs[0].ents) == 10 for ent in converted_docs[0].ents: assert ent.text in ["New York City", "London"]