def test_json_to_docs_no_ner(en_vocab): data = [{ "id": 1, "paragraphs": [{ "sentences": [{ "tokens": [ { "dep": "nn", "head": 1, "tag": "NNP", "orth": "Ms." }, { "dep": "nsubj", "head": 1, "tag": "NNP", "orth": "Haag", }, { "dep": "ROOT", "head": 0, "tag": "VBZ", "orth": "plays", }, { "dep": "dobj", "head": -1, "tag": "NNP", "orth": "Elianti", }, { "dep": "punct", "head": -2, "tag": ".", "orth": "." }, ] }] }], }] docs = list(json_to_docs(data)) assert len(docs) == 1 for doc in docs: assert not doc.has_annotation("ENT_IOB") for token in doc: assert token.ent_iob == 0 eg = Example( Doc( doc.vocab, words=[w.text for w in doc], spaces=[bool(w.whitespace_) for w in doc], ), doc, ) ner_tags = eg.get_aligned_ner() assert ner_tags == [None, None, None, None, None]
def json_path_to_examples(data_path, NLP): data = srsly.read_json(data_path) # no good way to convert with a specified vocab, so convert, then reload # through DocBin with the right vocab docs = json_to_docs(data) docbin = DocBin() for doc in docs: docbin.add(doc) docs = docbin.get_docs(NLP.vocab) examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs] return examples
def test_issue4402(): json_data = { "id": 0, "paragraphs": [ { "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", "sentences": [ { "tokens": [ {"id": 0, "orth": "How", "ner": "O"}, {"id": 1, "orth": "should", "ner": "O"}, {"id": 2, "orth": "I", "ner": "O"}, {"id": 3, "orth": "cook", "ner": "O"}, {"id": 4, "orth": "bacon", "ner": "O"}, {"id": 5, "orth": "in", "ner": "O"}, {"id": 6, "orth": "an", "ner": "O"}, {"id": 7, "orth": "oven", "ner": "O"}, {"id": 8, "orth": "?", "ner": "O"}, ], "brackets": [], }, { "tokens": [ {"id": 9, "orth": "\n", "ner": "O"}, {"id": 10, "orth": "I", "ner": "O"}, {"id": 11, "orth": "'ve", "ner": "O"}, {"id": 12, "orth": "heard", "ner": "O"}, {"id": 13, "orth": "of", "ner": "O"}, {"id": 14, "orth": "people", "ner": "O"}, {"id": 15, "orth": "cooking", "ner": "O"}, {"id": 16, "orth": "bacon", "ner": "O"}, {"id": 17, "orth": "in", "ner": "O"}, {"id": 18, "orth": "an", "ner": "O"}, {"id": 19, "orth": "oven", "ner": "O"}, {"id": 20, "orth": ".", "ner": "O"}, ], "brackets": [], }, ], "cats": [ {"label": "baking", "value": 1.0}, {"label": "not_baking", "value": 0.0}, ], }, { "raw": "What is the difference between white and brown eggs?\n", "sentences": [ { "tokens": [ {"id": 0, "orth": "What", "ner": "O"}, {"id": 1, "orth": "is", "ner": "O"}, {"id": 2, "orth": "the", "ner": "O"}, {"id": 3, "orth": "difference", "ner": "O"}, {"id": 4, "orth": "between", "ner": "O"}, {"id": 5, "orth": "white", "ner": "O"}, {"id": 6, "orth": "and", "ner": "O"}, {"id": 7, "orth": "brown", "ner": "O"}, {"id": 8, "orth": "eggs", "ner": "O"}, {"id": 9, "orth": "?", "ner": "O"}, ], "brackets": [], }, {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, ], "cats": [ {"label": "baking", "value": 0.0}, {"label": "not_baking", "value": 1.0}, ], }, ], } nlp = English() attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" docs = json_to_docs([json_data]) data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) reader = Corpus(output_file) train_data = list(reader(nlp)) assert len(train_data) == 2 split_train_data = [] for eg in train_data: split_train_data.extend(eg.split_sents()) assert len(split_train_data) == 4