def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    header = True
    with open(input_path, "r") as in_f, open(output_path, "w") as out_f:
        for line in tqdm(in_f):
            if header:
                header = False
                continue
            sentence, tokens = pd.read_csv(StringIO(line),
                                           header=None,
                                           usecols=[0, 1]).values[0]
            tokens = eval(tokens)
            dict_line = line_to_dict(sentence, tokens)
            eg = dict_line

            if eg["answer"] != "accept":
                continue
            tokens = [token["text"] for token in eg["tokens"]]
            words, spaces = get_words_and_spaces(tokens, eg["text"])
            doc = Doc(nlp.vocab, words=words, spaces=spaces)
            doc.ents = [
                doc.char_span(s["start"], s["end"], label=s["label"])
                for s in eg.get("spans", [])
            ]
            doc_bin.add(doc)
        doc_bin.to_disk(output_path)
        print(f"Processed {len(doc_bin)} documents: {output_path}")
Example #2
0
def test_create_from_words_and_text(vocab):
    # no whitespace in words
    words = ["'", "dogs", "'", "run"]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]

    # partial whitespace in words
    words = ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]

    # non-standard whitespace tokens
    words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
    text = "  'dogs'\n\nrun  "
    (words, spaces) = util.get_words_and_spaces(words, text)
    doc = Doc(vocab, words=words, spaces=spaces)
    assert [t.text for t in doc] == ["  ", "'", "dogs", "'", "\n\n", "run", " "]
    assert [t.whitespace_ for t in doc] == ["", "", "", "", "", " ", ""]
    assert doc.text == text
    assert [t.text for t in doc if not t.text.isspace()] == [
        word for word in words if not word.isspace()
    ]

    # mismatch between words and text
    with pytest.raises(ValueError):
        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
        text = "  'dogs'\n\nrun  "
        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
Example #3
0
def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
    # additional whitespace tokens in GoldParse words
    words, spaces = get_words_and_spaces(
        ["I", "flew", "to", "San Francisco", "Valley", "."],
        "I flew  to San Francisco Valley.",
    )
    doc = Doc(en_vocab, words=words, spaces=spaces)
    prefix = "I flew  to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
    gold_spaces = [True, True, False, True, False, False]
    example = Example.from_dict(
        doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
    )
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in tqdm(srsly.read_jsonl(input_path)):
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Example #5
0
def main(input_path: Path = typer.Argument(..., exists=True, dir_okay=False)):
    print("Read params.yaml...")
    with open("params.yaml", "r") as fd:
        params = yaml.safe_load(fd)
    dev_size = params["train"]["corpora"]["dev_size"]
    shuffle_seed = params["train"]["corpora"]["shuffle_seed"]
    print(f"...read dev_size={dev_size}, shuffle_seed={shuffle_seed}")

    print("Read annotations...")
    corpus = list(srsly.read_jsonl(input_path))
    print(f"...read {len(corpus)} texts")

    print("Convert into documents...")
    docs = []
    nlp = spacy.blank("en")
    for eg in corpus:
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        docs.append(doc)
    print(f"...converted {len(docs)} documents")

    print("Split into train and dev...")
    train, dev = train_test_split(docs,
                                  test_size=dev_size,
                                  random_state=shuffle_seed,
                                  shuffle=True)
    print(f"...split into {len(train)} train and {len(dev)} dev documents")

    print("Write serialized documents...")
    for split, data in [("train", train), ("dev", dev)]:
        output_path = input_path.with_suffix(f".{split}.spacy")
        doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], docs=data)
        doc_bin.to_disk(output_path)
        print(f"...wrote {output_path}")
Example #6
0
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    # one-to-many
    words = ["I", "flew to", "San Francisco Valley", "."]
    spaces = [True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"),
                 "LOC")]
    gp = GoldParse(
        doc,
        words=["I", "flew", "to", "San", "Francisco", "Valley", "."],
        entities=entities,
    )
    assert gp.ner == ["O", "O", "U-LOC", "O"]

    # many-to-one
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"),
                 "LOC")]
    gp = GoldParse(doc,
                   words=["I", "flew to", "San Francisco Valley", "."],
                   entities=entities)
    assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]

    # misaligned
    words = ["I flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"),
                 "LOC")]
    gp = GoldParse(
        doc,
        words=["I", "flew to", "San", "Francisco Valley", "."],
        entities=entities,
    )
    assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"]

    # additional whitespace tokens in GoldParse words
    words, spaces = get_words_and_spaces(
        ["I", "flew", "to", "San Francisco", "Valley", "."],
        "I flew  to San Francisco Valley.",
    )
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"),
                 "LOC")]
    gp = GoldParse(
        doc,
        words=["I", "flew", " ", "to", "San Francisco Valley", "."],
        entities=entities,
    )
    assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]

    # from issue #4791
    data = (
        "I'll return the ₹54 amount",
        {
            "words": [
                "I",
                "'ll",
                "return",
                "the",
                "₹",
                "54",
                "amount",
            ],
            "entities": [(16, 19, "MONEY")],
        },
    )
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
    assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]

    data = (
        "I'll return the $54 amount",
        {
            "words": [
                "I",
                "'ll",
                "return",
                "the",
                "$",
                "54",
                "amount",
            ],
            "entities": [(16, 19, "MONEY")],
        },
    )
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
    assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]