Esempio n. 1
0
def test_gold_biluo_overlap(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
        (len("I flew to "), len("I flew to San Francisco"), "LOC"),
    ]
    with pytest.raises(ValueError):
        offsets_to_biluo_tags(doc, entities)
Esempio n. 2
0
def test_cli_converters_conllu_to_docs_name_ner_map(lines):
    input_data = "\n".join(lines)
    converted_docs = list(
        conllu_to_docs(input_data,
                       n_sents=1,
                       ner_map={
                           "PER": "PERSON",
                           "BAD": ""
                       }))
    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert converted[0]["paragraphs"][0][
        "raw"] == "Dommer FinnEilertsen avstår. "
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 5
    tokens = sent["tokens"]
    assert [t["orth"]
            for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."]
    assert [t["tag"]
            for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
    assert [t["dep"]
            for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
    ent_offsets = [(e[0], e[1], e[2])
                   for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = offsets_to_biluo_tags(converted_docs[0],
                                       ent_offsets,
                                       missing="O")
    assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
Esempio n. 3
0
def test_cli_converters_conllu_to_docs():
    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
        "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER",
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted_docs = list(conllu_to_docs(input_data, n_sents=1))
    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 4
    tokens = sent["tokens"]
    assert [t["orth"]
            for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"]
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
    ent_offsets = [(e[0], e[1], e[2])
                   for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = offsets_to_biluo_tags(converted_docs[0],
                                       ent_offsets,
                                       missing="O")
    assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
Esempio n. 4
0
def test_gold_biluo_BIL(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    tags = offsets_to_biluo_tags(doc, entities)
    assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
Esempio n. 5
0
def test_gold_biluo_U(en_vocab):
    words = ["I", "flew", "to", "London", "."]
    spaces = [True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to London"), "LOC")]
    tags = offsets_to_biluo_tags(doc, entities)
    assert tags == ["O", "O", "O", "U-LOC", "O"]
Esempio n. 6
0
def tags_from_offsets(
    doc: Doc,
    offsets: List[Dict],
    label_encoding: Optional[str] = "BIOUL",
) -> List[str]:
    """Converts offsets to BIOUL or BIO tags using spacy's `offsets_to_biluo_tags`.

    Parameters
    ----------
    doc
        A spaCy Doc created with `text` and the backbone tokenizer
    offsets
        A list of dicts with start and end character index with respect to the doc, and the span label:
        `{"start": int, "end": int, "label": str}`
    label_encoding
        The label encoding to be used: BIOUL or BIO

    Returns
    -------
    tags (BIOUL or BIO)
    """
    tags = offsets_to_biluo_tags(
        doc, [(offset["start"], offset["end"], offset["label"]) for offset in offsets]
    )
    if label_encoding == "BIO":
        tags = bioul_tags_to_bio_tags(tags)
    return tags
Esempio n. 7
0
def test_gold_biluo_misalign(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley."]
    spaces = [True, True, True, True, True, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    with pytest.warns(UserWarning):
        tags = offsets_to_biluo_tags(doc, entities)
    assert tags == ["O", "O", "O", "-", "-", "-"]
Esempio n. 8
0
    def generate_corpus(nlp):
        directory_path = path.join('data')
        
        corpus_path = Path(path.join(directory_path, file_name) + ".spacy")
        raw_path = Path(path.join(directory_path, file_name) + ".jsonl")

        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)

        vulnerabilities = []
        with open(raw_path) as file:
            for line in file.readlines():
                vulnerability = loads(line)

                vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])})
                

        corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"])

        for vulnerability in vulnerabilities:
            document = nlp.make_doc(vulnerability['description'].lower())
            #print(vulnerability)
            #print(len(document))
            #iob =  [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
            #biluo = iob_to_biluo(iob)
            #print(biluo)
            
            
            #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']])
            #document.set_ents(list(document.ents))

            tags = offsets_to_biluo_tags(document, vulnerability['entities'])
            entities = biluo_tags_to_spans(document, tags)
            document.set_ents(entities)
            '''
             Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens.
            '''
            #print(document.has_annotation(1)) #ID of "SOFTWARE"

            # passt alles!
            ents = list(document.ents)
            for i, _ in enumerate(ents):
                print(ents[i].label_)
                print(ents[i].text)
                print('\n')


            print('\nOK\n')   
            #exit()
            corpus.add(document)
            
        print(len(corpus))
        print(list(corpus.get_docs(nlp.vocab)))
        corpus.to_disk(corpus_path)
    
        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)
Esempio n. 9
0
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
    offsets_converted = [ent for ent in offsets if ent[2]]
    assert offsets_converted == offsets
Esempio n. 10
0
def test_cli_converters_conllu_to_docs_subtokens():
    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
        "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER",
        "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER",
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O",
        "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
    ]
    input_data = "\n".join(lines)
    converted_docs = list(
        conllu_to_docs(input_data,
                       n_sents=1,
                       merge_subtokens=True,
                       append_morphology=True))
    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]

    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 4
    tokens = sent["tokens"]
    print(tokens)
    assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."]
    assert [t["tag"] for t in tokens] == [
        "NOUN__Definite=Ind|Gender=Masc|Number=Sing",
        "PROPN_X__Gender=Fem,Masc|Tense=past",
        "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin",
        "PUNCT",
    ]
    assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"]
    assert [t["morph"] for t in tokens] == [
        "Definite=Ind|Gender=Masc|Number=Sing",
        "Gender=Fem,Masc|Tense=past",
        "Mood=Ind|Tense=Pres|VerbForm=Fin",
        "",
    ]
    assert [t["lemma"]
            for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
    assert [t["head"] for t in tokens] == [1, 1, 0, -1]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
    ent_offsets = [(e[0], e[1], e[2])
                   for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = offsets_to_biluo_tags(converted_docs[0],
                                       ent_offsets,
                                       missing="O")
    assert biluo_tags == ["O", "U-PER", "O", "O"]
def make_docs(folder, doc_list):
    nlp = spacy.load('ru_core_news_lg')
    """
    this function will take a list of texts and annotations
    and transform them in spacy documents
    
    folder: folder consisting .txt and .out files (for this function to work
    you should have the same folder name in ../annotated directory )
    foc_list: list of documents for appending
     
    """
    out = 'out'
    for filename in os.listdir(
            'data/bsnlp2021_train_r1/raw/{folder}/ru'.format(folder=folder)):

        df = pd.read_csv(
            'data/bsnlp2021_train_r1/annotated/{folder}/ru/{filename}{out}'.
            format(folder=folder, filename=filename[:-3], out='out'),
            skiprows=1,
            header=None,
            sep='\t',
            encoding='utf8',
            error_bad_lines=False,
            engine='python')
        f = open('data/bsnlp2021_train_r1/raw/{folder}/ru/{filename}'.format(
            folder=folder, filename=filename),
                 "r",
                 encoding='utf8')
        list_words = df.iloc[:, 0].tolist()
        labels = df.iloc[:, 2].tolist()
        text = f.read()
        entities = []
        for n in range(len(list_words)):
            for m in re.finditer(list_words[n].strip(), text):
                entities.append([m.start(), m.end(), labels[n]])

        for f in range(len(entities)):
            if len(entities[f]) == 3:
                for s in range(f + 1, len(entities)):
                    if len(entities[s]) == 3 and len(entities[f]) == 3:
                        #                     print(entities[f],entities[s])
                        #                     print(f, s)
                        if entities[f][0] == entities[s][0] or entities[f][
                                1] == entities[s][1]:
                            #                         print(entities[f],entities[s])
                            #                         print(f, s)
                            if (entities[f][1] - entities[f][0]) >= (
                                    entities[s][1] - entities[s][0]):
                                entities.pop(s)
                                entities.insert(s, (''))
                            else:
                                entities.pop(f)
                                entities.insert(f, (''))
                        if len(entities[s]) == 3 and len(entities[f]) == 3:
                            if entities[f][0] in range(entities[s][0] + 1,
                                                       entities[s][1]):
                                entities.pop(f)
                                entities.insert(f, (''))
                            elif entities[s][0] in range(
                                    entities[f][0] + 1, entities[f][1]):
                                entities.pop(s)
                                entities.insert(s, (''))

        entities_cleared = [i for i in entities if len(i) == 3]
        doc = nlp(text)
        tags = offsets_to_biluo_tags(doc, entities_cleared)
        #assert tags == ["O", "O", "U-LOC", "O"]
        entities_x = biluo_tags_to_spans(doc, tags)
        doc.ents = entities_x
        doc_list.append(doc)