Ejemplo n.º 1
0
def test_correct_biluo_tags_random(nlp):
    ntests = 100
    for _ in range(ntests):
        length = 10
        text = ("foo " * length).strip()
        doc = nlp(text)
        tags = create_tags_sample(10)
        corrected_tags, _ = correct_biluo_tags(tags)
        spans_from_biluo_tags(doc, corrected_tags)
def format_data_to_jsonl(data, file_path, print_label=False):
    result = []
    labels = set()
    i = 0

    data = tqdm.tqdm(data, leave=False)

    with file_path.open("w", encoding="utf-8") as f:
        for d in data:
            text = d['text']
            ents = []
            label_data = d["label"]
            for l, label_l in label_data.items():
                labels.update([l])
                label_ent_array = []
                for text_labeled, ent_arrays in label_l.items():
                    start_char, end_char = ent_arrays[0]
                    label_ent_array.append((start_char, end_char + 1, l))
                ents.append(label_ent_array[0])

            if True == diff_contain_overlapping(ents):
                i = i + 1

                doc = nlp(text)
                tags = biluo_tags_from_offsets(doc, ents)
                doc.ents = spans_from_biluo_tags(doc, tags)

                line = docs_to_json([doc])
                f.write(json_dumps(line) + "\n")

    msg.good(f"Finished {file_path} :: {i} rows")
    if print_label:
        msg.info(f"{labels}")
Ejemplo n.º 3
0
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str],
                             nlp: Language) -> Doc:
    # Create initial doc
    all_tokens = list(chain.from_iterable(sentences))
    # Mark that every token is followed by space
    spaces = [True] * len(all_tokens)
    doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces)

    # Set sentence boundaries
    tok_idx = 0
    for sentence in sentences:
        for sentence_idx in range(len(sentence)):
            # First token should have start to True, all others False
            doc[tok_idx].is_sent_start = sentence_idx == 0
            tok_idx += 1

    if labels:
        if len(labels) != len(all_tokens):
            raise ValueError(
                f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})"
            )

        # Create entities after converting IOB (actually BIO) to BILUO
        doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels))

    return doc
Ejemplo n.º 4
0
def test_biluo_spans(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    spans = spans_from_biluo_tags(doc, biluo_tags)
    assert len(spans) == 2
    assert spans[0].text == "Silicon Valley"
    assert spans[0].label_ == "LOC"
    assert spans[1].text == "London"
    assert spans[1].label_ == "GPE"
Ejemplo n.º 5
0
def test_biluo_spans(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    spans = spans_from_biluo_tags(doc, biluo_tags)
    assert len(spans) == 2
    assert spans[0].text == "Silicon Valley"
    assert spans[0].label_ == "LOC"
    assert spans[1].text == "London"
    assert spans[1].label_ == "GPE"
Ejemplo n.º 6
0
    def set_annotations(
        self, docs: Iterable[Doc], logits: torch.Tensor
    ) -> Iterable[Doc]:
        assert len(logits.shape) == 3  # (batch, length, nclass)
        id2label = self.labels

        for doc, logit in zip(docs, cast(Iterable, logits)):
            doc._.set("tokens_logit", logit)
            best_tags = get_best_tags(logit, id2label, self.k_beam)
            ents = [best_tags[a[0]] if len(a) else "O" for a in doc._.get(ATTRS.align)]
            biluo_ents = iob_to_biluo(ents)
            doc.ents = tuple(
                spacy.util.filter_spans(
                    doc.ents + tuple(spans_from_biluo_tags(doc, biluo_ents))
                )
            )
        return docs
Ejemplo n.º 7
0
def main(textfile, output, dummymodel, labellist):
    #Need a dummy model to create a nlp object with the aim to transform a txt file to json
    nlp = spacy.load(dummymodel)
    sr_transfrom = load_SRs_file(textfile)

    sr_transfrom_string = eval(spacy_format(sr_transfrom, labellist))
    docs = []
    for text, annot in sr_transfrom_string:
        doc = nlp(text)
        doc.is_parsed = True
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        entities = spans_from_biluo_tags(doc, tags)
        doc.ents = entities
        docs.append(doc)
    #Create the json file in the same directory that textfile

    mkdir_p(os.path.split(output)[0])
    srsly.write_json(output, [spacy.gold.docs_to_json(docs)])
Ejemplo n.º 8
0
def ls_to_spacy_json(ls_completions):
    nlp = spacy.load('en_core_web_sm')

    # Load the Label Studio completions
    with ZipFile(ls_completions, 'r') as zip:
        result_file = zip.read('result.json')
        label_studio_json = json.loads(result_file)

    gold_docs = []
    entity_cnt = 0
    for task in label_studio_json:
        completions = task['completions']

        # don't include skipped tasks or tasks with multiple completions
        if len(completions) == 1:
            completion = completions[0]
            if 'was_cancelled' in completion:
                continue

            raw_text = task['data']['reddit']
            annotated_entities = []
            for result in completion['result']:
                ent = result['value']
                start_char_offset = ent['start']
                end_char_offset = ent['end']
                ent_label = ent['labels'][0]
                entity = (start_char_offset, end_char_offset, ent_label)
                annotated_entities.append(entity)

            doc = nlp(raw_text)
            tags = biluo_tags_from_offsets(doc, annotated_entities)
            entities = spans_from_biluo_tags(doc, tags)
            doc.ents = entities
            gold_docs.append(doc)
            entity_cnt += len(annotated_entities)

    print("{} entities in {} docs.".format(str(entity_cnt), len(gold_docs)))
    return gold_docs
Ejemplo n.º 9
0
def test_roundtrip_docs_to_json():
    text = "I flew to Silicon Valley via London."
    tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
    heads = [1, 1, 1, 4, 2, 1, 5, 1]
    deps = [
        "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"
    ]
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
    nlp = English()
    doc = nlp(text)
    for i in range(len(tags)):
        doc[i].tag_ = tags[i]
        doc[i].dep_ = deps[i]
        doc[i].head = doc[heads[i]]
    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
    doc.cats = cats
    doc.is_tagged = True
    doc.is_parsed = True

    # roundtrip to JSON
    with make_tempdir() as tmpdir:
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(json_file), str(json_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]

    # roundtrip to JSONL train dicts
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "roundtrip.jsonl"
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]

    # roundtrip to JSONL tuples
    with make_tempdir() as tmpdir:
        jsonl_file = tmpdir / "roundtrip.jsonl"
        # write to JSONL train dicts
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
        # load and rewrite as JSONL tuples
        srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))

    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))

    assert len(doc) == goldcorpus.count_train()
    assert text == reloaded_doc.text
    assert tags == goldparse.tags
    assert deps == goldparse.labels
    assert heads == goldparse.heads
    assert biluo_tags == goldparse.ner
    assert "TRAVEL" in goldparse.cats
    assert "BAKING" in goldparse.cats
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
    assert cats["BAKING"] == goldparse.cats["BAKING"]
Ejemplo n.º 10
0
import spacy
import srsly
import json
from spacy.gold import docs_to_json, biluo_tags_from_offsets, spans_from_biluo_tags

nlp = spacy.load('en_core_web_lg')
for i in range(114):
    train_data = json.load(
        open(
            f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/{i}.json"
        ))

    docs = []
    c = 0
    for kgid, text, annot in train_data:
        c += 1
        print(c)
        doc = nlp(text)
        tags = biluo_tags_from_offsets(doc, annot['entities'])
        entities = spans_from_biluo_tags(doc, tags)
        doc.ents = entities
        docs.append(doc)

    srsly.write_json(
        f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/gold/{i}.json",
        [docs_to_json(docs)])