def test_correct_biluo_tags_random(nlp): ntests = 100 for _ in range(ntests): length = 10 text = ("foo " * length).strip() doc = nlp(text) tags = create_tags_sample(10) corrected_tags, _ = correct_biluo_tags(tags) spans_from_biluo_tags(doc, corrected_tags)
def format_data_to_jsonl(data, file_path, print_label=False): result = [] labels = set() i = 0 data = tqdm.tqdm(data, leave=False) with file_path.open("w", encoding="utf-8") as f: for d in data: text = d['text'] ents = [] label_data = d["label"] for l, label_l in label_data.items(): labels.update([l]) label_ent_array = [] for text_labeled, ent_arrays in label_l.items(): start_char, end_char = ent_arrays[0] label_ent_array.append((start_char, end_char + 1, l)) ents.append(label_ent_array[0]) if True == diff_contain_overlapping(ents): i = i + 1 doc = nlp(text) tags = biluo_tags_from_offsets(doc, ents) doc.ents = spans_from_biluo_tags(doc, tags) line = docs_to_json([doc]) f.write(json_dumps(line) + "\n") msg.good(f"Finished {file_path} :: {i} rows") if print_label: msg.info(f"{labels}")
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str], nlp: Language) -> Doc: # Create initial doc all_tokens = list(chain.from_iterable(sentences)) # Mark that every token is followed by space spaces = [True] * len(all_tokens) doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces) # Set sentence boundaries tok_idx = 0 for sentence in sentences: for sentence_idx in range(len(sentence)): # First token should have start to True, all others False doc[tok_idx].is_sent_start = sentence_idx == 0 tok_idx += 1 if labels: if len(labels) != len(all_tokens): raise ValueError( f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})" ) # Create entities after converting IOB (actually BIO) to BILUO doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels)) return doc
def test_biluo_spans(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] spans = spans_from_biluo_tags(doc, biluo_tags) assert len(spans) == 2 assert spans[0].text == "Silicon Valley" assert spans[0].label_ == "LOC" assert spans[1].text == "London" assert spans[1].label_ == "GPE"
def test_biluo_spans(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] spans = spans_from_biluo_tags(doc, biluo_tags) assert len(spans) == 2 assert spans[0].text == "Silicon Valley" assert spans[0].label_ == "LOC" assert spans[1].text == "London" assert spans[1].label_ == "GPE"
def set_annotations( self, docs: Iterable[Doc], logits: torch.Tensor ) -> Iterable[Doc]: assert len(logits.shape) == 3 # (batch, length, nclass) id2label = self.labels for doc, logit in zip(docs, cast(Iterable, logits)): doc._.set("tokens_logit", logit) best_tags = get_best_tags(logit, id2label, self.k_beam) ents = [best_tags[a[0]] if len(a) else "O" for a in doc._.get(ATTRS.align)] biluo_ents = iob_to_biluo(ents) doc.ents = tuple( spacy.util.filter_spans( doc.ents + tuple(spans_from_biluo_tags(doc, biluo_ents)) ) ) return docs
def main(textfile, output, dummymodel, labellist): #Need a dummy model to create a nlp object with the aim to transform a txt file to json nlp = spacy.load(dummymodel) sr_transfrom = load_SRs_file(textfile) sr_transfrom_string = eval(spacy_format(sr_transfrom, labellist)) docs = [] for text, annot in sr_transfrom_string: doc = nlp(text) doc.is_parsed = True tags = biluo_tags_from_offsets(doc, annot['entities']) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities docs.append(doc) #Create the json file in the same directory that textfile mkdir_p(os.path.split(output)[0]) srsly.write_json(output, [spacy.gold.docs_to_json(docs)])
def ls_to_spacy_json(ls_completions): nlp = spacy.load('en_core_web_sm') # Load the Label Studio completions with ZipFile(ls_completions, 'r') as zip: result_file = zip.read('result.json') label_studio_json = json.loads(result_file) gold_docs = [] entity_cnt = 0 for task in label_studio_json: completions = task['completions'] # don't include skipped tasks or tasks with multiple completions if len(completions) == 1: completion = completions[0] if 'was_cancelled' in completion: continue raw_text = task['data']['reddit'] annotated_entities = [] for result in completion['result']: ent = result['value'] start_char_offset = ent['start'] end_char_offset = ent['end'] ent_label = ent['labels'][0] entity = (start_char_offset, end_char_offset, ent_label) annotated_entities.append(entity) doc = nlp(raw_text) tags = biluo_tags_from_offsets(doc, annotated_entities) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities gold_docs.append(doc) entity_cnt += len(annotated_entities) print("{} entities in {} docs.".format(str(entity_cnt), len(gold_docs))) return gold_docs
def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] heads = [1, 1, 1, 4, 2, 1, 5, 1] deps = [ "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct" ] biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() doc = nlp(text) for i in range(len(tags)): doc[i].tag_ = tags[i] doc[i].dep_ = deps[i] doc[i].head = doc[heads[i]] doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.cats = cats doc.is_tagged = True doc.is_parsed = True # roundtrip to JSON with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(json_file), str(json_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"] # roundtrip to JSONL train dicts with make_tempdir() as tmpdir: jsonl_file = tmpdir / "roundtrip.jsonl" srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"] # roundtrip to JSONL tuples with make_tempdir() as tmpdir: jsonl_file = tmpdir / "roundtrip.jsonl" # write to JSONL train dicts srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # load and rewrite as JSONL tuples srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"]
import spacy import srsly import json from spacy.gold import docs_to_json, biluo_tags_from_offsets, spans_from_biluo_tags nlp = spacy.load('en_core_web_lg') for i in range(114): train_data = json.load( open( f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/{i}.json" )) docs = [] c = 0 for kgid, text, annot in train_data: c += 1 print(c) doc = nlp(text) tags = biluo_tags_from_offsets(doc, annot['entities']) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities docs.append(doc) srsly.write_json( f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/gold/{i}.json", [docs_to_json(docs)])