def save_train_dev_data(gold_docs, split, train_file, dev_file): # shuffle the docs random.seed(27) random.shuffle(gold_docs) # split the gold data into training and evaluation num_training_tasks = round(len(gold_docs) * split / 100) train_docs = gold_docs[:num_training_tasks] dev_docs = gold_docs[num_training_tasks:] print("{} training entities".format(str(entity_count(train_docs)))) print("{} dev entities".format(str(entity_count(dev_docs)))) srsly.write_json(train_file, [docs_to_json(train_docs)]) srsly.write_json(dev_file, [docs_to_json(dev_docs)])
def format_data_to_jsonl(data, file_path, print_label=False): result = [] labels = set() i = 0 data = tqdm.tqdm(data, leave=False) with file_path.open("w", encoding="utf-8") as f: for d in data: text = d['text'] ents = [] label_data = d["label"] for l, label_l in label_data.items(): labels.update([l]) label_ent_array = [] for text_labeled, ent_arrays in label_l.items(): start_char, end_char = ent_arrays[0] label_ent_array.append((start_char, end_char + 1, l)) ents.append(label_ent_array[0]) if True == diff_contain_overlapping(ents): i = i + 1 doc = nlp(text) tags = biluo_tags_from_offsets(doc, ents) doc.ents = spans_from_biluo_tags(doc, tags) line = docs_to_json([doc]) f.write(json_dumps(line) + "\n") msg.good(f"Finished {file_path} :: {i} rows") if print_label: msg.info(f"{labels}")
def match_names_for_text(self, characters, text, results_dir, filename=None, tests_variant=False, displacy_option=False, save_ratios=False, save_doc=False): if tests_variant: train_data = [] matches_table = prepare_list_for_ratios(characters) for sentence in text: matches_table_row, data_for_sentence, _ = self.recognize_person_entities(sentence, characters) train_data.append(data_for_sentence[0]) matches_table.extend(matches_table_row[1:]) else: matches_table, train_data, doc = self.recognize_person_entities(text, characters) if filename is not None: if save_doc: json_data = gold.docs_to_json(doc) with open(results_dir + "\\docs\\" + filename, 'w') as result: json.dump(json_data, result) if save_ratios: write_list_to_file(results_dir + "\\ratios\\" + filename, matches_table) if tests_variant: with open(results_dir + filename, 'w', encoding='utf8') as result: json.dump(train_data, result, ensure_ascii=False) else: with open(results_dir + filename, 'w') as result: json.dump(train_data, result) if displacy_option: displacy.serve(doc, style="ent")
def convert_to_spacy_format(data_path, output_path): labels = get_labels(args.data) with open(output_path, "w") as f_o: json_format = [] for i, doc in enumerate(yield_docs(args.data, labels)): print(i) json_format.append(docs_to_json(doc, id=i)) f_o.write(json.dumps(json_format))
def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() doc = nlp(text) doc.cats = cats doc[0].is_sent_start = True for i in range(1, len(doc)): doc[i].is_sent_start = False with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(json_file), str(json_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"]
def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] heads = [1, 1, 1, 4, 2, 1, 5, 1] deps = [ "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct" ] biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() doc = nlp(text) for i in range(len(tags)): doc[i].tag_ = tags[i] doc[i].dep_ = deps[i] doc[i].head = doc[heads[i]] doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.cats = cats doc.is_tagged = True doc.is_parsed = True # roundtrip to JSON with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(json_file), str(json_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"] # roundtrip to JSONL train dicts with make_tempdir() as tmpdir: jsonl_file = tmpdir / "roundtrip.jsonl" srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"] # roundtrip to JSONL tuples with make_tempdir() as tmpdir: jsonl_file = tmpdir / "roundtrip.jsonl" # write to JSONL train dicts srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # load and rewrite as JSONL tuples srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text assert tags == goldparse.tags assert deps == goldparse.labels assert heads == goldparse.heads assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] assert cats["BAKING"] == goldparse.cats["BAKING"]
texts = [] texts.append(my_text) # naming of files fname = 'example' txt_file = fname + '.txt' json_file = fname + '.json' # f = open(txt_file) # will scrape strings from here # lines = f.readlines() # list of srings from the txt file docs = [] # initialize a list to be populated wih nlp doc objects # for line in lines: # print(line[:]) # display the sentence from that line for text in texts: doc = nlp(text) # convert string into a spacy doc object using nlp # doc = nlp(line) # convert string into a spacy doc object using nlp docs.append(doc) # add new doc to the list of docs json_data = docs_to_json(docs) # convert doc into a json file # import json # with open('json/' + json_file, 'w+') as outfile: # json.dump(json_data, outfile) import srsly # srsly.write_json('json/' + json_file, [spacy.gold.docs_to_json(docs)]) srsly.write_json(json_file, [spacy.gold.docs_to_json(docs)])
import spacy import srsly import json from spacy.gold import docs_to_json, biluo_tags_from_offsets, spans_from_biluo_tags nlp = spacy.load('en_core_web_lg') for i in range(114): train_data = json.load( open( f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/{i}.json" )) docs = [] c = 0 for kgid, text, annot in train_data: c += 1 print(c) doc = nlp(text) tags = biluo_tags_from_offsets(doc, annot['entities']) entities = spans_from_biluo_tags(doc, tags) doc.ents = entities docs.append(doc) srsly.write_json( f"/home/marco/Scrivania/tirocinio-unicredit/news/final_attempt/training_data/sector/cli/train_placeholder/gold/{i}.json", [docs_to_json(docs)])