def test_get_paragraph(): result_keep_no_annotation = get_paragraph_from_file(path="./resources/test/test.xml", keep_paragraph_without_annotation=True) result_keep_with_annotation = get_paragraph_from_file(path="./resources/test/test.xml", keep_paragraph_without_annotation=False) assert len(result_keep_no_annotation) == 27 assert len(result_keep_with_annotation) == 3
def annotate(model_dir_path: str, files_dir_path: List[str], out_dir_path: str) -> None: """ Annotate a sample of the given XML files and save them into the given directory. :param model_dir_path: the directory of the Spacy model :param files_dir_path: the directory containing the XML files :param out_dir_path: the directory where to write the annotations """ logging.info("Loading NER model…") nlp = get_empty_model(load_labels_for_training=False) nlp = nlp.from_disk(model_dir_path) # TODO remove when we have retrained infixes = nlp.Defaults.infixes + [r':', r"(?<=[\W\d_])-|-(?=[\W\d_])"] infixes_regex = spacy.util.compile_infix_regex(infixes) nlp.tokenizer.infix_finditer = infixes_regex.finditer # end of deletion above entity_typename_builder = EntityTypename() logging.info("Loading cases…") cases: List[Case] = list() for path in files_dir_path: if path.endswith(".xml"): case: Case = get_paragraph_from_file( path=path, keep_paragraph_without_annotation=True) cases.append(case) elif path.endswith(".txt"): with open(path) as f: lines = f.readlines() case: Case = list() for line in lines: clean_text = line.strip() if len(clean_text) > 1: basename = os.path.basename(path) basename = basename.split(".")[0] case.append( Paragraph(basename, clean_text, list(), list())) cases.append(case)
def main(data_folder: str, model_folder: str, top_n: int) -> None: print(f"keep only top {top_n} examples per file") nlp: Language = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) tokenizer = build_spacy_tokenizer(nlp) filenames = [ filename for filename in os.listdir(data_folder) if filename.endswith(".xml") ] sentences: List[Sentence] = list() with tqdm(total=len(filenames), unit=" XML", desc="Parsing XML") as progress_bar: for filename in filenames: paragraphs: List[Paragraph] = get_paragraph_from_file( path=os.path.join(data_folder, filename), keep_paragraph_without_annotation=True) if len(paragraphs) > top_n: for paragraph in paragraphs[:top_n]: if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) sentences.append(s) progress_bar.update() if len(sentences) == 0: raise Exception( "No example loaded, causes: no cases in provided path or sample size is to high" ) tagger: SequenceTagger = SequenceTagger.load( os.path.join(model_folder, 'best-model.pt')) _ = tagger.predict(sentences=sentences, mini_batch_size=32, verbose=True, embedding_storage_mode="cpu") print("prepare html") page_html = render_ner_html(sentences, colors=colors) print("write html") with open("sentence.html", "w") as writer: writer.write(page_html)
print(len(sys.argv)) assert len(sys.argv) <= 2 if len(sys.argv) == 2: param = sys.argv[1] train_dataset = "train_data_set" == param export_dataset = "export_dataset" == param else: train_dataset = False export_dataset = False TRAIN_DATA: List[Paragraph] = list() for filename in os.listdir(xml_train_path): if filename.endswith(".xml"): current_path = os.path.join(xml_train_path, filename) TRAIN_DATA += get_paragraph_from_file( path=current_path, keep_paragraph_without_annotation=True) if (not train_dataset) and (not export_dataset): print("Prepare training set for display") TRAIN_DATA = TRAIN_DATA[:number_of_paragraph_to_display] elif train_dataset: print("Train model") else: print("Save recurrent entities") print("folder_path:{}".format(xml_train_path)) case_header_content = parse_xml_headers(folder_path=xml_train_path) print("header_content:{}".format(case_header_content)) current_case_paragraphs = list() current_case_offsets = list() previous_case_id: Optional[str] = None current_item_header = None
] random.shuffle(files) with open("./resources/training_data/generated_annotations.txt", mode='w') as generated_text: with open("./resources/training_data/generated_annotations.ent", mode='w') as generated_entities: for filename in files: if filename.endswith(".xml"): try: print(f"--- {filename} ---") text_lines: List[str] = list() offset_lines: List[str] = list() print("read XML") generated_paragraphs: List[Sentence] = list() paragraphs = get_paragraph_from_file( path=filename, keep_paragraph_without_annotation=True) if len(paragraphs) > 50000: for paragraph in paragraphs: # type: Paragraph if len(paragraph.text) > 0: s = Sentence(text=paragraph.text, tokenizer=tokenizer) generated_paragraphs.append(s) generated_paragraphs = tagger.predict( sentences=generated_paragraphs, mini_batch_size=32, verbose=True, embedding_storage_mode="none") for sentence in generated_paragraphs: text_lines.append(sentence.original_text)
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from xml_extractions.extract_node_values import get_paragraph_from_file from ner.model_factory import get_empty_model from resources.config_provider import get_config_default from viewer.spacy_viewer import view_spacy_docs config_training = get_config_default() model_dir_path = config_training["model_dir_path"] xml_dev_path = config_training["xml_dev_path"] number_of_paragraph_to_display = int( config_training["number_of_paragraph_to_display"]) DEV_DATA = get_paragraph_from_file(xml_dev_path, keep_paragraph_without_annotation=True) DEV_DATA = list(DEV_DATA)[:number_of_paragraph_to_display] doc_annotated = list() nlp = get_empty_model(load_labels_for_training=True) # nlp = nlp.from_disk(model_dir_path) for current_case_id, xml_paragraph, xml_extracted_text, xml_offset in DEV_DATA: spacy_matcher_offset = list() doc = nlp.make_doc(xml_paragraph) for start_offset, end_offset, type_name in xml_offset: # https://spacy.io/usage/linguistic-features#section-named-entities span_doc = doc.char_span(start_offset, end_offset, label=type_name) if span_doc is not None: # span will be none if the word is incomplete