Beispiel #1
0
model_dir_path = config_training["model_dir_path"]
xml_dev_path = config_training["xml_dev_path"]
number_of_paragraph_to_display = int(
    config_training["number_of_paragraph_to_display"])

DEV_DATA = get_paragraph_from_file(xml_dev_path,
                                   keep_paragraph_without_annotation=True)
DEV_DATA = list(DEV_DATA)[:number_of_paragraph_to_display]

doc_annotated = list()

nlp = get_empty_model(load_labels_for_training=True)
# nlp = nlp.from_disk(model_dir_path)

for current_case_id, xml_paragraph, xml_extracted_text, xml_offset in DEV_DATA:
    spacy_matcher_offset = list()
    doc = nlp.make_doc(xml_paragraph)
    for start_offset, end_offset, type_name in xml_offset:
        # https://spacy.io/usage/linguistic-features#section-named-entities
        span_doc = doc.char_span(start_offset, end_offset, label=type_name)
        if span_doc is not None:
            # span will be none if the word is incomplete
            spacy_matcher_offset.append(span_doc)
        else:
            print("ERROR char offset", doc.text[start_offset:end_offset])
    doc.ents = spacy_matcher_offset
    doc_annotated.append(doc)

# docs = convert_offsets_to_spacy_docs(doc_annotated)
view_spacy_docs(doc_annotated, port=5000)
            current_case_offsets.clear()
            previous_case_id = paragraph.case_id
            current_item_header = case_header_content[paragraph.case_id]

            headers_matcher = MatchValuesFromHeaders(
                current_header=current_item_header, threshold_size=3)

        current_case_paragraphs.append(paragraph.text)
        current_case_offsets.append(paragraph.offsets)

print("Number of tags:",
      sum([len(offsets) for _, _, offsets in doc_annotated]))

#if train_dataset:
if True:
    train_model(data=doc_annotated,
                folder_to_save_model=model_dir_path,
                n_iter=n_iter,
                batch_size=batch_size,
                dropout_rate=dropout_rate)
elif export_dataset:
    with open(training_set_export_path, 'wb') as export_training_set_file:
        pickle.dump(obj=doc_annotated,
                    file=export_training_set_file,
                    protocol=pickle.HIGHEST_PROTOCOL)
else:
    # Display training set
    docs = convert_offsets_to_spacy_docs(doc_annotated)
    view_spacy_docs(docs, port=5000)
    print("view result on browser (localhost - port 5000)")
Beispiel #3
0
all_docs_to_view: List[Doc] = list()
# last_case_spans = dict()
last_case_docs: List[Doc] = list()
former_case_id = None
entity_typename_builder = EntityTypename()

with tqdm(total=len(DEV_DATA[:number_of_paragraph_to_display]),
          unit=" paragraphs",
          desc="Find entities") as progress_bar:
    for (case_id, original_text, _,
         _) in DEV_DATA[:number_of_paragraph_to_display]:
        if case_id != former_case_id:
            spans = entity_typename_builder.get_dict()
            complete_case_annotations(last_case_docs, spans)

            all_docs_to_view.extend(last_case_docs)
            last_case_docs.clear()
            entity_typename_builder.clear()
            former_case_id = case_id
        spacy_doc: Doc = nlp(original_text)
        # doc.user_data['title'] = case_id
        last_case_docs.append(spacy_doc)
        # entities_span = [(ent.text.lower(), ent.label_) for ent in spacy_doc.ents]
        # last_case_spans.update(entities_span)
        entity_typename_builder.add_spacy_entities(spacy_doc=spacy_doc)
        progress_bar.update()

print("Generate HTML")
view_spacy_docs(all_docs_to_view, port=5000)
print("view result on browser (port 5000)")