Esempio n. 1
0
def complete_case_annotations(spacy_docs: List[Doc],
                              entity_typename: Dict[str, str]) -> List[Doc]:
    """
    Complete/Normalize annotations from the spacy tagger.

    :param spacy_docs: the spacy annotations
    :param entity_typename: the dictionary with each occurence type
    :returns: the updated spacy_annotions (for convenience only, as the update is inplace)
    """

    if len(spacy_docs) > 0:
        matcher = AcoraMatcher(content=list(entity_typename.keys()),
                               ignore_case=True)

        doc_text, empty_offsets = zip(*[(spacy_doc.text, [])
                                        for spacy_doc in spacy_docs])
        document_addresses_offsets = find_address_in_block_of_paragraphs(
            texts=list(doc_text), offsets=list(empty_offsets))

        for spacy_doc, doc_address_offset in zip(spacy_docs,
                                                 document_addresses_offsets):

            matches = matcher.get_matches(text=spacy_doc.text, tag="UNKNOWN")
            matcher_offsets = list()
            for offset in matches:
                span_text = spacy_doc.text[offset.start:offset.end]
                logger.debug(span_text)
                offset.type = entity_typename[span_text.lower()]
                matcher_offsets.append(offset)

            matcher_offsets_normalized = normalize_offsets(
                offsets=matcher_offsets + doc_address_offset)

            spacy_matcher_offset: List[Span] = list()
            for offset in matcher_offsets_normalized:
                # https://spacy.io/usage/linguistic-features#section-named-entities
                span_doc: Span = spacy_doc.char_span(offset.start,
                                                     offset.end,
                                                     label=offset.type)
                if span_doc is not None:
                    # span will be none if the word is incomplete
                    spacy_matcher_offset.append(span_doc)

                else:
                    logger.error(
                        f"ERROR char offset [{spacy_doc.text[offset.start:offset.end]}] "
                        f"from [{spacy_doc.text}]")

            spacy_doc.ents = spacy_matcher_offset  # all_offsets

    return spacy_docs
Esempio n. 2
0
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
    result: List[str] = list()
    for text, offsets in data:
        doc: Doc = spacy_model(text)
        # remove duplicated offsets
        offsets = normalize_offsets(offsets=offsets)
        offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
        gold_annotations = GoldParse(doc, entities=offset_tuples)
        annotations: List[str] = gold_annotations.ner
        assert len(annotations) == len(doc)
        # Flair uses BIOES and Spacy BILUO
        # BILUO for Begin, Inside, Last, Unit, Out
        # BIOES for Begin, Inside, Outside, End, Single
        annotations = [a.replace("L-", "E-") for a in annotations]
        annotations = [a.replace("U-", "S-") for a in annotations]
        annotations = ["O" if a == "-" else a for a in annotations]  # replace unknown
        result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
        result.append("\n")
    return result
                        # clerk_names +
                        # lawyer_names +
                        # partie_pp +
                        # postal_code_matches +
                        # # frequent_entities +
                        # court_name +
                        # court_names_matches +
                        case_dates)  #+
                    # bar +
                    # rg_from_regex +
                    # licence_plate_number +
                    # phone_numbers +
                    # addresses)

                    if len(all_matches) > 0:
                        normalized_offsets = normalize_offsets(all_matches)
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append(normalized_offsets)

                    elif current_paragraph.isupper(
                    ) and len(current_paragraph) > 10:
                        # add empty title paragraph to avoid fake solution
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append([])

            assert len(last_document_texts) == len(last_document_offsets)

            last_document_offsets_with_addresses = find_address_in_block_of_paragraphs(
                texts=last_document_texts, offsets=last_document_offsets)

            last_document_offsets_with_cleaned_addresses = clean_address_offsets(
Esempio n. 4
0
def test_normalize_offsets():
    data1 = [Offset(5, 10, "LAWYER"), Offset(18, 24, "ORGANIZATION"), Offset(22, 24, "ORGANIZATION"), Offset(120, 133, "LAWYER")]
    assert normalize_offsets(data1) == [Offset(5, 10, "LAWYER"), Offset(18, 24, "ORGANIZATION"), Offset(120, 133, "LAWYER")]
    data2 = [Offset(71, 75, "PERS"), Offset(76, 85, "PERS")]
    assert normalize_offsets(data2) == [Offset(71, 85, "PERS")]
    data3 = [Offset(5, 10, "LAWYER"), Offset(71, 75, "PERS"), Offset(76, 85, "PERS"), Offset(120, 133, "LAWYER")]
    assert normalize_offsets(data3) == [Offset(5, 10, "LAWYER"), Offset(71, 85, "PERS"), Offset(120, 133, "LAWYER")]
    data4 = [Offset(5, 10, "LAWYER"), Offset(77, 85, "PERS"), Offset(77, 85, "PERS"), Offset(120, 133, "LAWYER")]
    assert normalize_offsets(data4) == [Offset(5, 10, "LAWYER"), Offset(77, 85, "PERS"), Offset(120, 133, "LAWYER")]
    data5 = [Offset(16, 20, "PERS"), Offset(21, 30, "PERS")]
    assert normalize_offsets(data5) == [Offset(16, 30, "PERS")]
    data6 = [Offset(10, 21, "PERS"), Offset(22, 30, "PERS")]
    assert normalize_offsets(data6) == [Offset(10, 30, "PERS")]
    data7 = []
    assert normalize_offsets(data7) == []
    data8 = [Offset(1, 1, "PERS")]
    assert normalize_offsets(data8) == []
    data9 = [Offset(1, 3, "PERS")]
    assert normalize_offsets(data9) == []
    data10 = [Offset(1, 10, "PERS"), Offset(1, 10, "PERS"), Offset(3, 10, "PERS")]
    assert normalize_offsets(data10) == [Offset(1, 10, "PERS")]
    data11 = [Offset(0, 34, "ORGANIZATION"), Offset(0, 8, "ORGANIZATION")]
    assert normalize_offsets(data11) == [Offset(0, 34, "ORGANIZATION")]
    data12 = [Offset(1, 10, "PERS"), Offset(1, 10, "ORGANIZATION_1")]
    assert normalize_offsets(data12) == [Offset(1, 10, "ORGANIZATION")]
    data13 = [Offset(1, 10, "PERS"), Offset(5, 10, "ORGANIZATION_1")]
    assert normalize_offsets(data13) == [Offset(1, 10, "ORGANIZATION")]
    data14 = [Offset(21, 33, "DATE"), Offset(35, 55, "PERS")]
    assert normalize_offsets(data14) == data14
    data15 = [Offset(start=21, end=37, type='DATE'), Offset(start=45, end=47, type='ORGANIZATION')]
    assert normalize_offsets(data15) == [Offset(21, 37, "DATE")]
Esempio n. 5
0
                        # clerk_names +
                        # lawyer_names +
                        # partie_pp +
                        # postal_code_matches +
                        # # frequent_entities +
                        # court_name +
                        # court_names_matches +
                        case_dates)  #+
                    # bar +
                    # rg_from_regex +
                    # licence_plate_number +
                    # phone_numbers +
                    # addresses)

                    if len(all_matches) > 0:
                        normalized_offsets = normalize_offsets(all_matches)
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append(normalized_offsets)

                    elif current_paragraph.isupper(
                    ) and len(current_paragraph) > 10:
                        # add empty title paragraph to avoid fake solution
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append([])

                assert len(last_document_texts) == len(last_document_offsets)

                # last_document_offsets_with_addresses = find_address_in_block_of_paragraphs(texts=last_document_texts,
                #                                                                            offsets=last_document_offsets)
                #
                # last_document_offsets_with_cleaned_addresses = clean_address_offsets(texts=last_document_texts,
def test_normalize_offsets():
    data1 = [(5, 10, "LAWYER"), (18, 24, "ORGANIZATION"),
             (22, 24, "ORGANIZATION"), (120, 133, "LAWYER")]
    assert normalize_offsets(data1) == [(5, 10, "LAWYER"),
                                        (18, 24, "ORGANIZATION"),
                                        (120, 133, "LAWYER")]
    data2 = [(71, 75, "PERS"), (76, 85, "PERS")]
    assert normalize_offsets(data2) == [(71, 85, "PERS")]
    data3 = [(5, 10, "LAWYER"), (71, 75, "PERS"), (76, 85, "PERS"),
             (120, 133, "LAWYER")]
    assert normalize_offsets(data3) == [(5, 10, "LAWYER"), (71, 85, "PERS"),
                                        (120, 133, "LAWYER")]
    data4 = [(5, 10, "LAWYER"), (77, 85, "PERS"), (77, 85, "PERS"),
             (120, 133, "LAWYER")]
    assert normalize_offsets(data4) == [(5, 10, "LAWYER"), (77, 85, "PERS"),
                                        (120, 133, "LAWYER")]
    data5 = [(16, 20, "PERS"), (21, 30, "PERS")]
    assert normalize_offsets(data5) == [(16, 30, "PERS")]
    data6 = [(10, 21, "PERS"), (22, 30, "PERS")]
    assert normalize_offsets(data6) == [(10, 30, "PERS")]
    data7 = []
    assert normalize_offsets(data7) == []
    data8 = [(1, 1, "PERS")]
    assert normalize_offsets(data8) == []
    data9 = [(1, 3, "PERS")]
    assert normalize_offsets(data9) == []
    data10 = [(1, 10, "PERS"), (1, 10, "PERS"), (3, 10, "PERS")]
    assert normalize_offsets(data10) == [(1, 10, "PERS")]
    data11 = [(0, 34, "ORGANIZATION"), (0, 8, "ORGANIZATION")]
    assert normalize_offsets(data11) == [(0, 34, "ORGANIZATION")]
    data12 = [(1, 10, "PERS"), (1, 10, "ORGANIZATION_1")]
    assert normalize_offsets(data12) == [(1, 10, "ORGANIZATION")]
    data13 = [(1, 10, "PERS"), (5, 10, "ORGANIZATION_1")]
    assert normalize_offsets(data13) == [(1, 10, "ORGANIZATION")]
    data14 = [(21, 33, "DATE"), (35, 55, "PERS")]
    assert normalize_offsets(data14) == data14
Esempio n. 7
0
                                   lawyer_names +
                                   partie_pp +
                                   postal_code_matches +
                                   # frequent_entities +
                                   court_name +
                                   court_names_matches +
                                   case_dates +
                                   bar +
                                   rg_from_regex +
                                   licence_plate_number +
                                   phone_numbers +
                                   addresses)

                    if len(all_matches) > 0:

                        normalized_offsets = normalize_offsets(all_matches)
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append(normalized_offsets)

                    elif current_paragraph.isupper() and len(current_paragraph) > 10:
                        # add empty title paragraph to avoid fake solution
                        last_document_texts.append(current_paragraph)
                        last_document_offsets.append([])

            assert len(last_document_texts) == len(last_document_offsets)

            last_document_offsets_with_addresses = find_address_in_block_of_paragraphs(texts=last_document_texts,
                                                                                       offsets=last_document_offsets)

            last_document_offsets_with_cleaned_addresses = clean_address_offsets(texts=last_document_texts,
                                                                                 offsets=last_document_offsets_with_addresses)
def test_normalize_offsets():
    data1 = [
        Offset(5, 10, "LAWYER"),
        Offset(18, 24, "ORGANIZATION"),
        Offset(22, 24, "ORGANIZATION"),
        Offset(120, 133, "LAWYER"),
    ]
    assert normalize_offsets(data1) == [
        Offset(5, 10, "LAWYER"),
        Offset(18, 24, "ORGANIZATION"),
        Offset(120, 133, "LAWYER"),
    ]
    data2 = [Offset(71, 75, "PERS"), Offset(76, 85, "PERS")]
    assert normalize_offsets(data2) == [Offset(71, 85, "PERS")]
    data3 = [
        Offset(5, 10, "LAWYER"),
        Offset(71, 75, "PERS"),
        Offset(76, 85, "PERS"),
        Offset(120, 133, "LAWYER")
    ]
    assert normalize_offsets(data3) == [
        Offset(5, 10, "LAWYER"),
        Offset(71, 85, "PERS"),
        Offset(120, 133, "LAWYER")
    ]
    data4 = [
        Offset(5, 10, "LAWYER"),
        Offset(77, 85, "PERS"),
        Offset(77, 85, "PERS"),
        Offset(120, 133, "LAWYER")
    ]
    assert normalize_offsets(data4) == [
        Offset(5, 10, "LAWYER"),
        Offset(77, 85, "PERS"),
        Offset(120, 133, "LAWYER")
    ]
    data5 = [Offset(16, 20, "PERS"), Offset(21, 30, "PERS")]
    assert normalize_offsets(data5) == [Offset(16, 30, "PERS")]
    data6 = [Offset(10, 21, "PERS"), Offset(22, 30, "PERS")]
    assert normalize_offsets(data6) == [Offset(10, 30, "PERS")]
    data7 = []
    assert normalize_offsets(data7) == []
    data8 = [Offset(1, 1, "PERS")]
    assert normalize_offsets(data8) == []
    data9 = [Offset(1, 3, "PERS")]
    assert normalize_offsets(data9, min_offset_size=2) == []
    data10 = [
        Offset(1, 10, "PERS"),
        Offset(1, 10, "PERS"),
        Offset(3, 10, "PERS")
    ]
    assert normalize_offsets(data10) == [Offset(1, 10, "PERS")]
    data11 = [Offset(0, 34, "ORGANIZATION"), Offset(0, 8, "ORGANIZATION")]
    assert normalize_offsets(data11) == [Offset(0, 34, "ORGANIZATION")]
    data14 = [Offset(21, 33, "DATE"), Offset(35, 55, "PERS")]
    assert normalize_offsets(data14) == data14
    data15 = [
        Offset(start=21, end=37, type="DATE"),
        Offset(start=45, end=47, type="ORGANIZATION")
    ]
    assert normalize_offsets(data15,
                             min_offset_size=2) == [Offset(21, 37, "DATE")]
    data16 = [Offset(1, 3, "PERS")]
    assert normalize_offsets(data9) == data16
    data17 = [Offset(1, 5, "PERS"), Offset(6, 10, "PERS")]
    assert normalize_offsets(data17) == [Offset(1, 10, "PERS")]
    data18 = [Offset(1, 5, "PERS"), Offset(6, 10, "ORGANIZATION")]
    assert normalize_offsets(data18) == data18
    data19 = [Offset(1, 5, "PERS"), Offset(7, 10, "ORGANIZATION")]
    assert normalize_offsets(data19) == data19