Beispiel #1
0
def legal_generator(doc):
    legal_spans = []
    for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]):
        if not utils.is_likely_proper(doc[span.end - 1]):
            continue
        last_token = doc[span.end - 1].text.title().rstrip("s")

        if last_token in LEGAL:
            legal_spans.append((span.start, span.end, "LAW"))

    # Handling legal references such as Article 5
    for i in range(len(doc) - 1):
        if doc[i].text.rstrip("s") in {
                "Article", "Paragraph", "Section", "Chapter", "§"
        }:
            if doc[i + 1].text[0].isdigit() or doc[i +
                                                   1].text in ROMAN_NUMERALS:
                start, end = i, i + 2
                if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"}
                        and (doc[i + 3].text[0].isdigit()
                             or doc[i + 3].text in ROMAN_NUMERALS)):
                    end = i + 4
                legal_spans.append((start, end, "LAW"))

    # Merge contiguous spans of legal references ("Article 5, Paragraph 3")
    legal_spans = utils.merge_contiguous_spans(legal_spans, doc)
    for start, end, label in legal_spans:
        yield start, end, label
Beispiel #2
0
def get_entities(doc: Doc, layer=None):
    """write the entities annotated in a spacy document, based on the
    provided annotation layer(s). If layer is None, the method displays
    the entities from Spacy
    """
    if layer is None:
        spans = doc.ents
    elif type(layer) is list:
        spans = utils.get_spans(doc, layer)
    elif type(layer) == str:
        if "*" in layer:
            matched_layers = [
                l for l in doc.spans
                if re.match(layer.replace("*", ".*?") + "$", l)
            ]
            spans = utils.get_spans(doc, matched_layers)
        else:
            spans = doc.spans[layer]
    else:
        raise RuntimeError("Layer type not accepted")

    entities = {}
    for span in spans:
        start_char = doc[span.start].idx
        end_char = doc[span.end - 1].idx + len(doc[span.end - 1])

        if (start_char, end_char) not in entities:
            entities[(start_char, end_char)] = span.label_

        # If we have several alternative labels for a span, join them with +
        elif span.label_ not in entities[(start_char, end_char)]:
            entities[(start_char,
                      end_char)] = entities[(start_char,
                                             end_char)] + "+" + span.label_

    entities = [{
        "start": start,
        "end": end,
        "label": label
    } for (start, end), label in entities.items()]

    for item in entities:
        item['term'] = doc.text[item['start']:item['end']]

    doc2 = {"text": doc.text, "entities": entities}
    return doc2
Beispiel #3
0
def test_spans(doc):
    print(doc)
    print(doc.spans)
    for encoding in ["IO", "BIO", "BILUO"]:
        aggregator = aggregation.BaseAggregator("", ["GPE", "NORP", "ORG", "PERSON"], prefixes=encoding)
        obs  = aggregator.get_observation_df(doc)
        print(obs)
        for source in ["name", "org", "place"]:
            spans = utils.token_array_to_spans(obs[source].values, aggregator.out_labels)
            spans = [Span(doc, start, end, label=label) for (start,end),label in spans.items()]
            all_spans = utils.get_spans(doc, [source])
            
            assert spans == all_spans
Beispiel #4
0
def test_get_spans(nlp_small):

    doc = nlp_small(
        "This is just a small test for checking that the method works correctly"
    )
    doc.spans["source1"] = [
        Span(doc, 0, 2, label="LABEL1"),
        Span(doc, 4, 5, label="LABEL2")
    ]
    doc.spans["source2"] = [
        Span(doc, 0, 1, label="LABEL3"),
        Span(doc, 2, 6, label="LABEL2")
    ]
    doc.spans["source4"] = [Span(doc, 0, 2, label="LABEL2")]
    doc.spans["source3"] = [
        Span(doc, 7, 9, label="LABEL2"),
        Span(doc, 1, 4, label="LABEL1")
    ]

    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source1", "source2"])) == {(0, 2),
                                                                      (2, 6)}
    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source1", "source3"])) == {(1, 4),
                                                                      (4, 5),
                                                                      (7, 9)}
    assert {(span.start, span.end): span.label_
            for span in utils.get_spans(doc, ["source1", "source4"])} == {
                (0, 2): "LABEL2",
                (4, 5): "LABEL2"
            }
    assert set(
        (span.start, span.end)
        for span in utils.get_spans(doc, ["source2", "source3"])) == {(0, 1),
                                                                      (2, 6),
                                                                      (7, 9)}