def legal_generator(doc): legal_spans = [] for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]): if not utils.is_likely_proper(doc[span.end - 1]): continue last_token = doc[span.end - 1].text.title().rstrip("s") if last_token in LEGAL: legal_spans.append((span.start, span.end, "LAW")) # Handling legal references such as Article 5 for i in range(len(doc) - 1): if doc[i].text.rstrip("s") in { "Article", "Paragraph", "Section", "Chapter", "§" }: if doc[i + 1].text[0].isdigit() or doc[i + 1].text in ROMAN_NUMERALS: start, end = i, i + 2 if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"} and (doc[i + 3].text[0].isdigit() or doc[i + 3].text in ROMAN_NUMERALS)): end = i + 4 legal_spans.append((start, end, "LAW")) # Merge contiguous spans of legal references ("Article 5, Paragraph 3") legal_spans = utils.merge_contiguous_spans(legal_spans, doc) for start, end, label in legal_spans: yield start, end, label
def get_entities(doc: Doc, layer=None): """write the entities annotated in a spacy document, based on the provided annotation layer(s). If layer is None, the method displays the entities from Spacy """ if layer is None: spans = doc.ents elif type(layer) is list: spans = utils.get_spans(doc, layer) elif type(layer) == str: if "*" in layer: matched_layers = [ l for l in doc.spans if re.match(layer.replace("*", ".*?") + "$", l) ] spans = utils.get_spans(doc, matched_layers) else: spans = doc.spans[layer] else: raise RuntimeError("Layer type not accepted") entities = {} for span in spans: start_char = doc[span.start].idx end_char = doc[span.end - 1].idx + len(doc[span.end - 1]) if (start_char, end_char) not in entities: entities[(start_char, end_char)] = span.label_ # If we have several alternative labels for a span, join them with + elif span.label_ not in entities[(start_char, end_char)]: entities[(start_char, end_char)] = entities[(start_char, end_char)] + "+" + span.label_ entities = [{ "start": start, "end": end, "label": label } for (start, end), label in entities.items()] for item in entities: item['term'] = doc.text[item['start']:item['end']] doc2 = {"text": doc.text, "entities": entities} return doc2
def test_spans(doc): print(doc) print(doc.spans) for encoding in ["IO", "BIO", "BILUO"]: aggregator = aggregation.BaseAggregator("", ["GPE", "NORP", "ORG", "PERSON"], prefixes=encoding) obs = aggregator.get_observation_df(doc) print(obs) for source in ["name", "org", "place"]: spans = utils.token_array_to_spans(obs[source].values, aggregator.out_labels) spans = [Span(doc, start, end, label=label) for (start,end),label in spans.items()] all_spans = utils.get_spans(doc, [source]) assert spans == all_spans
def test_get_spans(nlp_small): doc = nlp_small( "This is just a small test for checking that the method works correctly" ) doc.spans["source1"] = [ Span(doc, 0, 2, label="LABEL1"), Span(doc, 4, 5, label="LABEL2") ] doc.spans["source2"] = [ Span(doc, 0, 1, label="LABEL3"), Span(doc, 2, 6, label="LABEL2") ] doc.spans["source4"] = [Span(doc, 0, 2, label="LABEL2")] doc.spans["source3"] = [ Span(doc, 7, 9, label="LABEL2"), Span(doc, 1, 4, label="LABEL1") ] assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source1", "source2"])) == {(0, 2), (2, 6)} assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source1", "source3"])) == {(1, 4), (4, 5), (7, 9)} assert {(span.start, span.end): span.label_ for span in utils.get_spans(doc, ["source1", "source4"])} == { (0, 2): "LABEL2", (4, 5): "LABEL2" } assert set( (span.start, span.end) for span in utils.get_spans(doc, ["source2", "source3"])) == {(0, 1), (2, 6), (7, 9)}