def legal_generator(doc): legal_spans = [] for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]): if not utils.is_likely_proper(doc[span.end - 1]): continue last_token = doc[span.end - 1].text.title().rstrip("s") if last_token in LEGAL: legal_spans.append((span.start, span.end, "LAW")) # Handling legal references such as Article 5 for i in range(len(doc) - 1): if doc[i].text.rstrip("s") in { "Article", "Paragraph", "Section", "Chapter", "§" }: if doc[i + 1].text[0].isdigit() or doc[i + 1].text in ROMAN_NUMERALS: start, end = i, i + 2 if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"} and (doc[i + 3].text[0].isdigit() or doc[i + 3].text in ROMAN_NUMERALS)): end = i + 4 legal_spans.append((start, end, "LAW")) # Merge contiguous spans of legal references ("Article 5, Paragraph 3") legal_spans = utils.merge_contiguous_spans(legal_spans, doc) for start, end, label in legal_spans: yield start, end, label
def date_generator(doc): """Searches for occurrences of date patterns in text""" spans = [] i = 0 while i < len(doc): tok = doc[i] if tok.lemma_ in DAYS | DAYS_ABBRV: spans.append((i, i + 1, "DATE")) elif tok.is_digit and re.match( "\\d+$", tok.text) and 1920 < int(tok.text) < 2040: spans.append((i, i + 1, "DATE")) elif tok.lemma_ in MONTHS | MONTHS_ABBRV: if tok.tag_ == "MD": # Skipping "May" used as auxiliary pass elif i > 0 and re.match( "\\d+$", doc[i - 1].text) and int(doc[i - 1].text) < 32: spans.append((i - 1, i + 1, "DATE")) elif i > 1 and re.match( "\\d+(?:st|nd|rd|th)$", doc[i - 2].text) and doc[i - 1].lower_ == "of": spans.append((i - 2, i + 1, "DATE")) elif i < len(doc) - 1 and re.match( "\\d+$", doc[i + 1].text) and int(doc[i + 1].text) < 32: spans.append((i, i + 2, "DATE")) i += 1 else: spans.append((i, i + 1, "DATE")) i += 1 for start, end, content in utils.merge_contiguous_spans(spans, doc): yield start, end, content