def add_shallow(self): """Adds shallow annotation functions""" # Detection of dates, time, money, and numbers self.add_annotator(FunctionAnnotator("date_detector", date_generator)) self.add_annotator(FunctionAnnotator("time_detector", time_generator)) self.add_annotator(FunctionAnnotator("money_detector", money_generator)) # Detection based on casing proper_detector = TokenConstraintAnnotator("proper_detector", lambda tok: utils.is_likely_proper(tok), "ENT") # Detection based on casing, but allowing some lowercased tokens proper2_detector = TokenConstraintAnnotator("proper2_detector", lambda tok: utils.is_likely_proper(tok), "ENT") proper2_detector.add_gap_tokens(data_utils.LOWERCASED_TOKENS | data_utils.NAME_PREFIXES) # Detection based on part-of-speech tags nnp_detector = TokenConstraintAnnotator("nnp_detector", lambda tok: tok.tag_=="NNP", "ENT") # Detection based on dependency relations (compound phrases) compound = lambda tok: utils.is_likely_proper(tok) and utils.in_compound(tok) compound_detector = TokenConstraintAnnotator("compound_detector", compound, "ENT") exclusives = ["date_detector", "time_detector", "money_detector"] for annotator in [proper_detector, proper2_detector, nnp_detector, compound_detector]: annotator.add_incompatible_sources(exclusives) annotator.add_gap_tokens(["'s", "-"]) self.add_annotator(annotator) # We add one variants for each NE detector, looking at infrequent tokens infrequent_name = "infrequent_%s"%annotator.name self.add_annotator(SpanConstraintAnnotator(infrequent_name, annotator.name, utils.is_infrequent)) # Other types (legal references etc.) misc_detector = FunctionAnnotator("misc_detector", misc_generator) legal_detector = FunctionAnnotator("legal_detector", legal_generator) # Detection of companies with a legal type ends_with_legal_suffix = lambda x: x[-1].lower_.rstrip(".") in data_utils.LEGAL_SUFFIXES company_type_detector = SpanConstraintAnnotator("company_type_detector", "proper2_detector", ends_with_legal_suffix, "COMPANY") # Detection of person names full_name_detector = SpanConstraintAnnotator("full_name_detector", "proper2_detector", FullNameDetector(), "PERSON") name_detector2 = SpanConstraintAnnotator("name_detector", "proper_detector", constraint=name_detector, label="PERSON") for annotator in [misc_detector, legal_detector, company_type_detector, full_name_detector, name_detector2]: annotator.add_incompatible_sources(exclusives) self.add_annotator(annotator) # General number detector number_detector = FunctionAnnotator("number_detector", number_generator) number_detector.add_incompatible_sources(exclusives + ["legal_detector", "company_type_detector"]) self.add_annotator(number_detector) self.add_annotator(SnipsAnnotator("snips")) return self
def legal_generator(doc): legal_spans = [] for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]): if not utils.is_likely_proper(doc[span.end - 1]): continue last_token = doc[span.end - 1].text.title().rstrip("s") if last_token in LEGAL: legal_spans.append((span.start, span.end, "LAW")) # Handling legal references such as Article 5 for i in range(len(doc) - 1): if doc[i].text.rstrip("s") in { "Article", "Paragraph", "Section", "Chapter", "§" }: if doc[i + 1].text[0].isdigit() or doc[i + 1].text in ROMAN_NUMERALS: start, end = i, i + 2 if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"} and (doc[i + 3].text[0].isdigit() or doc[i + 3].text in ROMAN_NUMERALS)): end = i + 4 legal_spans.append((start, end, "LAW")) # Merge contiguous spans of legal references ("Article 5, Paragraph 3") legal_spans = utils.merge_contiguous_spans(legal_spans, doc) for start, end, label in legal_spans: yield start, end, label
def test_likely_proper(nlp_small, nlp): for nlpx in [nlp_small, nlp]: doc = nlpx("This is a test. Please tell me that is works.") for tok in doc: assert not utils.is_likely_proper(tok) doc = nlpx("Pierre Lison is living in Oslo.") for i, tok in enumerate(doc): assert utils.is_likely_proper(tok) == (i in {0, 1, 5}) doc = nlpx("Short sentence. But here, Beyond can be an organisation.") for i, tok in enumerate(doc): assert utils.is_likely_proper(tok) == (i in {6}) doc = nlp_small("Buying an iPad makes you ekrjøewlkrj in the USA.") for i, tok in enumerate(doc): assert utils.is_likely_proper(tok) == (i in {2, 8}) doc = nlp("Buying an iPad makes you ekrjøewlkrj in the USA.") for i, tok in enumerate(doc): assert utils.is_likely_proper(tok) == (i in {2, 8, 5})
def get_first_mentions(self, doc) -> Dict[List[str], Span]: """Returns a set containing the first mentions of each entity as triples (start, end, label) according to the "other_name' layer. The first mentions also contains subsequences: for instance, a named entity "Pierre Lison" will also contain the first mentions of ['Pierre'] and ['Lison']. """ if self.other_name not in doc.spans: return {} first_observed = {} for span in doc.spans[self.other_name]: # NB: We only consider entities with at least two tokens if span.label_ not in self.labels or len(span) < 2: continue # We also extract subsequences for length in range(1, len(span)+1): for i in range(length, len(span)+1): start2 = span.start + i-length end2 = span.start + i subseq = tuple(tok.text for tok in doc[start2:end2]) # We ony consider first mentions if subseq in first_observed: continue # To avoid too many FPs, the mention must have at least 4 charactes if sum(len(tok) for tok in subseq) <4: continue # And if the span looks like a proper name, then at least one # token in the subsequence must look like a proper name too if (any(utils.is_likely_proper(tok) for tok in span) and not any(utils.is_likely_proper(tok) for tok in doc[start2:end2])): continue first_observed[subseq] = Span(doc, start2, end2, span.label_) return first_observed