Exemple #1
0
    def add_shallow(self):
        """Adds shallow annotation functions"""
        
        # Detection of dates, time, money, and numbers
        self.add_annotator(FunctionAnnotator("date_detector", date_generator))
        self.add_annotator(FunctionAnnotator("time_detector", time_generator))
        self.add_annotator(FunctionAnnotator("money_detector", money_generator))
        
        # Detection based on casing
        proper_detector = TokenConstraintAnnotator("proper_detector", lambda tok: utils.is_likely_proper(tok), "ENT")
    
        # Detection based on casing, but allowing some lowercased tokens
        proper2_detector = TokenConstraintAnnotator("proper2_detector", lambda tok: utils.is_likely_proper(tok), "ENT")
        proper2_detector.add_gap_tokens(data_utils.LOWERCASED_TOKENS | data_utils.NAME_PREFIXES)
        
        # Detection based on part-of-speech tags
        nnp_detector = TokenConstraintAnnotator("nnp_detector", lambda tok: tok.tag_=="NNP", "ENT")
        
        # Detection based on dependency relations (compound phrases)
        compound = lambda tok: utils.is_likely_proper(tok) and utils.in_compound(tok)
        compound_detector = TokenConstraintAnnotator("compound_detector", compound, "ENT")
 
        exclusives = ["date_detector", "time_detector", "money_detector"]
        for annotator in [proper_detector, proper2_detector, nnp_detector, compound_detector]:
            annotator.add_incompatible_sources(exclusives)
            annotator.add_gap_tokens(["'s", "-"])
            self.add_annotator(annotator)

            # We add one variants for each NE detector, looking at infrequent tokens
            infrequent_name = "infrequent_%s"%annotator.name
            self.add_annotator(SpanConstraintAnnotator(infrequent_name, annotator.name, utils.is_infrequent))
        
        # Other types (legal references etc.)      
        misc_detector = FunctionAnnotator("misc_detector", misc_generator)
        legal_detector = FunctionAnnotator("legal_detector", legal_generator)
        
        # Detection of companies with a legal type
        ends_with_legal_suffix = lambda x: x[-1].lower_.rstrip(".") in data_utils.LEGAL_SUFFIXES
        company_type_detector = SpanConstraintAnnotator("company_type_detector", "proper2_detector", 
                                                        ends_with_legal_suffix, "COMPANY")

        # Detection of person names
        full_name_detector = SpanConstraintAnnotator("full_name_detector", "proper2_detector", 
                                                     FullNameDetector(), "PERSON")
        name_detector2 = SpanConstraintAnnotator("name_detector", "proper_detector", 
                                                 constraint=name_detector, label="PERSON")
        
        for annotator in [misc_detector, legal_detector, company_type_detector, 
                          full_name_detector, name_detector2]:
            annotator.add_incompatible_sources(exclusives)
            self.add_annotator(annotator)
         
        # General number detector
        number_detector = FunctionAnnotator("number_detector", number_generator)
        number_detector.add_incompatible_sources(exclusives + ["legal_detector", "company_type_detector"])
        self.add_annotator(number_detector)  
            
        self.add_annotator(SnipsAnnotator("snips"))
        return self
Exemple #2
0
def legal_generator(doc):
    legal_spans = []
    for span in utils.get_spans(doc, ["proper2_detector", "nnp_detector"]):
        if not utils.is_likely_proper(doc[span.end - 1]):
            continue
        last_token = doc[span.end - 1].text.title().rstrip("s")

        if last_token in LEGAL:
            legal_spans.append((span.start, span.end, "LAW"))

    # Handling legal references such as Article 5
    for i in range(len(doc) - 1):
        if doc[i].text.rstrip("s") in {
                "Article", "Paragraph", "Section", "Chapter", "§"
        }:
            if doc[i + 1].text[0].isdigit() or doc[i +
                                                   1].text in ROMAN_NUMERALS:
                start, end = i, i + 2
                if (i < len(doc) - 3 and doc[i + 2].text in {"-", "to", "and"}
                        and (doc[i + 3].text[0].isdigit()
                             or doc[i + 3].text in ROMAN_NUMERALS)):
                    end = i + 4
                legal_spans.append((start, end, "LAW"))

    # Merge contiguous spans of legal references ("Article 5, Paragraph 3")
    legal_spans = utils.merge_contiguous_spans(legal_spans, doc)
    for start, end, label in legal_spans:
        yield start, end, label
Exemple #3
0
def test_likely_proper(nlp_small, nlp):
    for nlpx in [nlp_small, nlp]:
        doc = nlpx("This is a test. Please tell me that is works.")
        for tok in doc:
            assert not utils.is_likely_proper(tok)
        doc = nlpx("Pierre Lison is living in Oslo.")
        for i, tok in enumerate(doc):
            assert utils.is_likely_proper(tok) == (i in {0, 1, 5})
        doc = nlpx("Short sentence. But here, Beyond can be an organisation.")
        for i, tok in enumerate(doc):
            assert utils.is_likely_proper(tok) == (i in {6})

    doc = nlp_small("Buying an iPad makes you ekrjøewlkrj in the USA.")
    for i, tok in enumerate(doc):
        assert utils.is_likely_proper(tok) == (i in {2, 8})
    doc = nlp("Buying an iPad makes you ekrjøewlkrj in the USA.")
    for i, tok in enumerate(doc):
        assert utils.is_likely_proper(tok) == (i in {2, 8, 5})
Exemple #4
0
    def get_first_mentions(self, doc) -> Dict[List[str], Span]:
        """Returns a set containing the first mentions of each entity as triples
        (start, end, label) according to the "other_name' layer.

        The first mentions also contains subsequences: for instance, a named entity
        "Pierre Lison" will also contain the first mentions of ['Pierre'] and ['Lison'].
        """
        if self.other_name not in doc.spans:
            return {}

        first_observed = {}
        for span in doc.spans[self.other_name]:

            # NB: We only consider entities with at least two tokens
            if span.label_ not in self.labels or len(span) < 2:
                continue

            # We also extract subsequences
            for length in range(1, len(span)+1):
                for i in range(length, len(span)+1):

                    start2 = span.start + i-length
                    end2 = span.start + i
                    subseq = tuple(tok.text for tok in doc[start2:end2])

                    # We ony consider first mentions
                    if subseq in first_observed:
                        continue

                    # To avoid too many FPs, the mention must have at least 4 charactes
                    if sum(len(tok) for tok in subseq) <4:
                        continue
                    
                    # And if the span looks like a proper name, then at least one 
                    # token in the subsequence must look like a proper name too 
                    if (any(utils.is_likely_proper(tok) for tok in span) and not 
                          any(utils.is_likely_proper(tok) for tok in doc[start2:end2])):
                        continue
                        
                    first_observed[subseq] = Span(doc, start2, end2, span.label_)

        return first_observed