Exemple #1
0
def is_likely_proper(tok: Token, min_rank=200) -> bool:
    """Returns true if the spacy token is a likely proper name, based on its form.

    NB: this method should only be used for languages that have a distinction between
    lowercase and uppercase (so called bicameral scripts)."""

    # We require at least two characters
    if len(tok) < 2:
        return False

    # If the lemma is titled or in uppercase, just return True
    elif tok.lemma_.istitle() and len(tok.lemma_) > 2:
        return True
    elif tok.lemma_.isupper() and len(
            tok.lemma_) > 2 and tok.lemma_ != "-PRON-":
        return True
    # If there is no lemma, but the token is in uppercase, return true as well
    elif tok.lemma_ == "" and tok.is_upper:
        return True

    # We do not consider the 200 most common words as proper name
    elif (tok.lemma_.islower() and tok.lemma in tok.vocab.strings
          and tok.vocab[tok.lemma].rank < min_rank):
        return False

    # Handling cases such as iPad
    elif len(tok) > 2 and tok.text[0].islower() and tok.text[1].isupper():
        return True

    # Handling cases such as IceFog
    elif (len(tok) > 2 and tok.text[0].isupper()
          and any([k.islower() for k in tok.text[1:]])
          and any([k.isupper() for k in tok.text[1:]])):
        return True

    # Else, check whether the surface token is titled and is not sentence-initial
    # NB: This should be commented out for languages such as German
    elif (tok.i > 0 and tok.is_title and not tok.is_sent_start and
          tok.nbor(-1).text not in {'\'', '"', '‘', '“', '”', '’', "\n", "|"}
          and not tok.nbor(-1).text.endswith(".")):
        return True

    # If the part-of-speech is a proper noun
    elif tok.pos_ == "PROPN":
        return True

    # If the token is in lowercase but is a quite rare token
    elif len(tok) > 3 and tok.is_lower and len(
            tok.vocab.vectors) > 0 and tok.is_oov:
        return True

    return False
Exemple #2
0
    def _get_lookahead(self, token: Token, next_sentence_boundary: int) -> int:
        """Returns the longest possible span starting with the current token, and
        satisfying the three following criteria:
        - the maximum length of the span is self.lookahead
        - the span cannot start with a punctuation symbol or within a compound phrase
        - the span cannot cross sentence boundaries
        """

        if token.is_punct:
            return 0
        elif token.i > 0 and token.nbor(-1).dep_ == "compound" and token.nbor(-1).head == token:
            return 0

        return min(next_sentence_boundary-token.i, self.lookahead)
Exemple #3
0
def in_compound(tok: Token):
    """Returns true if the spacy token is part of a compound phrase"""

    if tok.dep_ == "compound":
        return True
    elif tok.i > 0 and tok.nbor(-1).dep_ == "compound":
        return True
    return False
Exemple #4
0
def last_token_of_entity(doc: Doc, token: Token) -> Token:
    """RECURSIVE. Given a token in an entity, it recurses to the right until it finds a token where the IOB is not Inside (I)"""
    """ HMGB1-induced -> induced"""
    next_token = token.nbor()
    is_end = next_token.ent_iob != 1
    if is_end:
        return token
    else:
        return last_token_of_entity(doc, next_token)