def is_likely_proper(tok: Token, min_rank=200) -> bool: """Returns true if the spacy token is a likely proper name, based on its form. NB: this method should only be used for languages that have a distinction between lowercase and uppercase (so called bicameral scripts).""" # We require at least two characters if len(tok) < 2: return False # If the lemma is titled or in uppercase, just return True elif tok.lemma_.istitle() and len(tok.lemma_) > 2: return True elif tok.lemma_.isupper() and len( tok.lemma_) > 2 and tok.lemma_ != "-PRON-": return True # If there is no lemma, but the token is in uppercase, return true as well elif tok.lemma_ == "" and tok.is_upper: return True # We do not consider the 200 most common words as proper name elif (tok.lemma_.islower() and tok.lemma in tok.vocab.strings and tok.vocab[tok.lemma].rank < min_rank): return False # Handling cases such as iPad elif len(tok) > 2 and tok.text[0].islower() and tok.text[1].isupper(): return True # Handling cases such as IceFog elif (len(tok) > 2 and tok.text[0].isupper() and any([k.islower() for k in tok.text[1:]]) and any([k.isupper() for k in tok.text[1:]])): return True # Else, check whether the surface token is titled and is not sentence-initial # NB: This should be commented out for languages such as German elif (tok.i > 0 and tok.is_title and not tok.is_sent_start and tok.nbor(-1).text not in {'\'', '"', '‘', '“', '”', '’', "\n", "|"} and not tok.nbor(-1).text.endswith(".")): return True # If the part-of-speech is a proper noun elif tok.pos_ == "PROPN": return True # If the token is in lowercase but is a quite rare token elif len(tok) > 3 and tok.is_lower and len( tok.vocab.vectors) > 0 and tok.is_oov: return True return False
def _get_lookahead(self, token: Token, next_sentence_boundary: int) -> int: """Returns the longest possible span starting with the current token, and satisfying the three following criteria: - the maximum length of the span is self.lookahead - the span cannot start with a punctuation symbol or within a compound phrase - the span cannot cross sentence boundaries """ if token.is_punct: return 0 elif token.i > 0 and token.nbor(-1).dep_ == "compound" and token.nbor(-1).head == token: return 0 return min(next_sentence_boundary-token.i, self.lookahead)
def in_compound(tok: Token): """Returns true if the spacy token is part of a compound phrase""" if tok.dep_ == "compound": return True elif tok.i > 0 and tok.nbor(-1).dep_ == "compound": return True return False
def last_token_of_entity(doc: Doc, token: Token) -> Token: """RECURSIVE. Given a token in an entity, it recurses to the right until it finds a token where the IOB is not Inside (I)""" """ HMGB1-induced -> induced""" next_token = token.nbor() is_end = next_token.ent_iob != 1 if is_end: return token else: return last_token_of_entity(doc, next_token)