def test_synonyms_count_none(): drop_caches() # Lemmas are properly counted. assert len(SubstitutionFeaturesMixin._synonyms_count()) == 147306 # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._synonyms_count(): assert word.islower() or is_int(word[0]) or is_int(word[-1])
def test_is_int(): assert is_int('20') assert not is_int('20.0') assert not is_int('20.1') assert not is_int('2a') assert not is_int('21st') assert not is_int(None) assert not is_int(1) assert not is_int(1.0) assert not is_int(1.2)
def _syllables_count(cls, word=None): """<#syllables>""" pronunciations = _get_pronunciations() if word is None: return pronunciations.keys() if word not in pronunciations: return np.nan return np.mean([sum([is_int(ph[-1]) for ph in pronunciation]) for pronunciation in pronunciations[word]])
def test_frequency_none_with_computed(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._frequency(): assert word.islower() or is_int(word[0]) or is_int(word[-1]) or word in ["%", "!"]
def test_letters_count_none_with_computed(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._letters_count(): assert word.islower() or is_int(word[0]) or is_int(word[-1])
def validate(self): """Check whether or not this substitution is worth keeping.""" token1, token2 = self.tokens lem1, lem2 = self.lemmas tokens1, tokens2 = self.source.tokens, self.destination.tokens lemmas1, lemmas2 = self.source.lemmas, self.destination.lemmas # Only real-word lemmas. wordnet_words = _get_wordnet_words() if lem1 not in wordnet_words or lem2 not in wordnet_words: return False # '21st'/'twenty-first', etc. if (is_int(token1[0]) or is_int(token2[0]) or is_int(lem1[0]) or is_int(lem2[0])): return False # 'sen'/'senator', 'gov'/'governor', 'nov'/'november', etc. if (token1 == token2[:3] or token2 == token1[:3] or lem1 == lem2[:3] or lem2 == lem1[:3]): return False # 'programme'/'program', etc. if (token1[:-2] == token2 or token2[:-2] == token1 or lem1[:-2] == lem2 or lem2[:-2] == lem1): return False # 'centre'/'center', etc. if is_same_ending_us_uk_spelling(token1, token2): return False if is_same_ending_us_uk_spelling(lem1, lem2): return False # stopwords if (token1 in stopwords or token2 in stopwords or lem1 in stopwords or lem2 in stopwords): return False # Other minor spelling changes, also catching cases where tokens are # not different but lemmas are (because of lemmatization fluctuations). if levenshtein(token1, token2) <= 1: return False if levenshtein(lem1, lem2) <= 1: return False # Word deletion ('high school' -> 'school') if (self.start + self.position > 0 and (token2 == tokens1[self.start + self.position - 1] or lem2 == lemmas1[self.start + self.position - 1])): return False if (self.start + self.position < len(tokens1) - 1 and (token2 == tokens1[self.start + self.position + 1] or lem2 == lemmas1[self.start + self.position + 1])): return False # Word insertion ('school' -> 'high school') if (self.position > 0 and (token1 == tokens2[self.position - 1] or lem1 == lemmas2[self.position - 1])): return False if (self.position < len(tokens2) - 1 and (token1 == tokens2[self.position + 1] or lem1 == lemmas2[self.position + 1])): return False # Two words deletion ('supply of energy' -> 'supply') if (self.start + self.position > 1 and (token2 == tokens1[self.start + self.position - 2] or lem2 == lemmas1[self.start + self.position - 2])): return False if (self.start + self.position < len(tokens1) - 2 and (token2 == tokens1[self.start + self.position + 2] or lem2 == lemmas1[self.start + self.position + 2])): return False # Words stuck together ('policy maker' -> 'policymaker' # or 'policy-maker') if (self.start + self.position > 0 and (token2 == tokens1[self.start + self.position - 1] + token1 or token2 == tokens1[self.start + self.position - 1] + '-' + token1 or lem2 == lemmas1[self.start + self.position - 1] + lem1 or lem2 == lemmas1[self.start + self.position - 1] + '-' + lem1)): return False if (self.start + self.position < len(tokens1) - 1 and (token2 == token1 + tokens1[self.start + self.position + 1] or token2 == token1 + '-' + tokens1[self.start + self.position + 1] or lem2 == lem1 + lemmas1[self.start + self.position + 1] or lem2 == lem1 + '-' + lemmas1[self.start + self.position + 1])): return False # Words separated ('policymaker' or 'policy-maker' -> 'policy maker') if (self.position > 0 and (token1 == tokens2[self.position - 1] + token2 or token1 == tokens2[self.position - 1] + '-' + token2 or lem1 == lemmas2[self.position - 1] + lem2 or lem1 == lemmas2[self.position - 1] + '-' + lem2)): return False if (self.position < len(tokens2) - 1 and (token1 == token2 + tokens2[self.position + 1] or token1 == token2 + '-' + tokens2[self.position + 1] or lem1 == lem2 + lemmas2[self.position + 1] or lem1 == lem2 + '-' + lemmas2[self.position + 1])): return False # We need 2 extra checks compare to the words-stuck-together situation, # to detect teh second substitution appearing because of word # separation. Indeed in this case, contrary to words-stuck-together, we # can't rely on word shifts always being present, since the destination # can be cut shorter. In other words, in the following case: # (1) i'll come anytime there # (2) i'll come any time # these checks let us exclude 'there' -> 'time' as a substitution (in # the words-stuck-together case, the word 'there' would be present in # both sentences, shifted). if (self.position > 0 and (tokens1[self.start + self.position - 1] == tokens2[self.position - 1] + token2 or tokens1[self.start + self.position - 1] == tokens2[self.position - 1] + '-' + token2 or lemmas1[self.start + self.position - 1] == lemmas2[self.position - 1] + lem2 or lemmas1[self.start + self.position - 1] == lemmas2[self.position - 1] + '-' + lem2)): return False if (self.position < len(tokens2) - 1 and (tokens1[self.start + self.position + 1] == token2 + tokens2[self.position + 1] or tokens1[self.start + self.position + 1] == token2 + '-' + tokens2[self.position + 1] or lemmas1[self.start + self.position + 1] == lem2 + lemmas2[self.position + 1] or lemmas1[self.start + self.position + 1] == lem2 + '-' + lemmas2[self.position + 1])): return False return True