def test_synonyms_count_none():
    drop_caches()
    # Lemmas are properly counted.
    assert len(SubstitutionFeaturesMixin._synonyms_count()) == 147306
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._synonyms_count():
        assert word.islower() or is_int(word[0]) or is_int(word[-1])
Beispiel #2
0
def test_is_int():
    assert is_int('20')
    assert not is_int('20.0')
    assert not is_int('20.1')
    assert not is_int('2a')
    assert not is_int('21st')
    assert not is_int(None)
    assert not is_int(1)
    assert not is_int(1.0)
    assert not is_int(1.2)
Beispiel #3
0
 def _syllables_count(cls, word=None):
     """<#syllables>"""
     pronunciations = _get_pronunciations()
     if word is None:
         return pronunciations.keys()
     if word not in pronunciations:
         return np.nan
     return np.mean([sum([is_int(ph[-1]) for ph in pronunciation])
                     for pronunciation in pronunciations[word]])
def test_frequency_none_with_computed():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._frequency():
        assert word.islower() or is_int(word[0]) or is_int(word[-1]) or word in ["%", "!"]
def test_letters_count_none_with_computed():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._letters_count():
        assert word.islower() or is_int(word[0]) or is_int(word[-1])
Beispiel #6
0
    def validate(self):
        """Check whether or not this substitution is worth keeping."""

        token1, token2 = self.tokens
        lem1, lem2 = self.lemmas
        tokens1, tokens2 = self.source.tokens, self.destination.tokens
        lemmas1, lemmas2 = self.source.lemmas, self.destination.lemmas

        # Only real-word lemmas.
        wordnet_words = _get_wordnet_words()
        if lem1 not in wordnet_words or lem2 not in wordnet_words:
            return False
        # '21st'/'twenty-first', etc.
        if (is_int(token1[0]) or is_int(token2[0]) or
                is_int(lem1[0]) or is_int(lem2[0])):
            return False
        # 'sen'/'senator', 'gov'/'governor', 'nov'/'november', etc.
        if (token1 == token2[:3] or token2 == token1[:3] or
                lem1 == lem2[:3] or lem2 == lem1[:3]):
            return False
        # 'programme'/'program', etc.
        if (token1[:-2] == token2 or token2[:-2] == token1 or
                lem1[:-2] == lem2 or lem2[:-2] == lem1):
            return False
        # 'centre'/'center', etc.
        if is_same_ending_us_uk_spelling(token1, token2):
            return False
        if is_same_ending_us_uk_spelling(lem1, lem2):
            return False
        # stopwords
        if (token1 in stopwords or token2 in stopwords or
                lem1 in stopwords or lem2 in stopwords):
            return False
        # Other minor spelling changes, also catching cases where tokens are
        # not different but lemmas are (because of lemmatization fluctuations).
        if levenshtein(token1, token2) <= 1:
            return False
        if levenshtein(lem1, lem2) <= 1:
            return False
        # Word deletion ('high school' -> 'school')
        if (self.start + self.position > 0 and
            (token2 == tokens1[self.start + self.position - 1] or
             lem2 == lemmas1[self.start + self.position - 1])):
            return False
        if (self.start + self.position < len(tokens1) - 1 and
            (token2 == tokens1[self.start + self.position + 1] or
             lem2 == lemmas1[self.start + self.position + 1])):
            return False
        # Word insertion ('school' -> 'high school')
        if (self.position > 0 and
            (token1 == tokens2[self.position - 1] or
             lem1 == lemmas2[self.position - 1])):
            return False
        if (self.position < len(tokens2) - 1 and
            (token1 == tokens2[self.position + 1] or
             lem1 == lemmas2[self.position + 1])):
            return False
        # Two words deletion ('supply of energy' -> 'supply')
        if (self.start + self.position > 1 and
            (token2 == tokens1[self.start + self.position - 2] or
             lem2 == lemmas1[self.start + self.position - 2])):
            return False
        if (self.start + self.position < len(tokens1) - 2 and
            (token2 == tokens1[self.start + self.position + 2] or
             lem2 == lemmas1[self.start + self.position + 2])):
            return False
        # Words stuck together ('policy maker' -> 'policymaker'
        # or 'policy-maker')
        if (self.start + self.position > 0 and
            (token2 == tokens1[self.start + self.position - 1] + token1 or
             token2 == tokens1[self.start + self.position - 1] +
                '-' + token1 or
             lem2 == lemmas1[self.start + self.position - 1] + lem1 or
             lem2 == lemmas1[self.start + self.position - 1] + '-' + lem1)):
            return False
        if (self.start + self.position < len(tokens1) - 1 and
            (token2 == token1 + tokens1[self.start + self.position + 1] or
             token2 == token1 + '-' +
                tokens1[self.start + self.position + 1] or
             lem2 == lem1 + lemmas1[self.start + self.position + 1] or
             lem2 == lem1 + '-' + lemmas1[self.start + self.position + 1])):
            return False
        # Words separated ('policymaker' or 'policy-maker' -> 'policy maker')
        if (self.position > 0 and
            (token1 == tokens2[self.position - 1] + token2 or
             token1 == tokens2[self.position - 1] + '-' + token2 or
             lem1 == lemmas2[self.position - 1] + lem2 or
             lem1 == lemmas2[self.position - 1] + '-' + lem2)):
            return False
        if (self.position < len(tokens2) - 1 and
            (token1 == token2 + tokens2[self.position + 1] or
             token1 == token2 + '-' + tokens2[self.position + 1] or
             lem1 == lem2 + lemmas2[self.position + 1] or
             lem1 == lem2 + '-' + lemmas2[self.position + 1])):
            return False
        # We need 2 extra checks compare to the words-stuck-together situation,
        # to detect teh second substitution appearing because of word
        # separation. Indeed in this case, contrary to words-stuck-together, we
        # can't rely on word shifts always being present, since the destination
        # can be cut shorter. In other words, in the following case:
        # (1) i'll come anytime there
        # (2) i'll come any time
        # these checks let us exclude 'there' -> 'time' as a substitution (in
        # the words-stuck-together case, the word 'there' would be present in
        # both sentences, shifted).
        if (self.position > 0 and
            (tokens1[self.start + self.position - 1] ==
                tokens2[self.position - 1] + token2 or
             tokens1[self.start + self.position - 1] ==
                tokens2[self.position - 1] + '-' + token2 or
             lemmas1[self.start + self.position - 1] ==
                lemmas2[self.position - 1] + lem2 or
             lemmas1[self.start + self.position - 1] ==
                lemmas2[self.position - 1] + '-' + lem2)):
            return False
        if (self.position < len(tokens2) - 1 and
            (tokens1[self.start + self.position + 1] ==
                token2 + tokens2[self.position + 1] or
             tokens1[self.start + self.position + 1] ==
                token2 + '-' + tokens2[self.position + 1] or
             lemmas1[self.start + self.position + 1] ==
                lem2 + lemmas2[self.position + 1] or
             lemmas1[self.start + self.position + 1] ==
                lem2 + '-' + lemmas2[self.position + 1])):
            return False

        return True