def test_memoization_works(self):
        # The file need to be generated beforehand
        clean_generated_dir()
        generate_automaton_to_file(2)

        manager = Manager()
        t0 = time()
        manager.get_for_tolerance(2)
        first_time = time() - t0

        t0 = time()
        manager.get_for_tolerance(2)
        second_time = time() - t0

        assert first_time / second_time > 100
        clean_generated_dir()
    def profane_word_dictionaries(self) -> ProfaneWordDictionaries:
        """Gets profane word dictionaries"""
        if self.custom_profane_word_dictionaries:
            result = deepcopy(self.custom_profane_word_dictionaries)
        else:
            self._load_profane_word_dictionaries()
            result = deepcopy(self._censor_dictionaries)

        for language in self.languages:
            result[language] |= self.extra_profane_word_dictionaries[language]

        if self.deep_analysis:
            self._trie = {
                language: Trie(words=result[language], alphabet=self._alphabet)
                for language in self.languages
            }
            for length in range(self._MAX_MAX_DISTANCE + 1):
                generate_automaton_to_file(length)

        return result
Exemple #3
0
def test_find_all_words_within_tolerance_advanced():
    generate_automaton_to_file(0)
    generate_automaton_to_file(1)
    generate_automaton_to_file(2)
    # generate_automaton_to_file(3)
    # generate_automaton_to_file(4)

    fa = open(get_asset_file('english_words.txt'))
    words = [w.rstrip() for w in fa.readlines()]

    fb = open(get_asset_file('english_words_matched.txt'))
    matches_reader = csv.reader(fb)

    alphabet = set()
    trie = Trie(words, alphabet)

    try:
        while True:
            query, tolerance = next(matches_reader)
            tolerance = int(tolerance)

            matches = next(matches_reader)
            distances = [int(d) for d in next(matches_reader)]
            expected = set(zip(distances, matches))

            automaton = LevenshteinAutomaton(tolerance, query, alphabet)
            result = trie_automaton_intersection(automaton, trie, True)
            assert set(result) == expected

            # if tolerance < 3:
    except StopIteration:
        pass
Exemple #4
0
def test_find_all_words_within_tolerance():
    generate_automaton_to_file(0)
    generate_automaton_to_file(1)
    generate_automaton_to_file(2)

    dictionary = [
        'car', 'rat', 'bat', 'carp', 'caterpillar', 'kangaroo', 'camel', 'dog',
        'hog', 'snake', 'crow', 'raven', 'mate'
    ]

    result = find_all_words_within_tolerance('dog', dictionary, 0)
    assert set(result) == {'dog'}

    result = find_all_words_within_tolerance('dog', dictionary, 1)
    assert set(result) == {'dog', 'hog'}

    result = find_all_words_within_tolerance('dog', dictionary, 1)
    assert set(result) == {'dog', 'hog'}

    result = find_all_words_within_tolerance('dog', dictionary, 2)
    assert set(result) == {'dog', 'hog'}

    result = find_all_words_within_tolerance('cat', dictionary, 0)
    assert set(result) == set()

    result = find_all_words_within_tolerance('cat', dictionary, 1)
    assert set(result) == {'car', 'rat', 'bat'}

    result = find_all_words_within_tolerance('cat', dictionary, 2)
    assert set(result) == {'car', 'rat', 'bat', 'carp', 'mate'}

    clean_generated_dir()
Exemple #5
0
def main(tolerance):
    generate_automaton_to_file(tolerance)