Beispiel #1
0
 def test_autocomplete(self, items, size, expected_results):
     lfucache = LFUCache(size)
     for item in items:
         lfucache.set(item, f'{item}_cached')
     results = lfucache.get_sorted_cache_keys()
     diff = DeepDiff(expected_results, results)
     assert not diff
Beispiel #2
0
    def __init__(
            self,
            words,
            synonyms=None,
            full_stop_words=None,
            logger=None,
            valid_chars_for_string=None,
            valid_chars_for_integer=None,
            valid_chars_for_node_name=None,
    ):
        """
        Inistializes the Autocomplete module

        :param words: A dictionary of words mapped to their context
        :param synonyms: (optional) A dictionary of words to their synonyms.
                         The synonym words should only be here and not repeated in words parameter.
        """
        self._lock = Lock()
        self._dwg = None
        self._raw_synonyms = synonyms or {}
        self._lfu_cache = LFUCache(self.CACHE_SIZE)
        self._clean_synonyms, self._partial_synonyms = self._get_clean_and_partial_synonyms()
        self._reverse_synonyms = self._get_reverse_synonyms(self._clean_synonyms)
        self._full_stop_words = set(full_stop_words) if full_stop_words else None
        self.logger = logger
        self.words = words
        self.normalizer = Normalizer(
            valid_chars_for_string=valid_chars_for_string,
            valid_chars_for_integer=valid_chars_for_integer,
            valid_chars_for_node_name=valid_chars_for_node_name,
        )
        new_words = self._get_partial_synonyms_to_words()
        self.words.update(new_words)
        self._populate_dwg()
Beispiel #3
0
    def test_get_multithreading(self):
        keys = 'aaaaaaaaaaaaaaaaaaaaaaaaaaabbc'
        lfucache = LFUCache(2)

        def _do_set(cache, key):
            cache.set(key, f'{key}_cached')

        def _do_get(cache, key):
            return cache.get(key)

        def _key_gen():
            i = 0
            while i < 30000:
                i += 1
                yield random.choice(keys)

        def _random_func(cache, key):
            return random.choice([_do_get, _do_get, _do_set])(cache, key)

        with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
            futures = (executor.submit(_random_func, lfucache, key)
                       for key in _key_gen())
            for future in concurrent.futures.as_completed(futures):
                future.result()
import string
from fast_autocomplete.lfucache import LFUCache

valid_chars_for_string = {i for i in string.ascii_letters.lower()}
valid_chars_for_integer = {i for i in string.digits}
valid_chars_for_node_name = {
    ' ', '-', ':'
} | valid_chars_for_string | valid_chars_for_integer

NORMALIZED_CACHE_SIZE = 2048
MAX_WORD_LENGTH = 40

_normalized_lfu_cache = LFUCache(NORMALIZED_CACHE_SIZE)


def normalize_node_name(name):
    name = name[:MAX_WORD_LENGTH]
    result = _normalized_lfu_cache.get(name)
    if result == -1:
        result = _get_normalized_node_name(name)
        _normalized_lfu_cache.set(name, result)
    return result


def _remove_invalid_chars(x):
    result = x in valid_chars_for_node_name
    if x == '-' == _remove_invalid_chars.prev_x:
        result = False
    _remove_invalid_chars.prev_x = x
    return result
Beispiel #5
0
class AutoComplete:

    CACHE_SIZE = 2048
    SHOULD_INCLUDE_COUNT = True

    def __init__(
            self,
            words,
            synonyms=None,
            full_stop_words=None,
            logger=None,
            valid_chars_for_string=None,
            valid_chars_for_integer=None,
            valid_chars_for_node_name=None,
    ):
        """
        Inistializes the Autocomplete module

        :param words: A dictionary of words mapped to their context
        :param synonyms: (optional) A dictionary of words to their synonyms.
                         The synonym words should only be here and not repeated in words parameter.
        """
        self._lock = Lock()
        self._dwg = None
        self._raw_synonyms = synonyms or {}
        self._lfu_cache = LFUCache(self.CACHE_SIZE)
        self._clean_synonyms, self._partial_synonyms = self._get_clean_and_partial_synonyms()
        self._reverse_synonyms = self._get_reverse_synonyms(self._clean_synonyms)
        self._full_stop_words = set(full_stop_words) if full_stop_words else None
        self.logger = logger
        self.words = words
        self.normalizer = Normalizer(
            valid_chars_for_string=valid_chars_for_string,
            valid_chars_for_integer=valid_chars_for_integer,
            valid_chars_for_node_name=valid_chars_for_node_name,
        )
        new_words = self._get_partial_synonyms_to_words()
        self.words.update(new_words)
        self._populate_dwg()

    def _get_clean_and_partial_synonyms(self):
        """
        Synonyms are words that should produce the same results.

        - For example `beemer` and `bmw` should both give you `bmw`.
        - `alfa` and `alfa romeo` should both give you `alfa romeo`

        The synonyms get divided into 2 groups:

        1. clean synonyms: The 2 words share little or no words. For example `beemer` vs. `bmw`.
        2. partial synonyms: One of the 2 words is a substring of the other one. For example `alfa` and `alfa romeo` or `gm` vs. `gmc`.

        """
        clean_synonyms = {}
        partial_synonyms = {}

        for key, synonyms in self._raw_synonyms.items():
            key = key.strip().lower()
            _clean = []
            _partial = []
            for syn in synonyms:
                syn = syn.strip().lower()
                if key.startswith(syn):
                    _partial.append(syn)
                else:
                    _clean.append(syn)
            if _clean:
                clean_synonyms[key] = _clean
            if _partial:
                partial_synonyms[key] = _partial

        return clean_synonyms, partial_synonyms

    def _get_reverse_synonyms(self, synonyms):
        result = {}
        if synonyms:
            for key, value in synonyms.items():
                for item in value:
                    result[item] = key
        return result

    def _get_partial_synonyms_to_words(self):
        new_words = {}
        for key, value in self.words.items():
            # data is mutable so we copy
            try:
                value = value.copy()
            # data must be named tuple
            except Exception:
                new_value = value._asdict()
                new_value[ORIGINAL_KEY] = key
                value = type(value)(**new_value)
            else:
                value[ORIGINAL_KEY] = key
            for syn_key, syns in self._partial_synonyms.items():
                if key.startswith(syn_key):
                    for syn in syns:
                        new_key = key.replace(syn_key, syn)
                        new_words[new_key] = value
        return new_words

    def _populate_dwg(self):
        if not self._dwg:
            with self._lock:
                if not self._dwg:
                    self._dwg = _DawgNode()
                    for word, value in self.words.items():
                        original_key = value.get(ORIGINAL_KEY)
                        word = word.strip().lower()
                        count = value.get('count', 0)
                        leaf_node = self.insert_word_branch(
                            word,
                            original_key=original_key,
                            count=count
                        )
                        if self._clean_synonyms:
                            for synonym in self._clean_synonyms.get(word, []):
                                self.insert_word_branch(
                                    synonym,
                                    leaf_node=leaf_node,
                                    add_word=False,
                                    count=count
                                )

    def insert_word_callback(self, word):
        """
        Once word is inserted, run this.
        """
        pass

    def insert_word_branch(self, word, leaf_node=None, add_word=True, original_key=None, count=0):
        """
        Inserts a word into the Dawg.

        :param word: The word to be inserted as a branch of dwg
        :param leaf_node: (optional) The leaf node for the node to merge into in the dwg.
        :param add_word: (Boolean, default: True) Add the word itself at the end of the branch.
                          Usually this is set to False if we are merging into a leaf node and do not
                          want to add the actual word there.
        :param original_key: If the word that is being added was originally another word.
                             For example with synonyms, you might be inserting the word `beemer` but the
                             original key is `bmw`. This parameter might be removed in the future.

        """
        if leaf_node:
            temp_leaf_node = self._dwg.insert(
                word[:-1],
                add_word=add_word,
                original_key=original_key,
                count=count,
                insert_count=self.SHOULD_INCLUDE_COUNT
            )
            temp_leaf_node.children[word[-1]] = leaf_node
        else:
            leaf_node = self._dwg.insert(
                word,
                original_key=original_key,
                count=count,
                insert_count=self.SHOULD_INCLUDE_COUNT
            )
        self.insert_word_callback(word)
        return leaf_node

    def _find_and_sort(self, word, max_cost, size):
        output_keys_set = set()
        results, find_steps = self._find(word, max_cost, size)
        results_keys = list(results.keys())
        results_keys.sort()
        for key in results_keys:
            for output_items in results[key]:
                for i, item in enumerate(output_items):
                    reversed_item = self._reverse_synonyms.get(item)
                    if reversed_item:
                        output_items[i] = reversed_item
                    elif item not in self.words:
                        output_items[i] = item
                output_items_str = DELIMITER.join(output_items)
                if output_items and output_items_str not in output_keys_set:
                    output_keys_set.add(output_items_str)
                    yield output_items
                    if len(output_keys_set) >= size:
                        return

    def get_tokens_flat_list(self, word, max_cost=3, size=10):
        """
        Gets a flat list of tokens.
        This requires the original search function from this class to be run,
        instead of subclasses of AutoComplete.
        """
        result = AutoComplete.search(self, word, max_cost=max_cost, size=size)
        return [item for sublist in result for item in sublist]

    def get_word_context(self, word):
        """
        Gets the word's context from the words dictionary
        """
        word = self.normalizer.normalize_node_name(word)
        return self.words.get(word)

    def search(self, word, max_cost=2, size=5):
        """
        parameters:
        - word: the word to return autocomplete results for
        - max_cost: Maximum Levenshtein edit distance to be considered when calculating results
        - size: The max number of results to return
        """
        word = self.normalizer.normalize_node_name(word)
        if not word:
            return []
        key = f'{word}-{max_cost}-{size}'
        result = self._lfu_cache.get(key)
        if result == -1:
            result = list(self._find_and_sort(word, max_cost, size))
            self._lfu_cache.set(key, result)
        return result

    @staticmethod
    def _len_results(results):
        return sum(map(len, results.values()))

    @staticmethod
    def _is_enough_results(results, size):
        return AutoComplete._len_results(results) >= size

    def _is_stop_word_condition(self, matched_words, matched_prefix_of_last_word):
        return (self._full_stop_words and matched_words and matched_words[-1] in self._full_stop_words and not matched_prefix_of_last_word)

    def _find(self, word, max_cost, size, call_count=0):
        """
        The search function returns a list of all words that are less than the given
        maximum distance from the target word
        """
        results = defaultdict(list)
        fuzzy_matches = defaultdict(list)
        rest_of_results = {}
        fuzzy_matches_len = 0

        fuzzy_min_distance = min_distance = INF
        matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=word)

        last_word = matched_prefix_of_last_word + rest_of_word

        if matched_words:
            results[0] = [matched_words.copy()]
            min_distance = 0
            # under certain condition with finding full stop words, do not bother with finding more matches
            if self._is_stop_word_condition(matched_words, matched_prefix_of_last_word):
                find_steps = [FindStep.start]
                return results, find_steps
        if len(rest_of_word) < 3:
            find_steps = [FindStep.descendants_only]
            self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=1)
        else:
            find_steps = [FindStep.fuzzy_try]
            word_chunks = deque(filter(lambda x: x, last_word.split(' ')))
            new_word = word_chunks.popleft()

            # TODO: experiment with the number here
            # 'in los angeles' gets cut into `in los` so it becomes a closer match to `in lodi`
            # but if the number was bigger, we could have matched with `in los angeles`
            while len(new_word) < 5 and word_chunks:
                new_word = f'{new_word} {word_chunks.popleft()}'
            fuzzy_rest_of_word = ' '.join(word_chunks)

            for _word in self.words:
                if abs(len(_word) - len(new_word)) > max_cost:
                    continue
                dist = levenshtein_distance(new_word, _word)
                if dist < max_cost:
                    fuzzy_matches_len += 1
                    _value = self.words[_word].get(ORIGINAL_KEY, _word)
                    fuzzy_matches[dist].append(_value)
                    fuzzy_min_distance = min(fuzzy_min_distance, dist)
                    if fuzzy_matches_len >= size or dist < 2:
                        break
            if fuzzy_matches_len:
                find_steps.append(FindStep.fuzzy_found)
                if fuzzy_rest_of_word:
                    call_count += 1
                    if call_count < 2:
                        rest_of_results, rest_find_steps = self._find(word=fuzzy_rest_of_word, max_cost=max_cost, size=size, call_count=call_count)
                        find_steps.append({FindStep.rest_of_fuzzy_round2: rest_find_steps})
                for _word in fuzzy_matches[fuzzy_min_distance]:
                    if rest_of_results:
                        rest_of_results_min_key = min(rest_of_results.keys())
                        for _rest_of_matched_word in rest_of_results[rest_of_results_min_key]:
                            results[fuzzy_min_distance].append(matched_words + [_word] + _rest_of_matched_word)
                    else:
                        results[fuzzy_min_distance].append(matched_words + [_word])
                        _matched_prefix_of_last_word_b, not_used_rest_of_word, fuzzy_new_node, _matched_words_b = self._prefix_autofill(word=_word)
                        if self._is_stop_word_condition(matched_words=_matched_words_b, matched_prefix_of_last_word=_matched_prefix_of_last_word_b):
                            break
                        self._add_descendants_words_to_results(node=fuzzy_new_node, size=size, matched_words=matched_words, results=results, distance=fuzzy_min_distance)

            if matched_words and not self._is_enough_results(results, size):
                find_steps.append(FindStep.not_enough_results_add_some_descandants)
                total_min_distance = min(min_distance, fuzzy_min_distance)
                self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=total_min_distance+1)

        return results, find_steps

    def _prefix_autofill(self, word, node=None):
        len_prev_rest_of_last_word = INF
        matched_words = []
        matched_words_set = set()

        def _add_words(words):
            is_added = False
            for word in words:
                if word not in matched_words_set:
                    matched_words.append(word)
                    matched_words_set.add(word)
                    is_added = True
            return is_added

        matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node)
        _add_words(matched_words_part)
        result = (matched_prefix_of_last_word, rest_of_word, node, matched_words)
        len_rest_of_last_word = len(rest_of_word)

        while len_rest_of_last_word and len_rest_of_last_word < len_prev_rest_of_last_word:
            word = matched_prefix_of_last_word + rest_of_word
            word = word.strip()
            len_prev_rest_of_last_word = len_rest_of_last_word
            matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node=self._dwg, matched_condition_ever=matched_condition_ever, matched_condition_in_branch=matched_condition_in_branch)
            is_added = _add_words(matched_words_part)
            if is_added is False:
                break
            len_rest_of_last_word = len(rest_of_word)
            result = (matched_prefix_of_last_word, rest_of_word, node, matched_words)

        return result

    def prefix_autofill_part_condition(self, node):
        pass

    PREFIX_AUTOFILL_PART_CONDITION_SUFFIX = ''

    def _add_to_matched_words(self, node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word):
        if matched_words:
            last_matched_word = matched_words[-1].replace(self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX, '')
            if node.value.startswith(last_matched_word):
                matched_words.pop()
        value = node.value
        if self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX:
            if self._node_word_info_matches_condition(node, self.prefix_autofill_part_condition):
                matched_condition_in_branch = True
                if matched_condition_ever and matched_prefix_of_last_word:
                    value = f"{matched_prefix_of_last_word}{self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX}"
        matched_words.append(value)
        return matched_words, matched_condition_in_branch

    def _prefix_autofill_part(self, word, node=None, matched_condition_ever=False, matched_condition_in_branch=False):
        node = node or self._dwg
        que = deque(word)

        matched_prefix_of_last_word = ''
        matched_words = []
        nodes_that_words_were_extracted = set()

        while que:
            char = que.popleft()

            if node.children:
                if char not in node.children:
                    space_child = node.children.get(' ')
                    if space_child and char in space_child.children:
                        node = space_child
                    else:
                        que.appendleft(char)
                        break
                node = node.children[char]
                if char != ' ' or matched_prefix_of_last_word:
                    matched_prefix_of_last_word += char
                if node.word:
                    if que:
                        next_char = que[0]
                        if next_char != ' ':
                            continue
                    matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word)
                    nodes_that_words_were_extracted.add(node)
                    matched_prefix_of_last_word = ''
            else:
                if char == ' ':
                    node = self._dwg
                    if matched_condition_in_branch:
                        matched_condition_ever = True
                else:
                    que.appendleft(char)
                    break

        if not que and node.word and node not in nodes_that_words_were_extracted:
            matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word)
            matched_prefix_of_last_word = ''

        rest_of_word = "".join(que)
        if matched_condition_in_branch:
            matched_condition_ever = True

        return matched_prefix_of_last_word, rest_of_word, node, matched_words, matched_condition_ever, matched_condition_in_branch

    def _add_descendants_words_to_results(self, node, size, matched_words, results, distance, should_traverse=True):
        descendant_words = list(node.get_descendants_words(size, should_traverse, full_stop_words=self._full_stop_words))
        extended = _extend_and_repeat(matched_words, descendant_words)
        if extended:
            results[distance].extend(extended)
        return distance

    def _node_word_info_matches_condition(self, node, condition):
        _word = node.word
        word_info = self.words.get(_word)
        if word_info:
            return condition(word_info)
        else:
            return False

    def get_all_descendent_words_for_condition(self, word, size, condition):
        new_tokens = []

        matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word)
        if not rest_of_word and self._node_word_info_matches_condition(node, condition):
            found_nodes_gen = node.get_descendants_nodes(size, insert_count=self.SHOULD_INCLUDE_COUNT)
            for node in found_nodes_gen:
                if self._node_word_info_matches_condition(node, condition):
                    new_tokens.append(node.word)
        return new_tokens

    def update_count_of_word(self, word, count=None, offset=None):
        """
        Update the count attribute of a node in the dwg. This only affects the autocomplete
        object and not the original count of the node in the data that was fed into fast_autocomplete.
        """
        matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word)
        if node:
            if offset:
                with self._lock:
                    node.count += offset
            elif count:
                with self._lock:
                    node.count = count
        else:
            raise NodeNotFound(f'Unable to find a node for word {word}')
        return node.count

    def get_count_of_word(self, word):
        return self.update_count_of_word(word)