Exemple #1
0
    def exact_suggest(self, term):
        """
        Suggest matching objects exacting matching term given, given a term
        """
        providers = self._get_all_providers_by_autocompleter()
        if providers is None:
            return []

        # If we have a cached version of the search results available, return it!
        cache_key = EXACT_CACHE_BASE_NAME % (self.name, term,)
        if settings.CACHE_TIMEOUT and REDIS.exists(cache_key):
            return self.__class__._deserialize_data(REDIS.get(cache_key))
        provider_results = OrderedDict()

        # Get the normalized we need to search for each term... A single term
        # could turn into multiple terms we need to search.
        norm_terms = utils.get_norm_term_variations(term)
        if len(norm_terms) == 0:
            return []

        # Get the matched result IDs
        pipe = REDIS.pipeline()
        for provider in providers:
            provider_name = provider.provider_name

            MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            keys = []
            for norm_term in norm_terms:
                keys.append(EXACT_BASE_NAME % (provider_name, norm_term,))
            # Do not attempt zunionstore on empty list because redis errors out.
            if len(keys) == 0:
                continue
            pipe.zunionstore("djac.results", keys, aggregate='MIN')
            pipe.zrange("djac.results", 0, MAX_RESULTS - 1)
        results = [i for i in pipe.execute() if type(i) == list]

        # Create a dict mapping provider to result IDs
        for provider in providers:
            provider_name = provider.provider_name

            MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            exact_ids = results.pop(0)
            provider_results[provider_name] = exact_ids[:MAX_RESULTS]

        results = self._get_results_from_ids(provider_results)

        # If told to, cache the final results for CACHE_TIMEOUT secnds
        if settings.CACHE_TIMEOUT:
            REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT)
        return results
Exemple #2
0
    def suggest(self, term, facets=[]):
        """
        Suggest matching objects, given a term
        """
        providers = self._get_all_providers_by_autocompleter()
        if providers is None:
            return []

        # If we have a cached version of the search results available, return it!
        hashed_facets = self.hash_facets(facets)
        cache_key = CACHE_BASE_NAME % \
            (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS), hashed_facets)
        if settings.CACHE_TIMEOUT and REDIS.exists(cache_key):
            return self.__class__._deserialize_data(REDIS.get(cache_key))

        # Get the normalized term variations we need to search for each term. A single term
        # could turn into multiple terms we need to search.
        norm_terms = utils.get_norm_term_variations(term)
        if len(norm_terms) == 0:
            return []

        provider_results = OrderedDict()

        # Generate a unique identifier to be used for storing intermediate results. This is to
        # prevent redis key collisions between competing suggest / exact_suggest calls.
        base_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4())
        base_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4())
        # Same idea as the base_result_key, but for when we are using facets in the suggest call.
        facet_final_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4())
        facet_final_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4())
        # As we search, we may store a number of intermediate data items. We keep track of
        # what we store and delete so there is nothing left over
        # We initialize with the base keys all of which could end up being used.
        keys_to_delete = {
            base_result_key, base_exact_match_key, facet_final_result_key,
            facet_final_exact_match_key
        }

        facet_keys_set = set()
        if len(facets) > 0:
            # we use from_iterable to flatten the list comprehension into a single list
            sub_facets = itertools.chain.from_iterable(
                [facet['facets'] for facet in facets])
            facet_keys_set = set(
                [sub_facet['key'] for sub_facet in sub_facets])

        MOVE_EXACT_MATCHES_TO_TOP = registry.get_autocompleter_setting(
            self.name, 'MOVE_EXACT_MATCHES_TO_TOP')
        # Get the max results autocompleter setting
        MAX_RESULTS = registry.get_autocompleter_setting(
            self.name, 'MAX_RESULTS')

        pipe = REDIS.pipeline()
        for provider in providers:
            provider_name = provider.provider_name

            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(
                self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                continue

            term_result_keys = []
            for norm_term in norm_terms:
                norm_words = norm_term.split()
                keys = [
                    PREFIX_BASE_NAME % (
                        provider_name,
                        norm_word,
                    ) for norm_word in norm_words
                ]
                if len(keys) == 1:
                    term_result_keys.append(keys[0])
                else:
                    term_result_key = base_result_key + '.' + norm_term
                    term_result_keys.append(term_result_key)
                    keys_to_delete.add(term_result_key)
                    pipe.zinterstore(term_result_key, keys, aggregate='MIN')

            if len(term_result_keys) == 1:
                final_result_key = term_result_keys[0]
            else:
                final_result_key = base_result_key
                pipe.zunionstore(final_result_key,
                                 term_result_keys,
                                 aggregate='MIN')

            use_facets = False
            if len(facet_keys_set) > 0:
                provider_keys_set = set(provider.get_facets())
                if facet_keys_set.issubset(provider_keys_set):
                    use_facets = True

            if use_facets:
                facet_result_keys = []
                for facet in facets:
                    try:
                        facet_type = facet['type']
                        if facet_type not in ['and', 'or']:
                            continue
                        facet_list = facet['facets']
                        facet_set_keys = []
                        for facet_dict in facet_list:
                            facet_set_key = \
                                FACET_SET_BASE_NAME % (provider_name, facet_dict['key'], facet_dict['value'],)
                            facet_set_keys.append(facet_set_key)

                        if len(facet_set_keys) == 1:
                            facet_result_keys.append(facet_set_keys[0])
                        else:
                            facet_result_key = RESULT_SET_BASE_NAME % str(
                                uuid.uuid4())
                            facet_result_keys.append(facet_result_key)
                            keys_to_delete.add(facet_result_key)
                            if facet_type == 'and':
                                pipe.zinterstore(facet_result_key,
                                                 facet_set_keys,
                                                 aggregate='MIN')
                            else:
                                pipe.zunionstore(facet_result_key,
                                                 facet_set_keys,
                                                 aggregate='MIN')
                    except KeyError:
                        continue

                # We want to calculate the intersection of all the intermediate facet sets created so far
                # along with the final result set. So we append the final_result_key to the list of
                # facet_result_keys and store the intersection in the faceted final result set.
                pipe.zinterstore(facet_final_result_key,
                                 facet_result_keys + [final_result_key],
                                 aggregate='MIN')

            if use_facets:
                pipe.zrange(facet_final_result_key, 0, MAX_RESULTS - 1)
            else:
                pipe.zrange(final_result_key, 0, MAX_RESULTS - 1)

            # Get exact matches
            if MOVE_EXACT_MATCHES_TO_TOP:
                keys = []
                for norm_term in norm_terms:
                    keys.append(EXACT_BASE_NAME % (
                        provider_name,
                        norm_term,
                    ))
                # Do not attempt zunionstore on empty list because redis errors out.
                if len(keys) == 0:
                    continue

                if len(keys) == 1:
                    final_exact_match_key = keys[0]
                else:
                    final_exact_match_key = base_exact_match_key
                    pipe.zunionstore(final_exact_match_key,
                                     keys,
                                     aggregate='MIN')

                # If facets are being used for this suggest call, we need to make sure that
                # exact term matches don't bypass the requirement of having matching facet values.
                # To achieve this, we intersect all faceted matches (exact-and-non-exact) with
                # all exact matches.
                if use_facets:
                    pipe.zinterstore(facet_final_exact_match_key,
                                     facet_result_keys +
                                     [final_exact_match_key],
                                     aggregate='MIN')
                    pipe.zrange(facet_final_exact_match_key, 0,
                                MAX_RESULTS - 1)
                else:
                    pipe.zrange(final_exact_match_key, 0, MAX_RESULTS - 1)

        pipe.delete(*keys_to_delete)

        results = [i for i in pipe.execute() if type(i) == list]

        # Total number of results currently allocated to providers
        total_allocated_results = 0
        # Maximum number of results allowed per provider
        provider_max_results = OrderedDict()

        # Get an initial max/provider based on a equal share of MAX_RESULTS
        for provider in providers:
            provider_name = provider.provider_name
            results_per_provider = self.normalize_rounding(MAX_RESULTS /
                                                           len(providers))
            provider_max_results[provider_name] = results_per_provider
            total_allocated_results += results_per_provider

        # Due to having to round to nearest result, the maximum number of results
        # allocated could be less/more than the max allowed... Here we adjust providers
        # results until total allocation equals max allowed
        diff = 1 if total_allocated_results < MAX_RESULTS else -1
        while total_allocated_results != MAX_RESULTS:
            for provider in providers:
                provider_name = provider.provider_name
                provider_max_results[provider_name] += diff
                total_allocated_results += diff
                if total_allocated_results == MAX_RESULTS:
                    break

        # Result IDs per provider
        provider_result_ids = OrderedDict()
        # Number of results we will be getting from each provider
        provider_num_results = OrderedDict()
        # Total pool of extra result slots
        total_surplus = 0
        # Number of extra result slots a provider could use
        provider_deficits = OrderedDict()

        # Create a dict mapping provider to number of result IDs available
        # We combine the 2 different kinds of results into 1 result ID list per provider.
        # Also keep track of number of extra result slots available when a provider does not
        # use up its allocated slots.
        for provider in providers:
            provider_name = provider.provider_name

            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(
                self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                # if provider will not be used due to min_letters, put all result slots
                # in surplus pool then continue
                total_surplus += provider_max_results[provider_name]
                continue

            ids = results.pop(0)
            # We merge exact matches with base matches by moving them to
            # the head of the results
            if MOVE_EXACT_MATCHES_TO_TOP:
                exact_ids = results.pop(0)

                # Need to reverse exact IDs so high scores are behind low scores, since we
                # are inserted in front of list.
                exact_ids.reverse()

                # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing
                # from regular ID list if necessary
                for j in exact_ids:
                    if j in ids:
                        ids.remove(j)
                    ids.insert(0, j)
            provider_result_ids[provider] = ids
            surplus = provider_max_results[provider_name] - len(ids)
            if surplus >= 0:
                provider_num_results[provider_name] = len(ids)
                total_surplus += surplus
            else:
                # create base usage
                provider_num_results[provider_name] = provider_max_results[
                    provider_name]
                # create dict of how many extra each provider actually needs
                provider_deficits[provider_name] = -surplus

        # If there are extra result slots available, go through each provider that
        # needs extra results, and hand them out until there are no more to give
        while total_surplus > 0:
            # Check if there are any providers that still need extra results, and if not exit the loop,
            # else we get caught in an infinite loop
            provider_with_deficit_exists = False
            for provider_name in provider_deficits:
                deficit = provider_deficits[provider_name]
                if deficit > 0:
                    provider_with_deficit_exists = True
            if not provider_with_deficit_exists:
                break
            for provider_name in provider_deficits:
                deficit = provider_deficits[provider_name]
                if deficit > 0:
                    provider_num_results[provider_name] += 1
                    provider_deficits[provider_name] -= 1
                    total_surplus -= 1

                if total_surplus <= 0:
                    break

        # At this point we should have the final number of results we will be getting
        # from each provider, so we get from provider and put in final result IDs dict
        for provider in providers:
            provider_name = provider.provider_name
            try:
                num_results = provider_num_results[provider_name]
                provider_results[provider_name] = provider_result_ids[
                    provider][:num_results]
            except KeyError:
                continue

        results = self._get_results_from_ids(provider_results)

        # If told to, cache the final results for CACHE_TIMEOUT secnds
        if settings.CACHE_TIMEOUT:
            REDIS.setex(cache_key, settings.CACHE_TIMEOUT,
                        self.__class__._serialize_data(results))
        return results
Exemple #3
0
    def suggest(self, term):
        """
        Suggest matching objects, given a term
        """
        providers = self._get_all_providers_by_autocompleter()
        if providers is None:
            return []

        # If we have a cached version of the search results available, return it!
        cache_key = CACHE_BASE_NAME % \
            (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS))
        if settings.CACHE_TIMEOUT and REDIS.exists(cache_key):
            return self.__class__._deserialize_data(REDIS.get(cache_key))

        # Get the normalized we need to search for each term... A single term
        # could turn into multiple terms we need to search.
        norm_terms = utils.get_norm_term_variations(term)
        if len(norm_terms) == 0:
            return []

        provider_results = OrderedDict()

        # Get the matched result IDs
        total_results = 0
        if settings.ELASTIC_RESULTS:
            for provider in providers:
                total_results += registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')

        pipe = REDIS.pipeline()
        for provider in providers:
            provider_name = provider.provider_name
            # If we have total_results from adding up all MAX_RESULTS from ELASTIC_RESULTS use it.
            if settings.ELASTIC_RESULTS:
                MAX_RESULTS = total_results
            else:
                MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                continue

            result_keys = []
            for norm_term in norm_terms:
                norm_words = norm_term.split()
                result_key = "djac.results.%s" % (norm_term,)
                result_keys.append(result_key)
                keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words]
                pipe.zinterstore(result_key, keys, aggregate='MIN')
            pipe.zunionstore("djac.results", result_keys, aggregate='MIN')
            for result_key in result_keys:
                pipe.delete(result_key)
            pipe.zrange("djac.results", 0, MAX_RESULTS - 1)

            # Get exact matches
            if settings.MOVE_EXACT_MATCHES_TO_TOP:
                keys = []
                for norm_term in norm_terms:
                    keys.append(EXACT_BASE_NAME % (provider_name, norm_term,))
                # Do not attempt zunionstore on empty list because redis errors out.
                if len(keys) == 0:
                    continue

                pipe.zunionstore("djac.results", keys, aggregate='MIN')
                pipe.zrange("djac.results", 0, MAX_RESULTS - 1)
            pipe.delete("djac.results")

        results = [i for i in pipe.execute() if type(i) == list]

        # init mappings and surplus for Elastic Result distribution
        deficits = {}
        # Mapping required to store result_ids outside of per provider loop before
        # fetching items / redistributing availabe result slots in elastic results
        provider_result_ids = {}
        max_results_dict = {}
        # total pool of available result slots
        total_surplus = 0
        # Create a dict mapping provider to result IDs
        # We combine the 2 different kinds of results into 1 result ID list per provider.
        for provider in providers:
            provider_name = provider.provider_name

            MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                # if provider will not be used due to min_letters, put all result slots
                # in surplus pool then continue
                total_surplus += MAX_RESULTS
                continue

            ids = results.pop(0)
            # We merge exact matches with base matches by moving them to
            # the head of the results
            if settings.MOVE_EXACT_MATCHES_TO_TOP:
                exact_ids = results.pop(0)

                # Need to reverse exact IDs so high scores are behind low scores, since we
                # are inserted in front of list.
                exact_ids.reverse()

                # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing
                # from regular ID list if necessary
                for j in exact_ids:
                    if j in ids:
                        ids.remove(j)
                    ids.insert(0, j)
            provider_result_ids[provider] = ids

            if settings.ELASTIC_RESULTS:
                surplus = MAX_RESULTS - len(ids)
                if surplus >= 0:
                    max_results_dict[provider] = len(ids)
                    total_surplus += surplus
                else:
                    # create base usage
                    max_results_dict[provider] = MAX_RESULTS
                    # create dict of how many extra each provider actually needs
                    deficits[provider] = surplus * -1
            else:
                max_results_dict[provider] = MAX_RESULTS

        if settings.ELASTIC_RESULTS:
            while total_surplus > 0:
                # get a list of providers with deficits for two reasons. First, to know how
                # to divide the surplus, secondly, to iterate over rather than the deficit dict
                # as we will be manipulating the dict in the for loop
                beneficiaries = list(deficits.keys())
                num_beneficiaries = len(beneficiaries)
                # if num_beneficiaries is greater than surplus, surplus_each will be 0 because of int
                # division in python, but total_surplus will still be > 0, resulting in infinite loop.
                if num_beneficiaries == 0 or num_beneficiaries > total_surplus:
                    break
                else:
                    surplus_payout = int(total_surplus / num_beneficiaries)
                    for provider in beneficiaries:
                        deficit = deficits.pop(provider)
                        if (deficit - surplus_payout) <= 0:
                            total_surplus -= deficit
                            max_results_dict[provider] += surplus_payout
                        else:
                            total_surplus -= surplus_payout
                            max_results_dict[provider] += surplus_payout
                            deficits[provider] = deficit - surplus_payout

        for provider in providers:
            try:
                max_results = max_results_dict[provider]
                provider_results[provider.provider_name] = provider_result_ids[provider][:max_results]
            except KeyError:
                continue

        results = self._get_results_from_ids(provider_results)

        # If told to, cache the final results for CACHE_TIMEOUT secnds
        if settings.CACHE_TIMEOUT:
            REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT)
        return results
    def suggest(self, term):
        """
        Suggest matching objects, given a term
        """
        providers = self._get_all_providers_by_autocompleter()
        if providers == None:
            return []

        # If we have a cached version of the search results available, return it!
        cache_key = CACHE_BASE_NAME % \
            (self.name, utils.get_normalized_term(term),)
        if settings.CACHE_TIMEOUT and REDIS.exists(cache_key):
            return self._deserialize_data(REDIS.get(cache_key))

        # Get the normalized we need to search for each term... A single term
        # could turn into multiple terms we need to search.
        norm_terms = utils.get_norm_term_variations(term)

        provider_results = SortedDict()

        # Get the matched result IDs
        pipe = REDIS.pipeline()
        for provider in providers:
            provider_name = provider.provider_name

            MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                continue

            result_keys = []
            for norm_term in norm_terms:
                norm_words = norm_term.split()
                result_key = "djac.results.%s" % (norm_term,)
                result_keys.append(result_key)
                keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words]
                pipe.zinterstore(result_key, keys, aggregate='MIN')
            pipe.zunionstore("djac.results", result_keys, aggregate='MIN')
            pipe.zrange("djac.results", 0, MAX_RESULTS - 1)

            # Get exact matches
            if settings.MOVE_EXACT_MATCHES_TO_TOP:
                keys = []
                for norm_term in norm_terms:
                    keys.append(EXACT_BASE_NAME % (provider_name, norm_term,))
                pipe.zunionstore("djac.results", keys, aggregate='MIN')
                pipe.zrange("djac.results", 0, MAX_RESULTS - 1)

        results = [i for i in pipe.execute() if type(i) == list]

        # Create a dict mapping provider to result IDs
        # We combine the 2 different kinds of results into 1 result ID list per provider.
        for provider in providers:
            provider_name = provider.provider_name

            MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS')
            # If the total length of the term is less than MIN_LETTERS allowed, then don't search
            # the provider for this term
            MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS')
            if len(term) < MIN_LETTERS:
                continue

            ids = results.pop(0)
            # We merge exact matches with base matches by moving them to
            # the head of the results
            if settings.MOVE_EXACT_MATCHES_TO_TOP:
                exact_ids = results.pop(0)

                # Need to reverse exact IDs so high scores are behind low scores, since we
                # are inserted in front of list.
                exact_ids.reverse()

                # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing
                # from regular ID list if necessary
                for j in exact_ids:
                    if j in ids:
                        ids.remove(j)
                    ids.insert(0, j)

            provider_results[provider_name] = ids[:MAX_RESULTS]

        results = self._get_results_from_ids(provider_results)

        # If told to, cache the final results for CACHE_TIMEOUT secnds
        if settings.CACHE_TIMEOUT:
            REDIS.set(cache_key, self._serialize_data(results))
            REDIS.expire(cache_key, settings.CACHE_TIMEOUT)

        return results