def suggest(self, term, facets=[]): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! hashed_facets = self.hash_facets(facets) cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS), hashed_facets) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) # Get the normalized term variations we need to search for each term. A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] provider_results = OrderedDict() # Generate a unique identifier to be used for storing intermediate results. This is to # prevent redis key collisions between competing suggest / exact_suggest calls. base_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) base_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) # Same idea as the base_result_key, but for when we are using facets in the suggest call. facet_final_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) facet_final_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) # As we search, we may store a number of intermediate data items. We keep track of # what we store and delete so there is nothing left over # We initialize with the base keys all of which could end up being used. keys_to_delete = { base_result_key, base_exact_match_key, facet_final_result_key, facet_final_exact_match_key } facet_keys_set = set() if len(facets) > 0: # we use from_iterable to flatten the list comprehension into a single list sub_facets = itertools.chain.from_iterable( [facet['facets'] for facet in facets]) facet_keys_set = set( [sub_facet['key'] for sub_facet in sub_facets]) MOVE_EXACT_MATCHES_TO_TOP = registry.get_autocompleter_setting( self.name, 'MOVE_EXACT_MATCHES_TO_TOP') # Get the max results autocompleter setting MAX_RESULTS = registry.get_autocompleter_setting( self.name, 'MAX_RESULTS') pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting( self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue term_result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() keys = [ PREFIX_BASE_NAME % ( provider_name, norm_word, ) for norm_word in norm_words ] if len(keys) == 1: term_result_keys.append(keys[0]) else: term_result_key = base_result_key + '.' + norm_term term_result_keys.append(term_result_key) keys_to_delete.add(term_result_key) pipe.zinterstore(term_result_key, keys, aggregate='MIN') if len(term_result_keys) == 1: final_result_key = term_result_keys[0] else: final_result_key = base_result_key pipe.zunionstore(final_result_key, term_result_keys, aggregate='MIN') use_facets = False if len(facet_keys_set) > 0: provider_keys_set = set(provider.get_facets()) if facet_keys_set.issubset(provider_keys_set): use_facets = True if use_facets: facet_result_keys = [] for facet in facets: try: facet_type = facet['type'] if facet_type not in ['and', 'or']: continue facet_list = facet['facets'] facet_set_keys = [] for facet_dict in facet_list: facet_set_key = \ FACET_SET_BASE_NAME % (provider_name, facet_dict['key'], facet_dict['value'],) facet_set_keys.append(facet_set_key) if len(facet_set_keys) == 1: facet_result_keys.append(facet_set_keys[0]) else: facet_result_key = RESULT_SET_BASE_NAME % str( uuid.uuid4()) facet_result_keys.append(facet_result_key) keys_to_delete.add(facet_result_key) if facet_type == 'and': pipe.zinterstore(facet_result_key, facet_set_keys, aggregate='MIN') else: pipe.zunionstore(facet_result_key, facet_set_keys, aggregate='MIN') except KeyError: continue # We want to calculate the intersection of all the intermediate facet sets created so far # along with the final result set. So we append the final_result_key to the list of # facet_result_keys and store the intersection in the faceted final result set. pipe.zinterstore(facet_final_result_key, facet_result_keys + [final_result_key], aggregate='MIN') if use_facets: pipe.zrange(facet_final_result_key, 0, MAX_RESULTS - 1) else: pipe.zrange(final_result_key, 0, MAX_RESULTS - 1) # Get exact matches if MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % ( provider_name, norm_term, )) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue if len(keys) == 1: final_exact_match_key = keys[0] else: final_exact_match_key = base_exact_match_key pipe.zunionstore(final_exact_match_key, keys, aggregate='MIN') # If facets are being used for this suggest call, we need to make sure that # exact term matches don't bypass the requirement of having matching facet values. # To achieve this, we intersect all faceted matches (exact-and-non-exact) with # all exact matches. if use_facets: pipe.zinterstore(facet_final_exact_match_key, facet_result_keys + [final_exact_match_key], aggregate='MIN') pipe.zrange(facet_final_exact_match_key, 0, MAX_RESULTS - 1) else: pipe.zrange(final_exact_match_key, 0, MAX_RESULTS - 1) pipe.delete(*keys_to_delete) results = [i for i in pipe.execute() if type(i) == list] # Total number of results currently allocated to providers total_allocated_results = 0 # Maximum number of results allowed per provider provider_max_results = OrderedDict() # Get an initial max/provider based on a equal share of MAX_RESULTS for provider in providers: provider_name = provider.provider_name results_per_provider = self.normalize_rounding(MAX_RESULTS / len(providers)) provider_max_results[provider_name] = results_per_provider total_allocated_results += results_per_provider # Due to having to round to nearest result, the maximum number of results # allocated could be less/more than the max allowed... Here we adjust providers # results until total allocation equals max allowed diff = 1 if total_allocated_results < MAX_RESULTS else -1 while total_allocated_results != MAX_RESULTS: for provider in providers: provider_name = provider.provider_name provider_max_results[provider_name] += diff total_allocated_results += diff if total_allocated_results == MAX_RESULTS: break # Result IDs per provider provider_result_ids = OrderedDict() # Number of results we will be getting from each provider provider_num_results = OrderedDict() # Total pool of extra result slots total_surplus = 0 # Number of extra result slots a provider could use provider_deficits = OrderedDict() # Create a dict mapping provider to number of result IDs available # We combine the 2 different kinds of results into 1 result ID list per provider. # Also keep track of number of extra result slots available when a provider does not # use up its allocated slots. for provider in providers: provider_name = provider.provider_name # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting( self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: # if provider will not be used due to min_letters, put all result slots # in surplus pool then continue total_surplus += provider_max_results[provider_name] continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_result_ids[provider] = ids surplus = provider_max_results[provider_name] - len(ids) if surplus >= 0: provider_num_results[provider_name] = len(ids) total_surplus += surplus else: # create base usage provider_num_results[provider_name] = provider_max_results[ provider_name] # create dict of how many extra each provider actually needs provider_deficits[provider_name] = -surplus # If there are extra result slots available, go through each provider that # needs extra results, and hand them out until there are no more to give while total_surplus > 0: # Check if there are any providers that still need extra results, and if not exit the loop, # else we get caught in an infinite loop provider_with_deficit_exists = False for provider_name in provider_deficits: deficit = provider_deficits[provider_name] if deficit > 0: provider_with_deficit_exists = True if not provider_with_deficit_exists: break for provider_name in provider_deficits: deficit = provider_deficits[provider_name] if deficit > 0: provider_num_results[provider_name] += 1 provider_deficits[provider_name] -= 1 total_surplus -= 1 if total_surplus <= 0: break # At this point we should have the final number of results we will be getting # from each provider, so we get from provider and put in final result IDs dict for provider in providers: provider_name = provider.provider_name try: num_results = provider_num_results[provider_name] provider_results[provider_name] = provider_result_ids[ provider][:num_results] except KeyError: continue results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, settings.CACHE_TIMEOUT, self.__class__._serialize_data(results)) return results
def suggest(self, term): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS)) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] provider_results = OrderedDict() # Get the matched result IDs total_results = 0 if settings.ELASTIC_RESULTS: for provider in providers: total_results += registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name # If we have total_results from adding up all MAX_RESULTS from ELASTIC_RESULTS use it. if settings.ELASTIC_RESULTS: MAX_RESULTS = total_results else: MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() result_key = "djac.results.%s" % (norm_term,) result_keys.append(result_key) keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words] pipe.zinterstore(result_key, keys, aggregate='MIN') pipe.zunionstore("djac.results", result_keys, aggregate='MIN') for result_key in result_keys: pipe.delete(result_key) pipe.zrange("djac.results", 0, MAX_RESULTS - 1) # Get exact matches if settings.MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue pipe.zunionstore("djac.results", keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) pipe.delete("djac.results") results = [i for i in pipe.execute() if type(i) == list] # init mappings and surplus for Elastic Result distribution deficits = {} # Mapping required to store result_ids outside of per provider loop before # fetching items / redistributing availabe result slots in elastic results provider_result_ids = {} max_results_dict = {} # total pool of available result slots total_surplus = 0 # Create a dict mapping provider to result IDs # We combine the 2 different kinds of results into 1 result ID list per provider. for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: # if provider will not be used due to min_letters, put all result slots # in surplus pool then continue total_surplus += MAX_RESULTS continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if settings.MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_result_ids[provider] = ids if settings.ELASTIC_RESULTS: surplus = MAX_RESULTS - len(ids) if surplus >= 0: max_results_dict[provider] = len(ids) total_surplus += surplus else: # create base usage max_results_dict[provider] = MAX_RESULTS # create dict of how many extra each provider actually needs deficits[provider] = surplus * -1 else: max_results_dict[provider] = MAX_RESULTS if settings.ELASTIC_RESULTS: while total_surplus > 0: # get a list of providers with deficits for two reasons. First, to know how # to divide the surplus, secondly, to iterate over rather than the deficit dict # as we will be manipulating the dict in the for loop beneficiaries = list(deficits.keys()) num_beneficiaries = len(beneficiaries) # if num_beneficiaries is greater than surplus, surplus_each will be 0 because of int # division in python, but total_surplus will still be > 0, resulting in infinite loop. if num_beneficiaries == 0 or num_beneficiaries > total_surplus: break else: surplus_payout = int(total_surplus / num_beneficiaries) for provider in beneficiaries: deficit = deficits.pop(provider) if (deficit - surplus_payout) <= 0: total_surplus -= deficit max_results_dict[provider] += surplus_payout else: total_surplus -= surplus_payout max_results_dict[provider] += surplus_payout deficits[provider] = deficit - surplus_payout for provider in providers: try: max_results = max_results_dict[provider] provider_results[provider.provider_name] = provider_result_ids[provider][:max_results] except KeyError: continue results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT) return results
def suggest(self, term): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers == None: return [] # If we have a cached version of the search results available, return it! cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term),) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self._deserialize_data(REDIS.get(cache_key)) # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) provider_results = SortedDict() # Get the matched result IDs pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() result_key = "djac.results.%s" % (norm_term,) result_keys.append(result_key) keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words] pipe.zinterstore(result_key, keys, aggregate='MIN') pipe.zunionstore("djac.results", result_keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) # Get exact matches if settings.MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) pipe.zunionstore("djac.results", keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) results = [i for i in pipe.execute() if type(i) == list] # Create a dict mapping provider to result IDs # We combine the 2 different kinds of results into 1 result ID list per provider. for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if settings.MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_results[provider_name] = ids[:MAX_RESULTS] results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.set(cache_key, self._serialize_data(results)) REDIS.expire(cache_key, settings.CACHE_TIMEOUT) return results