def get_norm_phrase_aliases(cls): """ Take the dict from get_aliases() and normalize / reverse to get ready for actual usage. DO NOT override this. """ if cls._phrase_aliases == None: norm_phrase_aliases = {} for key, value in cls.get_phrase_aliases().items(): norm_keys = utils.get_norm_term_variations(key) norm_values = utils.get_norm_term_variations(value) for norm_key in norm_keys: for norm_value in norm_values: norm_phrase_aliases[norm_key] = norm_value norm_phrase_aliases[norm_value] = norm_key cls._phrase_aliases = norm_phrase_aliases return cls._phrase_aliases
def exact_suggest(self, term): """ Suggest matching objects exacting matching term given, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! cache_key = EXACT_CACHE_BASE_NAME % (self.name, term,) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) provider_results = OrderedDict() # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] # Generate a unique identifier to be used for storing intermediate results. This is to # prevent redis key collisions between competing suggest / exact_suggest calls. uuid_str = str(uuid.uuid4()) intermediate_result_key = RESULT_SET_BASE_NAME % (uuid_str,) MAX_RESULTS = registry.get_autocompleter_setting(self.name, 'MAX_RESULTS') # Get the matched result IDs pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue pipe.zunionstore(intermediate_result_key, keys, aggregate='MIN') pipe.zrange(intermediate_result_key, 0, MAX_RESULTS - 1) pipe.delete(intermediate_result_key) results = [i for i in pipe.execute() if type(i) == list] # Create a dict mapping provider to result IDs for provider in providers: provider_name = provider.provider_name exact_ids = results.pop(0) provider_results[provider_name] = exact_ids[:MAX_RESULTS] results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT) return results
def exact_suggest(self, term): """ Suggest matching objects exacting matching term given, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! cache_key = EXACT_CACHE_BASE_NAME % (self.name, term,) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) provider_results = OrderedDict() # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] # Get the matched result IDs pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue pipe.zunionstore("djac.results", keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) results = [i for i in pipe.execute() if type(i) == list] # Create a dict mapping provider to result IDs for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') exact_ids = results.pop(0) provider_results[provider_name] = exact_ids[:MAX_RESULTS] results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT) return results
def _get_norm_terms(cls, terms): """ Normalize each term in list of terms. Also, look to see if there are any aliases for any words in the term and use them to create alternate normalized terms DO NOT override this """ norm_terms = [utils.get_norm_term_variations(term) for term in terms] norm_terms = itertools.chain(*norm_terms) norm_terms_with_variations = [] # Now we get alternate norm terms by looking for alias phrases in any of the terms phrase_aliases = cls.get_norm_phrase_aliases() if phrase_aliases is not None: for norm_term in norm_terms: norm_terms_with_variations = norm_terms_with_variations + \ utils.get_aliased_variations(norm_term, phrase_aliases) return norm_terms_with_variations
def suggest(self, term): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS)) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] provider_results = OrderedDict() # Get the matched result IDs total_results = 0 if settings.ELASTIC_RESULTS: for provider in providers: total_results += registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name # If we have total_results from adding up all MAX_RESULTS from ELASTIC_RESULTS use it. if settings.ELASTIC_RESULTS: MAX_RESULTS = total_results else: MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() result_key = "djac.results.%s" % (norm_term,) result_keys.append(result_key) keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words] pipe.zinterstore(result_key, keys, aggregate='MIN') pipe.zunionstore("djac.results", result_keys, aggregate='MIN') for result_key in result_keys: pipe.delete(result_key) pipe.zrange("djac.results", 0, MAX_RESULTS - 1) # Get exact matches if settings.MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue pipe.zunionstore("djac.results", keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) pipe.delete("djac.results") results = [i for i in pipe.execute() if type(i) == list] # init mappings and surplus for Elastic Result distribution deficits = {} # Mapping required to store result_ids outside of per provider loop before # fetching items / redistributing availabe result slots in elastic results provider_result_ids = {} max_results_dict = {} # total pool of available result slots total_surplus = 0 # Create a dict mapping provider to result IDs # We combine the 2 different kinds of results into 1 result ID list per provider. for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: # if provider will not be used due to min_letters, put all result slots # in surplus pool then continue total_surplus += MAX_RESULTS continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if settings.MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_result_ids[provider] = ids if settings.ELASTIC_RESULTS: surplus = MAX_RESULTS - len(ids) if surplus >= 0: max_results_dict[provider] = len(ids) total_surplus += surplus else: # create base usage max_results_dict[provider] = MAX_RESULTS # create dict of how many extra each provider actually needs deficits[provider] = surplus * -1 else: max_results_dict[provider] = MAX_RESULTS if settings.ELASTIC_RESULTS: while total_surplus > 0: # get a list of providers with deficits for two reasons. First, to know how # to divide the surplus, secondly, to iterate over rather than the deficit dict # as we will be manipulating the dict in the for loop beneficiaries = list(deficits.keys()) num_beneficiaries = len(beneficiaries) # if num_beneficiaries is greater than surplus, surplus_each will be 0 because of int # division in python, but total_surplus will still be > 0, resulting in infinite loop. if num_beneficiaries == 0 or num_beneficiaries > total_surplus: break else: surplus_payout = int(total_surplus / num_beneficiaries) for provider in beneficiaries: deficit = deficits.pop(provider) if (deficit - surplus_payout) <= 0: total_surplus -= deficit max_results_dict[provider] += surplus_payout else: total_surplus -= surplus_payout max_results_dict[provider] += surplus_payout deficits[provider] = deficit - surplus_payout for provider in providers: try: max_results = max_results_dict[provider] provider_results[provider.provider_name] = provider_result_ids[provider][:max_results] except KeyError: continue results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, self.__class__._serialize_data(results), settings.CACHE_TIMEOUT) return results
def suggest(self, term, facets=[]): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers is None: return [] # If we have a cached version of the search results available, return it! hashed_facets = self.hash_facets(facets) cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term, settings.JOIN_CHARS), hashed_facets) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self.__class__._deserialize_data(REDIS.get(cache_key)) # Get the normalized term variations we need to search for each term. A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) if len(norm_terms) == 0: return [] provider_results = OrderedDict() # Generate a unique identifier to be used for storing intermediate results. This is to # prevent redis key collisions between competing suggest / exact_suggest calls. base_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) base_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) # Same idea as the base_result_key, but for when we are using facets in the suggest call. facet_final_result_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) facet_final_exact_match_key = RESULT_SET_BASE_NAME % str(uuid.uuid4()) # As we search, we may store a number of intermediate data items. We keep track of # what we store and delete so there is nothing left over # We initialize with the base keys all of which could end up being used. keys_to_delete = { base_result_key, base_exact_match_key, facet_final_result_key, facet_final_exact_match_key } facet_keys_set = set() if len(facets) > 0: # we use from_iterable to flatten the list comprehension into a single list sub_facets = itertools.chain.from_iterable( [facet['facets'] for facet in facets]) facet_keys_set = set( [sub_facet['key'] for sub_facet in sub_facets]) MOVE_EXACT_MATCHES_TO_TOP = registry.get_autocompleter_setting( self.name, 'MOVE_EXACT_MATCHES_TO_TOP') # Get the max results autocompleter setting MAX_RESULTS = registry.get_autocompleter_setting( self.name, 'MAX_RESULTS') pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting( self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue term_result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() keys = [ PREFIX_BASE_NAME % ( provider_name, norm_word, ) for norm_word in norm_words ] if len(keys) == 1: term_result_keys.append(keys[0]) else: term_result_key = base_result_key + '.' + norm_term term_result_keys.append(term_result_key) keys_to_delete.add(term_result_key) pipe.zinterstore(term_result_key, keys, aggregate='MIN') if len(term_result_keys) == 1: final_result_key = term_result_keys[0] else: final_result_key = base_result_key pipe.zunionstore(final_result_key, term_result_keys, aggregate='MIN') use_facets = False if len(facet_keys_set) > 0: provider_keys_set = set(provider.get_facets()) if facet_keys_set.issubset(provider_keys_set): use_facets = True if use_facets: facet_result_keys = [] for facet in facets: try: facet_type = facet['type'] if facet_type not in ['and', 'or']: continue facet_list = facet['facets'] facet_set_keys = [] for facet_dict in facet_list: facet_set_key = \ FACET_SET_BASE_NAME % (provider_name, facet_dict['key'], facet_dict['value'],) facet_set_keys.append(facet_set_key) if len(facet_set_keys) == 1: facet_result_keys.append(facet_set_keys[0]) else: facet_result_key = RESULT_SET_BASE_NAME % str( uuid.uuid4()) facet_result_keys.append(facet_result_key) keys_to_delete.add(facet_result_key) if facet_type == 'and': pipe.zinterstore(facet_result_key, facet_set_keys, aggregate='MIN') else: pipe.zunionstore(facet_result_key, facet_set_keys, aggregate='MIN') except KeyError: continue # We want to calculate the intersection of all the intermediate facet sets created so far # along with the final result set. So we append the final_result_key to the list of # facet_result_keys and store the intersection in the faceted final result set. pipe.zinterstore(facet_final_result_key, facet_result_keys + [final_result_key], aggregate='MIN') if use_facets: pipe.zrange(facet_final_result_key, 0, MAX_RESULTS - 1) else: pipe.zrange(final_result_key, 0, MAX_RESULTS - 1) # Get exact matches if MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % ( provider_name, norm_term, )) # Do not attempt zunionstore on empty list because redis errors out. if len(keys) == 0: continue if len(keys) == 1: final_exact_match_key = keys[0] else: final_exact_match_key = base_exact_match_key pipe.zunionstore(final_exact_match_key, keys, aggregate='MIN') # If facets are being used for this suggest call, we need to make sure that # exact term matches don't bypass the requirement of having matching facet values. # To achieve this, we intersect all faceted matches (exact-and-non-exact) with # all exact matches. if use_facets: pipe.zinterstore(facet_final_exact_match_key, facet_result_keys + [final_exact_match_key], aggregate='MIN') pipe.zrange(facet_final_exact_match_key, 0, MAX_RESULTS - 1) else: pipe.zrange(final_exact_match_key, 0, MAX_RESULTS - 1) pipe.delete(*keys_to_delete) results = [i for i in pipe.execute() if type(i) == list] # Total number of results currently allocated to providers total_allocated_results = 0 # Maximum number of results allowed per provider provider_max_results = OrderedDict() # Get an initial max/provider based on a equal share of MAX_RESULTS for provider in providers: provider_name = provider.provider_name results_per_provider = self.normalize_rounding(MAX_RESULTS / len(providers)) provider_max_results[provider_name] = results_per_provider total_allocated_results += results_per_provider # Due to having to round to nearest result, the maximum number of results # allocated could be less/more than the max allowed... Here we adjust providers # results until total allocation equals max allowed diff = 1 if total_allocated_results < MAX_RESULTS else -1 while total_allocated_results != MAX_RESULTS: for provider in providers: provider_name = provider.provider_name provider_max_results[provider_name] += diff total_allocated_results += diff if total_allocated_results == MAX_RESULTS: break # Result IDs per provider provider_result_ids = OrderedDict() # Number of results we will be getting from each provider provider_num_results = OrderedDict() # Total pool of extra result slots total_surplus = 0 # Number of extra result slots a provider could use provider_deficits = OrderedDict() # Create a dict mapping provider to number of result IDs available # We combine the 2 different kinds of results into 1 result ID list per provider. # Also keep track of number of extra result slots available when a provider does not # use up its allocated slots. for provider in providers: provider_name = provider.provider_name # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting( self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: # if provider will not be used due to min_letters, put all result slots # in surplus pool then continue total_surplus += provider_max_results[provider_name] continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_result_ids[provider] = ids surplus = provider_max_results[provider_name] - len(ids) if surplus >= 0: provider_num_results[provider_name] = len(ids) total_surplus += surplus else: # create base usage provider_num_results[provider_name] = provider_max_results[ provider_name] # create dict of how many extra each provider actually needs provider_deficits[provider_name] = -surplus # If there are extra result slots available, go through each provider that # needs extra results, and hand them out until there are no more to give while total_surplus > 0: # Check if there are any providers that still need extra results, and if not exit the loop, # else we get caught in an infinite loop provider_with_deficit_exists = False for provider_name in provider_deficits: deficit = provider_deficits[provider_name] if deficit > 0: provider_with_deficit_exists = True if not provider_with_deficit_exists: break for provider_name in provider_deficits: deficit = provider_deficits[provider_name] if deficit > 0: provider_num_results[provider_name] += 1 provider_deficits[provider_name] -= 1 total_surplus -= 1 if total_surplus <= 0: break # At this point we should have the final number of results we will be getting # from each provider, so we get from provider and put in final result IDs dict for provider in providers: provider_name = provider.provider_name try: num_results = provider_num_results[provider_name] provider_results[provider_name] = provider_result_ids[ provider][:num_results] except KeyError: continue results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.setex(cache_key, settings.CACHE_TIMEOUT, self.__class__._serialize_data(results)) return results
def suggest(self, term): """ Suggest matching objects, given a term """ providers = self._get_all_providers_by_autocompleter() if providers == None: return [] # If we have a cached version of the search results available, return it! cache_key = CACHE_BASE_NAME % \ (self.name, utils.get_normalized_term(term),) if settings.CACHE_TIMEOUT and REDIS.exists(cache_key): return self._deserialize_data(REDIS.get(cache_key)) # Get the normalized we need to search for each term... A single term # could turn into multiple terms we need to search. norm_terms = utils.get_norm_term_variations(term) provider_results = SortedDict() # Get the matched result IDs pipe = REDIS.pipeline() for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue result_keys = [] for norm_term in norm_terms: norm_words = norm_term.split() result_key = "djac.results.%s" % (norm_term,) result_keys.append(result_key) keys = [PREFIX_BASE_NAME % (provider_name, i,) for i in norm_words] pipe.zinterstore(result_key, keys, aggregate='MIN') pipe.zunionstore("djac.results", result_keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) # Get exact matches if settings.MOVE_EXACT_MATCHES_TO_TOP: keys = [] for norm_term in norm_terms: keys.append(EXACT_BASE_NAME % (provider_name, norm_term,)) pipe.zunionstore("djac.results", keys, aggregate='MIN') pipe.zrange("djac.results", 0, MAX_RESULTS - 1) results = [i for i in pipe.execute() if type(i) == list] # Create a dict mapping provider to result IDs # We combine the 2 different kinds of results into 1 result ID list per provider. for provider in providers: provider_name = provider.provider_name MAX_RESULTS = registry.get_ac_provider_setting(self.name, provider, 'MAX_RESULTS') # If the total length of the term is less than MIN_LETTERS allowed, then don't search # the provider for this term MIN_LETTERS = registry.get_ac_provider_setting(self.name, provider, 'MIN_LETTERS') if len(term) < MIN_LETTERS: continue ids = results.pop(0) # We merge exact matches with base matches by moving them to # the head of the results if settings.MOVE_EXACT_MATCHES_TO_TOP: exact_ids = results.pop(0) # Need to reverse exact IDs so high scores are behind low scores, since we # are inserted in front of list. exact_ids.reverse() # Merge exact IDs with non-exact IDs, puttting exacts IDs in front and removing # from regular ID list if necessary for j in exact_ids: if j in ids: ids.remove(j) ids.insert(0, j) provider_results[provider_name] = ids[:MAX_RESULTS] results = self._get_results_from_ids(provider_results) # If told to, cache the final results for CACHE_TIMEOUT secnds if settings.CACHE_TIMEOUT: REDIS.set(cache_key, self._serialize_data(results)) REDIS.expire(cache_key, settings.CACHE_TIMEOUT) return results