def test_get_corrected_split_queries(self): # No splits query_1_word = ["fast"] ans_1_word = [] # one split, total two words query_2_word = ["forw", "ard"] ans_2_word = [["forward"]] # one split, total three words query_3_word = ["forw", "ard", "march"] ans_3_word = [["forward", "march"]] # one split, total four words query_4_word = ["fast", "forw", "ard", "march"] ans_4_word = [["fast", "forward", "march"]] queries = [query_1_word, query_2_word, query_3_word, query_4_word] queries = [Suggestion(query) for query in queries] answers = [ans_1_word, ans_2_word, ans_3_word, ans_4_word] for i in xrange(4): self.assertEqual(utils.get_corrected_split_queries(queries[i], self.lexicon), answers[i])
def generate_suggestions_and_posteriors(self, query, get_posterior_fn = None): """Return (suggestion, posterior) pairs for query. Get a list of candidate suggestions and calculate posteriors for each of them. Arguments: - `query`: Suggestion object. """ if get_posterior_fn == None: get_posterior_fn = self.get_posterior_fn # all_queries = [query] + utils.get_corrected_split_queries(query, self.lexicon) \ # + utils.get_corrected_run_on_queries(query, self.lexicon) all_queries = [query] + utils.get_corrected_split_queries(query, self.lexicon) # + utils.get_corrected_run_on_queries(query, self.lexicon) # print 'all_queries' # pprint(all_queries) # List of list of (query, suggestion, likelihood) for each query all_suggestions = [[(query, suggestion) for suggestion in self.generate_candidate_suggestions( map(self.generate_candidate_terms, query), query.suggestion_type)] for query in all_queries] # Flatten the list of list of suggestions all_suggestions = list(itertools.chain(*all_suggestions)) # print 'all_suggestions after flattening' # pprint(all_suggestions) all_suggestions.sort(key = lambda query_sugg_tuple: phrase.get_likelihood(*query_sugg_tuple), reverse = True) # print 'suggestions and likelihood' # pprint([(query, suggestion, phrase.get_likelihood(query, suggestion)) # for query, suggestion in all_suggestions]) # Remove duplicates (if any) all_suggestions = [key for key, _ in itertools.groupby(all_suggestions)] # print 'all_suggestions after removing duplicates' # pprint(all_suggestions) # Take only the top few suggestions all_suggestions = all_suggestions[:self.MAX_NUM_SUGGESTIONS] # print 'len(all_suggestions)' # pprint(len(all_suggestions)) # print 'all_suggestions after taking off the top' # pprint(all_suggestions) all_posteriors = [get_posterior_fn(suggestion, query) for query, suggestion in all_suggestions] all_suggestions = list(zip(*all_suggestions)[1]) # TODO # original_query = query # original_query_posterior = get_posterior_fn(query, query) # print 'original_query' # pprint(original_query, original_query_posterior) # if original_query_posterior > self.ORIGINAL_POSTERIOR_THRESHOLD: # all_suggestions += [original_query] # all_posteriors += [original_query_posterior] normalized_posteriors = utils.get_normalized_probabilities(all_posteriors) suggestion_posterior_list = list(zip(all_suggestions, normalized_posteriors)) suggestion_posterior_list.sort(key = lambda pair: pair[1], reverse = True) return suggestion_posterior_list