def suggest_similar_search(word_regex_list, num=None): if num is None: num = config.functions_config.similar_search_default_number word_wildcard_list = [ word_regex for word_regex in word_regex_list if '*' in word_regex ] word_texts = [ word_regex for word_regex in word_regex_list if '*' not in word_regex ] length = config.functions_config.similar_search_candidate_length if word_wildcard_list: wildcard_groups = list() for word_text in word_wildcard_list: candidates = indexes.IndexHolder().word_text_index. \ collect(word_text, action=indexes.IndexHolder().word_text_index.CollectionAction.SIMILAR) candidates = filters.filter_by_avgtfidf( candidates, config.functions_config.similar_search_candidate_number) if candidates: wildcard_groups.append(candidates) if wildcard_groups: wildcard_candidates_groups = [ list(x) for x in itertools.product(*wildcard_groups) ] if len(word_wildcard_list) >= length or not word_texts: # use wildcard only for i in range(len(wildcard_candidates_groups)): wildcard_candidates_groups[i] = filters.filter_by_random( wildcard_candidates_groups[i], length) else: for i in range(len(wildcard_candidates_groups)): wildcard_candidates_groups[i] += filters.filter_by_random( word_texts, length - len(word_wildcard_list)) return filters.filter_by_coocurrence(wildcard_candidates_groups, num) candidates_groups = list() for i, word_text in enumerate(word_texts): other_words = word_texts[:i] + word_texts[i + 1:] candidates = indexes.IndexHolder().word_text_index. \ collect(word_text, action=indexes.IndexHolder().word_text_index.CollectionAction.SIMILAR) candidates = filters.filter_by_avgtfidf( candidates, config.functions_config.similar_search_candidate_number) for candidate in candidates: if candidate == word_text: continue candidates_groups.append( [candidate] + filters.filter_by_random(other_words, length - 1)) return filters.filter_by_coocurrence(candidates_groups, num)
def test_filter_by_coocurrence(self): word_text_samples = [ "abate", "bolster", "buttress", "champion", "defend", "espouse", "support" ] for i, j in [(0, 1), (0, 2), (0, 3), (0, 2), (1, 2), (1, 3), (3, 4)]: datasources.get_db().upsert_news_or_news_list( self.session, entities.news.NewsPlain(title='', content=' '.join([ word_text_samples[i], word_text_samples[j] ]))) indexes.IndexHolder().word_coocurrence_index.init(force_refresh=True) self.assertListEqual( filters.filter_by_coocurrence([ word_texts for word_texts in itertools.combinations(word_text_samples, 3) ], 3), [(word_text_samples[0], word_text_samples[1], word_text_samples[2]), (word_text_samples[0], word_text_samples[1], word_text_samples[3]), (word_text_samples[0], word_text_samples[2], word_text_samples[3]) ])
def update(self, num=20): self.sqlsession = datasources.get_db().create_session() self.crawl(num) self.prepossess() datasources.get_db().close_session(self.sqlsession) print('wait... we need to init IndexHolder...') indexes.IndexHolder().init(force_refresh=True) self.sqlsession = None
def suggest_autocomplete(word_regex_list, num=None): if num is None: num = config.functions_config.autocomplete_default_number candidate_texts = indexes.IndexHolder().word_text_index.collect( word_regex_list[-1]) # candidates = filters.filter_by_avgtfidf(candidates, num * 2) # candidates_groups = [word_regex_list[: -1] + [candidate] for candidate in candidates] # candidates_groups = filters.filter_by_coocurrence(candidates_groups, num) # return [candidates[-1] for candidates in candidates_groups] candidate_texts = filters.filter_by_avgtfidf(candidate_texts, num) return candidate_texts
def filter_by_avgtfidf(word_texts, keep_num): if len(word_texts) <= keep_num: return word_texts words = indexes.IndexHolder().vocab_index.collect( word_texts) # TODO: to be absorb, what is this words_graded = [((word.cf + 2) / math.log(word.df + 2), word) for word in words if word is not None] words_graded.sort(reverse=True, key=lambda x: x[0]) words_graded = [word_grade[1].text for word_grade in words_graded] return words_graded[:keep_num]
def filter_by_coocurrence(word_texts_list, keep_num): if len(word_texts_list) <= keep_num: return word_texts_list words_texts_graded_list = [ (indexes.IndexHolder().word_coocurrence_index.collect(word_texts), word_texts) for word_texts in word_texts_list ] words_texts_graded_list.sort(reverse=True, key=lambda x: x[0]) words_texts_graded_list = [ word_texts_grade[1] for word_texts_grade in words_texts_graded_list ] return words_texts_graded_list[:keep_num]