Ejemplo n.º 1
0
def suggest_similar_search(word_regex_list, num=None):
    if num is None:
        num = config.functions_config.similar_search_default_number
    word_wildcard_list = [
        word_regex for word_regex in word_regex_list if '*' in word_regex
    ]
    word_texts = [
        word_regex for word_regex in word_regex_list if '*' not in word_regex
    ]
    length = config.functions_config.similar_search_candidate_length
    if word_wildcard_list:
        wildcard_groups = list()
        for word_text in word_wildcard_list:
            candidates = indexes.IndexHolder().word_text_index. \
                collect(word_text, action=indexes.IndexHolder().word_text_index.CollectionAction.SIMILAR)
            candidates = filters.filter_by_avgtfidf(
                candidates,
                config.functions_config.similar_search_candidate_number)
            if candidates:
                wildcard_groups.append(candidates)
        if wildcard_groups:
            wildcard_candidates_groups = [
                list(x) for x in itertools.product(*wildcard_groups)
            ]
            if len(word_wildcard_list) >= length or not word_texts:
                # use wildcard only
                for i in range(len(wildcard_candidates_groups)):
                    wildcard_candidates_groups[i] = filters.filter_by_random(
                        wildcard_candidates_groups[i], length)
            else:
                for i in range(len(wildcard_candidates_groups)):
                    wildcard_candidates_groups[i] += filters.filter_by_random(
                        word_texts, length - len(word_wildcard_list))
            return filters.filter_by_coocurrence(wildcard_candidates_groups,
                                                 num)
    candidates_groups = list()
    for i, word_text in enumerate(word_texts):
        other_words = word_texts[:i] + word_texts[i + 1:]
        candidates = indexes.IndexHolder().word_text_index. \
            collect(word_text, action=indexes.IndexHolder().word_text_index.CollectionAction.SIMILAR)
        candidates = filters.filter_by_avgtfidf(
            candidates,
            config.functions_config.similar_search_candidate_number)
        for candidate in candidates:
            if candidate == word_text:
                continue
            candidates_groups.append(
                [candidate] +
                filters.filter_by_random(other_words, length - 1))
    return filters.filter_by_coocurrence(candidates_groups, num)
Ejemplo n.º 2
0
 def test_filter_by_coocurrence(self):
     word_text_samples = [
         "abate", "bolster", "buttress", "champion", "defend", "espouse",
         "support"
     ]
     for i, j in [(0, 1), (0, 2), (0, 3), (0, 2), (1, 2), (1, 3), (3, 4)]:
         datasources.get_db().upsert_news_or_news_list(
             self.session,
             entities.news.NewsPlain(title='',
                                     content=' '.join([
                                         word_text_samples[i],
                                         word_text_samples[j]
                                     ])))
     indexes.IndexHolder().word_coocurrence_index.init(force_refresh=True)
     self.assertListEqual(
         filters.filter_by_coocurrence([
             word_texts
             for word_texts in itertools.combinations(word_text_samples, 3)
         ], 3),
         [(word_text_samples[0], word_text_samples[1],
           word_text_samples[2]),
          (word_text_samples[0], word_text_samples[1],
           word_text_samples[3]),
          (word_text_samples[0], word_text_samples[2], word_text_samples[3])
          ])
Ejemplo n.º 3
0
 def update(self, num=20):
     self.sqlsession = datasources.get_db().create_session()
     self.crawl(num)
     self.prepossess()
     datasources.get_db().close_session(self.sqlsession)
     print('wait...  we need to init IndexHolder...')
     indexes.IndexHolder().init(force_refresh=True)
     self.sqlsession = None
Ejemplo n.º 4
0
def suggest_autocomplete(word_regex_list, num=None):
    if num is None:
        num = config.functions_config.autocomplete_default_number
    candidate_texts = indexes.IndexHolder().word_text_index.collect(
        word_regex_list[-1])
    # candidates = filters.filter_by_avgtfidf(candidates, num * 2)
    # candidates_groups = [word_regex_list[: -1] + [candidate] for candidate in candidates]
    # candidates_groups = filters.filter_by_coocurrence(candidates_groups, num)
    # return [candidates[-1] for candidates in candidates_groups]
    candidate_texts = filters.filter_by_avgtfidf(candidate_texts, num)
    return candidate_texts
Ejemplo n.º 5
0
def filter_by_avgtfidf(word_texts, keep_num):
    if len(word_texts) <= keep_num:
        return word_texts
    words = indexes.IndexHolder().vocab_index.collect(
        word_texts)  # TODO: to be absorb, what is this
    words_graded = [((word.cf + 2) / math.log(word.df + 2), word)
                    for word in words if word is not None]
    words_graded.sort(reverse=True, key=lambda x: x[0])

    words_graded = [word_grade[1].text for word_grade in words_graded]
    return words_graded[:keep_num]
Ejemplo n.º 6
0
def filter_by_coocurrence(word_texts_list, keep_num):
    if len(word_texts_list) <= keep_num:
        return word_texts_list
    words_texts_graded_list = [
        (indexes.IndexHolder().word_coocurrence_index.collect(word_texts),
         word_texts) for word_texts in word_texts_list
    ]
    words_texts_graded_list.sort(reverse=True, key=lambda x: x[0])
    words_texts_graded_list = [
        word_texts_grade[1] for word_texts_grade in words_texts_graded_list
    ]
    return words_texts_graded_list[:keep_num]