Exemple #1
0
    def _greedy_search(self, generate_next_keywords, search_type):
        claims = self._db.get_claims()
        min_tweets = self._min_tweet_count
        # word_tf_idf_dict = self._keywords_generator.get_word_tf_idf_of_claims()
        for i, claim in enumerate(claims):
            keywords_tweet_num = defaultdict(int)
            if i < self._start_from_claim:
                continue
            walked_keywords = Counter()
            print('{} Claim {}/{}'.format(search_type, i, len(claims)))
            start = timeit.default_timer()
            claim_description_words = self.get_claim_words_from_description(
                claim)
            ordered_words = OrderedSet(claim_description_words)
            base_keywords = ordered_words if search_type == 'top_down' else set(
            )
            # num_of_potential_words = len(ordered_words)
            num_of_potential_words = self._max_keywords_size
            keywords_list = []
            for size in range(1, num_of_potential_words + 1):
                same_keywords_size = Counter()
                best_word_rank_tuple = ['', 1000]
                for iter, word in enumerate(ordered_words):
                    keywords_str = ' '.join(
                        generate_next_keywords(base_keywords, word))
                    keywords_size = len(keywords_str.split(' '))
                    type_name = '{}_iter_{}_keywords_size_{}'.format(
                        search_type, iter, keywords_size)
                    evaluation = self.eval_keywords_for_claim(
                        claim, keywords_str, type_name)
                    if best_word_rank_tuple[1] > evaluation[
                            'distance'] and evaluation[
                                'tweet_num'] > min_tweets:
                        best_word_rank_tuple = [word, evaluation['distance']]
                    print('\r{} Distance: {}'.format(type_name,
                                                     evaluation['distance']),
                          end='')
                    keywords_tweet_num[keywords_str] = evaluation['tweet_num']
                    if evaluation['tweet_num'] > min_tweets:
                        walked_keywords[
                            keywords_str] = -1.0 * evaluation['distance']
                        same_keywords_size[
                            keywords_str] = -1.0 * evaluation['distance']
                    keywords_list.append(
                        [keywords_str, evaluation['distance'], type_name])
                if ordered_words:
                    if best_word_rank_tuple[0] == '':
                        best_word_rank_tuple[0] = ordered_words.pop()
                    else:
                        ordered_words.discard(best_word_rank_tuple[0])
                base_keywords = generate_next_keywords(base_keywords,
                                                       best_word_rank_tuple[0])

                curr_distance = best_word_rank_tuple[1]
                if len(same_keywords_size) > 0:
                    keywords, best_distances = same_keywords_size.most_common(
                        1)[0]
                    self._add_new_keywords(
                        claim, keywords,
                        '{}_keywords_size_{}'.format(search_type, size),
                        -1.0 * best_distances, keywords_tweet_num[keywords])
                else:
                    self._add_new_keywords(
                        claim, ' '.join(base_keywords),
                        '{}_keywords_size_{}'.format(search_type,
                                                     size), curr_distance,
                        keywords_tweet_num[' '.join(base_keywords)])

            for keywords, keywords_distance, type_name in keywords_list:
                self._add_new_keywords(claim, keywords, type_name,
                                       keywords_distance,
                                       keywords_tweet_num[keywords])

            if len(walked_keywords) > 0:
                keywords, best_distances = list(
                    zip(*(walked_keywords.most_common(
                        self._output_keywords_count))))
            else:
                sorted_by_second = sorted(keywords_list,
                                          key=lambda tup: tup[1],
                                          reverse=True)
                keywords, best_distances, type_name = list(
                    zip(*sorted_by_second[:self._output_keywords_count]))

            self._add_new_keywords(
                claim, '||'.join(keywords), '{}_final'.format(search_type),
                -1.0 * np.mean(best_distances),
                sum(list(keywords_tweet_num[k] for k in keywords)))
            with self._db.session.no_autoflush:
                self._db.addPosts(self._keywords_connections)
            self._keywords_connections = []
            end = timeit.default_timer()
            print('run time: {}'.format((end - start)))
def test_discard():
    os = OrderedSet(range(100))
    for value in range(200):
        os.discard(value)
    assert len(os) == 0
def test_discard():
    os = OrderedSet(range(100))
    for value in range(200):
        os.discard(value)
    assert len(os) == 0