def _greedy_search(self, generate_next_keywords, search_type): claims = self._db.get_claims() min_tweets = self._min_tweet_count # word_tf_idf_dict = self._keywords_generator.get_word_tf_idf_of_claims() for i, claim in enumerate(claims): keywords_tweet_num = defaultdict(int) if i < self._start_from_claim: continue walked_keywords = Counter() print('{} Claim {}/{}'.format(search_type, i, len(claims))) start = timeit.default_timer() claim_description_words = self.get_claim_words_from_description( claim) ordered_words = OrderedSet(claim_description_words) base_keywords = ordered_words if search_type == 'top_down' else set( ) # num_of_potential_words = len(ordered_words) num_of_potential_words = self._max_keywords_size keywords_list = [] for size in range(1, num_of_potential_words + 1): same_keywords_size = Counter() best_word_rank_tuple = ['', 1000] for iter, word in enumerate(ordered_words): keywords_str = ' '.join( generate_next_keywords(base_keywords, word)) keywords_size = len(keywords_str.split(' ')) type_name = '{}_iter_{}_keywords_size_{}'.format( search_type, iter, keywords_size) evaluation = self.eval_keywords_for_claim( claim, keywords_str, type_name) if best_word_rank_tuple[1] > evaluation[ 'distance'] and evaluation[ 'tweet_num'] > min_tweets: best_word_rank_tuple = [word, evaluation['distance']] print('\r{} Distance: {}'.format(type_name, evaluation['distance']), end='') keywords_tweet_num[keywords_str] = evaluation['tweet_num'] if evaluation['tweet_num'] > min_tweets: walked_keywords[ keywords_str] = -1.0 * evaluation['distance'] same_keywords_size[ keywords_str] = -1.0 * evaluation['distance'] keywords_list.append( [keywords_str, evaluation['distance'], type_name]) if ordered_words: if best_word_rank_tuple[0] == '': best_word_rank_tuple[0] = ordered_words.pop() else: ordered_words.discard(best_word_rank_tuple[0]) base_keywords = generate_next_keywords(base_keywords, best_word_rank_tuple[0]) curr_distance = best_word_rank_tuple[1] if len(same_keywords_size) > 0: keywords, best_distances = same_keywords_size.most_common( 1)[0] self._add_new_keywords( claim, keywords, '{}_keywords_size_{}'.format(search_type, size), -1.0 * best_distances, keywords_tweet_num[keywords]) else: self._add_new_keywords( claim, ' '.join(base_keywords), '{}_keywords_size_{}'.format(search_type, size), curr_distance, keywords_tweet_num[' '.join(base_keywords)]) for keywords, keywords_distance, type_name in keywords_list: self._add_new_keywords(claim, keywords, type_name, keywords_distance, keywords_tweet_num[keywords]) if len(walked_keywords) > 0: keywords, best_distances = list( zip(*(walked_keywords.most_common( self._output_keywords_count)))) else: sorted_by_second = sorted(keywords_list, key=lambda tup: tup[1], reverse=True) keywords, best_distances, type_name = list( zip(*sorted_by_second[:self._output_keywords_count])) self._add_new_keywords( claim, '||'.join(keywords), '{}_final'.format(search_type), -1.0 * np.mean(best_distances), sum(list(keywords_tweet_num[k] for k in keywords))) with self._db.session.no_autoflush: self._db.addPosts(self._keywords_connections) self._keywords_connections = [] end = timeit.default_timer() print('run time: {}'.format((end - start)))
def test_discard(): os = OrderedSet(range(100)) for value in range(200): os.discard(value) assert len(os) == 0