Example #1
0
 def arbitrate(self):
     """consider current date/time and value of the ranks, return the selected pattern"""
     timed_percentage_rank, timed_weight_rank = self.__get_current_timed_rank(
     )
     timed_percentage_count = sum([
         timed_percentage_rank[pattern].value
         for pattern in timed_percentage_rank
     ])
     total_percentage_count = self.__general_percentage_count + timed_percentage_count
     max_percentage = 100
     if total_percentage_count > max_percentage:
         max_percentage = total_percentage_count
         if not RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__:
             warning(
                 "total percentage count value '%s' is greater than 100" %
                 total_percentage_count)
             RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__ = True
     dice = {
         PERCENTAGE: RankHolder(total_percentage_count),
         WEIGHT: RankHolder(max_percentage - total_percentage_count)
     }
     choice = get_weighted_random_dict_key(dice)
     general_holders = self.__general_percentage_holder if PERCENTAGE is choice else self.__general_weight_holder
     timed_ranks = timed_percentage_rank if PERCENTAGE is choice else timed_weight_rank
     if not general_holders and not timed_ranks:
         return None
     dice = self.__get_dice(general_holders, timed_ranks)
     choice_pattern = get_weighted_random_dict_key(dice)
     return choice_pattern
Example #2
0
 def arbitrate(self):
     """consider current date/time and value of the ranks, return the selected pattern"""
     timed_percentage_rank, timed_weight_rank = self.__get_current_timed_rank()
     timed_percentage_count = sum([timed_percentage_rank[pattern].value for pattern in timed_percentage_rank])
     total_percentage_count = self.__general_percentage_count + timed_percentage_count
     max_percentage = 100
     if total_percentage_count > max_percentage:
         max_percentage = total_percentage_count
         if not RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__:
             print("[warning] total percentage count value '%s' is greater than 100" % total_percentage_count)
             RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__ = True
     dice = {PERCENTAGE: RankHolder(total_percentage_count),
             WEIGHT: RankHolder(max_percentage - total_percentage_count)}
     choice = get_weighted_random_dict_key(dice)
     general_holders = self.__general_percentage_holder if PERCENTAGE is choice else self.__general_weight_holder
     timed_ranks = timed_percentage_rank if PERCENTAGE is choice else timed_weight_rank
     if not general_holders and not timed_ranks:
         return None
     dice = self.__get_dice(general_holders, timed_ranks)
     choice_pattern = get_weighted_random_dict_key(dice)
     return choice_pattern
Example #3
0
 def choose_url(new_objs, old_objs):
     # ... support setting...
     new_size = len(new_objs)
     old_size = len(old_objs)
     if new_size > 0 and new_size + old_size <= new_size * 2:
         new_objs.update(old_objs)
         return get_random_dict_key(new_objs)
     if not old_size > 0:
         return NA
     # now we will throw a dice with 50%/50% prob. choosing new or old obj
     is_choose_new = new_size > 0 and 1 == random.randrange(0, 2)
     if is_choose_new:
         return get_random_dict_key(new_objs)
     else:
         return get_weighted_random_dict_key(old_objs, bypass=lambda image_slot: NA == image_slot.encoding)
Example #4
0
 def choose_url(new_objs, old_objs):
     # ... support setting...
     new_size = len(new_objs)
     old_size = len(old_objs)
     if new_size > 0 and new_size + old_size <= new_size * 2:
         new_objs.update(old_objs)
         return get_random_dict_key(new_objs)
     if not old_size > 0:
         return NA
     # now we will throw a dice with 50%/50% prob. choosing new or old obj
     is_choose_new = new_size > 0 and 1 == random.randrange(0, 2)
     if is_choose_new:
         return get_random_dict_key(new_objs)
     else:
         return get_weighted_random_dict_key(old_objs, bypass=lambda image_slot: NA == image_slot.encoding)
Example #5
0
 def crawl(self, pattern, size_list, option="", print_url=False):
     """output: urls, is_new_result"""
     show(get_msg(Msg.search_target), "\"" + pattern + "\"")
     key = Crawler.get_search_key(pattern, option)
     urls, size_ratio = self.get_recent_result(key)
     if urls:
         return urls, False
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         return None, False
     assert size_list and (not size_ratio or isinstance(size_ratio, dict))
     dice = Crawler.get_dice(size_list, size_ratio)
     urls = []
     next_size_ratio = {size: 0 for size in size_list}  # key: size, value: number of new result (initial with 0)
     start = {size: 1 for size in size_list}  # key: size, value: next search start offset (start from 1 by google)
     tried_size = 0
     while tried_size < TARGET_SEARCH_RESULT_SIZE:
         chosen_size = get_weighted_random_dict_key(dice)
         this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option)
         if not success:
             break
         urls += this_urls
         new_result = self.get_this_time_new_result_num(key, this_urls)
         next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls))
         start[chosen_size] += G_SEARCH_PER_REQ_SIZE
         tried_size += G_SEARCH_PER_REQ_SIZE
     # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result)
     urls = list(set(urls))
     if not Crawler._HAS_SHOW_NO_SEARCH_MSG:
         info("%s:%s, %s:%i" % (
           get_msg(Msg.target), pattern,
           get_msg(Msg.acquired_url_count), len(urls)))
     if print_url:
         for url in urls:
             show(url)
     if success:
         next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size]
                            for size in next_size_ratio}
         self.cache_url(key, urls, next_size_ratio)
     return urls, success
Example #6
0
 def crawl(self, pattern, size_list, option="", print_url=False):
     """output: urls, is_new_result"""
     debug("[search] search target: \"%s\"" % pattern)
     key = Crawler.get_search_key(pattern, option)
     urls, size_ratio = self.get_recent_result(key)
     if urls:
         return urls, False
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         return None, False
     assert size_list and (not size_ratio or isinstance(size_ratio, dict))
     dice = Crawler.get_dice(size_list, size_ratio)
     urls = []
     next_size_ratio = {size: 0 for size in size_list}  # key: size, value: number of new result (initial with 0)
     start = {size: 1 for size in size_list}  # key: size, value: next search start offset (start from 1 by google)
     tried_size = 0
     while tried_size < get_search_size():
         chosen_size = get_weighted_random_dict_key(dice)
         this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option)
         if not success:
             break
         urls += this_urls
         new_result = self.get_this_time_new_result_num(key, this_urls)
         next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls))
         start[chosen_size] += G_SEARCH_PER_REQ_SIZE
         tried_size += G_SEARCH_PER_REQ_SIZE
     # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result)
     urls = list(set(urls))
     if not Crawler._HAS_SHOW_NO_SEARCH_MSG:
         info("target:%s, acquired url count:%i" % (pattern, len(urls)))
     if print_url:
         for url in urls:
             debug("[search] %s" % url)
     if success:
         next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size]
                            for size in next_size_ratio}
         self.cache_url(key, urls, next_size_ratio)
     return urls, success
Example #7
0
 def get_graph(self):
     if not self.__valid:
         return NA, NA
     graph_file = get_weighted_random_dict_key(self.__status_cache)
     full_graph_file = self.__location + get_delim() + graph_file
     return full_graph_file, self.get_graph_digest(graph_file)
Example #8
0
 def get_graph(self):
     if not self.__valid:
         return NA, NA
     graph_file = get_weighted_random_dict_key(self.__status_cache)
     full_graph_file = self.__location + get_delim() + graph_file
     return full_graph_file, self.get_graph_digest(graph_file)