Example #1
0
 def get_recent_result(self, key):
     """output: urls, size_ratio"""
     if key not in self.__url_map:
         return None, None
     [retrieved_date, new_result, urls, size_ratio] = self.__url_map[key]
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         show(get_msg(Msg.use_previous_search_result))
         # though size_ratio can be valid, we do not return it for caller usage is not expected
         return urls, None
     # spec.: we will execute a new search when there is enough new result on previous search
     #       => if previous new result is n, all result is m, we will have a new search after m/n days
     #       => if all previous result is new, then after 1 day we will have a search
     #       => if no previous result is new, then we will have a search after 'TARGET_SEARCH_RESULT_SIZE' days
     valid_day_size = len(urls) / new_result if new_result > 0 else \
         1 if NA is new_result else \
         TARGET_SEARCH_RESULT_SIZE  # new_result = 0 => no new result before
     from util.global_def import get_search_latency
     valid_day_size *= get_search_latency()
     current_date = datetime.today()
     date_diff = current_date - retrieved_date
     if date_diff > timedelta(days=valid_day_size):  # 'valid_day_size' is the valid duration of search result
         return None, size_ratio
     to_next_query = timedelta(days=valid_day_size) - date_diff
     hours, remainder = divmod(to_next_query.seconds, 3600)
     minutes, seconds = divmod(remainder, 60)
     show(get_msg(Msg.to_next_search),
          to_next_query.days, get_msg(Msg.day),
          hours, get_msg(Msg.hour),
          minutes, get_msg(Msg.minute),
          seconds, (get_msg(Msg.second) + ","), get_msg(Msg.current_url_count), len(urls))
     # though size_ratio can be valid, we do not return it for caller usage is not expected
     return urls, None
Example #2
0
 def set_graph(self, image_obj, graph_file=NA):
     self.__cur_image_obj = image_obj
     digest = None
     if NA == graph_file:
         graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \
                              GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern)
     if NA == graph_file:
         return False
     show(graph_file)
     with open(graph_file, 'rb') as f:
         try:
             image = GraphViewer.get_image(f)
         except IOError as e:
             f.close()  # close f here for we are going to delete the file below
             # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file'
             info(get_msg(Msg.fail_to_open_image), str(e))
             GraphFetcher().handle_image(graph_file, DELETE)
             return False
         # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg...
         except ValueError as e:
             info(get_msg(Msg.fail_to_open_image), str(e))
             return False
     self.__cur_graph_file = graph_file
     self.__graph_history.append([self.__cur_image_obj, self.__cur_graph_file])
     if digest:
         digest_str = digest + "\n"
     else:
         digest_str = "%s:%s\n" % (get_msg(Msg.path), graph_file)
     self.__cur_digest = digest_str + "%s:%sx%s" % (get_msg(Msg.size), image.size[0], image.size[1])
     self.select_phrase(image_obj.pattern)
     return self.set_graph_content(graph_file, image)
Example #3
0
 def select_pattern(self):
     if self.__arbitrator.is_active():
         choice_pattern = None
         while not choice_pattern:
             choice_pattern = self.__arbitrator.arbitrate()
             if not choice_pattern:
                 show(get_msg(Msg.no_available_image_wait_10_minutes))
                 self.__root.withdraw()
                 import time
                 time.sleep(600)
         self.__root.deiconify()
         return self.__cur_image_obj_dict[choice_pattern]
     image_obj_size = len(self.__cur_image_obj_list)
     return self.__cur_image_obj_list[random.randrange(0, image_obj_size)]
Example #4
0
def load(pickle_file):
    """output: is_exist, value"""
    try:
        pickle_fd = open(pickle_file, "r")
    except IOError as err:
        if errno.ENOENT == err.errno:
            show(get_msg(Msg.cache_file_does_not_exist), pickle_file)
            return False, None
        assert False
    try:
        value = cPickle.load(pickle_fd)
        return True, value
    except (ValueError, UnpicklingError, EOFError):
        error(get_msg(Msg.cannot_read_pickle_file), pickle_file, get_msg(Msg.suggest_re_fetch_pickle_file))
        assert False
Example #5
0
 def fetch(self, pattern):
     self.__has_write = False
     new_objs, old_objs = self.get_updated_url(pattern)
     show(get_msg(Msg.total_data_count), len(new_objs) + len(old_objs))
     url = self.choose_url(new_objs, old_objs)
     if NA == url:
         return NA, NA
     image_objs = old_objs
     image_objs.update(new_objs)
     image_slot = image_objs[url]
     graph_file, new_encoding = self.get_graph_file(pattern, url, image_slot.encoding)
     new_slot = ImageSlot(image_slot.timestamp, new_encoding, image_slot.rank)
     image_objs[url] = new_slot
     if self.__has_write:
         save(GraphFetcher.get_cache_file(pattern), image_objs)
     return graph_file, GraphFetcher.get_graph_digest(graph_file, image_objs[url])
Example #6
0
 def crawl(self, pattern, size_list, option="", print_url=False):
     """output: urls, is_new_result"""
     show(get_msg(Msg.search_target), "\"" + pattern + "\"")
     key = Crawler.get_search_key(pattern, option)
     urls, size_ratio = self.get_recent_result(key)
     if urls:
         return urls, False
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         return None, False
     assert size_list and (not size_ratio or isinstance(size_ratio, dict))
     dice = Crawler.get_dice(size_list, size_ratio)
     urls = []
     next_size_ratio = {size: 0 for size in size_list}  # key: size, value: number of new result (initial with 0)
     start = {size: 1 for size in size_list}  # key: size, value: next search start offset (start from 1 by google)
     tried_size = 0
     while tried_size < TARGET_SEARCH_RESULT_SIZE:
         chosen_size = get_weighted_random_dict_key(dice)
         this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option)
         if not success:
             break
         urls += this_urls
         new_result = self.get_this_time_new_result_num(key, this_urls)
         next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls))
         start[chosen_size] += G_SEARCH_PER_REQ_SIZE
         tried_size += G_SEARCH_PER_REQ_SIZE
     # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result)
     urls = list(set(urls))
     if not Crawler._HAS_SHOW_NO_SEARCH_MSG:
         info("%s:%s, %s:%i" % (
           get_msg(Msg.target), pattern,
           get_msg(Msg.acquired_url_count), len(urls)))
     if print_url:
         for url in urls:
             show(url)
     if success:
         next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size]
                            for size in next_size_ratio}
         self.cache_url(key, urls, next_size_ratio)
     return urls, success
Example #7
0
 def prepare_for_next_view(self, wait_time, msg=None):
     if msg:
         show(msg)
     job = self.__root.after(int(wait_time), lambda: self.timer_action())
     self.__pending_jobs.append(job)