Example #1
0
 def set_graph(self, image_obj, graph_file=NA):
     self.__cur_image_obj = image_obj
     digest = None
     if NA == graph_file:
         graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \
                              GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern)
     if NA == graph_file:
         return False
     debug("[view] %s" % graph_file)
     with open(graph_file, 'rb') as f:
         try:
             image = GraphViewer.get_image(f)
         except IOError as e:
             f.close(
             )  # close f here for we are going to delete the file below
             # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file'
             info("fail to open image: %s" % str(e))
             GraphFetcher().handle_image(graph_file, DELETE)
             return False
         # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg...
         except ValueError as e:
             info("fail to open image: %s" % str(e))
             return False
     self.__cur_graph_file = graph_file
     self.__graph_history.append(
         [self.__cur_image_obj, self.__cur_graph_file])
     if digest:
         digest_str = digest + "\n"
     else:
         digest_str = "%s:%s\n" % ("path", graph_file)
     self.__cur_digest = digest_str + "size:%sx%s" % (image.size[0],
                                                      image.size[1])
     self.select_phrase(image_obj.pattern)
     return self.set_graph_content(graph_file, image)
Example #2
0
 def get_recent_result(self, key):
     """output: urls, size_ratio"""
     if key not in self.__url_map:
         return None, None
     [retrieved_date, new_result, urls, size_ratio] = self.__url_map[key]
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         debug("[search] use previous search result (due to no network connection)")
         # though size_ratio can be valid, we do not return it for caller usage is not expected
         return urls, None
     # spec.: we will execute a new search when there is enough new result on previous search
     #       => if previous new result is n, all result is m, we will have a new search after m/n days
     #       => if all previous result is new, then after 1 day we will have a search
     #       => if no previous result is new, then we will have a search after 'get_search_unit_size()' days
     valid_day_size = len(urls) / new_result if new_result > 0 else \
         1 if NA is new_result else \
         get_search_size()  # new_result = 0 => no new result before
     from util.global_def import get_latency
     valid_day_size *= get_latency()
     current_date = datetime.today()
     date_diff = current_date - retrieved_date
     if date_diff > timedelta(days=valid_day_size):  # 'valid_day_size' is the valid duration of search result
         return None, size_ratio
     to_next_query = timedelta(days=valid_day_size) - date_diff
     hours, remainder = divmod(to_next_query.seconds, 3600)
     minutes, seconds = divmod(remainder, 60)
     debug("[search] to next search: %i days %i hours %i minutes %i seconds, current url count: %i" %
           (to_next_query.days, hours, minutes, seconds, len(urls)))
     # though size_ratio can be valid, we do not return it for caller usage is not expected
     return urls, None
Example #3
0
 def select_pattern(self):
     if self.__arbitrator.is_active():
         choice_pattern = None
         while not choice_pattern:
             choice_pattern = self.__arbitrator.arbitrate()
             if not choice_pattern:
                 debug(
                     "[view] no available image now, will wait for ten minutes..."
                 )
                 self.__root.withdraw()
                 import time
                 time.sleep(600)
         self.__root.deiconify()
         return self.__cur_image_obj_dict[choice_pattern]
     image_obj_size = len(self.__cur_image_obj_list)
     return self.__cur_image_obj_list[random.randrange(0, image_obj_size)]
Example #4
0
def load(pickle_file):
    """output: is_exist, value"""
    try:
        pickle_fd = open(pickle_file, "r")
    except IOError as err:
        if errno.ENOENT == err.errno:
            debug("cache file does not exist: %s" % pickle_file)
            return False, None
        assert False
    try:
        value = cPickle.load(pickle_fd)
        return True, value
    except (ValueError, UnpicklingError, EOFError):
        error("cannot read pickle file: %s, suggest re-fetch the pickle file" %
              pickle_file)
        assert False
Example #5
0
 def fetch(self, pattern):
     self.__has_write = False
     new_objs, old_objs = self.get_updated_url(pattern)
     debug("[fetch] total data count: %s" % str(len(new_objs) + len(old_objs)))
     url = self.choose_url(new_objs, old_objs)
     if NA == url:
         return NA, NA
     image_objs = old_objs
     image_objs.update(new_objs)
     image_slot = image_objs[url]
     graph_file, new_encoding = self.get_graph_file(pattern, url, image_slot.encoding)
     new_slot = ImageSlot(image_slot.timestamp, new_encoding, image_slot.rank)
     image_objs[url] = new_slot
     if self.__has_write:
         save(GraphFetcher.get_cache_file(pattern), image_objs)
     return graph_file, GraphFetcher.get_graph_digest(graph_file, image_objs[url])
Example #6
0
 def crawl(self, pattern, size_list, option="", print_url=False):
     """output: urls, is_new_result"""
     debug("[search] search target: \"%s\"" % pattern)
     key = Crawler.get_search_key(pattern, option)
     urls, size_ratio = self.get_recent_result(key)
     if urls:
         return urls, False
     if not self.__network_reachable or Crawler.__STOP_SEARCH:
         return None, False
     assert size_list and (not size_ratio or isinstance(size_ratio, dict))
     dice = Crawler.get_dice(size_list, size_ratio)
     urls = []
     next_size_ratio = {size: 0 for size in size_list}  # key: size, value: number of new result (initial with 0)
     start = {size: 1 for size in size_list}  # key: size, value: next search start offset (start from 1 by google)
     tried_size = 0
     while tried_size < get_search_size():
         chosen_size = get_weighted_random_dict_key(dice)
         this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option)
         if not success:
             break
         urls += this_urls
         new_result = self.get_this_time_new_result_num(key, this_urls)
         next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls))
         start[chosen_size] += G_SEARCH_PER_REQ_SIZE
         tried_size += G_SEARCH_PER_REQ_SIZE
     # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result)
     urls = list(set(urls))
     if not Crawler._HAS_SHOW_NO_SEARCH_MSG:
         info("target:%s, acquired url count:%i" % (pattern, len(urls)))
     if print_url:
         for url in urls:
             debug("[search] %s" % url)
     if success:
         next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size]
                            for size in next_size_ratio}
         self.cache_url(key, urls, next_size_ratio)
     return urls, success
Example #7
0
 def prepare_for_next_view(self, wait_time, msg=None):
     if msg:
         debug("[view] %s" % msg)
     job = self.__root.after(int(wait_time), lambda: self.timer_action())
     self.__pending_jobs.append(job)