def set_graph(self, image_obj, graph_file=NA): self.__cur_image_obj = image_obj digest = None if NA == graph_file: graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \ GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern) if NA == graph_file: return False debug("[view] %s" % graph_file) with open(graph_file, 'rb') as f: try: image = GraphViewer.get_image(f) except IOError as e: f.close( ) # close f here for we are going to delete the file below # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file' info("fail to open image: %s" % str(e)) GraphFetcher().handle_image(graph_file, DELETE) return False # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg... except ValueError as e: info("fail to open image: %s" % str(e)) return False self.__cur_graph_file = graph_file self.__graph_history.append( [self.__cur_image_obj, self.__cur_graph_file]) if digest: digest_str = digest + "\n" else: digest_str = "%s:%s\n" % ("path", graph_file) self.__cur_digest = digest_str + "size:%sx%s" % (image.size[0], image.size[1]) self.select_phrase(image_obj.pattern) return self.set_graph_content(graph_file, image)
def get_recent_result(self, key): """output: urls, size_ratio""" if key not in self.__url_map: return None, None [retrieved_date, new_result, urls, size_ratio] = self.__url_map[key] if not self.__network_reachable or Crawler.__STOP_SEARCH: debug("[search] use previous search result (due to no network connection)") # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None # spec.: we will execute a new search when there is enough new result on previous search # => if previous new result is n, all result is m, we will have a new search after m/n days # => if all previous result is new, then after 1 day we will have a search # => if no previous result is new, then we will have a search after 'get_search_unit_size()' days valid_day_size = len(urls) / new_result if new_result > 0 else \ 1 if NA is new_result else \ get_search_size() # new_result = 0 => no new result before from util.global_def import get_latency valid_day_size *= get_latency() current_date = datetime.today() date_diff = current_date - retrieved_date if date_diff > timedelta(days=valid_day_size): # 'valid_day_size' is the valid duration of search result return None, size_ratio to_next_query = timedelta(days=valid_day_size) - date_diff hours, remainder = divmod(to_next_query.seconds, 3600) minutes, seconds = divmod(remainder, 60) debug("[search] to next search: %i days %i hours %i minutes %i seconds, current url count: %i" % (to_next_query.days, hours, minutes, seconds, len(urls))) # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None
def select_pattern(self): if self.__arbitrator.is_active(): choice_pattern = None while not choice_pattern: choice_pattern = self.__arbitrator.arbitrate() if not choice_pattern: debug( "[view] no available image now, will wait for ten minutes..." ) self.__root.withdraw() import time time.sleep(600) self.__root.deiconify() return self.__cur_image_obj_dict[choice_pattern] image_obj_size = len(self.__cur_image_obj_list) return self.__cur_image_obj_list[random.randrange(0, image_obj_size)]
def load(pickle_file): """output: is_exist, value""" try: pickle_fd = open(pickle_file, "r") except IOError as err: if errno.ENOENT == err.errno: debug("cache file does not exist: %s" % pickle_file) return False, None assert False try: value = cPickle.load(pickle_fd) return True, value except (ValueError, UnpicklingError, EOFError): error("cannot read pickle file: %s, suggest re-fetch the pickle file" % pickle_file) assert False
def fetch(self, pattern): self.__has_write = False new_objs, old_objs = self.get_updated_url(pattern) debug("[fetch] total data count: %s" % str(len(new_objs) + len(old_objs))) url = self.choose_url(new_objs, old_objs) if NA == url: return NA, NA image_objs = old_objs image_objs.update(new_objs) image_slot = image_objs[url] graph_file, new_encoding = self.get_graph_file(pattern, url, image_slot.encoding) new_slot = ImageSlot(image_slot.timestamp, new_encoding, image_slot.rank) image_objs[url] = new_slot if self.__has_write: save(GraphFetcher.get_cache_file(pattern), image_objs) return graph_file, GraphFetcher.get_graph_digest(graph_file, image_objs[url])
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" debug("[search] search target: \"%s\"" % pattern) key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < get_search_size(): chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("target:%s, acquired url count:%i" % (pattern, len(urls))) if print_url: for url in urls: debug("[search] %s" % url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def prepare_for_next_view(self, wait_time, msg=None): if msg: debug("[view] %s" % msg) job = self.__root.after(int(wait_time), lambda: self.timer_action()) self.__pending_jobs.append(job)