def set_graph_content(self, graph_file, image=None): if image is None: try: image = GraphViewer.get_image(graph_file) except IOError as e: error(str(e)) assert False self.__root.geometry(self.__full_geom if self.__fullscreen_mode else '%dx%d+0+0' % (image.size[0], image.size[1])) if self.__fullscreen_mode: resize_width, resize_height, x_pos, y_pos = self.get_adjusted_geom(image.size[0], image.size[1]) try: resized = image.resize((resize_width, resize_height), Image.ANTIALIAS) except IOError as e: # 'incomplete downloaded image' may go here info(get_msg(Msg.fail_to_convert_image_to_fullscreen), str(e)) GraphFetcher().handle_image(graph_file, DISCARD) return False image = resized self.__root.title(self.__cur_image_obj.group_name) tk_image_obj = ImageTk.PhotoImage(image) self.__tk_obj_ref = tk_image_obj self.__canvas.delete('all') self.__canvas.create_image(x_pos if self.__fullscreen_mode else 0, y_pos if self.__fullscreen_mode else 0, image=tk_image_obj, anchor=Tkinter.NW) self.show_onscreen_help() self.show_onscreen_info() self.show_onscreen_phrase() return True
def set_graph_content(self, graph_file, image=None): if image is None: try: image = GraphViewer.get_image(graph_file) except IOError as e: error("[view] %s" % str(e)) assert False self.__root.geometry( self.__full_geom if self.__fullscreen_mode else '%dx%d+0+0' % (image.size[0], image.size[1])) if self.__fullscreen_mode: resize_width, resize_height, x_pos, y_pos = self.get_adjusted_geom( image.size[0], image.size[1]) try: resized = image.resize((resize_width, resize_height), Image.ANTIALIAS) except IOError as e: # 'incomplete downloaded image' may go here info("fail to convert image to fullscreen: %s" % str(e)) GraphFetcher().handle_image(graph_file, DISCARD) return False image = resized self.__root.title(self.__cur_image_obj.group_name) tk_image_obj = ImageTk.PhotoImage(image) self.__tk_obj_ref = tk_image_obj self.__canvas.delete('all') self.__canvas.create_image(x_pos if self.__fullscreen_mode else 0, y_pos if self.__fullscreen_mode else 0, image=tk_image_obj, anchor=Tkinter.NW) self.show_onscreen_help() self.show_onscreen_info() self.show_onscreen_phrase() return True
def set_graph(self, image_obj, graph_file=NA): self.__cur_image_obj = image_obj digest = None if NA == graph_file: graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \ GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern) if NA == graph_file: return False show(graph_file) with open(graph_file, 'rb') as f: try: image = GraphViewer.get_image(f) except IOError as e: f.close() # close f here for we are going to delete the file below # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file' info(get_msg(Msg.fail_to_open_image), str(e)) GraphFetcher().handle_image(graph_file, DELETE) return False # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg... except ValueError as e: info(get_msg(Msg.fail_to_open_image), str(e)) return False self.__cur_graph_file = graph_file self.__graph_history.append([self.__cur_image_obj, self.__cur_graph_file]) if digest: digest_str = digest + "\n" else: digest_str = "%s:%s\n" % (get_msg(Msg.path), graph_file) self.__cur_digest = digest_str + "%s:%sx%s" % (get_msg(Msg.size), image.size[0], image.size[1]) self.select_phrase(image_obj.pattern) return self.set_graph_content(graph_file, image)
def set_graph(self, image_obj, graph_file=NA): self.__cur_image_obj = image_obj digest = None if NA == graph_file: graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \ GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern) if NA == graph_file: return False debug("[view] %s" % graph_file) with open(graph_file, 'rb') as f: try: image = GraphViewer.get_image(f) except IOError as e: f.close( ) # close f here for we are going to delete the file below # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file' info("fail to open image: %s" % str(e)) GraphFetcher().handle_image(graph_file, DELETE) return False # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg... except ValueError as e: info("fail to open image: %s" % str(e)) return False self.__cur_graph_file = graph_file self.__graph_history.append( [self.__cur_image_obj, self.__cur_graph_file]) if digest: digest_str = digest + "\n" else: digest_str = "%s:%s\n" % ("path", graph_file) self.__cur_digest = digest_str + "size:%sx%s" % (image.size[0], image.size[1]) self.select_phrase(image_obj.pattern) return self.set_graph_content(graph_file, image)
def delete_image(self, *unused): if self.__cur_image_obj.location: return # spec.: not support remove image that user 'specified' info(get_msg(Msg.remove_image), self.__cur_graph_file) self.__graph_history.remove([self.__cur_image_obj, self.__cur_graph_file]) GraphFetcher.handle_image(self.__cur_graph_file, DELETE) self.cancel_pending_jobs() self.timer_action(True)
def decrement_rank(self, *unused): info(get_msg(Msg.decrease_rank), self.__cur_graph_file) if self.__cur_image_obj.location: msg = GraphDirHandler.handle_image(self.__cur_image_obj.location, self.__cur_graph_file, DEC_RANK) else: msg = GraphFetcher.handle_image(self.__cur_graph_file, DEC_RANK) self.__cur_digest += "\n%s" % msg self.show_onscreen_info()
def decrement_rank(self, *unused): info("decrease rank %s" % self.__cur_graph_file) if self.__cur_image_obj.location: msg = GraphDirHandler.handle_image(self.__cur_image_obj.location, self.__cur_graph_file, DEC_RANK) else: msg = GraphFetcher.handle_image(self.__cur_graph_file, DEC_RANK) self.__cur_digest += "\n%s" % msg self.show_onscreen_info()
def delete_image(self, *unused): if self.__cur_image_obj.location: return # spec.: not support remove image that user 'specified' info("remove image %s" % self.__cur_graph_file) entry = [self.__cur_image_obj, self.__cur_graph_file] self.__graph_history.remove(entry) while self.__graph_history.count(entry) > 0: self.__graph_history.remove(entry) GraphFetcher.handle_image(self.__cur_graph_file, DELETE) self.cancel_pending_jobs() self.timer_action(True)
def view(self, image_obj_list, phrase_obj_list): if not phrase_obj_list: # the WTF 'mutable default argument' property makes us not have [] firstly phrase_obj_list = [] if not image_obj_list: info(get_msg(Msg.not_any_image_specified_program_exit)) sys.exit() self.setup_image_stuff(image_obj_list) self.setup_phrase_stuff(image_obj_list, phrase_obj_list) while True: self.timer_action(True) self.__root.mainloop() self.cancel_pending_jobs()
def view(self, image_obj_list, phrase_obj_list): if not phrase_obj_list: # the WTF 'mutable default argument' property makes us not have [] firstly phrase_obj_list = [] if not image_obj_list: info("not any image is specified, program exits") sys.exit() self.setup_image_stuff(image_obj_list) self.setup_phrase_stuff(image_obj_list, phrase_obj_list) GraphViewer.set_front() while True: self.timer_action(True) self.__root.mainloop() self.cancel_pending_jobs()
def __load_or_create_status(self): status_cache = {} # key: image_file, value: status cache_file = self.__location + get_delim() + GraphDirHandler.CACHE_FILE cache_existed = os.path.exists(cache_file) if cache_existed: success, cache_data = load(cache_file) assert success [timestamp, status_cache] = cache_data if not self.dir_changed(timestamp): return status_cache else: info("directory %s has changed, update cache file" % self.__location) else: info("create a new cache file for directory: %s" % self.__location) image_files = [] for root, _, files in os.walk(self.__location): assert len(root) >= 1 if root[-1] != get_delim(): root += get_delim() for base_file in files: basename, ext = os.path.splitext(base_file) if ext.replace(".", "") in GraphDirHandler.RECOGNIZED_IMAGE_EXT: image_files.append((root + base_file).replace( self.__location + get_delim(), "")) if not image_files: if cache_existed: os.remove(cache_file) self.__valid = False return None existed_image = {} for image in image_files: existed_image[image] = 1 # 1 is just a dummy value if image not in status_cache: status_cache[image] = Status() to_be_deleted = [] for image in status_cache: # this check works when some image is deleted if image not in existed_image: to_be_deleted.append(image) for image in to_be_deleted: status_cache.pop(image) # TODO: this makes an 'always' has-changed 2nd time image timestamp = time.ctime(os.path.getmtime(self.__location)) save(cache_file, [timestamp, status_cache]) return status_cache
def __load_or_create_status(self): status_cache = {} # key: image_file, value: status cache_file = self.__location + get_delim() + GraphDirHandler.CACHE_FILE cache_existed = os.path.exists(cache_file) if cache_existed: success, cache_data = load(cache_file) assert success [timestamp, status_cache] = cache_data if not self.dir_changed(timestamp): return status_cache else: info(get_msg(Msg.directory), self.__location, get_msg(Msg.has_changed_update_cache_file)) else: info("%s%s" % (get_msg(Msg.create_new_cache_file_for_directory), self.__location)) image_files = [] for root, _, files in os.walk(self.__location): assert len(root) >= 1 if root[-1] != get_delim(): root += get_delim() for base_file in files: basename, ext = os.path.splitext(base_file) if ext.replace(".", "") in GraphDirHandler.RECOGNIZED_IMAGE_EXT: image_files.append((root + base_file).replace(self.__location, "")) if not image_files: if cache_existed: os.remove(cache_file) self.__valid = False return None existed_image = {} for image in image_files: existed_image[image] = 1 # 1 is just a dummy value if image not in status_cache: status_cache[image] = Status() to_be_deleted = [] for image in status_cache: # this check works when some image is deleted if image not in existed_image: to_be_deleted.append(image) for image in to_be_deleted: status_cache.pop(image) # TODO: this makes an 'always' has-changed 2nd time image timestamp = time.ctime(os.path.getmtime(self.__location)) save(cache_file, [timestamp, status_cache]) return status_cache
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" show(get_msg(Msg.search_target), "\"" + pattern + "\"") key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < TARGET_SEARCH_RESULT_SIZE: chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("%s:%s, %s:%i" % ( get_msg(Msg.target), pattern, get_msg(Msg.acquired_url_count), len(urls))) if print_url: for url in urls: show(url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" debug("[search] search target: \"%s\"" % pattern) key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < get_search_size(): chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("target:%s, acquired url count:%i" % (pattern, len(urls))) if print_url: for url in urls: debug("[search] %s" % url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def crawl_by_asking_google_search(pattern, start, size, option=""): assert type(pattern) in [str, unicode] from util.global_def import get_api_key, get_cx api_key = get_api_key() cx = get_cx() if not api_key or not cx: if not Crawler._HAS_SHOW_NO_SEARCH_MSG: Crawler._HAS_SHOW_NO_SEARCH_MSG = True info(get_msg(Msg.no_search_due_to_no_api_key_and_cx)) return [], False size_option = "&imgSize=" + size if size else "" full_option = size_option + (option if option else "") base_url = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&searchType=image&num=%d' \ '&q=' + pattern + '&start=%d' + full_option request_str = base_url % (api_key, cx, G_SEARCH_PER_REQ_SIZE, start) urls = [] success = True try: r = requests.get(request_str) res = json.loads(r.text) if "error" in res: Crawler.print_error(res["error"]) if "This API requires billing to be enabled on the project" in res["error"]["message"]: # this is the 'out of quota' message Crawler.__STOP_SEARCH = True return urls, False if 'items' not in res: info(get_msg(Msg.cannot_fetch_image_url), "empty query") return urls, True # return 'True' is okay? for image_info in res['items']: assert 'link' in image_info url = image_info['link'] urls.append(url) except TypeError as e: # for unhandled error... info(get_msg(Msg.cannot_fetch_image_url), str(e)) success = False except requests.ConnectionError as e: info(get_msg(Msg.cannot_fetch_image_url), str(e)) success = False return urls, success
def crawl_by_asking_google_search(pattern, start, size, option=""): assert type(pattern) in [str, unicode] from util.global_def import get_api_key, get_cx api_key = get_api_key() cx = get_cx() if not api_key or not cx: if not Crawler._HAS_SHOW_NO_SEARCH_MSG: Crawler._HAS_SHOW_NO_SEARCH_MSG = True info("as api_key and cx for Google Custom Search is not available, no image search will be issued") return [], False size_option = "&imgSize=" + size if size else "" full_option = size_option + (option if option else "") base_url = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&searchType=image&num=%d' \ '&q=' + pattern + '&start=%d' + full_option request_str = base_url % (api_key, cx, G_SEARCH_PER_REQ_SIZE, start) urls = [] success = True try: r = requests.get(request_str) res = json.loads(r.text) if "error" in res: Crawler.print_error(res["error"]) if "This API requires billing to be enabled on the project" in res["error"]["message"]: # this is the 'out of quota' message Crawler.__STOP_SEARCH = True return urls, False if 'items' not in res: info("cannot fetch newer image url list: empty query") return urls, True # return 'True' is okay? for image_info in res['items']: assert 'link' in image_info url = image_info['link'] urls.append(url) except TypeError as e: # for unhandled error... info("cannot fetch newer image url list: %s" % str(e)) success = False except requests.ConnectionError as e: info("cannot fetch newer image url list: %s" % str(e)) success = False return urls, success
def get_graph_file(self, pattern, url, cached_encoding): """output: graph_file, encoding""" if NA == cached_encoding: # mean this url is not retrievable return NA, NA file_encoding = cached_encoding if not file_encoding: file_encoding = GraphFetcher.get_file_encoding(pattern) graph_dir = GraphFetcher.get_graph_dir(pattern) if not os.path.exists(graph_dir): try: mkdir_p(graph_dir) except OSError as e: error("[fetch] cannot create program directory, program exits:") error(str(e)) sys.exit() abs_graph_file = graph_dir + "image_" + file_encoding + ".jpg" if os.path.exists(abs_graph_file): return abs_graph_file, file_encoding if not self.__network_reachable: info("give up fetching image (due to no network connection):") return NA, None self.__has_write = True try: info("fetch image: %s" % url) try: web_content = urllib2.urlopen(url, timeout=10) except httplib.BadStatusLine: info("give up fetching image (due to no network connection): %s" % url) return NA, NA fd = open(abs_graph_file, 'wb') fd.write(web_content.read()) fd.close() assert os.path.exists(abs_graph_file) if os.stat(abs_graph_file).st_size <= 10240: info("give up acquired image with size: %s Bytes" % os.stat(abs_graph_file).st_size) info("remove image: %s" % abs_graph_file) os.remove(abs_graph_file) return NA, NA info("fetch succeeded") return abs_graph_file, file_encoding except (IOError, httplib.IncompleteRead, ssl.CertificateError) as e: info("failed url: %s" % url) info("error: %s" % str(e)) if os.path.exists(abs_graph_file): fd.close() os.remove(abs_graph_file) return NA, NA
def get_graph_file(self, pattern, url, cached_encoding): """output: graph_file, encoding""" if NA == cached_encoding: # mean this url is not retrievable return NA, NA file_encoding = cached_encoding if not file_encoding: file_encoding = GraphFetcher.get_file_encoding(pattern) graph_dir = GraphFetcher.get_graph_dir(pattern) if not os.path.exists(graph_dir): try: os.makedirs(graph_dir) except OSError as e: error(get_msg(Msg.cannot_create_directory), str(e)) import sys sys.exit() abs_graph_file = graph_dir + "image_" + file_encoding + ".jpg" if os.path.exists(abs_graph_file): return abs_graph_file, file_encoding if not self.__network_reachable: info(get_msg(Msg.give_up_fetch_image)) return NA, None self.__has_write = True try: info(get_msg(Msg.fetch_image), url) try: web_content = urllib2.urlopen(url, timeout=10) except httplib.BadStatusLine: info(get_msg(Msg.obtain_unrecognized_status_code), url) return NA, NA fd = open(abs_graph_file, 'wb') fd.write(web_content.read()) fd.close() assert os.path.exists(abs_graph_file) if os.stat(abs_graph_file).st_size <= 10240: info(get_msg(Msg.give_up_acquired_image_with_size), os.stat(abs_graph_file).st_size, "Bytes") info(get_msg(Msg.remove_image), abs_graph_file) os.remove(abs_graph_file) return NA, NA info(get_msg(Msg.fetch_succeed)) return abs_graph_file, file_encoding except (IOError, httplib.IncompleteRead, ssl.CertificateError) as e: info(get_msg(Msg.failed_url), url) info(get_msg(Msg.error_message), str(e)) if os.path.exists(abs_graph_file): fd.close() os.remove(abs_graph_file) return NA, NA