def download(illust_id, title, path, url, auth_api): extension = os.path.splitext(url)[1] if IMAGE_USE_ORG_NAME: save_path = path + "/p_%s_%s%s" % ( illust_id, CommonUtils.filter_dir_name(title), extension) else: save_path = path + "/p_%s%s" % (illust_id, extension) print(save_path) auth_api.download(url, path=save_path)
def download_illustration(illu, path, auth_api): """ #illu 包含插画详细 path 存储路径 auth_api 具有身份验证的下载工具 """ if illu.has_key("url") and illu.has_key("title"): illust_id = CommonUtils.get_url_param(illu.url, "illust_id") detail = PixivApi.illust_detail(illust_id) if detail: try: detail = detail.illust # 普通插画 if detail.page_count == 1: try: url = detail.meta_single_page.original_image_url except: url = detail.image_urls.large download(illust_id, illu.title, path, url, auth_api) # 多图插画 else: if detail.page_count > P_LIMIT: # 该插画P数大于最大限制,放弃下载 print("Pixiv id:%s,name:%s P>limit,Skip download" % (illust_id, illu.title)) return urls = detail.meta_pages # 获取多图 if len(urls) > 1: # 多图放入一个文件夹中 path += "/p_%s" % illust_id if not os.path.exists(path): os.mkdir(path) for index in range(len(urls)): try: url = urls[index].image_urls.original if \ urls[index].image_urls.has_key("original") else urls[index].image_urls.large extension = os.path.splitext(url)[1] if IMAGE_USE_ORG_NAME: save_path = path + "/p_%s_%s_%d%s" % ( illust_id, CommonUtils.filter_dir_name(illu.title), index, extension) else: save_path = path + "/p_%s_%d%s" % (illust_id, index, extension) print(save_path) auth_api.download(url, path=save_path) except: continue else: # 获取多图失败,下载大图 url = detail.image_urls.large download(illust_id, illu.title, path, url, auth_api) except Exception, e: error_log("Download fail:") error_log(e) else: print(illu.title + " can't get detail id :" + illust_id)
def handle_search(self): keywords = self.keywords.get().strip() if CommonUtils.is_empty(keywords): showwarning("warning", "Please enter search keywords!") print("warning", "Please enter search keywords!") return if CommonUtils.is_empty(self.path_var.get()): showwarning("warning", "path can't be empty!") print("warning", "path can't be empty!") return path = self.path_var.get().strip() if not os.path.exists(path): showerror("error", " No such file or directory!") print('error', 'No such file or directory') return path = path + "/" + CommonUtils.filter_dir_name("search_" + keywords) showinfo("info", "Is searching:") search_handler = Thread(target=self.search, args=(keywords, path)) search_handler.start()
def handle_related(self): id_var = CommonUtils.set_int(self.id_var.get().strip()) if id_var <= 0: showwarning("warning", "Please enter search keywords!") print("warning", "Please enter search keywords!") return if CommonUtils.is_empty(self.path_var.get()): showwarning("warning", "path can't be empty!") print("warning", "path can't be empty!") return path = self.path_var.get().strip() if not os.path.exists(path): showerror("error", " No such file or directory!") print('error', 'No such file or directory') return path = path + "/" + CommonUtils.filter_dir_name("related_" + str(id_var)) showinfo("info", "Get related illus of " + str(id_var) + " :") related_handler = Thread(target=self.related, args=(id_var, path)) related_handler.start()
def get_pixivision_topics(cls, url, path): topic_list = HtmlDownloader.parse_illustration_topic( HtmlDownloader.download(url)) if not topic_list: error_log(url + " not find any illustration topic") return for topic in topic_list: try: # 需要过滤掉特殊字符,否则会创建文件夹失败。 # 创建特辑文件夹,写入特辑信息。 save_path = path + "/" + CommonUtils.filter_dir_name( topic.title) if not os.path.exists(save_path): os.makedirs(save_path) CommonUtils.write_topic(save_path + "/topic.txt", topic) topic['save_path'] = save_path except Exception, e: continue error_log("Create topic path fail,topic url:" + topic.Href) error_log(e)
def download_topics(cls, url, path, quality=1): html = HtmlDownloader.download(url) illu_list = HtmlDownloader.parse_illustration(html) title_des = HtmlDownloader.get_title(html) if title_des and illu_list: title_des["size"] = len(illu_list) CommonUtils.write_topic_des(path + "/topic.txt", title_des) if not illu_list: return for illu in illu_list: try: filename = CommonUtils.filter_dir_name(illu.title) extension = os.path.splitext(illu.image)[1] id = CommonUtils.get_url_param(illu.image_page, "illust_id") if quality == 1: # 通过api获取 插画原图地址,下载原图 detail = PixivApi.illust_detail(id) if detail: download_url = ImageDownload.get_image_url( illu, detail) if IMAGE_USE_ORG_NAME: save_path = path + "/p_%s_%s%s" % (id, filename, extension) else: save_path = path + "/p_%s%s" % (id, extension) print(save_path) PixivApi.download(download_url, path=save_path) else: print(illu.title + " can't get detail id :" + id) else: # 直接下载 pixivision 展示图 print(path + "/p_%s_%s%s" % (id, filename, extension)) PixivApi.download(illu.image, path=path + "/p_%s_%s%s" % (id, filename, extension)) except Exception, e: error_log("Download Illu Fail:" + " Illustration :" + str(illu)) error_log(e) continue
page = SEARCH_PAGE keyword = SEARCH_KEYWORD else: username = raw_input("Please enter your pixiv accounts eamil or pixiv ID\n") password = raw_input('Enter password:\n ') print ("Loading") data_handler = PixivDataDownloader.PixivDataHandler(username, password) auth_api = AuthPixivApi(username, password) print("Login success!!!!") path = raw_input("Please input illustration save path:\n") page = int(raw_input("Please enter the total number of pages you want to crawl:\n")) download_threshold = int(raw_input("Please enter the minimum number of illustration's bookmarks:\n")) keyword = raw_input("Please enter search keyword:\n") keyword = keyword.decode("utf-8") queue = Queue() path = path + "/" + CommonUtils.filter_dir_name("search_" + keyword) # 默认消费者下载线程数为10个,可根据下载量和机器性能适当增加 thread_num = 10 if not os.path.exists(path): os.makedirs(path) for i in range(thread_num): t = Thread(target=download_queue, name="Thread" + str(i), args=(queue, path, auth_api)) t.daemon = True t.start() # 因为搜索的结果量不大,直接使用set在内存中过滤重复元素,不需要使用redisFilter set_filter = set() for p in range(1, page + 1): result = data_handler.search(keyword, page=p, download_threshold=download_threshold) print(result) for illu in result: