def start(cls): root_url = "http://www.mangabz.com/manga-list-p%d/" page_num = 0 while True: cls.init_thread() page_num += 1 print("Now page " + str(page_num)) url = root_url % page_num resp = HttpUtils.get_with_retry(url, headers=cls.headers) if resp is None: break links = HttpUtils.get_attrs(resp, ".mh-item-detali > .title > a", "href") if len(links) == 0: break for link in links: cls.task_pool.put(link) cls.process_thread.join() cls.fp.close()
def parse_lvl_one(cls): if cls.book_id is None: print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<") return resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id), headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() cls.root_folder = os.path.join("output", cls.comic_name) links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href") titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") image_numbers = HttpUtils.get_contents( resp, "div.detail-list-form-con a span") image_numbers = list( map(lambda x: re.search("(\d+)P", x).group(1), image_numbers)) assert len(titles) == len(image_numbers) assert len(titles) == len(links) cnt = 0 for index in range(len(titles)): cls.init_thread() link = links[index].replace("/", "").replace("m", "") title = titles[index].strip() image_number = image_numbers[index] if (cls.chapter_mode == 1 and "第" not in title and "话" not in title and "話" not in title) or (cls.chapter_mode == 2 and "卷" not in title and "第" not in title): print("Skip " + title) continue is_skip = False if cls.inclusion_list is not None: for inclusion in cls.inclusion_list: if inclusion not in title: is_skip = True break if not is_skip and cls.parse_lvl_two((link, title, image_number)): cnt += 1 if cnt > 0: cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def do_process(cls, link): resp = HttpUtils.get_with_retry(cls.root_url + link, headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() comic_author = HttpUtils.get_content( resp, ".detail-info-tip span a").strip() comic_status = HttpUtils.get_content( resp, ".detail-info-tip span:nth-of-type(2) span").strip() titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") # validation titles = list(map(lambda x: x.strip(), titles)) if len(titles) == 0: return chap_ids = list() vol_ids = list() for title in titles: id = re.search(".+?(\d*).+?", title).group(1) if id == "": # print("Cannot parse: " + title) pass else: if "話" in title: chap_ids.append(int(id)) elif "卷" in title: vol_ids.append(int(id)) max_chap = -1 max_vol = -1 is_missed = False if len(chap_ids) > 0: missing_ids = list() chap_ids.sort() max_chap = chap_ids[-1] for i in range(1, max_chap + 1): if i not in chap_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing chapters: " + str(missing_ids)) is_missed = True if len(vol_ids) > 0: missing_ids = list() vol_ids.sort() max_vol = vol_ids[-1] for i in range(1, max_vol + 1): if i not in vol_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing volumes: " + str(missing_ids)) is_missed = True if not is_missed: # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name) cls.output_pool.put((cls.comic_name, comic_author, comic_status, max_chap, max_vol, link))
def parse_lvl_two(cls, info): chapter_id = info[0] title = info[1] image_number = int(info[2]) # create folder once folder_name = os.path.join(cls.root_folder, title) # folder_name = "output/" + cls.comic_name + "/" + title + "_" + chapter_id if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) path_file_number = len(glob.glob(pathname=folder_name + '/*')) if path_file_number == image_number: print("Downloaded:" + title) # already downloaded all return False print("Start downloading: " + title) first_url = "http://www.mangabz.com/m%s/" % chapter_id headers = cls.headers headers["Cookie"] = headers["Cookie"] + urllib.parse.quote(first_url) headers["Referer"] = first_url index = 0 while index < image_number: index += 1 query_url = "%s/m%s/chapterimage.ashx?cid=%s&page=%d" % ( cls.root_url, chapter_id, chapter_id, index) content = HttpUtils.get_with_retry(query_url, headers=headers, return_raw=True) if content.text.strip() == "": print("url: " + query_url) print("get wrong data: \"" + content.text.strip() + "\"") print("fail to parse image key, %s-%d" % (title, index)) else: try: image_url_list = execjs.eval(content.text) except: print(">>>>>>>>>> fail to parse image: " + str(index)) continue assert len(image_url_list) > 0 image_keys = list() for image_url in image_url_list: match = re.search("/(\d+_\d{4}).(\w+)\?", image_url) if match is not None: image_key = match.group(1) surfix = match.group(2) image_keys.append(image_key) file_path = folder_name + "/" + image_key + "." + surfix cls.task_pool.put([file_path, image_url, 0]) assert len(image_keys) > 0, query_url # sort & find largest image number image_keys.sort(key=lambda x: int(x.split("_")[0])) index = max(int(image_keys[-1].split("_")[0]), index) print("now index[%d], total[%d]" % (index, image_number)) return True