def parse_lvl_one(cls): if cls.book_id is None: print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<") return resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id), headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() cls.root_folder = os.path.join("output", cls.comic_name) links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href") titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") image_numbers = HttpUtils.get_contents( resp, "div.detail-list-form-con a span") image_numbers = list( map(lambda x: re.search("(\d+)P", x).group(1), image_numbers)) assert len(titles) == len(image_numbers) assert len(titles) == len(links) cnt = 0 for index in range(len(titles)): cls.init_thread() link = links[index].replace("/", "").replace("m", "") title = titles[index].strip() image_number = image_numbers[index] if (cls.chapter_mode == 1 and "第" not in title and "话" not in title and "話" not in title) or (cls.chapter_mode == 2 and "卷" not in title and "第" not in title): print("Skip " + title) continue is_skip = False if cls.inclusion_list is not None: for inclusion in cls.inclusion_list: if inclusion not in title: is_skip = True break if not is_skip and cls.parse_lvl_two((link, title, image_number)): cnt += 1 if cnt > 0: cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def fetch_sub_category(cls, category): # go into category to find sub-category info category_link = category[1] category_text = category[0] sub_category_data_list = list() if category_text in cls.category_black_list: return [] soup_obj = HttpUtils.get(cls.amazon_base_url + category_link, headers=cls.amazon_headers) sub_category_text_list = HttpUtils.get_contents( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a span") sub_category_link_list = HttpUtils.get_attrs( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a", "href") if len(sub_category_link_list) != len(sub_category_text_list): print("ERROR: Sub-category number not correct") return [] # no sub-category if len(sub_category_link_list) == 0: sub_category_text_list = [category_text] sub_category_link_list = [category_link] print("find lvl 2 categories for %s" % category_text) print(sub_category_text_list) # find sub-category page number for sub_index in range(0, len(sub_category_link_list)): sub_category_link = sub_category_link_list[sub_index] sub_category_text = sub_category_text_list[sub_index] soup_obj = HttpUtils.get(cls.amazon_base_url + sub_category_link, headers=cls.amazon_headers) page_info = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-disabled") if len(page_info) == 2: max_page_num = page_info[1] elif len(page_info) == 0: # 没有分页 max_page_num = 1 else: # 5页以内 max_page_num = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-normal a")[-1] print("cat=%s, sub-cat=%s, page=%s" % (category_text, sub_category_text, max_page_num)) sub_category_data_list.append((category_text, sub_category_text, sub_category_link, max_page_num)) return sub_category_data_list
def check_and_notify(cls): url = "https://www.flyertea.com/forum.php?mod=forumdisplay&orderby=dateline&sum=226&fid=226&mobile=2" soup_obj = HttpUtils.get(url, return_raw=False) titles = list(map(lambda title: title.strip(), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk p.n5_htnrbt"))) readers = list(map(lambda x: int(x), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_hthfcs"))) flowers = list( map(lambda x: int(x) if x else 0, HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_htdzcs"))) print(titles) print(readers) print(flowers)
def parse_lvl_one(cls): if cls.book_id is None: return url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url cls.comic_name = HttpUtils.get_content(resp, "title").strip() links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href") titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a") assert len(titles) == len(links) cls.init_thread() for index in range(len(titles)): link = links[index] title = titles[index].strip() cls.parse_lvl_two((link, title)) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def crawl_sub_category_book(cls, sub_category_meta): ku_book_title_list = list() category_name = sub_category_meta[0] sub_category_name = sub_category_meta[1] sub_category_link = cls.amazon_base_url + sub_category_meta[2] page_num = int(sub_category_meta[3]) for page in range(1, page_num + 1): print("reading cat=%s,sub-cat=%s,page=%s" % (category_name, sub_category_name, page)) url = sub_category_link.split("%page=")[0] + "&page=" + str(page) soup_obj = HttpUtils.get(url, headers=cls.amazon_headers) if soup_obj is None: print("blocked?") break title_list = HttpUtils.get_contents( soup_obj, "div.s-result-list div.sg-col-inner h2.a-size-mini span.a-size-medium" ) current_page_title_list = list() for title in title_list: # remove meta info title = title.split("(")[0].split("(")[0].split("【")[0] ku_book_title_list.append(title) current_page_title_list.append(title) print(current_page_title_list) sleep(random() * 0.5 + 0.5) return ku_book_title_list
def parse_lvl_two(cls, info): chapter_url = info[0] title = info[1] # create folder once folder_name = "output/" + cls.comic_name + "/" + title if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) # # path_file_number = len(glob.glob(pathname=folder_name + '/*')) # if path_file_number == image_number: # print("下载完毕:" + title) # # already downloaded all # return print("开始下载: " + title) query_url = cls.root_url + chapter_url retry = 0 while True: content = HttpUtils.get(query_url, headers=cls.headers) if content is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % query_url script_content = HttpUtils.get_contents(content, "script") print(script_content[2][1:].replace(";;", ";").replace(";", ";\n")) image_url_list = re.search("chapterImages.*=.*\[(.*)\];", script_content[2]).group(1).replace( "\"", "").split(",") path = re.search("chapterPath.*?=.*?\"(.*?)\";", script_content[2]).group(1) assert len(image_url_list) > 0 index = 1 for image_url in image_url_list: full_image_url = "http://js1.zzszs.com.cn/" + path + image_url file_path = "%s/%03d_%s" % (folder_name, index, image_url) cls.task_pool.put([file_path, full_image_url, 0]) index += 1
def fetch_meta_data(cls): with open("ku_meta.txt", "r", encoding="utf-8") as fp: if fp.readline(): # already exist, skip return home_url = "https://www.amazon.cn/s?i=digital-text&rh=n%3A116087071%2Cn%3A116089071%2Cn%3A116176071%2Cn%3A1337022071&page=1" # find all category, sub-category and page number soup_obj = HttpUtils.get(home_url, headers=cls.amazon_headers) if soup_obj is None: print("ERROR: Cannot find category") return category_text_list = HttpUtils.get_contents( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link span") category_link_list = HttpUtils.get_attrs( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link", "href") if len(category_text_list) != len(category_link_list): print("ERROR: Category number not correct") return print("find lvl 1 categories:") print(category_text_list) category_list = list() for index in range(0, len(category_link_list)): category_list.append( (category_text_list[index], category_link_list[index])) parallel_template = ParallelTemplate(5) sub_category_data_list = parallel_template.run(cls.fetch_sub_category, category_list) with open("ku_meta.txt", "w", encoding="utf-8") as fp: fp.write(json.dumps(sub_category_data_list))
def do_process(cls, link): resp = HttpUtils.get_with_retry(cls.root_url + link, headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() comic_author = HttpUtils.get_content( resp, ".detail-info-tip span a").strip() comic_status = HttpUtils.get_content( resp, ".detail-info-tip span:nth-of-type(2) span").strip() titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") # validation titles = list(map(lambda x: x.strip(), titles)) if len(titles) == 0: return chap_ids = list() vol_ids = list() for title in titles: id = re.search(".+?(\d*).+?", title).group(1) if id == "": # print("Cannot parse: " + title) pass else: if "話" in title: chap_ids.append(int(id)) elif "卷" in title: vol_ids.append(int(id)) max_chap = -1 max_vol = -1 is_missed = False if len(chap_ids) > 0: missing_ids = list() chap_ids.sort() max_chap = chap_ids[-1] for i in range(1, max_chap + 1): if i not in chap_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing chapters: " + str(missing_ids)) is_missed = True if len(vol_ids) > 0: missing_ids = list() vol_ids.sort() max_vol = vol_ids[-1] for i in range(1, max_vol + 1): if i not in vol_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing volumes: " + str(missing_ids)) is_missed = True if not is_missed: # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name) cls.output_pool.put((cls.comic_name, comic_author, comic_status, max_chap, max_vol, link))
def zz(self): source_url_template = "https://bh.sb/post/category/main/page/{0}/" post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=5&extra=&topicsubmit=yes" self.check_in() max_cnt = 10 cnt = 0 page_num = 1 articles = list() stop_flag = False while not stop_flag: # get article of bhsb soup = HttpUtils.get(source_url_template.format(page_num)) article_urls = HttpUtils.get_attrs(soup, "h2 a", "href") page_num += 1 for article_index in range(len(article_urls)): article_url = article_urls[article_index] if Cache().get(article_url) is not None: continue article_soup = HttpUtils.get(article_url) titles = HttpUtils.get_contents(article_soup, ".article-content p") title_cnt = int(len(titles) / 2) for title_index in range(0, title_cnt): try: title = titles[title_index * 2].split("】")[1] image = titles[title_index * 2 + 1] if type(image) != Tag: continue src = image.attrs["src"] if src.endswith("jpg"): continue message = "好玩您就点个赞,不好笑请期待下一贴~\n" message += "[img]{0}[/img]".format(src) if Cache().get(title) is not None: continue Cache().set(title, message) articles.append((title, message)) cnt += 1 if cnt >= max_cnt: stop_flag = True break except: pass if stop_flag: break # only if all articles are included, then mark this url Cache().set(article_url, article_url) type_id_list = ["1629", "1631", "1633", "4481", "1641"] type_index = 0 for (title, message) in articles: print((title, message)) post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["wysiwyg"] = "1" post_data["typeid"] = type_id_list[type_index] post_data["allownoticeauthor"] = "1" post_data["addfeed"] = "1" post_data["usesig"] = "1" post_data["save"] = "" post_data["uploadalbum"] = "-2" post_data["newalbum"] = "请输入相册名称" post_data["subject"] = title post_data["message"] = message post_result = HttpUtils.post(post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None type_index = (type_index + 1) % len(type_id_list) time.sleep(int(random() * 300) + 2700)
def crawl_book(cls): tag_source_url = "https://book.douban.com/tag/" soup_obj = HttpUtils.get(tag_source_url) tags = HttpUtils.get_contents(soup_obj, "div.article tr td a") tags = [ '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学', '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻', '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记', '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学', '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧', '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行', '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性', '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融', '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程', '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD', '神经网络', '程序' ] print(tags) book_shelf = dict() for tag in tags: for page in range(0, 10): url = "https://book.douban.com/tag/%s?start=%d&type=T" % ( tag, page * 20) soup_obj = HttpUtils.get(url) if soup_obj is None: print("blocked?") break print(tag, page) books_obj = soup_obj.select("#subject_list ul > li") if len(books_obj) == 0: break for book_obj in books_obj: try: title = HttpUtils.get_attr(book_obj, "h2 a", "title") rating = float( HttpUtils.get_content(book_obj, "span.rating_nums")) people = int( HttpUtils.get_content(book_obj, "span.pl").strip().replace( "人评价", "").replace( "(", "").replace(")", "")) if people > cls.people_threshold: if title in book_shelf: book_shelf[title].tag.append(tag) else: book_shelf[title] = Book( title, rating, people, [tag]) except Exception as e: pass # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下 sleep(random() * 0.5 + 0.5) books = list(book_shelf.values()) with open("douban_book_raw.txt", "w") as fp: fp.write(json.dumps(books, default=Book.convert))