def start(cls): root_url = "http://www.mangabz.com/manga-list-p%d/" page_num = 0 while True: cls.init_thread() page_num += 1 print("Now page " + str(page_num)) url = root_url % page_num resp = HttpUtils.get_with_retry(url, headers=cls.headers) if resp is None: break links = HttpUtils.get_attrs(resp, ".mh-item-detali > .title > a", "href") if len(links) == 0: break for link in links: cls.task_pool.put(link) cls.process_thread.join() cls.fp.close()
def parse_lvl_one(cls): if cls.book_id is None: return url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url cls.comic_name = HttpUtils.get_content(resp, "title").strip() links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href") titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a") assert len(titles) == len(links) cls.init_thread() for index in range(len(titles)): link = links[index] title = titles[index].strip() cls.parse_lvl_two((link, title)) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def parse_lvl_two(cls, info): url = info[0] index = info[1] # create folder once folder_name = "output/龙珠/" + str(index) if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url links = HttpUtils.get_attrs(resp, ".ListContainer .ItemThumb a", "style") assert links is not None for link in links: url = re.search("background:url\(.*'(.*)'", link).group(1).replace("_thumb.", "") file_name = url.split("/")[-1] cls.task_pool.put([folder_name + "/" + file_name, url, 0])
def fetch_sub_category(cls, category): # go into category to find sub-category info category_link = category[1] category_text = category[0] sub_category_data_list = list() if category_text in cls.category_black_list: return [] soup_obj = HttpUtils.get(cls.amazon_base_url + category_link, headers=cls.amazon_headers) sub_category_text_list = HttpUtils.get_contents( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a span") sub_category_link_list = HttpUtils.get_attrs( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a", "href") if len(sub_category_link_list) != len(sub_category_text_list): print("ERROR: Sub-category number not correct") return [] # no sub-category if len(sub_category_link_list) == 0: sub_category_text_list = [category_text] sub_category_link_list = [category_link] print("find lvl 2 categories for %s" % category_text) print(sub_category_text_list) # find sub-category page number for sub_index in range(0, len(sub_category_link_list)): sub_category_link = sub_category_link_list[sub_index] sub_category_text = sub_category_text_list[sub_index] soup_obj = HttpUtils.get(cls.amazon_base_url + sub_category_link, headers=cls.amazon_headers) page_info = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-disabled") if len(page_info) == 2: max_page_num = page_info[1] elif len(page_info) == 0: # 没有分页 max_page_num = 1 else: # 5页以内 max_page_num = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-normal a")[-1] print("cat=%s, sub-cat=%s, page=%s" % (category_text, sub_category_text, max_page_num)) sub_category_data_list.append((category_text, sub_category_text, sub_category_link, max_page_num)) return sub_category_data_list
def parse_lvl_one(cls): if cls.book_id is None: print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<") return resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id), headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() cls.root_folder = os.path.join("output", cls.comic_name) links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href") titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") image_numbers = HttpUtils.get_contents( resp, "div.detail-list-form-con a span") image_numbers = list( map(lambda x: re.search("(\d+)P", x).group(1), image_numbers)) assert len(titles) == len(image_numbers) assert len(titles) == len(links) cnt = 0 for index in range(len(titles)): cls.init_thread() link = links[index].replace("/", "").replace("m", "") title = titles[index].strip() image_number = image_numbers[index] if (cls.chapter_mode == 1 and "第" not in title and "话" not in title and "話" not in title) or (cls.chapter_mode == 2 and "卷" not in title and "第" not in title): print("Skip " + title) continue is_skip = False if cls.inclusion_list is not None: for inclusion in cls.inclusion_list: if inclusion not in title: is_skip = True break if not is_skip and cls.parse_lvl_two((link, title, image_number)): cnt += 1 if cnt > 0: cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def fetch_meta_data(cls): with open("ku_meta.txt", "r", encoding="utf-8") as fp: if fp.readline(): # already exist, skip return home_url = "https://www.amazon.cn/s?i=digital-text&rh=n%3A116087071%2Cn%3A116089071%2Cn%3A116176071%2Cn%3A1337022071&page=1" # find all category, sub-category and page number soup_obj = HttpUtils.get(home_url, headers=cls.amazon_headers) if soup_obj is None: print("ERROR: Cannot find category") return category_text_list = HttpUtils.get_contents( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link span") category_link_list = HttpUtils.get_attrs( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link", "href") if len(category_text_list) != len(category_link_list): print("ERROR: Category number not correct") return print("find lvl 1 categories:") print(category_text_list) category_list = list() for index in range(0, len(category_link_list)): category_list.append( (category_text_list[index], category_link_list[index])) parallel_template = ParallelTemplate(5) sub_category_data_list = parallel_template.run(cls.fetch_sub_category, category_list) with open("ku_meta.txt", "w", encoding="utf-8") as fp: fp.write(json.dumps(sub_category_data_list))
def water(self): self.check_in() url_prefix = "http://www.miui.com/forum.php?mod=forumdisplay&fid=5&orderby=dateline&filter=author&orderby=dateline&page=" page = 1 cnt = 1 max_cnt = 50 chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] id_list = [] while True: soup_obj = HttpUtils.get(url_prefix + str(page)) print("new page: " + str(page)) id_list.extend(HttpUtils.get_attrs(soup_obj, "tbody", "id")) page += 1 if len(id_list) > max_cnt: break id_list = id_list[:max_cnt] for id in id_list: if not id.startswith("normalthread"): continue id = id[13:] page_url = self.page_url_template.format(id) page_soup_obj = HttpUtils.get(page_url) assert page_soup_obj is not None i = str(cnt) length = len(i) num = "" for index in range(length): num += chinese_char[int(i[index])] id_num = "" for index in range(len(id)): id_num += chinese_char[int(id[index])] random_id = str(int(random() * 1000000000000000)) chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] random_id_num = "" for index in range(len(random_id)): random_id_num += chinese_char[int(random_id[index])] title = HttpUtils.get_content(page_soup_obj, "title").strip().replace( "_灌者为王_MIUI论坛", "") message = "时间{0},帖子ID{1},标题\"{2}\",随机数{3},第{4}个积分,打扰".format( time.strftime("%b %d %Y %H:%M:%S", time.localtime()), id_num, title, random_id_num, num) # form_hash = page_soup_obj.select("input[name='formhash']")[0]["value"] post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["usesig"] = "1" post_data["subject"] = " " post_data["message"] = message form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid=5&tid={0}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format( id) # print(post_data) post_result = HttpUtils.post(form_submit_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None time.sleep(int(random() * 60) + 90) cnt += 1
def vote(self): self.check_in() source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}" page_num = 1 max_cnt = 10 cnt = 0 stop_flag = False while not stop_flag: soup = HttpUtils.get(source_list_url_template.format(page_num), headers=self.site.login_headers) assert soup is not None page_num += 1 current_score = self.get_score() previous_score = current_score article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href") for article_url in article_urls: try: article_url = "http://www.miui.com/" + article_url article_soup = HttpUtils.get( article_url, headers=self.site.login_headers) assert article_soup is not None title = HttpUtils.get_content(article_soup, "title") form = article_soup.select("#poll", limit=1) option = article_soup.select("#option_1", limit=1) if form is None or len(form) == 0: continue if option is None or len(option) == 0: continue print(title) # do vote here post_url = "http://www.miui.com/" + HttpUtils.get_attr( article_soup, "#poll", "action") + "&inajax=1" post_data = dict() post_data["pollanswers[]"] = HttpUtils.get_attr( article_soup, "#option_1", "value") post_data["formhash"] = self.form_hash_mirror post_result = HttpUtils.post( post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None current_score = self.get_score() print(previous_score) print(current_score) cnt += 1 if cnt >= max_cnt or previous_score == current_score: stop_flag = True break previous_score = current_score time.sleep(60) except: pass
def zz(self): source_url_template = "https://bh.sb/post/category/main/page/{0}/" post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=5&extra=&topicsubmit=yes" self.check_in() max_cnt = 10 cnt = 0 page_num = 1 articles = list() stop_flag = False while not stop_flag: # get article of bhsb soup = HttpUtils.get(source_url_template.format(page_num)) article_urls = HttpUtils.get_attrs(soup, "h2 a", "href") page_num += 1 for article_index in range(len(article_urls)): article_url = article_urls[article_index] if Cache().get(article_url) is not None: continue article_soup = HttpUtils.get(article_url) titles = HttpUtils.get_contents(article_soup, ".article-content p") title_cnt = int(len(titles) / 2) for title_index in range(0, title_cnt): try: title = titles[title_index * 2].split("】")[1] image = titles[title_index * 2 + 1] if type(image) != Tag: continue src = image.attrs["src"] if src.endswith("jpg"): continue message = "好玩您就点个赞,不好笑请期待下一贴~\n" message += "[img]{0}[/img]".format(src) if Cache().get(title) is not None: continue Cache().set(title, message) articles.append((title, message)) cnt += 1 if cnt >= max_cnt: stop_flag = True break except: pass if stop_flag: break # only if all articles are included, then mark this url Cache().set(article_url, article_url) type_id_list = ["1629", "1631", "1633", "4481", "1641"] type_index = 0 for (title, message) in articles: print((title, message)) post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["wysiwyg"] = "1" post_data["typeid"] = type_id_list[type_index] post_data["allownoticeauthor"] = "1" post_data["addfeed"] = "1" post_data["usesig"] = "1" post_data["save"] = "" post_data["uploadalbum"] = "-2" post_data["newalbum"] = "请输入相册名称" post_data["subject"] = title post_data["message"] = message post_result = HttpUtils.post(post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None type_index = (type_index + 1) % len(type_id_list) time.sleep(int(random() * 300) + 2700)