def load_weather_data(cls): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2", "Content-Type": "application/x-www-form-urlencoded", "Host": "d1.weather.com.cn", "Referer": "http://www.weather.com.cn/weather1d/%s.shtml" % cls.city_code } res = HttpUtils.get("http://d1.weather.com.cn/sk_2d/%s.html?_=%d" % (cls.city_code, round(time.time() * 1000)), headers=headers, return_raw=True) html = res.content.decode("utf-8") data = json.loads(html.replace("var dataSK = ", "")) res = HttpUtils.get( "http://d1.weather.com.cn/dingzhi/%s.html?_=%d" % (cls.city_code, round(time.time() * 1000)), headers=headers, return_raw=True) html = res.content.decode("utf-8") html2 = html.replace("var cityDZ101020100 =", "").replace(";var alarmDZ101020100 ={\"w\":[]}", "") data2 = json.loads(html2).get("weatherinfo") return "今天%s,最高气温%s,最低气温%s,%s%s, 当前气温%s,空气质量指数%s,相对湿度%s" % ( data2.get("weather"), data2.get("temp"), data2.get("tempn"), data2.get("wd"), data2.get("ws"), data.get("temp"), data.get("aqi"), data.get("sd"))
def fetch_sub_category(cls, category): # go into category to find sub-category info category_link = category[1] category_text = category[0] sub_category_data_list = list() if category_text in cls.category_black_list: return [] soup_obj = HttpUtils.get(cls.amazon_base_url + category_link, headers=cls.amazon_headers) sub_category_text_list = HttpUtils.get_contents( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a span") sub_category_link_list = HttpUtils.get_attrs( soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a", "href") if len(sub_category_link_list) != len(sub_category_text_list): print("ERROR: Sub-category number not correct") return [] # no sub-category if len(sub_category_link_list) == 0: sub_category_text_list = [category_text] sub_category_link_list = [category_link] print("find lvl 2 categories for %s" % category_text) print(sub_category_text_list) # find sub-category page number for sub_index in range(0, len(sub_category_link_list)): sub_category_link = sub_category_link_list[sub_index] sub_category_text = sub_category_text_list[sub_index] soup_obj = HttpUtils.get(cls.amazon_base_url + sub_category_link, headers=cls.amazon_headers) page_info = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-disabled") if len(page_info) == 2: max_page_num = page_info[1] elif len(page_info) == 0: # 没有分页 max_page_num = 1 else: # 5页以内 max_page_num = HttpUtils.get_contents( soup_obj, "ul.a-pagination li.a-normal a")[-1] print("cat=%s, sub-cat=%s, page=%s" % (category_text, sub_category_text, max_page_num)) sub_category_data_list.append((category_text, sub_category_text, sub_category_link, max_page_num)) return sub_category_data_list
def sign(self): self.check_in() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) time_start = time.time() for i in range(100): HttpUtils.get( "http://www.miui.com/extra.php?mod=sign/index&op=sign", headers=self.site.login_headers, return_raw=True) time_end = time.time() print('time cost', time_end - time_start, 's') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def parse_lvl_one(cls): if cls.book_id is None: return url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url cls.comic_name = HttpUtils.get_content(resp, "title").strip() links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href") titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a") assert len(titles) == len(links) cls.init_thread() for index in range(len(titles)): link = links[index] title = titles[index].strip() cls.parse_lvl_two((link, title)) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def crawl_sub_category_book(cls, sub_category_meta): ku_book_title_list = list() category_name = sub_category_meta[0] sub_category_name = sub_category_meta[1] sub_category_link = cls.amazon_base_url + sub_category_meta[2] page_num = int(sub_category_meta[3]) for page in range(1, page_num + 1): print("reading cat=%s,sub-cat=%s,page=%s" % (category_name, sub_category_name, page)) url = sub_category_link.split("%page=")[0] + "&page=" + str(page) soup_obj = HttpUtils.get(url, headers=cls.amazon_headers) if soup_obj is None: print("blocked?") break title_list = HttpUtils.get_contents( soup_obj, "div.s-result-list div.sg-col-inner h2.a-size-mini span.a-size-medium" ) current_page_title_list = list() for title in title_list: # remove meta info title = title.split("(")[0].split("(")[0].split("【")[0] ku_book_title_list.append(title) current_page_title_list.append(title) print(current_page_title_list) sleep(random() * 0.5 + 0.5) return ku_book_title_list
def parse_lvl_one(cls): if cls.book_id is None: return resp = HttpUtils.get( "https://api.ishuhui.shop/ver/4e198319/anime/detail?id=%d&type=comics&.json" % cls.book_id, return_raw=True) assert resp is not None json_data = json.loads(resp.text) cartoons = json_data["data"]["comicsIndexes"]["1"]["nums"] cls.init_thread() for type in cartoons.keys(): posts = cartoons[type] for index in posts.keys(): post_id = posts[index][0]["id"] final_url = "https://prod-api.ishuhui.com/comics/detail?id=%s" % post_id cls.parse_lvl_two(final_url) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None m = re.search("chapter: \$\.evalJSON\(\'(.*)\'\),", content.text) if not m or m.group(1) == "null": m = re.search("chapter: (.*),", content.text) assert m json_data = json.loads(m.group(1)) book = json_data["comic_id"] number = json_data["chapter_id"] title = json_data["name"].strip().replace(" ", "-").replace( "(", "(").replace(")", ")") # create folder once folder_name = "%s/%08d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) m = re.search("image_list: \$\.evalJSON\(\'(.*)\'\),", content.text) if not m or m.group(1) == "null": m = re.search("image_list: (.*),", content.text) assert m json_data = json.loads(m.group(1)) for index in json_data.keys(): image_data = json_data[index] page = image_data["page"] image_url = base64.decodebytes( image_data["src"].encode("utf-8")).decode("utf-8") format = image_url.split(".")[-1] image_file_name = "%03d.%s" % (int(page), format) file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0])
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None json_data = json.loads(content.text) book = json_data["data"]["animeName"] title = json_data["data"]["title"] number = json_data["data"]["numberStart"] images = json_data["data"]["contentImg"] # create folder once ''' folder_name = "%s/%03d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) for image in images: image_file_name = image["name"] image_url = image["url"] file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0]) ''' folder_name = "%s/%03d_%s" % (book, int(number), title) for image in images: image_file_name = image["name"] image_url = image["url"] file_path = folder_name + image_file_name cls.task_pool.put([file_path, image_url, 0])
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None location = os.path.join(os.path.dirname(__file__), "../bin/phantomjs") jsFile = os.path.join(os.path.dirname(__file__), "../static/tencent_comic.js") print(">>> parsing " + url) data = os.popen("%s %s %s" % (location, jsFile, url)).read() # retry twice if data is None: data = os.popen("%s %s %s" % (location, jsFile, url)).read() assert data is not None print("****** data=" + data) json_data = json.loads(data) book = json_data["title"] number = json_data["cid"] title = json_data["cTitle"].strip().replace(" ", "-").replace( "(", "(").replace(")", ")") # create folder once folder_name = "%s/%08d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) for index in json_data["picture"].keys(): image_url = json_data["picture"][index] format = "png" image_file_name = "%03d.%s" % (int(index), format) file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0])
def parse_lvl_two(cls, info): url = info[0] index = info[1] # create folder once folder_name = "output/龙珠/" + str(index) if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url links = HttpUtils.get_attrs(resp, ".ListContainer .ItemThumb a", "style") assert links is not None for link in links: url = re.search("background:url\(.*'(.*)'", link).group(1).replace("_thumb.", "") file_name = url.split("/")[-1] cls.task_pool.put([folder_name + "/" + file_name, url, 0])
def get_score(self): self.check_in() soup = HttpUtils.get("http://www.miui.com/space-uid-2248502469.html") assert soup is not None score = HttpUtils.get_content( soup, "#statistic_content li:nth-of-type(1) a") return int(score)
def parse_users(cls, url): soup_obj = HttpUtils.get(url) if soup_obj is None: print(">>>>>> Fail to parse " + url) return None data_state = HttpUtils.get_attr(soup_obj, "#data", "data-state") data_map = json.loads(data_state) return data_map['entities']['users']
def crawl(self): site = self.generate_site() assert self.login(site) for i in range(107, 164): soup_obj = HttpUtils.get(site.home_page + "?page=" + str(i), headers=site.login_headers) ids = self.parse(soup_obj) ParallelTemplate(150).run(func=self.say_thank, inputs=ids) print(">>>>>> finish page " + str(i))
def download_seed_file(self, seed_id): res = HttpUtils.get("https://totheglory.im/rssdd.php?par=%s==&ssl=yes" % seed_id, headers=self.site.login_headers, return_raw=True) try: with open("%s.torrent" % seed_id, "wb") as f: f.write(res.content) except Exception as e: print("Cannot download seed file: " + seed_id, e)
def check_and_notify(cls): url = "https://www.flyertea.com/forum.php?mod=forumdisplay&orderby=dateline&sum=226&fid=226&mobile=2" soup_obj = HttpUtils.get(url, return_raw=False) titles = list(map(lambda title: title.strip(), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk p.n5_htnrbt"))) readers = list(map(lambda x: int(x), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_hthfcs"))) flowers = list( map(lambda x: int(x) if x else 0, HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_htdzcs"))) print(titles) print(readers) print(flowers)
def read_msg_content(self, msg): soup_obj = HttpUtils.get(self.detail_url + msg.id, headers=self.site.login_headers) assert soup_obj is not None td_list = soup_obj.select( "#outer table:nth-of-type(2) tr:nth-of-type(3) td:nth-of-type(1)") print("--------------------------------------") print(td_list[0].text) print("--------------------------------------")
def action(self, data): vote_url = "https://kp.m-team.cc/vote.php?tid=%s&type=1" success_cnt = 0 for id in data: res_obj = HttpUtils.get(url=vote_url % id, headers=self.site.login_headers) msg = HttpUtils.get_content(res_obj, "#outer table h2") if msg == "操作成功": success_cnt += 1 print("Vote success: " + str(success_cnt))
def crawl(self, print_log=True): self.login_if_not() soup_obj = HttpUtils.get(self.site.home_page, headers=self.site.login_headers) seeds = self.parse(soup_obj) if print_log: for seed in seeds: print(seed) return seeds
def download_seed_file(self, seed_id): self.login_if_not() res = HttpUtils.get("https://pt.sjtu.edu.cn/download.php?id=" + str(seed_id), headers=self.site.login_headers, return_raw=True) try: with open("%s.torrent" % seed_id, "wb") as f: f.write(res.content) except Exception as e: print("Cannot download seed file: " + seed_id, e)
def download_seed_file(self, seed_id): res = HttpUtils.get( "https://kp.m-team.cc/download.php?id=%s&passkey=%s&https=1" % (seed_id, self.passKey), headers=self.site.login_headers, return_raw=True) try: with open("%s.torrent" % seed_id, "wb") as f: f.write(res.content) except Exception as e: print("Cannot download seed file: " + seed_id, e)
def parse_captcha(self, site): soup_obj = HttpUtils.get("https://pt.sjtu.edu.cn/login.php", headers=site.login_headers) captcha_image_list = soup_obj.select("form img") # if captcha image exists, parse expression and return if len(captcha_image_list) > 0: image_url = "https://pt.sjtu.edu.cn/" + captcha_image_list[0]["src"] HttpUtils.download_file(image_url, "/tmp/cap.png", over_write=True) return PuTaoCaptchaParser.analyze("/tmp/cap.png") else: return "XxXx"
def stat(self, unit="GB", update_cache=True): self.login_if_not() soup_obj = HttpUtils.get(self.site.stat_page, headers=self.site.login_headers) assert soup_obj is not None div_list = soup_obj.select( "table.mainouter tr td table tr td div[align='center']") assert len(div_list) == 1 content = div_list[0].contents[0] m = re.search(u"获取(\d+.\d+)个魔力", content) assert m mp = float(m.group(1)) span_list = soup_obj.select("#usermsglink span") up = HttpUtils.pretty_format(span_list[1].contents[2], unit) down = HttpUtils.pretty_format(span_list[1].contents[4], unit) prev_up = Cache().get(self.get_site_name() + "_up") prev_down = Cache().get(self.get_site_name() + "_down") if prev_up is None: prev_up = 0 else: prev_up = float(prev_up.decode()) if prev_down is None: prev_down = 0 else: prev_down = float(prev_down.decode()) delta_up = round(up - prev_up, 2) delta_down = round(down - prev_down, 2) if delta_down == 0: delta_ratio = "Inf" else: delta_ratio = round(delta_up / delta_down, 2) current_upload = round(up - down, 2) print( "%s, mp=%s, up=%s, down=%s, current=%s, delta_up=%s, delta_down=%s, delta_ratio=%s" % (str(time.strftime("%Y-%m-%d %H:%M:%S")), mp, up, down, current_upload, delta_up, delta_down, delta_ratio)) if update_cache: Cache().set(self.get_site_name() + "_up", up) Cache().set(self.get_site_name() + "_down", down) return mp, up, down
def parse_lvl_two(cls, info): link = info[0] title = info[1] # create folder once folder_name = "output/" + cls.comic_name + "/" + title if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) # path_file_number = len(glob.glob(pathname=folder_name + '/*')) # if path_file_number == image_number: # print("下载完毕:" + title) # # already downloaded all # return print("开始下载: " + title) index = 0 query_url = cls.root_url + link retry = 0 while True: content = HttpUtils.get(query_url) if content is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % query_url if content.text.strip() == "": print("url: " + query_url) print("get wrong data: \"" + content.text.strip() + "\"") print("fail to parse image key, %s-%d" % (title, index)) else: url_encoded = re.search("qTcms_S_m_murl_e.*=.*(\".*?\");", content.text).group(1) image_url_list = base64.b64decode(url_encoded).decode( "utf-8").split("$qingtiandy$") assert len(image_url_list) > 0 index = 1 for image_url in image_url_list: file_name = image_url.split("/")[-1] file_path = "%s/%03d_%s" % (folder_name, index, file_name) if "http" not in image_url: image_url = "http://j.aiwenwo.net" + image_url cls.task_pool.put([file_path, image_url, 0]) index += 1
def check_login(self, site): HttpUtils.create_session_if_absent() HttpUtils.load_cookie() soup_obj = HttpUtils.get(site.home_page, headers=site.login_headers) content = HttpUtils.get_content(soup_obj, site.login_verify_css_selector) print("Current user is " + str(content)) result = content is not None and content == site.login_verify_str if result: HttpUtils.save_cookie() else: HttpUtils.clear_cookie() return result
def parse_lvl_two(cls, info): chapter_url = info[0] title = info[1] # create folder once folder_name = "output/" + cls.comic_name + "/" + title if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) # # path_file_number = len(glob.glob(pathname=folder_name + '/*')) # if path_file_number == image_number: # print("下载完毕:" + title) # # already downloaded all # return print("开始下载: " + title) query_url = cls.root_url + chapter_url retry = 0 while True: content = HttpUtils.get(query_url, headers=cls.headers) if content is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % query_url script_content = HttpUtils.get_contents(content, "script") print(script_content[2][1:].replace(";;", ";").replace(";", ";\n")) image_url_list = re.search("chapterImages.*=.*\[(.*)\];", script_content[2]).group(1).replace( "\"", "").split(",") path = re.search("chapterPath.*?=.*?\"(.*?)\";", script_content[2]).group(1) assert len(image_url_list) > 0 index = 1 for image_url in image_url_list: full_image_url = "http://js1.zzszs.com.cn/" + path + image_url file_path = "%s/%03d_%s" % (folder_name, index, image_url) cls.task_pool.put([file_path, full_image_url, 0]) index += 1
def get_max_chapter_num(animation_id): # not sure what this ver means, could be any value and still works verification = "3b230956" response = HttpUtils.get( "https://prod-api.ishuhui.com/ver/{0}/anime/detail?id={1}&type=comics&.json" .format(verification, animation_id), return_raw=True) if response.status_code != 200 and response.status_code != 301: return ShuHuiWatchDog.INVALID_CHAPTER_NUM else: comic_data = json.loads(response.text) max_chapter_num = int( comic_data["data"]["comicsIndexes"]["1"]["maxNum"]) comic_name = comic_data["data"]["name"] return comic_name, max_chapter_num
def crawl(self, print_log=True): self.login_if_not() soup_obj = HttpUtils.get(self.site.home_page, headers=self.site.login_headers) assert soup_obj is not None data = self.parse_page(soup_obj) if print_log: if type(data) is list: for item in data: print(item) else: print(data) return data
def read_msg(self, index): self.login_if_not() soup_obj = HttpUtils.get(self.url + index, headers=self.site.login_headers) assert soup_obj is not None tr_list = soup_obj.select("#outer form table tr") messages = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue td_list = tr.select("td.rowfollow") if len(td_list) < 4: # skip footer continue msg = Message() msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0 msg.title = HttpUtils.get_content(td_list[1], "a") msg.from_user = HttpUtils.get_content(td_list[2], "span a b") if msg.from_user is None: # for ad. msg.from_user = td_list[2].contents[0] msg.since = HttpUtils.get_content(td_list[3], "span") link = HttpUtils.get_attr(td_list[1], "a", "href") msg.id = link.split("id=")[1] messages.append(msg) print("--------------------------------------") index = 1 for msg in messages: print("{:<2}|".format(index) + str(msg)) index += 1 print("--------------------------------------") return messages
def parse_lvl_one(cls): if cls.book_id is None: return soup_obj = HttpUtils.get("http://www.u17.com/comic/%s.html" % cls.book_id) assert soup_obj is not None chapters = soup_obj.select("ul#chapter li a") cls.init_thread() for chapter in chapters: final_url = chapter["href"] cls.parse_lvl_two(final_url) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def fetch_meta_data(cls): with open("ku_meta.txt", "r", encoding="utf-8") as fp: if fp.readline(): # already exist, skip return home_url = "https://www.amazon.cn/s?i=digital-text&rh=n%3A116087071%2Cn%3A116089071%2Cn%3A116176071%2Cn%3A1337022071&page=1" # find all category, sub-category and page number soup_obj = HttpUtils.get(home_url, headers=cls.amazon_headers) if soup_obj is None: print("ERROR: Cannot find category") return category_text_list = HttpUtils.get_contents( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link span") category_link_list = HttpUtils.get_attrs( soup_obj, "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link", "href") if len(category_text_list) != len(category_link_list): print("ERROR: Category number not correct") return print("find lvl 1 categories:") print(category_text_list) category_list = list() for index in range(0, len(category_link_list)): category_list.append( (category_text_list[index], category_link_list[index])) parallel_template = ParallelTemplate(5) sub_category_data_list = parallel_template.run(cls.fetch_sub_category, category_list) with open("ku_meta.txt", "w", encoding="utf-8") as fp: fp.write(json.dumps(sub_category_data_list))