def parse_arts_search(self, url, keys, response): """ parser, keys: ("arts_search", arts_key) """ _, querys = spider.get_url_params(url) self.current_page = int(querys["page"][0]) if "page" in querys else self.current_page logging.debug("WeiXinPublic parse_arts_search: update current page, current_page=%d" % self.current_page) soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") if not self.check_anti_by_captcha(soup): self.reset_this_class() return # current page for art_soup in soup.find_all("div", class_="txt-box"): art_url = spider.get_url_legal(art_soup.find("a").get("href"), base_url=url) user_openid = art_soup.find("a", id="weixin_account").get("i") user_name = art_soup.find("a", id="weixin_account").get("title") self.fetch_queue.put(item=(art_url, ("get_art", keys[1], user_openid, user_name), 0)) # next page next_page = soup.find("a", id="sogou_next") if next_page: next_page_url = spider.get_url_legal(next_page.get("href"), base_url=url) self.fetch_queue.put(item=(next_page_url, keys, 0)) return
def working(self, task, spider_url, next_page_url, cookies): try: url, keys, contents, priority = task # contents = urllib.parse.unquote(content) re_group = re.compile(spider_url).findall(contents, re.IGNORECASE) url_set = {(spider.get_url_legal(_url, base_url=url)).split("#")[0] for _url in re_group} next_page_set = { __url for __url in url_set if re.compile(next_page_url).search(__url) } flter_url_set = [ url_set.remove(_url) for _url in next_page_set if _url in url_set ] if next_page_url: next_url_re_group = re.compile(next_page_url).findall( contents, re.IGNORECASE) add_set = [ next_page_set.add( (spider.get_url_legal(_url, base_url=url)).split("#")[0]) for _url in next_url_re_group ] state, item = self.htm_parse(url, contents, cookies) key = {"type": "parser"} except Exception as excep: next_page_set = {} url_set = {} item = None state = 0 key = {"type": "parser"} logging.error("parer:", excep) return next_page_set, url_set, key, state, item
def parse_user_arts(self, url, keys, response): """ parser, keys: ("user_arts", user_id, user_name) """ html = spider.get_html_content(response, charset="utf-8") json_data = spider.get_json_data(html, "msgList = '(?P<item>\{[\w\W]+?\})'") if json_data: for item in json_data.get("list", []): item_url = spider.get_url_legal(item["app_msg_ext_info"]["content_url"][1:], self.base_url_weixinqq).replace("&", "&") self.fetch_queue.put(item=(item_url, ("get_art", None, keys[1], keys[2]), 0)) for subitem in item["app_msg_ext_info"]["multi_app_msg_item_list"]: subitem_url = spider.get_url_legal(subitem["content_url"][1:], self.base_url_weixinqq).replace("&", "&") self.fetch_queue.put(item=(subitem_url, ("get_art", None, keys[1], keys[2]), 0)) logging.debug("WeiXinPublic parse_user_arts: len(fetch_queue)=%d" % self.fetch_queue.qsize()) return
def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object): # test error-logging assert random.randint(0, 100) != 8, "error-in-parser" status_code, url_now, html_text = content url_list = [] if (self._max_deep < 0) or (deep < self._max_deep): re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE) url_list = [(spider.get_url_legal(_url, base_url=url), keys, priority + 1) for _url in re_group] # save_list can be list / tuple / dict title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) # item = (url, title.group("title").strip(), datetime.datetime.now()) if title else [] item = { "url": url, "title": title.group("title").strip(), "datetime": datetime.datetime.now() } if title else {} # test multi-processing(heavy time) [BeautifulSoup(html_text, "lxml") for _ in range(10)] return 1, url_list, item
def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object): """ 定义解析函数,解析抓取到的content,生成待抓取的url和待保存的item """ status_code, url_now, html_text = content url_list = [] if (self._max_deep < 0) or (deep < self._max_deep): re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE) url_list = [(spider.get_url_legal(_url, base_url=url), keys, priority + 1) for _url in re_group] title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) # item = (url, title.group("title").strip(), datetime.datetime.now()) if title else [] item = { "url": url, "title": title.group("title").strip(), "datetime": datetime.datetime.now() } if title else {} # test multi-processing(heavy time) [BeautifulSoup(html_text, "lxml") for _ in range(10)] return 1, url_list, item
def htm_parse(self, task_parse: spider.TaskParse) -> spider.ResultParse: """ 定义解析函数,解析抓取到的content,生成待抓取的url列表和待保存的item """ status_code, url_now, html_text = task_parse.content task_fetch_list = [] if (self._max_deep < 0) or (task_parse.deep < self._max_deep): re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE) url_list = [ spider.get_url_legal(_url, base_url=task_parse.url) for _url in re_group ] task_fetch_list = [ spider.TaskFetch.from_task_parse(task_parse, url_new=url) for url in url_list ] title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) item = {"url": url_now, "title": title.group("title")} if title else {} task_save = spider.TaskSave.from_task_parse(task_parse, item=item) return spider.ResultParse(state_code=1, task_fetch_list=task_fetch_list, task_save=task_save)
def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content): """ 重写函数htm_parse() """ # parse content (cur_code, cur_url, cur_info, cur_html) cur_code, cur_url, cur_info, cur_html = content # get url_list and save_list url_list = [] if (self.max_deep < 0) or (deep < self.max_deep): a_list = re.findall(r"<a[\w\W]+?href=\"(?P<url>[\w\W]+?)\"[\w\W]*?>[\w\W]+?</a>", cur_html, flags=re.IGNORECASE) url_list = [(_url, keys, critical, priority+1) for _url in [spider.get_url_legal(href, url) for href in a_list]] title = re.search(r"<title>(?P<title>[\w\W]+?)</title>", cur_html, flags=re.IGNORECASE) save_list = [(url, title.group("title"), datetime.datetime.now()), ] if title else [] # test cpu task count = 0 for i in range(1000): for j in range(1000): count += ((i*j) / 1000) # test parsing error if random.randint(0, 5) == 3: parse_repeat += (1 / 0) # return code, url_list, save_list return 1, url_list, save_list
def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object): status_code, url_now, html_text = content # test multi-processing [BeautifulSoup(html_text, "lxml") for _ in range(10)] url_list = [] if (self._max_deep < 0) or (deep < self._max_deep): for _url in re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE): url_list.append( (spider.get_url_legal(_url, base_url=url), keys, priority + 1)) title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) save_list = [ (url, title.group("title").strip(), datetime.datetime.now()), ] if title else [] # test error-logging # assert random.randint(0, 100) != 8, "error-in-parser" return 1, url_list, save_list
def check_anti_by_captcha(self, html): """ check anti-spider by captcha """ soup = bs4.BeautifulSoup(html, "html.parser") cid, code = None, None while not code: captcha_url = soup.find("img", attrs={ "node-type": "yzm_img" }).get("src") response = self.opener.open( spider.get_url_legal(captcha_url, self.search_url)) cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1004") verified_url = "http://s.weibo.com/ajax/pincode/verified?__rnd=%d" % int( time.time() * 1000) post_data = spider.make_post_data({ "secode": code, "type": "sass", "pageid": "weibo", "_t": 0 }) temp = json.loads( spider.get_html_content( self.opener.open(verified_url, data=post_data))) if temp["code"] == "100000": logging.warning("WeiBoSearch anti-spider succeed") else: logging.warning("WeiBoSearch anti-spider failed") self.yundama.report(cid) if cid else 0 return
def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object): status_code, url_now, html_text = content url_list = [] if (self._max_deep < 0) or (deep < self._max_deep): tmp_list = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE) url_list = [(_url, keys, priority+1) for _url in [spider.get_url_legal(href, url) for href in tmp_list]] title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) save_list = [(url, title.group("title").strip(), datetime.datetime.now()), ] if title else [] return 1, url_list, save_list
def url_parse(baseurl, html_doc, keys, priority, deep, MAX_DEEP): url_list = [] soup = BeautifulSoup(html_doc, 'lxml') if (deep == 0): post_nodes = soup.select('#archive .floated-thumb .post-thumb a') if (len(post_nodes) != 0): for node in post_nodes: url_list.append( (spider.get_url_legal(node['href'], baseurl), keys, priority + 1)) elif deep < MAX_DEEP: related_nodes = soup.select('.digg-item-updated-title a') if (len(related_nodes) != 0): for node in related_nodes: if ('/#comments' in node['href']): continue url_list.append( (spider.get_url_legal(node['href'], baseurl), keys, priority + 1)) return url_list
def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object): status_code, url_now, html_text = content url_list = [] if (self._max_deep < 0) or (deep < self._max_deep): tmp_list = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE) url_list = [(_url, keys, priority+1) for _url in [spider.get_url_legal(href, url) for href in tmp_list]] title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE) save_list = [(url, title.group("title").strip(), datetime.datetime.now()), ] if title else [] return 1, url_list, save_list
def check_anti_by_captcha(self, soup): """ check anti-spider by captcha :return 1, 0: 1(can continue), 0(can repeat) """ if not soup.find("img", id="seccodeImage"): return 1 while 1: cid, code = None, None while not code: captcha_url = soup.find("img", id="seccodeImage").get("src") response = self.opener.open(spider.get_url_legal(captcha_url, self.base_url_antispider)) cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1006") post_data = urllib.parse.urlencode({ "c": code, "r": soup.find("input", id="from").get("value"), "v": 5 }).encode() response = self.opener.open("http://weixin.sogou.com/antispider/thank.php", data=post_data) json_data = json.loads(spider.get_html_content(response, charset="utf-8")) if json_data["msg"].find("解封成功") >= 0: snuid = json_data["id"] self.cookie_jar.set_cookie(spider.make_cookie(name="SNUID", value=snuid, domain="weixin.sogou.com")) post_dict = { "uigs_productid": "webapp", "type": "antispider", "subtype": "", "domain": "weixin", "suv": "", "snuid": snuid, "t": int(time.time() * 1000) } for cookie in self.cookie_jar: if cookie.name == "SUV": post_dict["suv"] = cookie.value post_dict["subtype"] = "0_seccodeInputSuccess" post_dict["t"] = int(time.time() * 1000) self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) post_dict["subtype"] = "close_refresh" post_dict["t"] = int(time.time() * 1000) self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) break else: self.yundama.report(cid=cid) if cid else 0 logging.warning("WeiXinPublic check_anti_by_captcha: anti-spider success!") return 0