def parse_arts_search(self, url, keys, response): """ parser, keys: ("arts_search", arts_key) """ _, querys = spider.get_url_params(url) self.current_page = int(querys["page"][0]) if "page" in querys else self.current_page logging.debug("WeiXinPublic parse_arts_search: update current page, current_page=%d" % self.current_page) soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") if not self.check_anti_by_captcha(soup): self.reset_this_class() return # current page for art_soup in soup.find_all("div", class_="txt-box"): art_url = spider.get_url_legal(art_soup.find("a").get("href"), base_url=url) user_openid = art_soup.find("a", id="weixin_account").get("i") user_name = art_soup.find("a", id="weixin_account").get("title") self.fetch_queue.put(item=(art_url, ("get_art", keys[1], user_openid, user_name), 0)) # next page next_page = soup.find("a", id="sogou_next") if next_page: next_page_url = spider.get_url_legal(next_page.get("href"), base_url=url) self.fetch_queue.put(item=(next_page_url, keys, 0)) return
def get_json_data(self, su_value): """ get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc """ post_data = urllib.parse.urlencode({ "entry": "weibo", "callback": "sinaSSOController.preloginCallBack", "rsakt": "mod", "checkpin": "1", "client": "ssologin.js(v1.4.18)", "su": su_value, "_": int(time.time() * 1000), }) try: response = self.opener.open( 'http://login.sina.com.cn/sso/prelogin.php?' + post_data) data = spider.get_html_content(response, charset="utf-8") json_data = json.loads( re.search("\((?P<data>.*)\)", data).group("data")) except Exception as excep: json_data = {} logging.error("WeiBoLogin get_json_data error: %s", excep) logging.debug("WeiBoLogin get_json_data: %s", json_data) return json_data
def check_anti_by_captcha(self, html): """ check anti-spider by captcha """ soup = bs4.BeautifulSoup(html, "html.parser") cid, code = None, None while not code: captcha_url = soup.find("img", attrs={ "node-type": "yzm_img" }).get("src") response = self.opener.open( spider.get_url_legal(captcha_url, self.search_url)) cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1004") verified_url = "http://s.weibo.com/ajax/pincode/verified?__rnd=%d" % int( time.time() * 1000) post_data = spider.make_post_data({ "secode": code, "type": "sass", "pageid": "weibo", "_t": 0 }) temp = json.loads( spider.get_html_content( self.opener.open(verified_url, data=post_data))) if temp["code"] == "100000": logging.warning("WeiBoSearch anti-spider succeed") else: logging.warning("WeiBoSearch anti-spider failed") self.yundama.report(cid) if cid else 0 return
def url_fetch(self, url, keys, critical, fetch_repeat): """ 这里只需要重写url_fetch函数,参数含义及返回结果见框架 """ headers = spider.make_headers(user_agent="all", accept_encoding="gzip") response = self.opener.urlopen(urllib.request.Request(url, headers=headers), timeout=10) content = (spider.get_html_content(response, charset="utf-8"), ) return 1, content
def fetch_user_from_id(self, user_id): """ fetch user data from user_id :return user_name, [user_page_follows, user_page_fans, user_page_weibos], [follows, fans, weibos] """ self.re_login() if not self.user_uniqueid else 0 user_url_base = "http://weibo.com/%s/profile" % user_id.strip() user_name, user_pages, user_counts = None, [], [] repeat_time = 0 while repeat_time <= self.max_repeat: logging.debug("WeiBoUser repeat: repeat_time=%d" % repeat_time) if repeat_time > 0 else 0 html_all = spider.get_html_content( self.opener.open(user_url_base, timeout=5)) header_dict = { key: value for key, value in self.header_re.findall(html_all) } if ("uid" not in header_dict) or ("nick" not in header_dict): repeat_time += 1 continue if ("onick" not in header_dict) or (header_dict["onick"] == header_dict["nick"]): repeat_time += 1 continue for sc_string in self.html_re.findall(html_all): json_data = json.loads(sc_string) if json_data[ "domid"] == "Pl_Core_T8CustomTriColumn__3" and "html" in json_data: soup = bs4.BeautifulSoup(json_data["html"], "html.parser") a_soup_list = soup.find_all("a", class_="S_txt1") user_pages = [a_soup.get("href") for a_soup in a_soup_list] user_counts = [ int(spider.get_string_num(a_soup.get_text())) for a_soup in a_soup_list ] user_name = header_dict["onick"] break if user_name: break repeat_time += 1 # return result logging.warning( "WeiBoUser fetch_user_from_id: user_id=%s, user_name=%s" % (user_id, user_name)) return user_name, user_pages, user_counts
def fetch_search_weibo(self, fetch_keys, fetch_timescope, fetch_type="typeall=1", out_file=None): """ fetch search weibo """ assert fetch_type in ["typeall=1", "xsort=hot", "scope=ori"] self.re_login() if not self.user_uniqueid else 0 # base class variables self.fetch_queue.queue.clear() self.saved_set.clear() self.current_page = 1 self.out_file = out_file self.out_list = [] self.out_length = 0 # this class variables self.fetch_keys = fetch_keys self.fetch_timescope = fetch_timescope self.fetch_type = fetch_type # update fetch queue self.update_fetch_queue() while self.fetch_queue.qsize() > 0: url, keys, repeat = self.fetch_queue.get() logging.debug("WeiBoSearch: keys=%s, repeat=%s, url=%s", keys, repeat, url) try: html_all = spider.get_html_content(self.opener.open(url)) for sc in re.findall( "<script>[\w\W]+?STK\.pageletM\.view\(([\w\W]+?)\)</script>", html_all): json_data = json.loads(sc) if json_data.get("pid") == "pl_common_sassfilter": self.check_anti_by_captcha(json_data["html"]) self.update_fetch_queue() break if json_data.get("pid") == "pl_weibo_direct": self.parse_search_weibo_page(json_data["html"]) break except Exception as excep: if repeat < self.max_repeat: self.fetch_queue.put(item=(url, keys, repeat + 1)) else: logging.error("WeiBoSearch error: %s, url=%s", excep, url) return
def check_anti_by_captcha(self, soup): """ check anti-spider by captcha :return 1, 0: 1(can continue), 0(can repeat) """ if not soup.find("img", id="seccodeImage"): return 1 while 1: cid, code = None, None while not code: captcha_url = soup.find("img", id="seccodeImage").get("src") response = self.opener.open(spider.get_url_legal(captcha_url, self.base_url_antispider)) cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1006") post_data = urllib.parse.urlencode({ "c": code, "r": soup.find("input", id="from").get("value"), "v": 5 }).encode() response = self.opener.open("http://weixin.sogou.com/antispider/thank.php", data=post_data) json_data = json.loads(spider.get_html_content(response, charset="utf-8")) if json_data["msg"].find("解封成功") >= 0: snuid = json_data["id"] self.cookie_jar.set_cookie(spider.make_cookie(name="SNUID", value=snuid, domain="weixin.sogou.com")) post_dict = { "uigs_productid": "webapp", "type": "antispider", "subtype": "", "domain": "weixin", "suv": "", "snuid": snuid, "t": int(time.time() * 1000) } for cookie in self.cookie_jar: if cookie.name == "SUV": post_dict["suv"] = cookie.value post_dict["subtype"] = "0_seccodeInputSuccess" post_dict["t"] = int(time.time() * 1000) self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) post_dict["subtype"] = "close_refresh" post_dict["t"] = int(time.time() * 1000) self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict)) break else: self.yundama.report(cid=cid) if cid else 0 logging.warning("WeiXinPublic check_anti_by_captcha: anti-spider success!") return 0
def parse_get_art(self, url, keys, response): """ parser, keys: ("get_art", None or arts_key, user_id or user_openid, user_name) """ soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") _, querys = spider.get_url_params(url) s_title = spider.get_string_strip(soup.title.string) s_date = soup.find("em", id="post-date").get_text() self.file_out.write("\t".join([s_title, s_date, str(keys[1:])]) + "\n") self.saved_set.add(keys[2] + s_date + s_title) logging.debug("WeiXinPublic parse_get_art: len(saved_set)=%d" % len(self.saved_set)) return
def parse_user_arts(self, url, keys, response): """ parser, keys: ("user_arts", user_id, user_name) """ html = spider.get_html_content(response, charset="utf-8") json_data = spider.get_json_data(html, "msgList = '(?P<item>\{[\w\W]+?\})'") if json_data: for item in json_data.get("list", []): item_url = spider.get_url_legal(item["app_msg_ext_info"]["content_url"][1:], self.base_url_weixinqq).replace("&", "&") self.fetch_queue.put(item=(item_url, ("get_art", None, keys[1], keys[2]), 0)) for subitem in item["app_msg_ext_info"]["multi_app_msg_item_list"]: subitem_url = spider.get_url_legal(subitem["content_url"][1:], self.base_url_weixinqq).replace("&", "&") self.fetch_queue.put(item=(subitem_url, ("get_art", None, keys[1], keys[2]), 0)) logging.debug("WeiXinPublic parse_user_arts: len(fetch_queue)=%d" % self.fetch_queue.qsize()) return
def parse_user_search(self, url, keys, response): """ parser, keys: ("user_search", user_id) """ soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser") if not self.check_anti_by_captcha(soup): self.reset_this_class() return user_name = "" for user_item in soup.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item"): if user_item.find("label", attrs={"name": "em_weixinhao"}).get_text() == self.user_id: user_name = user_item.find("div", class_="txt-box").find("h3").get_text() self.fetch_queue.put(item=(user_item.get("href"), ("user_arts", self.user_id, user_name), 0)) logging.debug("WeiXinPublic parse_user_search: user_name=%s" % user_name) return
def login(self, user_name, pass_word, proxies=None): """ login weibo.com, return True or False """ # 变量赋值初始化 self.user_name = user_name self.pass_word = pass_word self.user_uniqueid = None self.user_nick = None # 构建cookie_jar和opener,这里不使用代理,同时保证整个流程中不需要关心cookie问题 self.cookie_jar, self.opener = spider.make_cookiejar_opener( is_cookie=True, proxies=proxies) self.opener.addheaders = spider.make_headers( user_agent="pc", host="weibo.com", referer="http://weibo.com/", accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", accept_encoding="gzip, deflate", accept_language="zh-CN,zh;q=0.8").items() # (1) 打开weibo.com/login.php,先请求一些必要的cookie信息 self.opener.open("http://weibo.com/login.php") # (2) 根据用户名获取加密后的用户名 s_user_name = self.get_username() # (3) 利用加密后的用户名,获取其他一些数据:json格式 json_data = self.get_json_data(su_value=s_user_name) if not json_data: return False # (4) 根据第三步得到的json数据,获取加密后的密码 s_pass_word = self.get_password(json_data["servertime"], json_data["nonce"], json_data["pubkey"]) # (5) 构造登录中用到的postdata post_dict = { "entry": "weibo", "gateway": "1", "from": "", "savestate": "7", "userticket": "1", "vsnf": "1", "service": "miniblog", "encoding": "UTF-8", "pwencode": "rsa2", "sr": "1280*800", "prelt": "529", "url": "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack", "rsakv": json_data["rsakv"], "servertime": json_data["servertime"], "nonce": json_data["nonce"], "su": s_user_name, "sp": s_pass_word, "returntype": "TEXT", } # (6) 判断是否需要输入验证码,如果需要,获取验证码并进行打码操作 if json_data.get("showpin", None) == 1: url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int( time.time()), json_data["pcid"]) with open("captcha.jpeg", "wb") as file_out: file_out.write(self.opener.open(url).read()) code = input("请输入验证码:") # cid, code = self.yundama.get_captcha(self.opener.open(url).read(), "captcha.jpeg", "image/jpeg", codetype="1005") # if not code: # return False post_dict["pcid"] = json_data["pcid"] post_dict["door"] = code # (7) 根据构造的postdata,登录微博 login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int( time.time()) json_data_1 = json.loads( spider.get_html_content( self.opener.open(login_url_1, data=spider.make_post_data(post_dict)))) if json_data_1["retcode"] == "0": # 登录后有一个跳转, 构造跳转链接的postdata post_dict = { "callback": "sinaSSOController.callbackLoginStatus", "ticket": json_data_1["ticket"], "ssosavestate": int(time.time()), "client": "ssologin.js(v1.4.18)", "_": int(time.time() * 1000), } login_url_2 = "https://passport.weibo.com/wbsso/login?" + urllib.parse.urlencode( post_dict) html_data = spider.get_html_content(self.opener.open(login_url_2), charset="gbk") json_data_2 = json.loads( re.search("\((?P<result>.*)\)", html_data).group("result")) # 检查登录是否成功,并获取用户唯一ID,用户昵称等 if json_data_2["result"] is True: self.user_uniqueid = json_data_2["userinfo"]["uniqueid"] self.user_nick = json_data_2["userinfo"]["displayname"] logging.warning("WeiBoLogin succeed: %s", json_data_2) else: logging.warning("WeiBoLogin failed: %s", json_data_2) else: logging.warning("WeiBoLogin failed: %s", json_data_1) return True if self.user_uniqueid and self.user_nick else False
def fetch_user_weibos(self, user_url, key_dict, file_out=sys.stdout, sleep_time=0): """ fetch user weibo, user_url like: http://weibo.com/p/1005051750270991/home?parameters :param key_dict: {"mod": "data", "is_all": 1} :param key_dict: {"stat_date": "201512", "is_all": 1} :param key_dict: { "is_ori": 1, "is_forward": 1, "is_text": 1, "is_pic": 1, "is_video": 1, "is_music": 1, "is_article": 1, "key_word": "a b", "start_time": "2016-06-01", "end_time": "2016-06-04", "is_search": 1, "is_searchadv": 1 } """ self.re_login() if not self.user_uniqueid else 0 self.fetch_queue.queue.clear() self.current_page = 1 self.file_out = file_out # get the start url url_main, _ = spider.get_url_params(user_url, is_unique_values=True) self.fetch_queue.put( (url_main + "?" + urllib.parse.urlencode(key_dict), "page_index", 0)) # get data from url while self.fetch_queue.qsize() > 0: time.sleep(random.randint(0, sleep_time)) if sleep_time > 0 else 0 url, keys, repeat = self.fetch_queue.get() try: html_all = spider.get_html_content( self.opener.open(url, timeout=5)) main, querys = spider.get_url_params(url, is_unique_values=True) if keys == "page_index": logging.warning( "WeiBoUser index: repeat=%d, page=%d, url=%s" % (repeat, self.current_page, url)) header_dict = { key: value for key, value in self.header_re.findall(html_all) } for sc_string in self.html_re.findall(html_all): json_data = json.loads(sc_string) if json_data.get("ns") == "pl.content.homeFeed.index" and \ json_data["domid"].startswith("Pl_Official_MyProfileFeed"): # get index data weibo_count, is_loading, next_page = self.parse_user_weibo_page( json_data["html"]) if is_loading: # pagebar 0 and 1 post_dict = { "id": querys.get("id", header_dict["page_id"]), "domain": querys.get("domain", header_dict["domain"]), "domain_op": querys.get("domain_op", header_dict["domain"]), "pre_page": querys.get("page", 1), "page": querys.get("page", 1), "pagebar": 0, "feed_type": 0, "ajwvr": 6, "__rnd": int(time.time() * 1000) } post_dict.update(key_dict) self.fetch_queue.put( (self.bar_url + urllib.parse.urlencode(post_dict), "page_bar", 0)) break elif keys == "page_bar": logging.warning( "WeiBoUser bar=%s: page=%d url=%s" % (querys["pagebar"], self.current_page, url)) # get bar data weibo_count, is_loading, next_page = self.parse_user_weibo_page( json.loads(html_all)["data"]) if is_loading: querys["pagebar"] = 1 self.fetch_queue.put( (self.bar_url + urllib.parse.urlencode(querys), "page_bar", 0)) if next_page: self.current_page += 1 _temp = next_page.get("href") self.fetch_queue.put( (url_main + _temp[_temp.find("?"):], "page_index", 0)) except Exception as e: if repeat < self.max_repeat: self.fetch_queue.put((url, keys, repeat + 1)) else: logging.error("WeiBoUser error: error=%s, url=%s" % (str(e), url)) return