def __init__(self, max_repeat=5): """ constructor """ self.base_url = "http://weixin.sogou.com/" self.base_url_gzhjs = "http://weixin.sogou.com/gzhjs?" self.base_url_weixin = "http://weixin.sogou.com/weixin?" self.base_url_antispider = "http://weixin.sogou.com/antispider/" self.base_url_weixinqq = "http://mp.weixin.qq.com/" self.fetch_queue = Queue() # unfetched url queue (url, keys, repeat) self.saved_set = set() # saved url or other id self.current_page = 1 # current page which is fetching self.max_repeat = max_repeat # maxinum repeat time self.arts_key = None # key words for fetching articals self.user_id = None # user id, not the open_id; None if fetch_type is 2 self.search_keys = None # search keys, (key, others) self.fetch_type = 1 # fetch type, 1: public_user, 2: public_artical self.fetch_tsn = 0 # fetch tsn, 0: all, 1: one day, 2: one week, 3: one month self.cookie_jar, self.opener = spider.make_cookiejar_opener() self.opener.addheaders = spider.make_headers( user_agent="pc", host="weixin.sogou.com", accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", accept_encoding="gzip, deflate", accept_language="zh-CN" ).items() # prepare to identify the captcha, and reset this class self.yundama = spider.YunDaMa("qixianhu", "mimaMIMA123456") self.file_out = None return
def __init__(self, user_name, pass_word, appid=None, appkey=None, boundary=None): """ constructor """ self.base_url = "http://api.yundama.com/api.php" self.base_headers = spider.make_headers( user_agent="pc", host="api.yundama.com", referer="http://www.yundama.com/download/YDMHttp.html", accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", accept_language="zh-CN,zh;q=0.8", origin="http://www.yundama.com") self.user_name = user_name self.pass_word = pass_word self.appid = "1" if not appid else appid self.appkey = "22cc5376925e9387a23cf797cb9ba745" if not appkey else appkey self.boundary = "----WebKitFormBoundaryIHXcDqOlNKqucLJ7" if not boundary else boundary return
def url_fetch(self, url, keys, critical, fetch_repeat): """ 这里只需要重写url_fetch函数,参数含义及返回结果见框架 """ headers = spider.make_headers(user_agent="all", accept_encoding="gzip") response = self.opener.urlopen(urllib.request.Request(url, headers=headers), timeout=10) content = (spider.get_resp_unzip(response).decode("utf-8"), ) return 1, content
def login(self, user_name, pass_word, proxies=None): """ login weibo.com, return True or False """ # 变量赋值初始化 self.user_name = user_name self.pass_word = pass_word self.user_uniqueid = None self.user_nick = None # 构建cookie_jar和opener,这里不使用代理,同时保证整个流程中不需要关心cookie问题 self.cookie_jar, self.opener = spider.make_cookiejar_opener( is_cookie=True, proxies=proxies) self.opener.addheaders = spider.make_headers( user_agent="pc", host="weibo.com", referer="http://weibo.com/", accept= "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", accept_encoding="gzip, deflate", accept_language="zh-CN,zh;q=0.8").items() # (1) 打开weibo.com/login.php,先请求一些必要的cookie信息 self.opener.open("http://weibo.com/login.php") # (2) 根据用户名获取加密后的用户名 s_user_name = self.get_username() # (3) 利用加密后的用户名,获取其他一些数据:json格式 json_data = self.get_json_data(su_value=s_user_name) if not json_data: return False # (4) 根据第三步得到的json数据,获取加密后的密码 s_pass_word = self.get_password(json_data["servertime"], json_data["nonce"], json_data["pubkey"]) # (5) 构造登录中用到的postdata post_dict = { "entry": "weibo", "gateway": "1", "from": "", "savestate": "7", "userticket": "1", "vsnf": "1", "service": "miniblog", "encoding": "UTF-8", "pwencode": "rsa2", "sr": "1280*800", "prelt": "529", "url": "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack", "rsakv": json_data["rsakv"], "servertime": json_data["servertime"], "nonce": json_data["nonce"], "su": s_user_name, "sp": s_pass_word, "returntype": "TEXT", } # (6) 判断是否需要输入验证码,如果需要,获取验证码并进行打码操作 if json_data.get("showpin", None) == 1: url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int( time.time()), json_data["pcid"]) with open("captcha.jpeg", "wb") as file_out: file_out.write(self.opener.open(url).read()) code = input("请输入验证码:") # cid, code = self.yundama.get_captcha(self.opener.open(url).read(), "captcha.jpeg", "image/jpeg", codetype="1005") # if not code: # return False post_dict["pcid"] = json_data["pcid"] post_dict["door"] = code # (7) 根据构造的postdata,登录微博 login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int( time.time()) json_data_1 = json.loads( spider.get_html_content( self.opener.open(login_url_1, data=spider.make_post_data(post_dict)))) if json_data_1["retcode"] == "0": # 登录后有一个跳转, 构造跳转链接的postdata post_dict = { "callback": "sinaSSOController.callbackLoginStatus", "ticket": json_data_1["ticket"], "ssosavestate": int(time.time()), "client": "ssologin.js(v1.4.18)", "_": int(time.time() * 1000), } login_url_2 = "https://passport.weibo.com/wbsso/login?" + urllib.parse.urlencode( post_dict) html_data = spider.get_html_content(self.opener.open(login_url_2), charset="gbk") json_data_2 = json.loads( re.search("\((?P<result>.*)\)", html_data).group("result")) # 检查登录是否成功,并获取用户唯一ID,用户昵称等 if json_data_2["result"] is True: self.user_uniqueid = json_data_2["userinfo"]["uniqueid"] self.user_nick = json_data_2["userinfo"]["displayname"] logging.warning("WeiBoLogin succeed: %s", json_data_2) else: logging.warning("WeiBoLogin failed: %s", json_data_2) else: logging.warning("WeiBoLogin failed: %s", json_data_1) return True if self.user_uniqueid and self.user_nick else False