def get_response_content(self, url, headers = {}, data = None): ''' 获取响应数据 ''' content = '' try: # 关键的一步,加载模拟登录获取的cookie if os.path.exists(self.cookiefile): self.cj.revert(self.cookiefile, True, True) self.cookie_support = urllib2.HTTPCookieProcessor(self.cj) if self.proxy == '': self.opener = urllib2.build_opener(self.cookie_support, urllib2.HTTPHandler) else: self.opener = urllib2.build_opener(self.cookie_support, urllib2.ProxyHandler({'http': self.proxy})) urllib2.install_opener(self.opener) else: return '' req = self.pack_request(url = url, headers = headers, data = data) response = self.opener.open(req, timeout = 10) if response.info().get('Content-Encoding') == 'gzip': content = self.gzip_data(response.read()) else: content = response.read() except urllib2.HTTPError, e: logError(e) return e.code
def get_servertime(self): """ 模拟登录第一步,获取servertime、nonce等信息,用于登录时加密用户名、密码 """ url = 'http://login.sina.com.cn/sso/prelogin.php?entry=account&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.2)&_=%s' % self.__get_millitime( ) result = {} servertime = None nonce = None headers = self.__get_headers() headers['Host'] = 'login.sina.com.cn' headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' del headers['Accept-encoding'] for i in range(3): req = self.pack_request(url, headers) data = urllib2.urlopen(req).read() p = re.compile('\((.*)\)') try: json_data = p.search(data).group(1) data = json.loads(json_data) result['servertime'] = str(data['servertime']) result['nonce'] = str(data['nonce']) result['rsakv'] = str(data['rsakv']) result['pubkey'] = str(data['pubkey']) self.pcid = str(data['pcid']) break except Exception, e: logError(e) msg = 'get severtime error!' logError(msg) continue
def get_response_content(self, url, headers={}, data=None): ''' 获取响应数据 ''' content = '' try: # 关键的一步,加载模拟登录获取的cookie if os.path.exists(self.cookiefile): self.cj.revert(self.cookiefile, True, True) self.cookie_support = urllib2.HTTPCookieProcessor(self.cj) if self.proxy == '': self.opener = urllib2.build_opener(self.cookie_support, urllib2.HTTPHandler) else: self.opener = urllib2.build_opener( self.cookie_support, urllib2.ProxyHandler({'http': self.proxy})) urllib2.install_opener(self.opener) else: return '' req = self.pack_request(url=url, headers=headers, data=data) response = self.opener.open(req, timeout=10) if response.info().get('Content-Encoding') == 'gzip': content = self.gzip_data(response.read()) else: content = response.read() except urllib2.HTTPError, e: logError(e) return e.code
def redo_login(self, login_url): ''' 第三步登录 ''' try: headers = self.__get_headers() headers[ 'Referer'] = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.2)' req = self.pack_request(login_url, headers) urllib2.urlopen(req) # 保存cookie!! self.cj.save(self.cookiefile, True, True) msg = 'login success' logInfo(msg) loginFalg = True except Exception, e: logError(e) s = sys.exc_info() msg = ('redo_login %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False
def get_servertime(self): """ 模拟登录第一步,获取servertime、nonce等信息,用于登录时加密用户名、密码 """ url = 'http://login.sina.com.cn/sso/prelogin.php?entry=account&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.2)&_=%s' % self.__get_millitime() result = {} servertime = None nonce = None headers = self.__get_headers() headers['Host'] = 'login.sina.com.cn' headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' del headers['Accept-encoding'] for i in range(3): req = self.pack_request(url, headers) data = urllib2.urlopen(req).read() p = re.compile('\((.*)\)') try: json_data = p.search(data).group(1) data = json.loads(json_data) result['servertime'] = str(data['servertime']) result['nonce'] = str(data['nonce']) result['rsakv'] = str(data['rsakv']) result['pubkey'] = str(data['pubkey']) self.pcid = str(data['pcid']) break except Exception, e: logError(e) msg = 'get severtime error!' logError(msg) continue
def redo_login(self, login_url): ''' 第三步登录 ''' try: headers = self.__get_headers() headers['Referer'] = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.2)' req = self.pack_request(login_url, headers) urllib2.urlopen(req) # 保存cookie!! self.cj.save(self.cookiefile, True, True) msg = 'login success' logInfo(msg) loginFalg = True except Exception, e: logError(e) s = sys.exc_info() msg = ('redo_login %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False
def valid_cookie(self, html=''): ''' 验证本地cookie文件 ''' html = str(html) if not html: headers = self.__get_headers() # 测试李开复主页,判断cookie是否过期 html = self.get_response_content(url='http://weibo.com/kaifulee', headers=headers) if not html: msg = 'need relogin.' logError(msg) self.clear_cookiedat(self.cookiefile) return False html = str(html) html = html.replace('"', "'") if 'sinaSSOController' in html: p = re.compile('location\.replace\(\'(.*?)\'\)') try: login_url = p.search(html).group(1) headers = self.__get_headers() headers['Host'] = 'account.weibo.com' req = self.pack_request(url=login_url, headers=headers) result = urllib2.urlopen(req) self.cj.save(self.cookiefile, True, True) if result.info().get('Content-Encoding') == 'gzip': html = self.gzipData(result.read()) else: html = result.read() except Exception, e: logError(e) msg = 'relogin failure.' logError(msg) self.clear_cookiedat(self.cookiefile) return False
def valid_cookie(self, html = ''): ''' 验证本地cookie文件 ''' html = str(html) if not html: headers = self.__get_headers() # 测试李开复主页,判断cookie是否过期 html = self.get_response_content(url = 'http://weibo.com/kaifulee', headers = headers) if not html: msg = 'need relogin.' logError(msg) self.clear_cookiedat(self.cookiefile) return False html = str(html) html = html.replace('"', "'") if 'sinaSSOController' in html: p = re.compile('location\.replace\(\'(.*?)\'\)') try: login_url = p.search(html).group(1) headers = self.__get_headers() headers['Host'] = 'account.weibo.com' req = self.pack_request(url = login_url, headers = headers) result = urllib2.urlopen(req) self.cj.save(self.cookiefile, True, True) if result.info().get('Content-Encoding') == 'gzip': html = self.gzipData(result.read()) else: html = result.read() except Exception, e: logError(e) msg = 'relogin failure.' logError(msg) self.clear_cookiedat(self.cookiefile) return False
def run(self): # print到文件用 # reload(sys) # sys.setdefaultencoding('utf-8') try: searchResult = '' url = 'http://s.weibo.com/weibo/%s&xsort=time&scope=ori×cope=custom:%s:%s&page=%d' \ % (self.key, self.startTime, self.endTime, self.id) headers = { 'Host': 's.weibo.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:13.0) Gecko/20100101 Firefox/13.0.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Referer': 'http://s.weibo.com', } content = self.sina.get_response_content(url, headers) if content == '': msg = u'%s failure:获取网页内容为空!' % self.id logError(msg) searchResult = 'error' # 其他类型错误处理 # 后续处理,提取微博信息 # result = open('result.html', 'w') # print >> result, content self.fetch(content) except Exception, e: logError(e) searchResult = 'error' s = sys.exc_info() msg = ('SearchWeiboThread run Error %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg)
def clear_cookiedat(self, datpath): try: os.remove(datpath) except Exception, e: logError(e)
def do_login(self, login_un, login_pw, door = ''): ''' 第二步登录 ''' loginFlag = False try: username = login_un pwd = login_pw url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.2)' # 构造POST体是关键! postdata = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'userticket': '1', 'pagerefer' : '', 'ssosimplelogin': '******', 'vsnf': '1', 'vsnval': '', 'service': 'miniblog', 'pwencode': 'rsa2', 'rsakv' : self.rsakv, 'encoding': 'UTF-8', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META', 'prelt' : '26', } postdata['servertime'] = self.servertime postdata['nonce'] = self.nonce postdata['su'] = self.get_user(username) postdata['sp'] = self.get_pwd(pwd, self.servertime, self.nonce).lower() # 当需要验证码登录的时候,后续优化 if door: postdata['pcid'] = self.pcid postdata['door'] = door.lower() headers = { 'Host': 'login.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-encoding': 'gzip, deflate', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Referer' : 'http://weibo.com/', 'Content-Type': 'application/x-www-form-urlencoded', } req = self.pack_request(url, headers, postdata) result = urllib2.urlopen(req) if result.info().get('Content-Encoding') == 'gzip': text = self.gzip_data(result.read()) else: text = result.read() return text except Exception, e: logError(e) s = sys.exc_info() msg = ('do_login: %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFlag = False
def login(self, login_un, login_pw): ''' 对外暴露的登录接口 ''' loginFalg = False try: try: # 步骤一,获取加密用servertime、nonce等信息 stObj = self.get_servertime() self.servertime = stObj.get('servertime') self.nonce = stObj.get('nonce') self.pubkey = stObj.get('pubkey') self.rsakv = stObj.get('rsakv') except Exception, e: logError(e) return False self.get_global_id() # 步骤二,加密密码登录 loginHtml = self.do_login(login_un, login_pw) loginHtml = loginHtml.replace('"', "'") try: p = re.compile('location\.replace\(\'(.*?)\'\)') login_url = p.search(loginHtml).group(1) if 'retcode=0' in loginHtml: # 步骤三,根据步骤二跳转地址,进一步登录,获取cookie信息 # 这一步成功才是真的成功!! return self.redo_login(login_url) if 'retcode=5' in loginHtml: logError('password or account error.') return False if 'retcode=4040' in loginHtml: logError('do login too much times.') return False # 需要验证码,悲剧,先报错吧!后续优化~ if 'retcode=4049' in login_url: logError('nead input verify code, return failure.') return False except Exception, e: logError(e) s = sys.exc_info() msg = ('do login %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False
def do_login(self, login_un, login_pw, door=''): ''' 第二步登录 ''' loginFlag = False try: username = login_un pwd = login_pw url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.2)' # 构造POST体是关键! postdata = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'userticket': '1', 'pagerefer': '', 'ssosimplelogin': '******', 'vsnf': '1', 'vsnval': '', 'service': 'miniblog', 'pwencode': 'rsa2', 'rsakv': self.rsakv, 'encoding': 'UTF-8', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META', 'prelt': '26', } postdata['servertime'] = self.servertime postdata['nonce'] = self.nonce postdata['su'] = self.get_user(username) postdata['sp'] = self.get_pwd(pwd, self.servertime, self.nonce).lower() # 当需要验证码登录的时候,后续优化 if door: postdata['pcid'] = self.pcid postdata['door'] = door.lower() headers = { 'Host': 'login.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-encoding': 'gzip, deflate', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Referer': 'http://weibo.com/', 'Content-Type': 'application/x-www-form-urlencoded', } req = self.pack_request(url, headers, postdata) result = urllib2.urlopen(req) if result.info().get('Content-Encoding') == 'gzip': text = self.gzip_data(result.read()) else: text = result.read() return text except Exception, e: logError(e) s = sys.exc_info() msg = ('do_login: %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFlag = False
logError('do login too much times.') return False # 需要验证码,悲剧,先报错吧!后续优化~ if 'retcode=4049' in login_url: logError('nead input verify code, return failure.') return False except Exception, e: logError(e) s = sys.exc_info() msg = ('do login %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False except Exception, e: logError(e) s = sys.exc_info() msg = ('login: %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False return loginFalg def do_login(self, login_un, login_pw, door = ''): ''' 第二步登录 ''' loginFlag = False try:
return False # 需要验证码,悲剧,先报错吧!后续优化~ if 'retcode=4049' in login_url: logError('nead input verify code, return failure.') return False except Exception, e: logError(e) s = sys.exc_info() msg = ('do login %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False except Exception, e: logError(e) s = sys.exc_info() msg = ('login: %s happened on line %d' % (s[1], s[2].tb_lineno)) logError(msg) loginFalg = False return loginFalg def do_login(self, login_un, login_pw, door=''): ''' 第二步登录 ''' loginFlag = False try: username = login_un