class SinaClient(object): def __init__(self, username=None, password=None): self.username = username self.password = password self.servertime = None self.nonce = None self.pubkey = None self.rsakv = None self.post_data = None self.headers = {} self.session = None self.cookiejar = None self.logger = None self.state = False self.initParams() def initParams(self): self.logger = LogClient().createLogger( 'SinaClient', 'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log') self.headers = dataEncode.headers return self def setAccount(self, username, password): self.username = username self.password = password return self def setPostData(self): self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info( ) self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv) return self def login(self, username=None, password=None): self.setAccount(username, password) self.setPostData() login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' session = requests.Session() response = session.post(login_url, data=self.post_data) json_text = response.content.decode('gbk') res_info = json.loads(json_text) try: if res_info["retcode"] == "0": self.logger.info("Login success!") self.state = True cookies = session.cookies.get_dict() cookies = [key + "=" + value for key, value in cookies.items()] cookies = "; ".join(cookies) session.headers["Cookie"] = cookies else: self.logger.error("Login Failed! | " + res_info["reason"]) except Exception, e: self.logger.error("Loading error --> " + e) self.session = session return session
class SinaClient(object): def __init__(self, username=None, password=None): self.username = username self.password = password self.servertime = None self.nonce = None self.pubkey = None self.rsakv = None self.post_data = None self.headers = {} self.session = None self.cookiejar = None self.logger = None self.status = False self.access_token = None self.app_key = None self.initParams() self.timeout = 3 socket.setdefaulttimeout(3) self.tryTimes = 8 def initParams(self): self.logger = LogClient().createLogger('SinaWSpider', myconf.log_out_path) self.headers = myconf.headers self.access_token = myconf.access_token self.app_key = myconf.app_key return self def setAccount(self, username, password): self.username = username self.password = password return self def setPostData(self): self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info( ) self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv) return self def switchUserAgent(self, enableAgent=True): user_agent = random.choice(myconf.agent_list) self.headers["User-Agent"] = user_agent return self def switchUserAccount(self, userlist): is_login = False while not is_login: self.switchUserAgent() self.logger.info("User-Agent is: " + self.headers["User-Agent"]) user = random.choice(userlist).split("|") self.logger.info("logining with user: "******"Cannot login to sina!") continue is_login = True return self def enableCookie(self, enableProxy=False): self.cookiejar = cookielib.LWPCookieJar() # 建立COOKIE cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) return self def login(self, username=None, password=None): self.logger.info("Start to login...") self.setAccount(username, password) self.setPostData() self.enableCookie(enableProxy=True) login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' headers = self.headers try: request = urllib2.Request(login_url, urllib.urlencode(self.post_data), headers) resText = urllib2.urlopen(request).read() jsonText = json.loads(resText) if jsonText["retcode"] == "0": self.logger.info("Login success!") self.status = True cookies = ';'.join([ cookie.name + "=" + cookie.value for cookie in self.cookiejar ]) headers["Cookie"] = cookies else: self.logger.error("Login Failed --> " + jsonText["reason"]) except Exception, e: self.logger.error("Login Failed2! --> " + str(e)) self.headers = headers return self
class SinaClient(object): def __init__(self, username=None, password=None): #用户输入的用户名与密码 self.username = username self.password = password #从prelogin.php中获取的数据 self.servertime = None self.nonce = None self.pubkey = None self.rsakv = None #请求时提交的数据列表 self.post_data = None self.headers = {} #用于存储登录后的session self.session = None self.cookiejar = None #用于输出log信息 self.logger = None #登录状态,初始化为False,表示未登录状态 self.status = False #微博API必备信息 self.access_token = None self.app_key = None #初始时调用initParams方法,初始化相关参数 self.initParams() self.timeout = 3 socket.setdefaulttimeout(3) self.tryTimes = 8 #初始化参数 def initParams(self): self.logger = LogClient().createLogger('SinaWSpider', myconf.log_out_path) self.headers = myconf.headers self.access_token = myconf.access_token self.app_key = myconf.app_key return self #设置username 和 password def setAccount(self, username, password): self.username = username self.password = password return self #设置post_data def setPostData(self): self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info( ) self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv) return self #使用用户代理,更换header中的User-Agent def switchUserAgent(self, enableAgent=True): user_agent = random.choice(myconf.agent_list) self.headers["User-Agent"] = user_agent return self #用于切换用户账号,防止长时间爬取账号被禁 def switchUserAccount(self, userlist): is_login = False while not is_login: self.switchUserAgent() self.logger.info("User-Agent is: " + self.headers["User-Agent"]) user = random.choice(userlist).split("|") self.logger.info("logining with user: "******"Cannot login to sina!") continue is_login = True return self #生成Cookie,接下来的所有get和post请求都带上已经获取的cookie def enableCookie(self, enableProxy=False): self.cookiejar = cookielib.LWPCookieJar() # 建立COOKIE cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar) if enableProxy: proxy = myconf.swithProxy() proxy_support = urllib2.ProxyHandler(proxy) # 使用代理myconf.proxies opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) self.logger.info("Proxy enable, proxy is: " + str(proxy)) else: opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) return self #使用urllib2模拟登录过程 def login(self, username=None, password=None): self.status = False #重新将登录状态设置为False self.logger.info("Start to login...") #根据用户名和密码给默认参数赋值,并初始化post_data self.setAccount(username, password) self.setPostData() self.enableCookie(enableProxy=True) #登录时请求的url login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' headers = self.headers try: request = urllib2.Request(login_url, urllib.urlencode(self.post_data), headers) resText = urllib2.urlopen(request).read() jsonText = json.loads(resText) if jsonText["retcode"] == "0": self.logger.info("Login success!") self.status = True #将cookie加入到headers中 cookies = ';'.join([ cookie.name + "=" + cookie.value for cookie in self.cookiejar ]) headers["Cookie"] = cookies else: self.logger.error("Login Failed --> " + jsonText["reason"]) except Exception, e: self.logger.error("Login Failed2! --> " + str(e)) self.headers = headers return self
class SinaClient(object): def __init__(self, username=None, password=None): # 用户输入的用户名与密码 self.username = username self.password = password # 从prelogin.php中获取的数据 self.servertime = None self.nonce = None self.pubkey = None self.rsakv = None # 请求时提交的数据列表 self.post_data = None self.headers = {} # 用于存储登录后的session self.session = None self.cookiejar = None # 用于输出log信息 self.logger = None # 存储登录状态,初始状态为False self.state = False # 初始时调用initParams方法,初始化相关参数 self.initParams() # 初始化参数 def initParams(self): self.logger = LogClient().createLogger( 'SinaClient', 'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log') self.headers = dataEncode.headers return self # 设置username 和 password def setAccount(self, username, password): self.username = username self.password = password return self # 设置post_data def setPostData(self): self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info( ) self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv) return self # 使用requests库登录到 https://login.sina.com.cn def login(self, username=None, password=None): # 根据用户名和密码给默认参数赋值,并初始化post_data self.setAccount(username, password) self.setPostData() # 登录时请求的url login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' session = requests.Session() response = session.post(login_url, data=self.post_data) json_text = response.content.decode('gbk') res_info = json.loads(json_text) try: if res_info["retcode"] == "0": self.logger.info("Login success!") self.state = True # 把cookies添加到headers中 cookies = session.cookies.get_dict() cookies = [key + "=" + value for key, value in cookies.items()] cookies = "; ".join(cookies) session.headers["Cookie"] = cookies else: self.logger.error("Login Failed! | " + res_info["reason"]) except Exception, e: self.logger.error("Loading error --> " + e) self.session = session return session
class SinaClient(object): def __init__(self, username=None, password=None): #用户输入的用户名与密码 self.username = username self.password = password #从prelogin.php中获取的数据 self.servertime = None self.nonce = None self.pubkey = None self.rsakv = None #请求时提交的数据列表 self.post_data = None self.headers = {} #用于存储登录后的session self.session = None self.cookiejar = None #用于输出log信息 self.logger = None #存储登录状态,初始状态为False self.state = False #初始时调用initParams方法,初始化相关参数 self.initParams() #初始化参数 def initParams(self): #// 如:out/log_20190420.log self.logger = LogClient().createLogger('SinaClient', 'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log') self.headers = dataEncode.headers return self #设置username 和 password def setAccount(self, username, password): self.username = username self.password = password return self #设置post_data def setPostData(self): self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info() self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv) return self #使用requests库登录到 https://login.sina.com.cn def login(self, username=None, password=None): #根据用户名和密码给默认参数赋值,并初始化post_data self.setAccount(username, password) self.setPostData() #登录时请求的url //network中headers可以看到此url,这里把url后面的时间戳省略了 login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' session = requests.Session() response = session.post(login_url, data=self.post_data) json_text = response.content.decode('gbk') res_info = json.loads(json_text) try: if res_info["retcode"] == "0": self.logger.info("Login success!") self.state = True #把cookies添加到headers中 cookies = session.cookies.get_dict() cookies = [key + "=" + value for key, value in cookies.items()] cookies = "; ".join(cookies) session.headers["Cookie"] = cookies else: self.logger.error("Login Failed! | " + res_info["reason"]) #except Exception, e: #//本版本不能这样使用 except Exception as e: self.logger.error("Loading error --> " + e) self.session = session return session #生成Cookie,接下来的所有get和post请求都带上已经获取的cookie def enableCookie(self, enableProxy=False): #self.cookiejar = cookielib.LWPCookieJar() # 建立COOKIE //python 3.6 改成 cookiejar了 self.cookiejar = cookiejar.LWPCookieJar() # 建立COOKIE #cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar) //urllib2 在 python3.x 中被改为urllib.request #//利用urllib库中的request的HTTPCookieProcessor对象来创建cookie处理器 cookie_support = urllib.request.HTTPCookieProcessor(self.cookiejar) if enableProxy: #//这里enableProxy=False,应该不执行if语句 #proxy_support = urllib2.ProxyHandler({'http': 'http://122.96.59.107:843'}) # 使用代理 proxy_support = urllib.request.ProxyHandler({'http': 'http://122.96.59.107:843'}) # 使用代理 #opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) opener = urllib.request.build_opener(proxy_support, cookie_support, urllib.request.HTTPHandler) self.logger.info("Proxy enable.") else: #opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler) #//通过handler来构建opener #urllib2.install_opener(opener) urllib.request.install_opener(opener) #//如果程序里所有的请求都使用自定义的opener,可以使用urllib.request.install_opener(), #//将自定义的 opener 对象 定义为 全局opener,表示如果之后凡是调用urlopen,都将使用这个opener. #使用urllib2模拟登录过程 def login2(self, username=None, password=None): self.logger.info("Start to login...") #根据用户名和密码给默认参数赋值,并初始化post_data self.setAccount(username, password) self.setPostData() self.enableCookie() #登录时请求的url login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)' headers = self.headers #request = urllib2.Request(login_url, urllib.urlencode(self.post_data), headers) //POST data should be bytes #resText = urllib2.urlopen(request).read() request = urllib.request.Request(login_url, urllib.parse.urlencode(self.post_data).encode("utf-8"), headers) resText = urllib.request.urlopen(request).read() try: jsonText = json.loads(resText) if jsonText["retcode"] == "0": self.logger.info("Login success!") self.state = True #将cookie加入到headers中 cookies = ';'.join([cookie.name + "=" + cookie.value for cookie in self.cookiejar]) headers["Cookie"] = cookies #//dict headers在dataEncode.py中,这里给headers加上Cookie else: self.logger.error("Login Failed --> " + jsonText["reason"]) #//except Exception, e: except Exception as e: #//print e print(e) self.headers = headers return self #打开url时携带headers,此header需携带cookies def openURL(self, url, data=None): #req = urllib2.Request(url, data=data, headers=self.headers) #text = urllib2.urlopen(req).read() req = urllib.request.Request(url, data=data, headers=self.headers) text = urllib.request.urlopen(req).read() return text #功能:将文本内容输出至本地 def output(self, content, out_path, save_mode="w"): self.logger.info("Download html page to local machine. | path: " + out_path) prefix = os.path.dirname(out_path) if not os.path.exists(prefix): os.makedirs(prefix) fw = open(out_path, save_mode) #fw.write(content) fw.write(str(content)) fw.close() return self """ 防止读取出来的HTML乱码,测试样例如下 req = urllib2.Request(url, headers=headers) text = urllib2.urlopen(req).read() unzip(text) """ def unzip(self, data): data = StringIO.StringIO(data) gz = gzip.GzipFile(fileobj=data) data = gz.read() gz.close() return data