Exemple #1
0
 def initParams(self):
     self.logger = LogClient().createLogger('SinaWSpider',
                                            myconf.log_out_path)
     self.headers = myconf.headers
     self.access_token = myconf.access_token
     self.app_key = myconf.app_key
     return self
class SinaClient(object):
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
        self.servertime = None
        self.nonce = None
        self.pubkey = None
        self.rsakv = None
        self.post_data = None
        self.headers = {}
        self.session = None
        self.cookiejar = None
        self.logger = None
        self.state = False
        self.initParams()

    def initParams(self):
        self.logger = LogClient().createLogger(
            'SinaClient',
            'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log')
        self.headers = dataEncode.headers
        return self

    def setAccount(self, username, password):
        self.username = username
        self.password = password
        return self

    def setPostData(self):
        self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info(
        )
        self.post_data = dataEncode.encode_post_data(self.username,
                                                     self.password,
                                                     self.servertime,
                                                     self.nonce, self.pubkey,
                                                     self.rsakv)
        return self

    def login(self, username=None, password=None):
        self.setAccount(username, password)
        self.setPostData()
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        session = requests.Session()
        response = session.post(login_url, data=self.post_data)
        json_text = response.content.decode('gbk')
        res_info = json.loads(json_text)
        try:
            if res_info["retcode"] == "0":
                self.logger.info("Login success!")
                self.state = True
                cookies = session.cookies.get_dict()
                cookies = [key + "=" + value for key, value in cookies.items()]
                cookies = "; ".join(cookies)
                session.headers["Cookie"] = cookies
            else:
                self.logger.error("Login Failed! | " + res_info["reason"])
        except Exception, e:
            self.logger.error("Loading error --> " + e)
        self.session = session
        return session
Exemple #3
0
class SinaClient(object):
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
        self.servertime = None
        self.nonce = None
        self.pubkey = None
        self.rsakv = None
        self.post_data = None
        self.headers = {}
        self.session = None
        self.cookiejar = None
        self.logger = None
        self.status = False
        self.access_token = None
        self.app_key = None
        self.initParams()
        self.timeout = 3
        socket.setdefaulttimeout(3)
        self.tryTimes = 8

    def initParams(self):
        self.logger = LogClient().createLogger('SinaWSpider',
                                               myconf.log_out_path)
        self.headers = myconf.headers
        self.access_token = myconf.access_token
        self.app_key = myconf.app_key
        return self

    def setAccount(self, username, password):
        self.username = username
        self.password = password
        return self

    def setPostData(self):
        self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info(
        )
        self.post_data = dataEncode.encode_post_data(self.username,
                                                     self.password,
                                                     self.servertime,
                                                     self.nonce, self.pubkey,
                                                     self.rsakv)
        return self

    def switchUserAgent(self, enableAgent=True):
        user_agent = random.choice(myconf.agent_list)
        self.headers["User-Agent"] = user_agent
        return self

    def switchUserAccount(self, userlist):
        is_login = False
        while not is_login:
            self.switchUserAgent()
            self.logger.info("User-Agent is: " + self.headers["User-Agent"])
            user = random.choice(userlist).split("|")
            self.logger.info("logining with user: "******"Cannot login to sina!")
                continue
            is_login = True
        return self

    def enableCookie(self, enableProxy=False):
        self.cookiejar = cookielib.LWPCookieJar()  # 建立COOKIE
        cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar)
        opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        return self

    def login(self, username=None, password=None):
        self.logger.info("Start to login...")
        self.setAccount(username, password)
        self.setPostData()
        self.enableCookie(enableProxy=True)
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        headers = self.headers
        try:
            request = urllib2.Request(login_url,
                                      urllib.urlencode(self.post_data),
                                      headers)
            resText = urllib2.urlopen(request).read()
            jsonText = json.loads(resText)
            if jsonText["retcode"] == "0":
                self.logger.info("Login success!")
                self.status = True
                cookies = ';'.join([
                    cookie.name + "=" + cookie.value
                    for cookie in self.cookiejar
                ])
                headers["Cookie"] = cookies
            else:
                self.logger.error("Login Failed --> " + jsonText["reason"])
        except Exception, e:
            self.logger.error("Login Failed2! --> " + str(e))
        self.headers = headers
        return self
Exemple #4
0
class SinaClient(object):
    def __init__(self, username=None, password=None):
        #用户输入的用户名与密码
        self.username = username
        self.password = password
        #从prelogin.php中获取的数据
        self.servertime = None
        self.nonce = None
        self.pubkey = None
        self.rsakv = None
        #请求时提交的数据列表
        self.post_data = None
        self.headers = {}
        #用于存储登录后的session
        self.session = None
        self.cookiejar = None
        #用于输出log信息
        self.logger = None
        #登录状态,初始化为False,表示未登录状态
        self.status = False
        #微博API必备信息
        self.access_token = None
        self.app_key = None
        #初始时调用initParams方法,初始化相关参数
        self.initParams()
        self.timeout = 3
        socket.setdefaulttimeout(3)
        self.tryTimes = 8

    #初始化参数
    def initParams(self):
        self.logger = LogClient().createLogger('SinaWSpider',
                                               myconf.log_out_path)
        self.headers = myconf.headers
        self.access_token = myconf.access_token
        self.app_key = myconf.app_key
        return self

    #设置username 和 password
    def setAccount(self, username, password):
        self.username = username
        self.password = password
        return self

    #设置post_data
    def setPostData(self):
        self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info(
        )
        self.post_data = dataEncode.encode_post_data(self.username,
                                                     self.password,
                                                     self.servertime,
                                                     self.nonce, self.pubkey,
                                                     self.rsakv)
        return self

    #使用用户代理,更换header中的User-Agent
    def switchUserAgent(self, enableAgent=True):
        user_agent = random.choice(myconf.agent_list)
        self.headers["User-Agent"] = user_agent
        return self

    #用于切换用户账号,防止长时间爬取账号被禁
    def switchUserAccount(self, userlist):
        is_login = False
        while not is_login:
            self.switchUserAgent()
            self.logger.info("User-Agent is: " + self.headers["User-Agent"])
            user = random.choice(userlist).split("|")
            self.logger.info("logining with user: "******"Cannot login to sina!")
                continue
            is_login = True
        return self

    #生成Cookie,接下来的所有get和post请求都带上已经获取的cookie
    def enableCookie(self, enableProxy=False):
        self.cookiejar = cookielib.LWPCookieJar()  # 建立COOKIE
        cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar)
        if enableProxy:
            proxy = myconf.swithProxy()
            proxy_support = urllib2.ProxyHandler(proxy)  # 使用代理myconf.proxies
            opener = urllib2.build_opener(proxy_support, cookie_support,
                                          urllib2.HTTPHandler)
            self.logger.info("Proxy enable, proxy is: " + str(proxy))
        else:
            opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        return self

    #使用urllib2模拟登录过程
    def login(self, username=None, password=None):
        self.status = False  #重新将登录状态设置为False
        self.logger.info("Start to login...")
        #根据用户名和密码给默认参数赋值,并初始化post_data
        self.setAccount(username, password)
        self.setPostData()
        self.enableCookie(enableProxy=True)
        #登录时请求的url
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        headers = self.headers
        try:
            request = urllib2.Request(login_url,
                                      urllib.urlencode(self.post_data),
                                      headers)
            resText = urllib2.urlopen(request).read()
            jsonText = json.loads(resText)
            if jsonText["retcode"] == "0":
                self.logger.info("Login success!")
                self.status = True
                #将cookie加入到headers中
                cookies = ';'.join([
                    cookie.name + "=" + cookie.value
                    for cookie in self.cookiejar
                ])
                headers["Cookie"] = cookies
            else:
                self.logger.error("Login Failed --> " + jsonText["reason"])
        except Exception, e:
            self.logger.error("Login Failed2! --> " + str(e))
        self.headers = headers
        return self
 def initParams(self):
     self.logger = LogClient().createLogger(
         'SinaClient',
         'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log')
     self.headers = dataEncode.headers
     return self
class SinaClient(object):
    def __init__(self, username=None, password=None):
        # 用户输入的用户名与密码
        self.username = username
        self.password = password
        # 从prelogin.php中获取的数据
        self.servertime = None
        self.nonce = None
        self.pubkey = None
        self.rsakv = None
        # 请求时提交的数据列表
        self.post_data = None
        self.headers = {}
        # 用于存储登录后的session
        self.session = None
        self.cookiejar = None
        # 用于输出log信息
        self.logger = None
        # 存储登录状态,初始状态为False
        self.state = False
        # 初始时调用initParams方法,初始化相关参数
        self.initParams()

    # 初始化参数
    def initParams(self):
        self.logger = LogClient().createLogger(
            'SinaClient',
            'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log')
        self.headers = dataEncode.headers
        return self

    # 设置username 和 password
    def setAccount(self, username, password):
        self.username = username
        self.password = password
        return self

    # 设置post_data
    def setPostData(self):
        self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info(
        )
        self.post_data = dataEncode.encode_post_data(self.username,
                                                     self.password,
                                                     self.servertime,
                                                     self.nonce, self.pubkey,
                                                     self.rsakv)
        return self

    # 使用requests库登录到 https://login.sina.com.cn
    def login(self, username=None, password=None):
        # 根据用户名和密码给默认参数赋值,并初始化post_data
        self.setAccount(username, password)
        self.setPostData()
        # 登录时请求的url
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        session = requests.Session()
        response = session.post(login_url, data=self.post_data)
        json_text = response.content.decode('gbk')
        res_info = json.loads(json_text)
        try:
            if res_info["retcode"] == "0":
                self.logger.info("Login success!")
                self.state = True
                # 把cookies添加到headers中
                cookies = session.cookies.get_dict()
                cookies = [key + "=" + value for key, value in cookies.items()]
                cookies = "; ".join(cookies)
                session.headers["Cookie"] = cookies
            else:
                self.logger.error("Login Failed! | " + res_info["reason"])
        except Exception, e:
            self.logger.error("Loading error --> " + e)
        self.session = session
        return session
Exemple #7
0
class SinaClient(object):
    def __init__(self, username=None, password=None):
        #用户输入的用户名与密码
        self.username = username
        self.password = password
        #从prelogin.php中获取的数据
        self.servertime = None
        self.nonce = None
        self.pubkey = None
        self.rsakv = None
        #请求时提交的数据列表
        self.post_data = None
        self.headers = {}
        #用于存储登录后的session
        self.session = None   
        self.cookiejar = None
        #用于输出log信息
        self.logger = None
        #存储登录状态,初始状态为False        
        self.state = False
        #初始时调用initParams方法,初始化相关参数
        self.initParams()
    
    #初始化参数
    def initParams(self):                                                   #// 如:out/log_20190420.log
        self.logger = LogClient().createLogger('SinaClient', 'out/log_' + time.strftime("%Y%m%d", time.localtime()) + '.log')
        self.headers = dataEncode.headers
        return self
    
    #设置username 和 password
    def setAccount(self, username, password):
        self.username = username
        self.password = password
        return self
    
    #设置post_data
    def setPostData(self):
        self.servertime, self.nonce, self.pubkey, self.rsakv = dataEncode.get_prelogin_info()
        self.post_data = dataEncode.encode_post_data(self.username, self.password, self.servertime, self.nonce, self.pubkey, self.rsakv)
        return self
        
    #使用requests库登录到 https://login.sina.com.cn
    def login(self, username=None, password=None):
        #根据用户名和密码给默认参数赋值,并初始化post_data
        self.setAccount(username, password) 
        self.setPostData()
        #登录时请求的url //network中headers可以看到此url,这里把url后面的时间戳省略了
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        session = requests.Session()
        response = session.post(login_url, data=self.post_data)
        json_text = response.content.decode('gbk')
        res_info = json.loads(json_text)
        try:
            if res_info["retcode"] == "0":
                self.logger.info("Login success!")
                self.state = True
                #把cookies添加到headers中
                cookies = session.cookies.get_dict()
                cookies = [key + "=" + value for key, value in cookies.items()]
                cookies = "; ".join(cookies)
                session.headers["Cookie"] = cookies
            else:
                self.logger.error("Login Failed! | " + res_info["reason"])
        #except Exception, e:  #//本版本不能这样使用
        except Exception as e:        
            self.logger.error("Loading error --> " + e)
        self.session = session
        return session
    
    #生成Cookie,接下来的所有get和post请求都带上已经获取的cookie
    def enableCookie(self, enableProxy=False):
        #self.cookiejar = cookielib.LWPCookieJar()  # 建立COOKIE //python 3.6 改成 cookiejar了
        self.cookiejar = cookiejar.LWPCookieJar()  # 建立COOKIE
        #cookie_support = urllib2.HTTPCookieProcessor(self.cookiejar) //urllib2 在 python3.x 中被改为urllib.request
        #//利用urllib库中的request的HTTPCookieProcessor对象来创建cookie处理器
        cookie_support = urllib.request.HTTPCookieProcessor(self.cookiejar)
        if enableProxy:  #//这里enableProxy=False,应该不执行if语句
            #proxy_support = urllib2.ProxyHandler({'http': 'http://122.96.59.107:843'}) # 使用代理
            proxy_support = urllib.request.ProxyHandler({'http': 'http://122.96.59.107:843'}) # 使用代理
            #opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
            opener = urllib.request.build_opener(proxy_support, cookie_support, urllib.request.HTTPHandler)
            self.logger.info("Proxy enable.")
        else:
            #opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
            opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler) #//通过handler来构建opener
        #urllib2.install_opener(opener)
        urllib.request.install_opener(opener)
        #//如果程序里所有的请求都使用自定义的opener,可以使用urllib.request.install_opener(),
        #//将自定义的 opener 对象 定义为 全局opener,表示如果之后凡是调用urlopen,都将使用这个opener.
    
    #使用urllib2模拟登录过程
    def login2(self, username=None, password=None):
        self.logger.info("Start to login...")
        #根据用户名和密码给默认参数赋值,并初始化post_data
        self.setAccount(username, password) 
        self.setPostData()
        self.enableCookie()
        #登录时请求的url
        login_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
        headers = self.headers
        #request = urllib2.Request(login_url, urllib.urlencode(self.post_data), headers) //POST data should be bytes
        #resText = urllib2.urlopen(request).read()
        request = urllib.request.Request(login_url, urllib.parse.urlencode(self.post_data).encode("utf-8"), headers)
        resText = urllib.request.urlopen(request).read()
        try:        
            jsonText = json.loads(resText)
            if jsonText["retcode"] == "0":
                self.logger.info("Login success!")
                self.state = True
                #将cookie加入到headers中
                cookies = ';'.join([cookie.name + "=" + cookie.value for cookie in self.cookiejar])
                headers["Cookie"] = cookies  #//dict headers在dataEncode.py中,这里给headers加上Cookie
            else:
                self.logger.error("Login Failed --> " + jsonText["reason"])
        #//except Exception, e:
        except Exception as e:        
            #//print e
            print(e)
        self.headers = headers
        return self
    
    #打开url时携带headers,此header需携带cookies
    def openURL(self, url, data=None):
        #req = urllib2.Request(url, data=data, headers=self.headers)
        #text = urllib2.urlopen(req).read()
        req = urllib.request.Request(url, data=data, headers=self.headers)
        text = urllib.request.urlopen(req).read()
        return text
    
    #功能:将文本内容输出至本地
    def output(self, content, out_path, save_mode="w"):
        self.logger.info("Download html page to local machine. | path: " + out_path)
        prefix = os.path.dirname(out_path)
        if not os.path.exists(prefix):
            os.makedirs(prefix)
        fw = open(out_path, save_mode)
        #fw.write(content)
        fw.write(str(content))
        fw.close()
        return self
        
    """
    防止读取出来的HTML乱码,测试样例如下
    req = urllib2.Request(url, headers=headers)
    text = urllib2.urlopen(req).read()
    unzip(text)
    """
    def unzip(self, data):
        data = StringIO.StringIO(data)
        gz = gzip.GzipFile(fileobj=data)
        data = gz.read()
        gz.close()
        return data