def needIdenCode(self):
        #第一次登录获取验证码尝试,构建request
        # request = Request(self.loginURL, self.postData, self.loginHeaders)
        # response = self.opener.open(request)        #得到第一次登录尝试的相应
        # content = response.read().decode('gbk')
        # status = response.getcode()           # 获取状态吗

        response = requests.post(url=self.loginURL,
                                 headers=self.loginHeaders,
                                 data=json.dumps(self.postData),
                                 proxies=MyRequests._get_proxies())
        content = response.content.decode('gbk')

        status = response.status_code

        #状态码为200,获取成功
        if status == 200:
            print("获取请求成功")
            #u8bf7u8f93u5165u9a8cu8bc1u7801这六个字是请输入验证码的utf-8编码
            pattern = re.compile(u'u8bf7u8f93u5165u9a8cu8bc1u7801', re.S)
            result = re.search(pattern, content)
            #如果找到该字符,代表需要输入验证码
            if result:
                print("此次安全验证异常,您需要输入验证码")
                return content
            #否则不需要
            else:
                print("此次安全验证通过,您这次不需要输入验证码")
                return False
        else:
            print("获取请求失败")
Example #2
0
    def _judge_is_taobao_head_img(self, url):
        '''
        判断是否为淘宝默认头像地址
        :param url:
        :return:
        '''
        tmp_proxies = MyRequests._get_proxies()

        try:
            _res = requests.get(url=url,
                                headers=self.headers,
                                proxies=tmp_proxies)
            self.my_lg.info(str(_res.url))
            if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg':
                return True
            else:
                return False
        except:
            self.my_lg.info('检测图片地址时网络错误! 跳过!')
            return False
    def __init__(self):
        #登录的URL
        self.loginURL = "https://login.taobao.com/member/login.jhtml"
        #代理IP地址,防止自己的IP被封禁
        # self.proxyURL = 'http://120.193.146.97:843'
        self.proxyURL = 'http://' + MyRequests._get_proxies()['http']

        #登录POST数据时发送的头部信息
        self.loginHeaders =  {
            'Host':'login.taobao.com',
            'User-Agent' : HEADERS[randint(0, len(HEADERS)-1)],
            'Referer' : 'https://login.taobao.com/member/login.jhtml',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Connection' : 'Keep-Alive'
        }
        self.headers = {
            '': 'path: /login.htm?_input_charset=utf-8&ttid=h5%40iframe',
            'content-length': '3015',
            'cache-control': 'max-age=0',
            'origin': 'https://login.m.taobao.com',
            'upgrade-insecure-requests': '1',
            'content-type': 'application/x-www-form-urlencoded',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'referer': 'https://login.m.taobao.com/login.htm?from=sm&ttid=h5@iframe&tpl_redirect_url=https%3A%2F%2Fsec.taobao.com%2Fquery.htm%3Faction%3DQueryAction%26event_submit_do_login%3Dok%26smApp%3Dmalldetailskip%26smPolicy%3Dmalldetailskip-h5-anti_Spider-h5SS-checklogin%26smCharset%3DGBK%26smTag%3DMTgzLjE1OS4xNzcuMTMwLCxiY2Y0NWZkZjVlYmI0ZGE2OTlkZjNkZmUyYTA1ODc4Mg%253D%253D%26captcha%3Dhttps%253A%252F%252Fsec.taobao.com%252Fquery.htm%26smReturn%3Dhttps%253A%252F%252Fdetail.m.tmall.com%252Fitem.htm%253Fid%253D20739535568%26smSign%3DEh51d83i2uzo2b2zGtgKRg%253D%253D',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': 'isg=BAEBfhDFP4K9JlO2YSgx_sw0EEvbhnVHYzo4S2NWsohnSiMcoH_n8l8IKL4Mug1Y',
        }
        #用户名
        self.username = ''
        #ua字符串,经过淘宝ua算法计算得出,包含了时间戳,浏览器,屏幕分辨率,随机数,鼠标移动,鼠标点击,其实还有键盘输入记录,鼠标移动的记录、点击的记录等等的信息
        self.ua = '107#ssznSzS9s2xAImllPVytXzO1XnIggOmR9LcTi88ngXX9lXFu//94sFxKXrrOg/ZvsLFLTAtXgFXxxXGQ/CxRlaEJXq6nuVE9luz8TdFDyJB+BaUtsYc0P894xppFjjidAOaEIdSveZpfnD1iDG2n7ISDflmisXa8KMKMoU/xPqBLTYtnO8KQVMpYmoBPOShgsCbDNSCXXQGLbLURQXXlDI5M9IjfOd3dPUh9PEUyljo+bL1NmKqXt3/weVQb8v9QkE/6k9DzC5vWQJAec6+1aOnflejd87g7m9jYygKX3ccb1dbwQmh6qYOcuWlw3mQmKlYV2yr7t7xqEmG01vFkqYVr3/Lpu6tg29V22jXmoxcufdUbepynwC/HwUCiCgOKKZCNjE3OCy4bem2r8dEWdr9d3rhGvmt3jpiUK7XJCy4bek5r8dEmdrtGC3iUbfub25COijn8oEkDvpRIdDy/bqy48C+DCf3AiLpng9gn82splff8c6rjantrG/Cf81V6y9FAyii/Xf53Z5Zr0xMzheiteOnGC5FsmE/46EtzI5IZQkAdv7VYrrexvYMzGvQygi+OmTjae5dbF6w0EJjyE8T4lInDXvIvOoVYPyZ7XP1MbY38+FXEEITAla=='
        #密码,,淘宝对此密码进行了加密处理,256位
        self.password2 = '02d12254b4a503974749e4ee16f72e76081dce05e8fe449b487573ba72f14d47f9df56a8377afd48194afa7053aa1829cd9d3a55476a5510128244fea80fbed8a0c798146912122dcce4059be5ba85b39cc7d51fae6629a103d527256a3a48327c6e4cb1350806fa15e9ea07696cdce9c91658718f72b2f325b0d0784730e9fe'
        self.post = {
            'ua':self.ua,
            'TPL_checkcode':'',
            'CtrlVersion': '1,0,0,7',
            'TPL_password':'',
            'TPL_redirect_url':'http://i.taobao.com/my_taobao.htm?nekot=udm8087E1424147022443',
            'TPL_username':self.username,
            'loginsite':'0',
            'newlogin':'******',
            'from':'tb',
            'fc':'default',
            'style':'default',
            'css_style':'',
            'tid':'XOR_1_000000000000000000000000000000_625C4720470A0A050976770A',
            'support':'000001',
            'loginType':'4',
            'minititle':'',
            'minipara':'',
            'umto':'NaN',
            'pstrong':'3',
            'llnick':'',
            'sign':'',
            'need_sign':'',
            'isIgnore':'',
            'full_redirect':'',
            'popid':'',
            'callback':'',
            'guf':'',
            'not_duplite_str':'',
            'need_user_id':'',
            'poy':'',
            'gvfdcname':'10',
            'gvfdcre':'',
            'from_encoding ':'',
            'sub':'',
            'TPL_password_2':self.password2,
            'loginASR':'1',
            'loginASRSuc':'1',
            'allp':'',
            'oslanguage':'zh-CN',
            'sr':'1366*768',
            'osVer':'windows|6.1',
            'naviVer':'firefox|35'
        }
        #将POST的数据进行编码转换
        self.postData = urlencode(self.post)

        #设置代理
        self.proxy = ProxyHandler({'http':self.proxyURL})

        #设置cookie
        self.cookie = LWPCookieJar()

        #设置cookie处理器
        self.cookieHandler = HTTPCookieProcessor(self.cookie)

        #设置登录时用到的opener,它的open方法相当于urllib2.urlopen
        self.opener = build_opener(self.cookieHandler,self.proxy, HTTPHandler)
Example #4
0
    def get_goods_data(self, goods_id):
        '''
        得到data
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            self.result_data = {}
            return {}

        type = goods_id[0]  # 天猫类型
        # self.my_lg.info(str(type))
        goods_id = goods_id[1]  # 天猫goods_id
        tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id)
        self.my_lg.info('------>>>| 得到的移动端地址为: %s' % tmp_url)

        params = self._set_params(goods_id=goods_id)
        # pprint(params)
        self.headers.update({'Referer': tmp_url})
        _url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/'

        # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params)
        # self.my_lg.info(str(body))

        # 设置代理ip
        tmp_proxies = MyRequests._get_proxies()
        self.proxy = tmp_proxies['http']
        # self.my_lg.info(tmp_proxies)

        s = requests.session()
        try:
            response = s.get(_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            last_url = re.compile(r'\+').sub('', response.url)  # 转换后得到正确的url请求地址
            # self.my_lg.info(last_url)
            response = s.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            body = response.content.decode('utf-8')
            # self.my_lg.info(str(body))
        except Exception:
            self.my_lg.error('requests.get()请求超时... 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
            self.my_lg.error('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        try:
            assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % (str(type), goods_id)
            data = re.compile('mtopjsonp3\((.*)\)').findall(body)[0]  # 贪婪匹配匹配所有
        except (AssertionError, IndexError) as e:
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        if data != '':
            try:
                data = json.loads(data)
            except Exception:
                self.my_lg.error('json.loads转换data时出错, 请检查! 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            # pprint(data)

            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                '''
                ## 表示该商品已经下架, 原地址被重定向到新页面
                '''
                self.my_lg.info('@@@@@@ 该商品已经下架...')
                tmp_data_s = self.init_pull_off_shelves_goods(type)
                self.result_data = {}
                return tmp_data_s

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data').get('seller', {}).get('evaluates') is None:
                self.my_lg.error('data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data['data']['rate'] = ''  # 这是宝贝评价
            data['data']['resource'] = ''  # 买家询问别人
            data['data']['vertical'] = ''  # 也是问和回答
            data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {})

            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0]['value'] = self._wash_result_data_apiStack_value(
                goods_id=goods_id,
                result_data_apiStack_value=result_data_apiStack_value
            )

            # 处理mockData
            mock_data = result_data['mockData']
            try:
                mock_data = json.loads(mock_data)
            except Exception:
                self.my_lg.error('json.loads转化mock_data时出错, 跳出 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            mock_data['feature'] = ''
            # pprint(mock_data)
            result_data['mockData'] = mock_data

            # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
            if result_data.get('apiStack', [])[0].get('value', '') == '':
                self.my_lg.error("result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: %s, goods_id: %s" % (str(type), goods_id))
                result_data['trade'] = {}
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            else:
                result_data['trade'] = result_data.get('apiStack', [])[0].get('value', {}).get('trade', {})     # 用于判断该商品是否已经下架的参数
                # pprint(result_data['trade'])

            result_data['type'] = type
            result_data['goods_id'] = goods_id
            self.result_data = result_data
            # pprint(self.result_data)
            return result_data

        else:
            self.my_lg.error('data为空! 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
Example #5
0
    def get_div_from_pc_div_url(self, url, goods_id):
        '''
        根据pc描述的url模拟请求获取描述的div
        :return: str
        '''
        '''
        appKey:12574478
        t:1509513791232
        api:mtop.taobao.detail.getdesc
        v:6.0
        type:jsonp
        dataType:jsonp
        timeout:20000
        callback:mtopjsonp1
        data:{"id":"546818961702","type":"1"}
        '''
        appKey = '12574478'
        t = str(time.time().__round__()) + str(randint(
            100, 999))  # time.time().__round__() 表示保留到个位
        '''
        下面是构造params
        '''
        goods_id = goods_id
        # self.my_lg.info(goods_id)
        params_data_1 = {
            'id': goods_id,
            'type': '1',
        }

        # self.my_lg.info(str(params_data_2))
        params = {
            'data': json.dumps(params_data_1)  # 每层里面的字典都要先转换成json
        }

        tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/?appKey={}&t={}&api=mtop.taobao.detail.getdesc&v=6.0&type=jsonp&dataType=jsonp&timeout=20000&callback=mtopjsonp1'.format(
            appKey, t)

        tmp_proxies = MyRequests._get_proxies()
        # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        # 设置3层避免报错退出
        try:
            response = requests.get(
                tmp_url,
                headers=self.headers,
                params=params,
                proxies=tmp_proxies,
                timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        except Exception:
            try:
                tmp_proxies = MyRequests._get_proxies()
                response = requests.get(
                    tmp_url,
                    headers=self.headers,
                    params=params,
                    proxies=tmp_proxies,
                    timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            except Exception:
                tmp_proxies = MyRequests._get_proxies()
                response = requests.get(
                    tmp_url,
                    headers=self.headers,
                    params=params,
                    proxies=tmp_proxies,
                    timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造

        last_url = re.compile(r'\+').sub('', response.url)  # 转换后得到正确的url请求地址
        # self.my_lg.info(last_url)
        try:
            response = requests.get(
                last_url,
                headers=self.headers,
                proxies=tmp_proxies,
                timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        except Exception:
            tmp_proxies = MyRequests._get_proxies()
            try:
                response = requests.get(
                    last_url,
                    headers=self.headers,
                    proxies=tmp_proxies,
                    timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            except ProxyError:
                self.my_lg.error('ProxyError!')
                return ''

        try:
            data = response.content.decode('utf-8')
        except Exception as e:  # 解码错误,异常退出
            self.my_lg.error(e)
            return ''

        # self.my_lg.info(str(data))
        data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)  # 贪婪匹配匹配所有
        if data != []:
            data = data[0]
            data = json.loads(data)

            if data != []:
                div = data.get('data', '').get('pcDescContent', '')
                # self.my_lg.info(str(div))
                div = self.deal_with_div(div)
                # self.my_lg.info(div)
            else:
                div = ''
        else:
            div = ''

        return div
Example #6
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str(
            goods_id)
        self.my_lg.info(self.msg)

        # 设置params
        params = self._set_params(goods_id=goods_id)
        tmp_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/'

        # 设置代理ip
        tmp_proxies = MyRequests._get_proxies()
        self.proxy = tmp_proxies['http']
        # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        s = requests.session()
        try:
            response = s.get(
                tmp_url,
                headers=self.headers,
                params=params,
                proxies=tmp_proxies,
                timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            last_url = re.compile(r'\+').sub('',
                                             response.url)  # 转换后得到正确的url请求地址
            # self.my_lg.info(last_url)
            response = s.get(
                last_url,
                headers=self.headers,
                proxies=tmp_proxies,
                timeout=14)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            data = response.content.decode('utf-8')
            # self.my_lg.info(data)
        except Exception:
            self.my_lg.error('requests.get()请求超时...' + ' 出错goods_id: ' +
                             str(goods_id))
            self.my_lg.error('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)  # 贪婪匹配匹配所有
        # self.my_lg.info(str(data))

        if data != []:
            data = data[0]
            try:
                data = json.loads(data)
            except Exception:
                self.my_lg.error('json.loads转换data时出错, 请检查! 出错goods_id: ' +
                                 str(goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            # pprint(data)

            if data.get('data', {}).get('trade', {}).get(
                    'redirectUrl', '') != '' and data.get('data', {}).get(
                        'seller', {}).get('evaluates') is None:
                '''
                ## 表示该商品已经下架, 原地址被重定向到新页面
                '''
                self.my_lg.info('@@@@@@ 该商品已经下架...')
                tmp_data_s = self.init_pull_off_shelves_goods()
                self.result_data = {}
                return tmp_data_s

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data').get('seller', {}).get('evaluates') is None:
                self.my_lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data['data']['rate'] = ''  # 这是宝贝评价
            data['data']['resource'] = ''  # 买家询问别人
            data['data']['vertical'] = ''  # 也是问和回答
            data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack',
                                                         [])[0].get(
                                                             'value', {})

            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0][
                'value'] = self._wash_result_data_apiStack_value(
                    goods_id=goods_id,
                    result_data_apiStack_value=result_data_apiStack_value)

            # 处理mockData
            mock_data = result_data['mockData']
            try:
                mock_data = json.loads(mock_data)
            except Exception:
                self.my_lg.error('json.loads转化mock_data时出错, 跳出' +
                                 ' 出错goods_id: ' + str(goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            mock_data['feature'] = ''
            # pprint(mock_data)
            result_data['mockData'] = mock_data

            # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
            if result_data.get('apiStack', [])[0].get('value', '') == '':
                self.my_lg.info(
                    "result_data.get('apiStack', [])[0].get('value', '')的值为空...."
                )
                result_data['trade'] = {}
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            else:
                result_data['trade'] = result_data.get('apiStack', [])[0].get(
                    'value', {}).get('trade', {})  # 用于判断该商品是否已经下架的参数
                # pprint(result_data['trade'])

            self.result_data = result_data
            # pprint(self.result_data)
            return result_data
        else:
            self.my_lg.info('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}