Beispiel #1
0
    def _get_this_goods_member_id(self, goods_id):
        '''
        获取member_id
        :param goods_id:
        :return: '' or str
        '''
        headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_pc_ua(),
            # 'X-DevTools-Emulate-Network-Conditions-Client-Id': '5C1ED6AF76F4F84D961F136EAA06C40F',
        }

        params = (
            ('offerId', str(goods_id)),
        )

        url = 'https://m.1688.com/page/offerRemark.htm'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            self.my_lg.error('获取到的body为空值!此处跳过!')
            return ''

        try:
            member_id = re.compile(r'"memberId":"(.*?)",').findall(body)[0]
        except IndexError:
            self.my_lg.error('获取member_id时索引异常!请检查!')
            return ''

        return member_id
    def needIdenCode(self):
        #第一次登录获取验证码尝试,构建request
        # request = Request(self.loginURL, self.postData, self.loginHeaders)
        # response = self.opener.open(request)        #得到第一次登录尝试的相应
        # content = response.read().decode('gbk')
        # status = response.getcode()           # 获取状态吗

        response = requests.post(url=self.loginURL,
                                 headers=self.loginHeaders,
                                 data=json.dumps(self.postData),
                                 proxies=MyRequests._get_proxies())
        content = response.content.decode('gbk')

        status = response.status_code

        #状态码为200,获取成功
        if status == 200:
            print("获取请求成功")
            #u8bf7u8f93u5165u9a8cu8bc1u7801这六个字是请输入验证码的utf-8编码
            pattern = re.compile(u'u8bf7u8f93u5165u9a8cu8bc1u7801', re.S)
            result = re.search(pattern, content)
            #如果找到该字符,代表需要输入验证码
            if result:
                print("此次安全验证异常,您需要输入验证码")
                return content
            #否则不需要
            else:
                print("此次安全验证通过,您这次不需要输入验证码")
                return False
        else:
            print("获取请求失败")
Beispiel #3
0
    def get_all_img_url(self, goods_id, is_hk):
        '''
        得到all_img_url
        :param goods_id:
        :param is_hk:
        :return:
        '''
        if is_hk is True:  # 全球购
            tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str(
                goods_id) + '.html'
        else:
            tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html'

        tmp_body_2 = MyRequests.get_url_body(url=tmp_url_2,
                                             headers=self.headers,
                                             had_referer=True)
        # print(Selector(text=tmp_body_2).css('div.small').extract())

        if tmp_body_2 == '':
            print('请求tmp_body_2为空值, 此处先跳过!')
            return ''

        all_img_url = []
        for item in Selector(text=tmp_body_2).css('div.small img').extract():
            # print(item)
            tmp_img_url = Selector(
                text=item).css('img::attr("src")').extract_first()
            all_img_url.append({'img_url': tmp_img_url})

        return all_img_url
Beispiel #4
0
    def get_jump_to_url_and_is_hk(self, body):
        '''
        得到跳转地址和is_hk
        :param body: 待解析的url的body
        :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean
        '''
        if re.compile(r'_sign_direct_url = ').findall(
                body) != []:  # 表明是跳转,一般会出现这种情况的是拼团商品
            # 出现跳转时
            try:
                sign_direct_url = re.compile(
                    r"_sign_direct_url = '(.*?)';").findall(body)[0]
                print('*** 获取到跳转地址为: ', sign_direct_url)
            except IndexError:
                sign_direct_url = ''
                print('获取跳转的地址时出错!')

            body = MyRequests.get_url_body(url=sign_direct_url,
                                           headers=self.headers,
                                           had_referer=True)

            if re.compile(r'://m.miyabaobei.hk/').findall(
                    sign_direct_url) != []:
                # 表示为全球购商品
                print('*** 此商品为全球购商品!')
                is_hk = True
            else:
                is_hk = False

        else:
            is_hk = False
            sign_direct_url = ''

        return (body, sign_direct_url, is_hk)
Beispiel #5
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        _tmp_comment_list = []
        self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id)

        '''
        下面抓取的是pc端的数据地址
        '''
        # 获取评论数据
        for current_page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num))
            tmp_url = 'https://rate.taobao.com/feedRateList.htm'
            _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id)

            self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id})
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk')
            # self.my_lg.info(str(body))

            try:
                body = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id)
                sleep(.5)
                self.result_data = {}
                return {}

            data = json_2_dict(json_str=body, logger=self.my_lg).get('comments')
            # pprint(data)
            if data is None:
                self.my_lg.error('出错goods_id: ' + goods_id)
                self.result_data = {}
                return {}
            if data == []:  # 该页的"comments"=[], 跳出本次循环
                continue

            _tmp_comment_list += data
            sleep(self.comment_page_switch_sleep_time)

        # self.my_lg.info(str(len(_tmp_comment_list)))
        try:
            _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id: ' + goods_id)
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()

        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Beispiel #6
0
    def _get_shop_name(self, **kwargs):
        '''
        得到shop_name
        '''
        data = kwargs.get('data', {})

        seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0)
        tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str(seller_id)
        seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url, headers=self.headers, high_conceal=True)
        if seller_info_body == '':
            print('seller_info为空!')
            return {}
        else:
            seller_info = [seller_info_body]
        seller_info_str = ''
        for item_ss in seller_info:  # 拼接字符串
            seller_info_str += item_ss

        seller_info = [seller_info_str]
        # print(seller_info)

        if seller_info != []:
            seller_info = json_2_dict(json_str=seller_info[0])
            if seller_info == {}:
                print('卖家信息在转换时出现错误, 此处跳过')
                return {}

            # pprint(seller_info)
            shop_name = seller_info.get('sellerInfo', {}).get('nickName', '')
        else:
            shop_name = ''
        # print(shop_name)

        return shop_name
    def get_p_info_list(self, goods_id):
        '''
        得到详情介绍信息
        :param goods_id:
        :return: 返回一个list
        '''
        p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id)
        p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', [])
        if tmp_p_info == []:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
    def get_div_desc_body(self, div_desc_url):
        '''
        得到div_desc的html页面
        :param div_desc_url:
        :return: str类型的data, 出错的情况下返回{}
        '''
        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url,
                                                headers=self.headers)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        tmp_body = json_2_dict(json_str=div_desc_body).get('data', '')
        if tmp_body == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        tmp_body = self._wash_div_desc(tmp_body=tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body
    def get_p_info_list(self, p_info_url):
        '''
        得到详情介绍信息
        :param p_info_url:
        :return: 返回一个list
        '''
        # 使用requests
        p_info_body = MyRequests.get_url_body(url=p_info_url,
                                              headers=self.headers)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        tmp_p_info = json_2_dict(json_str=p_info_body).get(
            'perportieslist', [])
        if tmp_p_info == []:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
Beispiel #10
0
    def _get_one_fund_info(self, fund_code):
        '''
        得到一只基金的info,并处理
        :return:
        '''
        cookies = {
            'st_pvi': '11586003301354',
            'st_si': '46806950936799',
            'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs',
            'EMFUND1': 'null',
            'EMFUND0': 'null',
            # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884',
            'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884',
            # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106',
            'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106',
            # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301',
            'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301',
            # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723',
            'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723',
            # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595',
            'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595',
            # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594',
            'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594',
            # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148',
            'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148',
            # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092',
            'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092',
        }

        cookies = unquote_cookies(cookies)
        # pprint(cookies)

        headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            # 'Referer': 'http://fund.eastmoney.com/001092.html',
            'Proxy-Connection': 'keep-alive',
        }

        v = re.compile(r'-| |:').sub('', str(get_shanghai_time()))  # 2018-07-10 18:30:46 -> 20180710183046
        # print(v)
        params = (
            # ('v', '20180710175951'),    # 时间
            ('v', v),  # 时间
        )

        fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(fund_code)
        # response = requests.get(fund_url, headers=headers, params=params, cookies=None)
        # body = response.text
        # print(body)

        body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None)
        # print(body)
        self._get_this_fund_info(body=body)

        return True
Beispiel #11
0
    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''  # 搜索: 偶尔不稳定但是还是能用
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = MyRequests.get_url_body(url=s_url,
                                       headers=headers,
                                       params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            data = json_2_dict(json_str=body, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.my_lg.error(
                        '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                            keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.my_lg.exception(e)
                    self.my_lg.error(
                        '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                            keyword[1]))
                    return []

                return goods_id_list
Beispiel #12
0
    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
        # print(tmp_body)

        try:
            tmp_data = json.loads(tmp_body).get('data', [])
            # pprint(tmp_data)
        except Exception as e:
            print('json.loads转换tmp_body时出错!')
            tmp_data = []
            self.result_data = {}
            return {}

        true_sku_info = []
        i_s = {}
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['normal_price'] = normal_price
                            tmp['detail_price'] = detail_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s)
Beispiel #13
0
    def _get_p_info(self, goods_id):
        p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(
            goods_id)
        tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url,
                                                  headers=self.headers,
                                                  had_referer=True)
        # print(tmp_p_info_body)
        assert tmp_p_info_body != '', '获取到的tmp_p_info_body为空值, 请检查!'

        p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body)

        return p_info, tmp_p_info_body
Beispiel #14
0
    def _get_div_desc(self, data):
        '''
        得到div_desc
        :param data:
        :return:
        '''
        def _get_right_body(body):
            '''得到main_body'''
            # 处理data-lazy-src
            body = re.compile(r'<img src=').sub('<img data-lazy-src=', body)
            body = re.compile(r'data-lazy-src=').sub('src=', body)
            body = re.compile(r'<img data-src=').sub('<img src=', body)
            body = re.compile(r';opacity:0').sub('', body)  # 不替换否则不显示图片
            # print(body)

            try:
                main_body = re.compile(r'<main .*?>(.*)</main>').findall(body)[0]
            except IndexError:
                main_body = re.compile(r'<body>(.*?)</body>').findall(body)[0]
                main_body = re.compile(r'<script.*?>.*?</script>').sub('', main_body)
            return main_body

        try:
            intros = data.get('good', {}).get('intros', [])[0]
        except IndexError:
            raise IndexError('获取intros获取异常!')

        tabs = intros.get('tabs', [])
        # pprint(tabs)
        div_desc_url = ''
        title_list = [
            '功能详情',
            '产品介绍',
            '概述',
            '商品详情',
        ]
        for item in tabs:
            if item.get('title', '') in title_list:
                div_desc_url = item.get('url', '')
                break

        if div_desc_url == '':
            raise ValueError('获取div_desc_url为空值!')

        body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers)
        # self.my_lg.info(str(body))
        if body == '':
            raise ValueError('获取到的div_desc为空值!')

        div_desc = '<div>' + _get_right_body(body) + '</div>'
        # self.my_lg.info(str(div_desc))

        return div_desc
Beispiel #15
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = json_2_dict(json_str=body, logger=self.my_lg).get(
                'wareDetailComment', {}).get('commentInfoList', [])
            if _data == []:
                self.my_lg.error('出错goods_id:{0}'.format(self.goods_id))

            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Beispiel #16
0
    async def _get_target_url_and_content_id_and_csid(self, taobao_short_url):
        '''
        根据给与的淘宝分享短链接, 得到target_url, content_id, csid
        :param taobao_short_url:
        :return:
        '''
        if re.compile(r'contentId').findall(taobao_short_url) != []:
            # 先检查是否已为目标地址
            target_url = taobao_short_url

        else:
            body = MyRequests.get_url_body(url=taobao_short_url, headers=self.headers)
            # self.my_lg.info(str(body))
            if body == '':
                self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(str(taobao_short_url)))
                return '', '', ''

            try:
                # 获取短连接的目标地址
                target_url = re.compile('var url = \'(.*?)\';').findall(body)[0]
                # self.my_lg.info(str(target_url))
            except IndexError:
                self.my_lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
                target_url = ''

        try:
            # 得到contentId
            content_id = re.compile('contentId=(\d+)').findall(target_url)[0]
            # self.my_lg.info(content_id)
        except IndexError:
            self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
            content_id = ''

        try:
            # 得到csid
            csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(target_url)[0]
            # self.my_lg.info(csid)
        except IndexError:
            self.my_lg.info('此链接为无csid情况的链接...')
            # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
            csid = ''

        try:
            tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag_name = ''

        try:
            tag = re.compile('tag=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag = ''

        return target_url, content_id, csid, tag_name, tag
Beispiel #17
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.5)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Beispiel #18
0
def _z8_get_parent_dir(goods_id) -> str:
    '''
    折800获取parent_dir (常规, 拼团, 秒杀都可用)
    :param goods_id:
    :return: '' | 'xxx/xxx'
    '''
    headers = {
        'authority': 'shop.zhe800.com',
        'cache-control': 'max-age=0',
        'upgrade-insecure-requests': '1',
        'user-agent': get_random_pc_ua(),
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        # 'referer': 'https://brand.zhe800.com/yl?brandid=353130&page_stats_w=ju_tag/taofushi/1*1&ju_flag=1&pos_type=jutag&pos_value=taofushi&x=1&y=1&n=1&listversion=&sourcetype=brand&brand_id=353130',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        # 'cookie': 'has_webp=1; __utmz=148564220.1533879745.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); utm_csr=direct; session_id=1833465439.1533879745; utm_ccn=notset_c0; utm_cmd=; utm_ctr=; utm_cct=; utm_etr=tao.home; firstTime=2018-08-11; qd_user=32364805.1533966002852; f_jk=9269921533966002852qyOwt6Jn; f_jk_t=1533966002857; f_jk_e_t=1536558002; f_jk_r=https://www.zhe800.com/ju_tag/taofushi; user_type=0; downloadGuide_config=%257B%25220direct%2522%253A%257B%2522open%2522%253A1%257D%257D; user_role=1; student=0; gr_user_id=cb0301be-4ee0-4f86-80c5-7d880106ed87; user_id=; utm_csr_first=direct; ac_token=15345580381942356; frequency=1%2C0%2C0%2C0%2C1%2C0%2C0%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0; lastTime=2018-08-18; cart_mark=1%7C0%7C0%7Cnil%7C0; __utma=148564220.1330126799.1533879745.1533965999.1534583229.3; __utmc=148564220; __utmt=1; screenversion=2; ju_rv=BD001_BAYES_RCMD; unix_time=1534583232; ju_version=3; gr_session_id_655df36e87c2496389d319bd67d56fec=c953414c-5377-42f3-b449-99ed57b65b31; gr_session_id_655df36e87c2496389d319bd67d56fec_c953414c-5377-42f3-b449-99ed57b65b31=true; jk=9451481534583233650qyOwt6Jn; __utmb=148564220.5.10.1534583229; new_old_user=1; city_id=330000; source=; platform=; version=; channelId=; deviceId=; userId=; cType=; cId=; dealId=; visit=7; wris_session_id=424077871.1534583351',
    }

    params = (
        ('jump_source', '1'),
        ('qd_key', 'qyOwt6Jn'),
    )

    url = 'https://shop.zhe800.com/products/{0}'.format(goods_id)
    body = MyRequests.get_url_body(url=url,
                                   headers=headers,
                                   params=None,
                                   high_conceal=True)
    # print(body)

    parent_dir = []
    try:
        aside = Selector(text=body).css('aside.pos.area').extract_first()
        # print(aside)
        assert aside is not None, '获取到的aside为None!获取parent_dir失败!'
        _1 = Selector(text=aside).css('em::text').extract_first()
        # print(_1)
        parent_dir.append(_1)
        _2 = re.compile('</i>(.*?)<i>').findall(aside)[1].replace(' ', '')
        # print(_2)
    except Exception as e:
        print('获取parent_dir时遇到错误(默认为""):', e)
        return ''

    parent_dir.append(_2)
    # 父级路径
    parent_dir = '/'.join(parent_dir)
    # print(parent_dir)

    return parent_dir
Beispiel #19
0
def getRandomExternalLink(startingPage):
    html = MyRequests.get_url_body(url=startingPage, headers=headers)
    bsObj = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("没有外部链接,准备遍历整个网站")
        domain = urlparse(startingPage).scheme + "://" + urlparse(
            startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(
            0,
            len(internalLinks) - 1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks) - 1)]
Beispiel #20
0
    def _get_aweme_api_videos_info(self, user_id):
        self.user_id = user_id
        params = (
            ('user_id', self.user_id),
            ('max_cursor', '0'),
            ('count', '20'),
        )

        url = 'https://www.douyin.com/aweme/v1/aweme/post/'
        body = MyRequests.get_url_body(url=url,
                                       headers=self.headers,
                                       params=params)
        # print(body)

        self.deal_with_data(body=body)
Beispiel #21
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        for index in range(1, 1000):  # 0跟1返回一样,所有从1开始遍历
            tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                index) + '/0/'
            print('正在抓取: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')

                if tmp_data.get('data_list', []) == []:
                    print('得到的data_list为[], 此处跳过!')
                    break

                else:
                    # print(tmp_data)
                    data_list = [{
                        'goods_id': item.get('sku', ''),
                        'sub_title': item.get('intro', ''),
                        'pid': index,
                    } for item in tmp_data.get('data_list', [])]
                    # pprint(data_list)

                    for item in data_list:
                        goods_list.append(item)
                    sleep(.5)

        pprint(goods_list)
        self.deal_with_data(goods_list=goods_list)
        sleep(8)
        return None
Beispiel #22
0
    def get_stock_info_dict(self, goods_id):
        '''
        得到实时库存信息
        :param goods_id:
        :return: 返回dict类型
        '''
        stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json'
        stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {})
        if tmp_stock_info == {}:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        return tmp_stock_info
    def get_stock_info_dict(self, stock_info_url):
        '''
        得到实时库存信息
        :param stock_info_url:
        :return: 返回dict类型
        '''
        stock_info_body = MyRequests.get_url_body(url=stock_info_url,
                                                  headers=self.headers)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {})
        if tmp_stock_info == {}:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        return tmp_stock_info
Beispiel #24
0
    def _judge_is_taobao_head_img(self, url):
        '''
        判断是否为淘宝默认头像地址
        :param url:
        :return:
        '''
        tmp_proxies = MyRequests._get_proxies()

        try:
            _res = requests.get(url=url, headers=self.headers, proxies=tmp_proxies)
            self.my_lg.info(str(_res.url))
            if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg':
                return True
            else:
                return False
        except:
            self.my_lg.info('检测图片地址时网络错误! 跳过!')
            return False
Beispiel #25
0
    def get_one_page_goods_info(self, *params):
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {
            "group": 4,
            "module": "99",
            "page": page,
            "tab": "all"
        }

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data)
        if body == '':
            body = '{}'

        return body
Beispiel #26
0
    def _get_1688_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取1688销量靠前的商品信息
        :param keyword:
        :return: a list eg: ['11111', ...]
        '''
        '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品'''
        headers = {
            'authority': 'm.1688.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': get_random_pc_ua(),
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE',
        }

        params = (
            ('sortType', 'booked'),
            ('filtId', ''),
            ('keywords', keyword[1]),
            ('descendOrder', 'true'),
        )

        url = 'https://m.1688.com/offer_search/-6161.html'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                goods_id_list = Selector(text=body).css(
                    'div.list_group-item::attr("data-offer-id")').extract()
                # pprint(goods_id_list)
            except Exception as e:
                self.my_lg.exception(e)
                self.my_lg.error(
                    '获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                goods_id_list = []

        return goods_id_list
Beispiel #27
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        mia_base_number = MIA_BASE_NUMBER
        while mia_base_number < MIA_MAX_NUMBER:
            tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                mia_base_number)

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '' or body == '[]':
                print('mia_base_number为: ', mia_base_number)
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')
                tmp_hour = tmp_data.get('p_info', {}).get('start_time',
                                                          '')[11:13]
                if tmp_hour == '22':  # 过滤掉秒杀时间为22点的
                    print('--- 销售时间为22点,不抓取!')
                    pass
                else:
                    print(tmp_data)
                    print('mia_base_number为: ', mia_base_number)
                    pid = mia_base_number
                    begin_time = tmp_data.get('p_info',
                                              {}).get('start_time', '')
                    end_time = tmp_data.get('p_info', {}).get('end_time', '')
                    item_list = tmp_data.get('item_list', [])

                    self.deal_with_data(pid, begin_time, end_time, item_list)

            sleep(.35)
            mia_base_number += 1
Beispiel #28
0
    def traversal_hour_timestamp(self, item):
        '''
        遍历每个需求的整点时间戳
        :param item:
        :return:
        '''
        # 先遍历today的需求的整点时间戳
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(item))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('item为: ', item)
            print('获取到的body为空值! 此处跳过')

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                print('json.loads转换body时出错, 此处跳过!')
                tmp_data = {}

            if tmp_data == {}:
                print('tmp_data为空{}!')
                pass
            else:
                # pprint(tmp_data)
                # print(tmp_data)

                event_time = item
                item_list = tmp_data.get('data', {}).get('list', [])

                self.deal_with_data(event_time, item_list)
                sleep(MOGUJIE_SLEEP_TIME)
Beispiel #29
0
    def get_item_list(self, event_time):
        '''
        得到event_time中所有的商品信息
        :param event_time:
        :return: item_list 类型 list
        '''
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(event_time))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('获取到的body为空值! 此处跳过')
            item_list = ''

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                tmp_data = {}
                print('json.loads转换body时出错, 此处跳过!')

            if tmp_data == {}:
                print('tmp_data为空{}!')
                item_list = []

            else:
                # pprint(tmp_data)
                # print(tmp_data)

                item_list = tmp_data.get('data', {}).get('list', [])
        sleep(.5)

        return item_list
Beispiel #30
0
    def get_one_page_all_goods_list(self, *params):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        try:
            json_body = json.loads(body)
            # print(json_body)
        except:
            print('json.loads转换body时出错!请检查')
            json_body = {}
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        # sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list