Ejemplo n.º 1
0
    def _get_pc_right_body(self, body):
        '''
        处理pc端得到需求数据
        :param body:
        :return:
        '''
        try:
            body = re.compile(
                r'window.__kaolaHeadData = (.*?);</script>').findall(body)[0]

            goodsInfoBase = re.compile(r'goodsInfoBase: (.*?), //基本').findall(
                body)[0]
            goodsDetailContent = re.compile(
                r'goodsDetailContent: (.*?), //图文详情').findall(body)[0]
            kaolaSuperMarket = re.compile(
                r'kaolaSuperMarket: (.*?), //needSelfTag').findall(body)[0]

            # self.lg.info(str(body))
        except IndexError:
            self.lg.error('遇到错误:', exc_info=True)
            return {}

        _ = {}
        _['goodsInfoBase'] = json_2_dict(json_str=goodsInfoBase,
                                         logger=self.lg)
        _['goodsDetailContent'] = json_2_dict(json_str=goodsDetailContent,
                                              logger=self.lg)
        _['kaolaSuperMarket'] = kaolaSuperMarket

        return _
Ejemplo n.º 2
0
def turn_one_time() -> dict:
    cookies = {
        'Hm_lpvt_fa0ddec29ac177a2d127cebe209832e3':
        str(datetime_to_timestamp(get_shanghai_time())),
        'Hm_lvt_fa0ddec29ac177a2d127cebe209832e3':
        '1537161510,1537228200,1537353114,1537411854',  # 定值
        'wk_':
        '9umq63s8g6leobk2p285frmp583nhm9t',  # 定值
    }
    headers = {
        'Host':
        'm.riyiwk.com',
        'accept':
        'application/json, text/javascript, */*; q=0.01',
        'origin':
        'https://m.riyiwk.com',
        'referer':
        'https://m.riyiwk.com/lottery.html?check_login=1',
        'accept-language':
        'zh-cn',
        'x-requested-with':
        'XMLHttpRequest',
        'user-agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f/RIYIWK 2.6.0/USER_ID 203793/TOKEN 3a3988e07be98db064a70fc635c0b590',
    }
    url = 'https://m.riyiwk.com/lottery/start.html'
    res = json_2_dict(
        Requests.get_url_body(method='post',
                              use_proxy=False,
                              url=url,
                              headers=headers,
                              cookies=cookies))
    # pprint(res)

    return res
Ejemplo n.º 3
0
def share_2_wx() -> bool:
    '''
    分享给微信
    :return:
    '''
    cookies = {
        'wk_': '8llgqrevckd0bmllcdgrtqjv88elq3fl',
    }
    headers = {
        'Host': 'ios.riyiwk.com',
        'accept': '*/*',
        'content-type': 'application/x-www-form-urlencoded',
        'user-agent': 'ExtraIncome/2.6.0 (iPhone; iOS 11.0; Scale/3.00)',
        'accept-language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
    }
    data = 'data=6FutSNjTIN512XBvPZXgztwPxRaLLFygqXFrzxnaSHhKJ0RMskgPCJ1veAFe71DmE/Weqi3qbl9Jp%2BWfhSSCtlPnKIheoydBjmxWvUtEh9qV4RXkSil0AWr5P5f8V4jL/OnQQxXgTeOBhhsJK7140Iuc/kdtw0qP'

    url = 'https://ios.riyiwk.com//user/shareCallback'
    message = json_2_dict(
        Requests.get_url_body(method='post',
                              use_proxy=False,
                              url=url,
                              headers=headers,
                              cookies=cookies,
                              data=data)).get('message', '')
    label, res = (
        '+',
        True,
    ) if message == '成功' else (
        '-',
        False,
    )
    print('[{}] 分享微信成功!'.format(label))

    return res
Ejemplo n.º 4
0
    def _get_shop_name(self, **kwargs):
        '''
        得到shop_name
        '''
        data = kwargs.get('data', {})

        seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0)
        tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str(seller_id)
        seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url, headers=self.headers, high_conceal=True)
        if seller_info_body == '':
            print('seller_info为空!')
            return {}
        else:
            seller_info = [seller_info_body]
        seller_info_str = ''
        for item_ss in seller_info:  # 拼接字符串
            seller_info_str += item_ss

        seller_info = [seller_info_str]
        # print(seller_info)

        if seller_info != []:
            seller_info = json_2_dict(json_str=seller_info[0])
            if seller_info == {}:
                print('卖家信息在转换时出现错误, 此处跳过')
                return {}

            # pprint(seller_info)
            shop_name = seller_info.get('sellerInfo', {}).get('nickName', '')
        else:
            shop_name = ''
        # print(shop_name)

        return shop_name
Ejemplo n.º 5
0
    def get_p_info_list(self, goods_id):
        '''
        得到详情介绍信息
        :param goods_id:
        :return: 返回一个list
        '''
        p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id)
        p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', [])
        if tmp_p_info == []:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
Ejemplo n.º 6
0
 async def json_2_dict(self, json_str):
     '''
     异步json_2_dict
     :param json_str:
     :return: {} | {...}
     '''
     return json_2_dict(json_str=json_str, logger=self.my_lg)
Ejemplo n.º 7
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        _tmp_comment_list = []
        self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id)

        '''
        下面抓取的是pc端的数据地址
        '''
        # 获取评论数据
        for current_page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num))
            tmp_url = 'https://rate.taobao.com/feedRateList.htm'
            _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id)

            self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id})
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk')
            # self.my_lg.info(str(body))

            try:
                body = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id)
                sleep(.5)
                self.result_data = {}
                return {}

            data = json_2_dict(json_str=body, logger=self.my_lg).get('comments')
            # pprint(data)
            if data is None:
                self.my_lg.error('出错goods_id: ' + goods_id)
                self.result_data = {}
                return {}
            if data == []:  # 该页的"comments"=[], 跳出本次循环
                continue

            _tmp_comment_list += data
            sleep(self.comment_page_switch_sleep_time)

        # self.my_lg.info(str(len(_tmp_comment_list)))
        try:
            _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id: ' + goods_id)
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()

        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Ejemplo n.º 8
0
    def _wash_target_data(self, data):
        '''
        清洗数据
        :return:
        '''
        try:
            data['comment'] = {}
            data['service'] = []  # 发货售后
        except:
            pass

        tmp_activitys = data.get('good', {}).get('activitys', {})
        activitys = {}
        try:
            for key, value in tmp_activitys.items():
                value = json_2_dict(value, logger=self.my_lg)
                activitys.update({
                    key: value,
                })
        except Exception as e:
            raise e

        data['good']['activitys'] = activitys

        return data
Ejemplo n.º 9
0
    def get_div_desc_body(self, div_desc_url):
        '''
        得到div_desc的html页面
        :param div_desc_url:
        :return: str类型的data, 出错的情况下返回{}
        '''
        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url,
                                                headers=self.headers)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        tmp_body = json_2_dict(json_str=div_desc_body).get('data', '')
        if tmp_body == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        tmp_body = self._wash_div_desc(tmp_body=tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body
Ejemplo n.º 10
0
    def get_p_info_list(self, p_info_url):
        '''
        得到详情介绍信息
        :param p_info_url:
        :return: 返回一个list
        '''
        # 使用requests
        p_info_body = MyRequests.get_url_body(url=p_info_url,
                                              headers=self.headers)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        tmp_p_info = json_2_dict(json_str=p_info_body).get(
            'perportieslist', [])
        if tmp_p_info == []:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
Ejemplo n.º 11
0
    def orc_captcha(captcha_url):
        '''识别验证码'''
        baidu_orc_info_path = '/Users/afa/baidu_orc.json'
        with open(baidu_orc_info_path, 'r') as f:
            baidu_orc_info = json_2_dict(f.read())

        img_path = './images/captcha.jpg'
        app_id = str(baidu_orc_info['app_id'])
        api_key = baidu_orc_info['api_key']
        secret_key = baidu_orc_info['secret_key']

        save_img_through_url(img_url=captcha_url, save_path=img_path)
        orc_res = baidu_ocr_captcha(
            app_id=app_id,
            api_key=api_key,
            secret_key=secret_key,
            img_path=img_path,
            orc_type=2)
        # print(orc_res)
        captcha = ''
        try:
            captcha = orc_res.get('words_result', [])[0].get('words', '')
        except IndexError:
            pass

        return captcha
Ejemplo n.º 12
0
    def _get_right_body(self, body):
        '''
        处理phone端得到需求数据
        :param body:
        :return:
        '''
        try:
            body_1 = re.compile(
                r'window.__Goods__ = (.*?),}</script>').findall(body)[0]
            body_1 += '}'
            # 尺码表
            sizeChartImgs = re.compile(r'sizeChartImgs: (.*?),//').findall(
                body[0])

            basicInfo = re.compile(r'basicInfo: (.*?),goodsNotice').findall(
                body_1)[0]
            skuPropertyList = re.compile(
                r'skuPropertyList: (.*?),specialGoodsDesc').findall(body_1)[0]
            kaolaSuperMarket = re.compile(
                r'kaolaSuperMarket: (.*?),colorSliderImgs').findall(body_1)[0]
            brandGoodsAmount = re.compile(
                r'brandGoodsAmount: (.*?),brandLogo').findall(body_1)[0]
            goodsDetailContent = re.compile(
                r'goodsDetailContent: (.*?),vipGoods').findall(body_1)[0]
            vipGoods = re.compile(r'vipGoods: (.*?),vipGoodsLogo').findall(
                body_1)[0]

            # self.lg.info(str(sizeChartImgs))
            # self.lg.info(str(body_1))
        except IndexError:
            self.lg.error('遇到错误:', exc_info=True)
            return {}

        _ = {}
        _['basicInfo'] = json_2_dict(json_str=basicInfo, logger=self.lg)
        _['skuPropertyList'] = json_2_dict(json_str=skuPropertyList,
                                           logger=self.lg)
        _['kaolaSuperMarket'] = kaolaSuperMarket
        _['brandGoodsAmount'] = brandGoodsAmount
        _['goodsDetailContent'] = json_2_dict(json_str=goodsDetailContent,
                                              logger=self.lg)
        _['vipGoods'] = vipGoods
        _['sizeChartImgs'] = sizeChartImgs
        # pprint(_)

        return _
Ejemplo n.º 13
0
    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''  # 搜索: 偶尔不稳定但是还是能用
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = MyRequests.get_url_body(url=s_url,
                                       headers=headers,
                                       params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            data = json_2_dict(json_str=body, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.my_lg.error(
                        '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                            keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.my_lg.exception(e)
                    self.my_lg.error(
                        '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                            keyword[1]))
                    return []

                return goods_id_list
Ejemplo n.º 14
0
def get_ak() -> str:
    bd_api_json = ''
    with open('/Users/afa/myFiles/pwd/baidu_map_pwd.json', 'r') as f:
        for line in f:
            bd_api_json += line.replace('\n', '').replace('  ', '')
        # print(bd_api_json)
        ak = json_2_dict(json_str=bd_api_json) \
            .get('fz_map_info', {}) \
            .get('ak', '')
    assert ak != '', 'ak不为空str!'

    return ak
Ejemplo n.º 15
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = json_2_dict(json_str=body, logger=self.my_lg).get(
                'wareDetailComment', {}).get('commentInfoList', [])
            if _data == []:
                self.my_lg.error('出错goods_id:{0}'.format(self.goods_id))

            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Ejemplo n.º 16
0
def _test(self):
    driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH)
    url = 'https://httpbin.org/get'
    body = driver.get_url_body(url=url)
    # lg.info(str(body))
    try:
        data = json_2_dict(re.compile('<pre.*?>(.*)</pre>').findall(body)[0], default_res={})
    except IndexError:
        return {}
    del driver

    return data
Ejemplo n.º 17
0
    def _get_one_page_articles(self, page_num) -> list:
        '''
        得到一页新闻
        :param page_num:
        :return:
        '''
        headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': get_random_pc_ua(),
            'Accept': '*/*',
            'Referer': 'https://36kr.com/',
            'Connection': 'keep-alive',
        }

        params = (
            ('per_page', '20'),
            ('page', str(page_num)),
            ('_', str(datetime_to_timestamp(get_shanghai_time())) +
             str(get_random_int_number(100, 999))),
        )

        url = 'https://36kr.com/api/search-column/mainsite'
        data = json_2_dict(
            Requests.get_url_body(url=url,
                                  headers=headers,
                                  params=params,
                                  cookies=None)).get('data',
                                                     {}).get('items', [])
        # pprint(data)
        if data == []:
            return []

        [
            item.update({'user_info': json_2_dict(item.get('user_info', ''))})
            for item in data
        ]
        # pprint(data)

        return data
Ejemplo n.º 18
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            try:
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             high_conceal=True,
                                             ip_pool_type=self.ip_pool_type)
                assert body != '', 'body为空值!'
                tmp_data = json_2_dict(json_str=body, default_res={}).get(
                    'data', {}).get('goods', [])
                # print(tmp_data)
                assert tmp_data != [], '该tmp_url得到的goods为空list, 此处跳过!'
                sleep(.5)
            except AssertionError as e:
                print(e)
                sleep(.5)
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Ejemplo n.º 19
0
    def _get_origin_comment_list(self, **kwargs) -> list:
        '''
        得到加密的接口数据信息
        :param kwargs:
        :return:
        '''
        csrf = kwargs.get('csrf', '')
        goods_id = kwargs.get('goods_id', '')
        cookies = kwargs.get('cookies', '')

        url = 'https://m.1688.com/page/offerRemark.htm'
        headers = {
            'cookie': cookies,
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': 'application/json, text/javascript, */*; q=0.01',
            'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id),
            'authority': 'm.1688.com',
            'x-requested-with': 'XMLHttpRequest',
        }

        origin_comment_list = []
        for i in range(1, self.max_page):
            __wing_navigate_options = {
                'data': {
                    'bizType': 'trade',
                    'itemId': int(goods_id),
                    'offerId': str(goods_id),
                    'page': i,
                    'pageSize': 5,
                    # 'receiveUserId': 989036456,
                    'starLevel': 7
                }
            }
            params = (
                ('_csrf', csrf),
                ('__wing_navigate_type', 'view'),
                ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'),
                ('__wing_navigate_options', dumps(__wing_navigate_options)),
                ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))),
            )
            body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type)
            data = json_2_dict(body, encoding='ascii').get('data', {})
            # pprint(data)
            one = data.get('model', [])
            pprint(one)
            origin_comment_list += one
            sleep(.25)

        return origin_comment_list
Ejemplo n.º 20
0
    def _get_one_page_comment_info(self, page_num, goods_id) -> tuple:
        """
        获取单页comment info
        :return:
        """
        def _get_params(goods_id, page_num, page_size):
            params = (
                ('productId', str(goods_id)),
                ('tagId', ''),
                ('page', str(page_num)),
                ('perPage', page_size),
            )

            return params
        
        tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list'
        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='', )
        headers.update({
            'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id))
        })
        params = _get_params(
            goods_id=goods_id,
            page_num=page_num,
            page_size=self.page_size,
        )
        body = Requests.get_url_body(
            url=tmp_url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        data = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={})
        # pprint(data)
        assert data.get('comments') is not None\
            and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id)

        # 判断是否下页还有评论信息
        # <class 'bool'>
        has_next_page = data.get('hasNext', False)
        data = data.get('comments', [])
        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,))

        return data, has_next_page
Ejemplo n.º 21
0
    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
        # print(tmp_body)

        tmp_data = json_2_dict(json_str=tmp_body).get('data', [])
        if tmp_data == []:
            self.result_data = {}
            return {}

        true_sku_info = []
        i_s = {}
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['normal_price'] = normal_price
                            tmp['detail_price'] = detail_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s)
Ejemplo n.º 22
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_goods_list = []
        for gender in ['0', '1']:  # 男,女
            for page in range(0, 100):  # page控制放回数据为哪一页
                print('正在抓取的page为: ', page)

                body = self.get_one_page_goods_info(gender, page)
                json_body = json_2_dict(body, default_res={})
                try:
                    this_page_total_count = json_body.get('data', {}).get(
                        'groupList', [])[0].get('totalCount', 0)
                except IndexError:
                    print('获取this_page_total_count时出错, 请检查!')
                    this_page_total_count = 0

                # print(this_page_total_count)
                if this_page_total_count == 0:
                    print('### 该性别的全部限时商品信息获取完毕 ###')
                    break

                tmp_goods_list = json_body.get('data', {}).get(
                    'groupList', [])[0].get('dataList', [])
                for item in tmp_goods_list:
                    item['gender'] = gender
                    item['page'] = page

                for item in tmp_goods_list:
                    if item.get('id', 0) not in [
                            item_1.get('id', 0) for item_1 in all_goods_list
                    ]:
                        all_goods_list.append(item)

                sleep(.4)

        all_goods_list = [{
            'goods_id': str(item.get('chuchuId', '')),
            'sub_title': item.get('description', ''),
            'gender': item.get('gender', '0'),
            'page': item.get('page')
        } for item in all_goods_list]
        print(all_goods_list)
        print('本次抓取共有限时商品个数为: ', len(all_goods_list))

        self.deal_with_data(all_goods_list)

        return None
Ejemplo n.º 23
0
def check_proxy_status(self, proxy, timeout=CHECK_PROXY_TIMEOUT) -> bool:
    '''
    检测代理状态, 突然发现, 免费网站写的信息不靠谱, 还是要自己检测代理的类型
    :param proxy: 待检测代理
    :return:
    '''
    # lg.info(str(self.request))
    res = False
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': get_random_pc_ua(),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }

    proxies = {
        'http': 'http://' + proxy,
        # 'https': 'https://' + proxy,
    }
    try:
        response = requests.get(url=TEST_HTTP_HEADER,
                                headers=headers,
                                proxies=proxies,
                                timeout=timeout)
        lg.info(str(response.text))
        if response.ok:
            content = json_2_dict(json_str=response.text)
            proxy_connection = content.get('headers',
                                           {}).get('Proxy-Connection', None)
            lg.info('Proxy-Connection: {}'.format(proxy_connection))
            ip = content.get('origin', '')
            if ',' in ip:  # 两个ip, 匿名度: 透明
                pass
            elif proxy_connection:
                pass
            else:  # 只抓取高匿名代理
                lg.info(str('成功捕获一只高匿ip: {}'.format(proxy)))
                return True
        else:
            pass
    except Exception:
        pass

    return res
Ejemplo n.º 24
0
def check_proxy_status(self, proxy, local_ip, timeout=CHECK_PROXY_TIMEOUT) -> bool:
    '''
    检测代理状态, 突然发现, 免费网站写的信息不靠谱, 还是要自己检测代理的类型
    :param proxy: 待检测代理
    :return:
    '''
    # lg.info(str(self.request))
    res = False
    headers = _get_base_headers()
    proxies = {
        'http': 'http://' + proxy,
        # 'https': 'https://' + proxy,
    }
    try:
        with session() as s:
            response = s.get(url=TEST_HTTP_HEADER, headers=headers, proxies=proxies, timeout=timeout)
            lg.info(str(response.text))
            if response.ok:
                content = json_2_dict(json_str=response.text)
                proxy_connection = content.get('headers', {}).get('Proxy-Connection', None)
                lg.info('Proxy-Connection: {}'.format(proxy_connection))
                ip = content.get('origin', '')
                assert ip != '', 'ip为空!'
                # TODO 老版本的判断 pass
                # if ',' in ip\
                #         or proxy_connection:           # 两个ip, 匿名度: 透明
                #     pass
                # else:                   # 只抓取高匿名代理
                #     if local_ip != ip:
                #         lg.info(str('成功捕获一只高匿ip: {}'.format(proxy)))
                #         return True

                # 新版判断, 新版不用代理请求httpbin返回格式: '原ip, 原ip'
                local_ip_str = '{}, {}'.format(local_ip, local_ip)
                if local_ip_str != ip \
                        and local_ip not in ip:
                    # print(now_ip)
                    lg.info(str('成功捕获一只高匿ip: {}'.format(proxy)))
                    return True
                else:
                    pass

            else:
                pass
    except Exception:
        pass

    return res
Ejemplo n.º 25
0
    def get_goods_div_desc(self, tmp_p_info_body):
        '''
        得到div_desc
        :param body:
        :return: '' or str
        '''
        def _get_div_images_list(target):
            div_images_list = []
            for item in target:
                if re.compile('http').findall(item) == []:
                    item = 'http:' + item
                div_images_list.append(item)

            return div_images_list

        tmp_p_info_data = json_2_dict(json_str=tmp_p_info_body)
        if tmp_p_info_data == {}:
            return ''

        div_images_list = _get_div_images_list(
            target=tmp_p_info_data.get('data', {}).get('detailInfos', {}).get(
                'detailImage', [])[0].get('list', []))
        if div_images_list == []:
            # print('div_images_list为空list, 出错请检查!')
            # 可能在[1] 这个里面再进行处理
            div_images_list = _get_div_images_list(
                target=tmp_p_info_data.get('data', {}).get('detailInfos', {}).
                get('detailImage', [])[1].get('list', []))
            if div_images_list == []:
                print('div_images_list为空list, 出错请检查!')
                return ''
            else:
                tmp_div_desc = ''
                for item in div_images_list:
                    tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                        item)
                    tmp_div_desc += tmp
                div_desc = '<div>' + tmp_div_desc + '</div>'

        else:
            tmp_div_desc = ''
            for item in div_images_list:
                tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                    item)
                tmp_div_desc += tmp
            div_desc = '<div>' + tmp_div_desc + '</div>'

        return div_desc
Ejemplo n.º 26
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        for index in range(1, 1000):  # 0跟1返回一样,所有从1开始遍历
            tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                index) + '/0/'
            print('正在抓取: ', tmp_url)

            body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         high_conceal=True,
                                         ip_pool_type=self.ip_pool_type)
            # print(body)

            if body == '':
                print('获取到的body为空值! 此处跳过')

            else:
                tmp_data = json_2_dict(json_str=body)
                if tmp_data == {}:
                    print('json.loads转换body时出错, 此处跳过!')

                if tmp_data.get('data_list', []) == []:
                    print('得到的data_list为[], 此处跳过!')
                    break

                else:
                    # print(tmp_data)
                    data_list = [{
                        'goods_id': item.get('sku', ''),
                        'sub_title': item.get('intro', ''),
                        'pid': index,
                    } for item in tmp_data.get('data_list', [])]
                    # pprint(data_list)

                    for item in data_list:
                        goods_list.append(item)
                    sleep(.5)

        pprint(goods_list)
        self.deal_with_data(goods_list=goods_list)
        sleep(8)
        return None
Ejemplo n.º 27
0
def ocr_mt_captcha():
    with open('/Users/afa/myFiles/pwd/yundama_pwd.json', 'r') as f:
        yundama_info = json_2_dict(f.read())

    username = yundama_info['username']
    pwd = yundama_info['pwd']
    app_key = yundama_info['app_key']
    res = yundama_ocr_captcha(
        username=username,
        pwd=pwd,
        app_key=app_key,
        code_type=1004,  # 4位字符数字
        img_path='./mt_captcha.png')

    print('识别结果:{}'.format(res))

    return res
Ejemplo n.º 28
0
def get_gd_key() -> str:
    gd_api_json = ''
    gd_map_pwd_file_path = '/Users/afa/myFiles/pwd/gaode_map_pwd.json'
    with open(gd_map_pwd_file_path, 'r') as f:
        for line in f:
            gd_api_json += line.replace('\n', '').replace('  ', '')
        # self.lg.info(gd_api_json)
        gd_key_list = json_2_dict(
            json_str=gd_api_json, ) \
            .get('fz_map_info', [])
    pprint(gd_key_list)
    assert gd_key_list != [], 'gd_key_list不为空list!'
    gd_key_list = [item.get('key', '') for item in gd_key_list]
    gd_key = gd_key_list[1]
    assert gd_key != '', 'gd_key不为空str!'

    return gd_key
Ejemplo n.º 29
0
    def get_stock_info_dict(self, goods_id):
        '''
        得到实时库存信息
        :param goods_id:
        :return: 返回dict类型
        '''
        stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json'
        stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {})
        if tmp_stock_info == {}:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        return tmp_stock_info
Ejemplo n.º 30
0
    def get_stock_info_dict(self, stock_info_url):
        '''
        得到实时库存信息
        :param stock_info_url:
        :return: 返回dict类型
        '''
        stock_info_body = MyRequests.get_url_body(url=stock_info_url,
                                                  headers=self.headers)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {})
        if tmp_stock_info == {}:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        return tmp_stock_info