Exemple #1
0
    def get_all_img_url(self, goods_id, is_hk):
        '''
        得到all_img_url
        :param goods_id:
        :param is_hk:
        :return:
        '''
        if is_hk is True:  # 全球购
            tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str(
                goods_id) + '.html'
        else:
            tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html'

        tmp_body_2 = Requests.get_url_body(url=tmp_url_2,
                                           headers=self.headers,
                                           had_referer=True,
                                           ip_pool_type=self.ip_pool_type)
        # print(Selector(text=tmp_body_2).css('div.small').extract())

        if tmp_body_2 == '':
            print('请求tmp_body_2为空值, 此处先跳过!')
            return ''

        all_img_url = []
        for item in Selector(text=tmp_body_2).css('div.small img').extract():
            # print(item)
            tmp_img_url = Selector(
                text=item).css('img::attr("src")').extract_first()
            all_img_url.append({'img_url': tmp_img_url})

        return all_img_url
Exemple #2
0
    def get_p_info_list(self, goods_id):
        '''
        得到详情介绍信息
        :param goods_id:
        :return: 返回一个list
        '''
        p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id)
        p_info_body = Requests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', [])
        if tmp_p_info == []:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
Exemple #3
0
def share_2_wx() -> bool:
    '''
    分享给微信
    :return:
    '''
    cookies = {
        'wk_': '8llgqrevckd0bmllcdgrtqjv88elq3fl',
    }
    headers = {
        'Host': 'ios.riyiwk.com',
        'accept': '*/*',
        'content-type': 'application/x-www-form-urlencoded',
        'user-agent': 'ExtraIncome/2.6.0 (iPhone; iOS 11.0; Scale/3.00)',
        'accept-language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
    }
    data = 'data=6FutSNjTIN512XBvPZXgztwPxRaLLFygqXFrzxnaSHhKJ0RMskgPCJ1veAFe71DmE/Weqi3qbl9Jp%2BWfhSSCtlPnKIheoydBjmxWvUtEh9qV4RXkSil0AWr5P5f8V4jL/OnQQxXgTeOBhhsJK7140Iuc/kdtw0qP'

    url = 'https://ios.riyiwk.com//user/shareCallback'
    message = json_2_dict(
        Requests.get_url_body(method='post',
                              use_proxy=False,
                              url=url,
                              headers=headers,
                              cookies=cookies,
                              data=data)).get('message', '')
    label, res = (
        '+',
        True,
    ) if message == '成功' else (
        '-',
        False,
    )
    print('[{}] 分享微信成功!'.format(label))

    return res
Exemple #4
0
    def get_jump_to_url_and_is_hk(self, body):
        '''
        得到跳转地址和is_hk
        :param body: 待解析的url的body
        :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean
        '''
        if re.compile(r'_sign_direct_url = ').findall(
                body) != []:  # 表明是跳转,一般会出现这种情况的是拼团商品
            # 出现跳转时
            try:
                sign_direct_url = re.compile(
                    r"_sign_direct_url = '(.*?)';").findall(body)[0]
                print('*** 获取到跳转地址为: ', sign_direct_url)
            except IndexError:
                sign_direct_url = ''
                print('获取跳转的地址时出错!')

            body = Requests.get_url_body(url=sign_direct_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         ip_pool_type=self.ip_pool_type)

            if re.compile(r'://m.miyabaobei.hk/').findall(
                    sign_direct_url) != []:
                # 表示为全球购商品
                print('*** 此商品为全球购商品!')
                is_hk = True
            else:
                is_hk = False

        else:
            is_hk = False
            sign_direct_url = ''

        return (body, sign_direct_url, is_hk)
Exemple #5
0
def turn_one_time() -> dict:
    cookies = {
        'Hm_lpvt_fa0ddec29ac177a2d127cebe209832e3':
        str(datetime_to_timestamp(get_shanghai_time())),
        'Hm_lvt_fa0ddec29ac177a2d127cebe209832e3':
        '1537161510,1537228200,1537353114,1537411854',  # 定值
        'wk_':
        '9umq63s8g6leobk2p285frmp583nhm9t',  # 定值
    }
    headers = {
        'Host':
        'm.riyiwk.com',
        'accept':
        'application/json, text/javascript, */*; q=0.01',
        'origin':
        'https://m.riyiwk.com',
        'referer':
        'https://m.riyiwk.com/lottery.html?check_login=1',
        'accept-language':
        'zh-cn',
        'x-requested-with':
        'XMLHttpRequest',
        'user-agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f/RIYIWK 2.6.0/USER_ID 203793/TOKEN 3a3988e07be98db064a70fc635c0b590',
    }
    url = 'https://m.riyiwk.com/lottery/start.html'
    res = json_2_dict(
        Requests.get_url_body(method='post',
                              use_proxy=False,
                              url=url,
                              headers=headers,
                              cookies=cookies))
    # pprint(res)

    return res
Exemple #6
0
    def _get_this_goods_member_id(self, goods_id):
        '''
        获取member_id
        :param goods_id:
        :return: '' or str
        '''
        headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_pc_ua(),
            # 'X-DevTools-Emulate-Network-Conditions-Client-Id': '5C1ED6AF76F4F84D961F136EAA06C40F',
        }
        params = (
            ('offerId', str(goods_id)),
        )
        url = 'https://m.1688.com/page/offerRemark.htm'
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            self.lg.error('获取到的body为空值!此处跳过!')
            return ''

        try:
            member_id = re.compile(r'"memberId":"(.*?)",').findall(body)[0]
        except IndexError:
            self.lg.error('获取member_id时索引异常!请检查!')
            return ''

        return member_id
Exemple #7
0
    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''  # 搜索: 偶尔不稳定但是还是能用
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = Requests.get_url_body(url=s_url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            return []
        else:
            data = json_2_dict(json_str=body, logger=self.lg)
            if data == {}:
                self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                        keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.lg.exception(e)
                    self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(
                        keyword[1]))
                    return []

                return goods_id_list
Exemple #8
0
def fetch(pid):
    url = 'http://json-time.appspot.com/time.json'
    body = Requests.get_url_body(url=url, use_proxy=False)
    # print(body)
    json_result = json.loads(body)
    datetime = json_result['datetime']

    print('Process %s: %s' % (pid, datetime))

    return json_result['datetime']
def bg_login(bg_username, bg_pwd):
    '''
    后台login
    :param validate:
    :return:
    '''
    cookies = {
        '_9755xjdesxxd_':
        '32',
        'gdxidpyhxdE':
        'gjKCnDWASVwyJpOSGKLIaqHXYt0Qjq7Ycs7JzzLNWoZV2S%5CTam6fybIabIljeoL4JpfrI%2Bl6Xp9wLy5bHanMUDVPQdC3%2B3ihW%2BrP1cH6ktTTEvKfaPLQSHkkL5Wn7BpLALiek4J2Bq9nan1om%2B8dA%2FYyoxxDwX7vLusi5dLf%2Bni%2Fyrot%3A1536833525662',
    }
    validate = crack_wy_point_select_captcha(username=username,
                                             pwd=pwd,
                                             id=id,
                                             referer=referer)
    validate = '' if validate == '' else unquote_plus(validate)
    print('获取到的validate:{}'.format(validate))
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Origin': 'http://120.26.119.135',
        'Upgrade-Insecure-Requests': '1',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': get_random_pc_ua(),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'http://120.26.119.135/Login.aspx',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }

    data = {
        '__VIEWSTATE':
        '/wEPDwUKLTUwOTQ0NDQ3MWRk/ffecNvOMZIyPoiGxLPop3/5ERoE5/VlszxMMNbpijg=',
        '__VIEWSTATEGENERATOR': 'C2EE9ABB',
        'txtUserName': bg_username,
        'txtPwd': bg_pwd,
        # 验证码认证str
        # 'NECaptchaValidate': 'jrhSRTTEM4fZR9oXGRxtC4oiups4od-qu7zvHUkrheMvtGBDV-UPUNmpcigljb2adxT.49aFGB6.Ez2EfgTbMvjMLp54AF9KAmfNAjVoN7.UWqxQac6zbtrU-nWbFc-22a_E85FotOmPBIQFb1U68mRGd0.xBv_N5BIqAFqi495WKS0XQwyQE7frGovtg0OQoah9eXFaLall-rRlaWQrHe6ifSAGnCrLYpfU7P1W561gIUssJJ0Jfs_BGSQshsQ_XivpGyt84K9ISOTijZ45h1NQbaSwupv_EGXSgkXv4T8gnJHao1E9d5e7rqeGw_YgYLQiEzhm1.uuG2xQVPPdYbYVdk0kbQDyTDTTfyMrVkfMdwnjh.XupVrShm1vEPI9YHJGFuh.GwezkeQJCLb1BwbJ_gXPLE9evLEUGa.R4mvLZuxjkzS28qksNpyzFSs0NDobMc18Y81Vr_XiRZu.mGCmfemIE.yWSmgNnPpS.IbY6w6laJkEF1oT5sI3',
        'NECaptchaValidate': validate,
        'btnLogin': '******',
    }
    url = 'http://admin.k85u.com/index.aspx'
    body = Requests.get_url_body(method='post',
                                 url=url,
                                 headers=headers,
                                 cookies=cookies,
                                 data=data,
                                 use_proxy=False)
    print(body)

    return body
Exemple #10
0
    def _get_div_desc(self, data):
        '''
        得到div_desc
        :param data:
        :return:
        '''
        def _get_right_body(body):
            '''得到main_body'''
            # 处理data-lazy-src
            body = re.compile(r'<img src=').sub('<img data-lazy-src=', body)
            body = re.compile(r'data-lazy-src=').sub('src=', body)
            body = re.compile(r'<img data-src=').sub('<img src=', body)
            body = re.compile(r';opacity:0').sub('', body)  # 不替换否则不显示图片
            # print(body)

            try:
                main_body = re.compile(r'<main .*?>(.*)</main>').findall(body)[0]
            except IndexError:
                main_body = re.compile(r'<body>(.*?)</body>').findall(body)[0]
                main_body = re.compile(r'<script.*?>.*?</script>').sub('', main_body)
            return main_body

        try:
            intros = data.get('good', {}).get('intros', [])[0]
        except IndexError:
            raise IndexError('获取intros获取异常!')

        tabs = intros.get('tabs', [])
        # pprint(tabs)
        div_desc_url = ''
        title_list = [
            '功能详情',
            '产品介绍',
            '概述',
            '商品详情',
        ]
        for item in tabs:
            if item.get('title', '') in title_list:
                div_desc_url = item.get('url', '')
                break

        if div_desc_url == '':
            raise ValueError('获取div_desc_url为空值!')

        body = Requests.get_url_body(url=div_desc_url, headers=self.headers, ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            raise ValueError('获取到的div_desc为空值!')

        div_desc = '<div>' + _get_right_body(body) + '</div>'
        # self.lg.info(str(div_desc))

        return div_desc
Exemple #11
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            try:
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             high_conceal=True,
                                             ip_pool_type=self.ip_pool_type)
                assert body != '', 'body为空值!'
                tmp_data = json_2_dict(json_str=body, default_res={}).get(
                    'data', {}).get('goods', [])
                # print(tmp_data)
                assert tmp_data != [], '该tmp_url得到的goods为空list, 此处跳过!'
                sleep(.5)
            except AssertionError as e:
                print(e)
                sleep(.5)
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Exemple #12
0
    def _get_p_info(self, goods_id):
        p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(
            goods_id)
        tmp_p_info_body = Requests.get_url_body(url=p_info_api_url,
                                                headers=self.headers,
                                                had_referer=True,
                                                ip_pool_type=self.ip_pool_type)
        # print(tmp_p_info_body)
        assert tmp_p_info_body != '', '获取到的tmp_p_info_body为空值, 请检查!'

        p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body)

        return p_info, tmp_p_info_body
Exemple #13
0
    def _get_origin_comment_list(self, **kwargs) -> list:
        '''
        得到加密的接口数据信息
        :param kwargs:
        :return:
        '''
        csrf = kwargs.get('csrf', '')
        goods_id = kwargs.get('goods_id', '')
        cookies = kwargs.get('cookies', '')

        url = 'https://m.1688.com/page/offerRemark.htm'
        headers = {
            'cookie': cookies,
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': 'application/json, text/javascript, */*; q=0.01',
            'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id),
            'authority': 'm.1688.com',
            'x-requested-with': 'XMLHttpRequest',
        }

        origin_comment_list = []
        for i in range(1, self.max_page):
            __wing_navigate_options = {
                'data': {
                    'bizType': 'trade',
                    'itemId': int(goods_id),
                    'offerId': str(goods_id),
                    'page': i,
                    'pageSize': 5,
                    # 'receiveUserId': 989036456,
                    'starLevel': 7
                }
            }
            params = (
                ('_csrf', csrf),
                ('__wing_navigate_type', 'view'),
                ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'),
                ('__wing_navigate_options', dumps(__wing_navigate_options)),
                ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))),
            )
            body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type)
            data = json_2_dict(body, encoding='ascii').get('data', {})
            # pprint(data)
            one = data.get('model', [])
            pprint(one)
            origin_comment_list += one
            sleep(.25)

        return origin_comment_list
    def _get_one_page_comment_info(self, page_num, goods_id) -> tuple:
        """
        获取单页comment info
        :return:
        """
        def _get_params(goods_id, page_num, page_size):
            params = (
                ('productId', str(goods_id)),
                ('tagId', ''),
                ('page', str(page_num)),
                ('perPage', page_size),
            )

            return params
        
        tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list'
        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='', )
        headers.update({
            'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id))
        })
        params = _get_params(
            goods_id=goods_id,
            page_num=page_num,
            page_size=self.page_size,
        )
        body = Requests.get_url_body(
            url=tmp_url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        data = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={})
        # pprint(data)
        assert data.get('comments') is not None\
            and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id)

        # 判断是否下页还有评论信息
        # <class 'bool'>
        has_next_page = data.get('hasNext', False)
        data = data.get('comments', [])
        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,))

        return data, has_next_page
Exemple #15
0
    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         ip_pool_type=self.ip_pool_type)
        # print(tmp_body)

        tmp_data = json_2_dict(json_str=tmp_body).get('data', [])
        if tmp_data == []:
            return self._data_error_init()

        true_sku_info = []
        i_s = {}
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['normal_price'] = normal_price
                            tmp['detail_price'] = detail_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s)
    def _get_aweme_api_videos_info(self, user_id):
        self.user_id = user_id
        params = (
            ('user_id', self.user_id),
            ('max_cursor', '0'),
            ('count', '20'),
        )

        url = 'https://www.douyin.com/aweme/v1/aweme/post/'
        body = Requests.get_url_body(url=url,
                                     headers=self.headers,
                                     params=params)
        # print(body)

        self.deal_with_data(body=body)
Exemple #17
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        for index in range(1, 1000):  # 0跟1返回一样,所有从1开始遍历
            tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                index) + '/0/'
            print('正在抓取: ', tmp_url)

            body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         high_conceal=True,
                                         ip_pool_type=self.ip_pool_type)
            # print(body)

            if body == '':
                print('获取到的body为空值! 此处跳过')

            else:
                tmp_data = json_2_dict(json_str=body)
                if tmp_data == {}:
                    print('json.loads转换body时出错, 此处跳过!')

                if tmp_data.get('data_list', []) == []:
                    print('得到的data_list为[], 此处跳过!')
                    break

                else:
                    # print(tmp_data)
                    data_list = [{
                        'goods_id': item.get('sku', ''),
                        'sub_title': item.get('intro', ''),
                        'pid': index,
                    } for item in tmp_data.get('data_list', [])]
                    # pprint(data_list)

                    for item in data_list:
                        goods_list.append(item)
                    sleep(.5)

        pprint(goods_list)
        self.deal_with_data(goods_list=goods_list)
        sleep(8)
        return None
Exemple #18
0
def _get_66_ip_list():
    '''
    先获取66高匿名ip
    :return:
    '''
    global a_66_ip
    headers = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'http://www.66ip.cn/nm.html',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }

    params = (
        ('getnum', ''),
        ('isp', '0'),
        ('anonymoustype', '3'),
        ('start', ''),
        ('ports', ''),
        ('export', ''),
        ('ipaddress', ''),
        ('area', '0'),
        ('proxytype', '2'),
        ('api', '66ip'),
    )

    response = requests.get('http://www.66ip.cn/nmtq.php',
                            headers=headers,
                            params=params,
                            cookies=None)
    body = Requests._wash_html(response.content.decode('gbk'))
    try:
        part = re.compile(r'</script>(.*)</div>').findall(body)[0]
    except IndexError:
        part = ''
    part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part)
    # print(part)
    ip_list = delete_list_null_str(part.split('<br />'))
    # print(ip_list)
    a_66_ip = ip_list if ip_list != [] else []

    return ip_list
Exemple #19
0
    async def _search(self, search_key) -> list:
        '''
        天眼查搜索功能
        :param search_key: 待搜索key
        :return:
        '''
        headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_phone_ua(),
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://m.tianyancha.com/',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
        params = (('key', str(search_key)), )
        url = 'https://m.tianyancha.com/search'
        body = Requests.get_url_body(url=url,
                                     headers=headers,
                                     params=params,
                                     cookies=None)
        # print(body)
        if body == '':
            return []

        search_list = []
        try:
            # div.new-border-bottom
            search_res = Selector(text=body).css(
                'div.search_result_container ::text').extract_first() or ''
            company_name = Selector(
                text=search_res).css('div.new-border-bottom a span text ::text'
                                     ).extract_first() or ''
            assert company_name != '', 'company_name为空值!'
            url = Selector(text=search_res).css(
                'div.new-border-bottom a ::attr("href")').extract_first() or ''
            assert url != '', 'url为空值!'
            legal_person = Selector(text=search_res).css(
                'a.legalPersonName ::text').extract_first() or ''
            legal_person_url = Selector(text=search_res).css(
                'a.legalPersonName ::attr("href")').extract_first() or ''
            legal_person_url = 'https://m.tianyancha.com' + legal_person_url if legal_person_url != '' else ''

        except AssertionError as e:
            print(e)
            return []
Exemple #20
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id, current_page=current_page)
            body = Requests.get_url_body(url=_url, headers=self.headers, params=params, ip_pool_type=self.ip_pool_type)
            # self.lg.info(str(body))

            _data = json_2_dict(json_str=body, logger=self.lg).get('wareDetailComment', {}).get('commentInfoList', [])
            if _data == []:
                self.lg.error('出错goods_id:{0}'.format(self.goods_id))

            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.lg.error('出错goods_id:{0}'.format(goods_id))
            self.lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Exemple #21
0
    def get_stock_info_dict(self, goods_id):
        '''
        得到实时库存信息
        :param goods_id:
        :return: 返回dict类型
        '''
        stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json'
        stock_info_body = Requests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {})
        if tmp_stock_info == {}:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        return tmp_stock_info
Exemple #22
0
    def _get_1688_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取1688销量靠前的商品信息
        :param keyword:
        :return: a list eg: ['11111', ...]
        '''
        '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品'''
        headers = {
            'authority': 'm.1688.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': get_random_pc_ua(),
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE',
        }

        params = (
            ('sortType', 'booked'),
            ('filtId', ''),
            ('keywords', keyword[1]),
            ('descendOrder', 'true'),
        )

        url = 'https://m.1688.com/offer_search/-6161.html'
        body = Requests.get_url_body(url=url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                goods_id_list = Selector(text=body).css(
                    'div.list_group-item::attr("data-offer-id")').extract()
                # pprint(goods_id_list)
            except Exception as e:
                self.lg.exception(e)
                self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(
                    keyword[1]))
                goods_id_list = []

        return goods_id_list
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page)
            )
            print('正在抓取的页面地址为: ', tmp_url)

            body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.5)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id': item.get('goods_id', ''),
                'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count': str(item.get('join_number_int', '')),
                'page': page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Exemple #24
0
    async def _get_all_ame_from_office(self) -> list:
        '''
        得到全国最新区码(http://xzqh.mca.gov.cn/map)
        :return:
        '''
        body = Requests.get_url_body(url='http://xzqh.mca.gov.cn/map',
                                     headers=await self._get_headers(),
                                     ip_pool_type=self.ip_pool_type)
        # print(body)
        # http://www.mca.gov.cn/article/sj/tjbz/a/2018/201803131439.html
        data = json_2_dict(json_str=Selector(
            text=body).css('table.select_table td input#pyArr ::attr("value")'
                           ).extract_first(),
                           default_res=[])
        print('总计邮编个数: {}'.format(len(data)))
        self.ame_list = data

        return data
    def _get_one_page_comment_info(self, goods_id, page_num) -> list:
        """
        获取单页comment info
        :return:
        """
        headers = {
            'Referer':
            'https://item.m.jd.com/product/{}.html'.format(goods_id),
            'User-Agent': get_random_phone_ua(),
        }
        params = (
            # ('callback', 'skuJDEvalA'),
            ('sorttype', '5'),
            ('pagesize', '10'),
            ('sceneval', '2'),
            ('score', '3'),  # 取好评的
            ('sku', str(goods_id)),
            ('page', str(page_num)),
            # ('t', '0.7175421988280679'),
        )
        url = 'https://wq.jd.com/commodity/comment/getcommentlist'
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type,
        )
        # self.lg.info(body)
        assert body != '', 'body不为空值!'
        data = []
        try:
            data = json_2_dict(
                json_str=re.compile('\((.*)\)').findall(body)[0],
                default_res={}).get('result', {}).get('comments', [])
        except IndexError:
            pass
        # pprint(data)
        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,
        ))
        # assert data != [], 'data不为空list! 出错goods_id: {}'.format(goods_id)

        return data
Exemple #26
0
    def get_one_page_goods_info(self, *params):
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {
            "group": 4,
            "module": "99",
            "page": page,
            "tab": "all"
        }

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = Requests.get_url_body(url=tmp_url, headers=self.headers, params=data, ip_pool_type=self.ip_pool_type)
        if body == '':
            body = '{}'

        return body
Exemple #27
0
    def _get_seller_id(self, _type, goods_id):
        '''
        得到seller_id
        :param type:
        :param goods_id:
        :return:
        '''
        # TODO 与更新脚本接口冲突
        # tmall = TmallParse(logger=self.lg)
        # _g = [_type, goods_id]
        # self.g_data = tmall.get_goods_data(goods_id=_g)
        # seller_id = str(self.g_data.get('seller', {}).get('userId', 0))
        # # self.lg.info('获取到的seller_id: ' + seller_id)
        # try:
        #     del tmall
        # except:
        #     pass

        # 方案2:
        headers = self._get_phone_headers()
        headers.update({
            'authority': 'detail.m.tmall.com',
        })
        # 测试发现: 必要字段_tb_token_, cookie2, t
        params = (('id', goods_id), )
        # 处理天猫国际
        url = 'https://detail.m.tmall.com/item.htm' if _type != 2 else 'https://detail.m.tmall.hk/item.htm'
        body = Requests.get_url_body(url=url,
                                     headers=headers,
                                     params=params,
                                     ip_pool_type=self.ip_pool_type,
                                     cookies=self.login_cookies_dict)
        # self.lg.info(body)

        seller_id = '0'
        try:
            seller_id = str(re.compile('\"userId\":(\d+),').findall(body)[0])
        except (IndexError, Exception):
            pass
        # self.lg.info(seller_id)

        assert seller_id != '0', '获取到的seller_id为0!'

        return seller_id
    def traversal_hour_timestamp(self, item):
        '''
        遍历每个需求的整点时间戳
        :param item:
        :return:
        '''
        # 先遍历today的需求的整点时间戳
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(item))
        body = Requests.get_url_body(url=tmp_url,
                                     headers=self.headers,
                                     had_referer=True,
                                     ip_pool_type=self.ip_pool_type)
        # print(body)

        if body == '':
            print('item为: ', item)
            print('获取到的body为空值! 此处跳过')

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                print('json.loads转换body时出错, 此处跳过!')
                tmp_data = {}

            if tmp_data == {}:
                print('tmp_data为空{}!')
                pass
            else:
                # pprint(tmp_data)
                # print(tmp_data)

                event_time = item
                item_list = tmp_data.get('data', {}).get('list', [])

                self.deal_with_data(event_time, item_list)
                sleep(MOGUJIE_SLEEP_TIME)
Exemple #29
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             high_conceal=True,
                                             ip_pool_type=self.ip_pool_type)
            if tmp_body == '':
                tmp_body = '{}'
            try:
                tmp_data = json.loads(tmp_body)
                tmp_data = tmp_data.get('objects', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        return zid_list
Exemple #30
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        mia_base_number = MIA_BASE_NUMBER
        self.db_goods_id_list = self._get_db_goods_id_list()
        assert self.db_goods_id_list is not None, 'self.db_goods_id_list为空值!'
        while mia_base_number < MIA_MAX_NUMBER:
            tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                mia_base_number)
            body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         ip_pool_type=self.ip_pool_type)
            # print(body)
            if body == '' or body == '[]':
                print('mia_base_number为: ', mia_base_number)
                print('获取到的body为空值! 此处跳过')
                mia_base_number += 1
                continue

            else:
                tmp_data = json_2_dict(body, default_res={})
                tmp_hour = tmp_data.get('p_info', {}).get('start_time',
                                                          '')[11:13]
                if tmp_hour == '22':  # 过滤掉秒杀时间为22点的
                    print('--- 销售时间为22点,不抓取!')
                    pass
                else:
                    print(tmp_data)
                    print('mia_base_number为: ', mia_base_number)
                    pid = mia_base_number
                    begin_time = tmp_data.get('p_info',
                                              {}).get('start_time', '')
                    end_time = tmp_data.get('p_info', {}).get('end_time', '')
                    item_list = tmp_data.get('item_list', [])

                    self.deal_with_data(pid, begin_time, end_time, item_list)
                    sleep(5)

            mia_base_number += 1