Ejemplo n.º 1
0
    def get_one_page_all_goods_list(self, *params):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        try:
            json_body = json.loads(body)
            # print(json_body)
        except:
            print('json.loads转换body时出错!请检查')
            json_body = {}
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        # sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list
Ejemplo n.º 2
0
    def get_one_page_goods_info(self, *params):
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {"group": 4, "module": "99", "page": page, "tab": "all"}

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       params=data)
        if body == '':
            body = '{}'

        return body
Ejemplo n.º 3
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_body = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers,
                                               high_conceal=True)
            if tmp_body == '':
                tmp_body = '{}'
            try:
                tmp_data = json.loads(tmp_body)
                tmp_data = tmp_data.get('objects', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        return zid_list
Ejemplo n.º 4
0
    def _get_wm_page_info(self):
        '''
        获取外卖页面的json推荐
        :return:
        '''
        # cookies = {
        #     'ASP.NET_SessionId': 'rxnstx4qhayrkqdne3coeevj',
        # }
        all_rows = []
        print('开始采集券妈妈外卖券!')
        for page_index in range(1, 5):
            print('正在抓取第{0}页...'.format(page_index))
            data = self._set_data(page_index=page_index)

            url = 'https://app.quanmama.com/apios/v5/appZdmList.ashx'
            body = MyRequests.get_url_body(method='post',
                                           url=url,
                                           headers=self.headers,
                                           cookies=None,
                                           data=data)
            # print(body)
            if body == '':
                print('获取到的body为空值!此处跳过!')
                continue
            # print(body)

            rows = json_2_dict(json_str=body).get('data', {}).get('rows', [])
            if rows == []:
                print('得到的rows为空值!此处跳过!')
                continue
            # pprint(rows)

            all_rows += rows
            sleep(self.page_sleep_time)

        print('\n@@@@@@ 抓取完毕!')
        wm_list = self._parse_wm_page(all_rows)
        # pprint(wm_list)

        self._deal_with_wm_info(wm_list)
Ejemplo n.º 5
0
def getAllExternalLinks(siteUrl):
    domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
    html = MyRequests.get_url_body(url=siteUrl, headers=headers)
    bsObj = BeautifulSoup(html, 'lxml')
    internalLinks = getInternalLinks(bsObj, domain)
    externalLinks = getExternalLinks(bsObj, domain)

    f = open('result.txt', 'w')
    # 收集外链
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            # print(link)
            f.writelines(link + '\n')
            print("即将获取的外部链接的URL是:" + link)
            # 收集内链
    for link in internalLinks:
        if link not in allIntLinks:
            print("即将获取内部链接的URL是:" + link)
            allIntLinks.add(link)
            getAllExternalLinks(link)
            f.writelines(link + '\n')
def run_forever():
    with open('./setting.txt', 'r') as f:
        start = int(f.readline())

    for index in range(start, 99999999999999999):
        if index % 50 == 0:
            with open('./setting.txt', 'w') as f:
                f.write(str(index))
            print('*** 短暂休眠...')
            sleep(2)

        video_id = str(int('65' + 17 * '0') + index)

        url = 'https://www.iesdouyin.com/share/video/' + video_id + '/'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # print(body)

        if deal_with_data(video_id=video_id, body=body) is False:
            continue
        else:
            pass

        sleep(.2)
Ejemplo n.º 7
0
    def _get_pc_goods_body(self, url, goods_id):
        '''
        得到pc端商品的body
        :param goods_id:
        :return:
        '''
        headers = {
            'authority': 'goods.kaola.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': get_random_pc_ua(),
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'davisit=39; usertrack=O2+g2Ftatitk7YwIAwY2Ag==; _ntes_nnid=7732365205c88dc47486ad1208406e7e,1532671534874; _ga=GA1.2.960357080.1532671535; _gid=GA1.2.1543960295.1532671535; _klhtxd_=31; kaola_user_key=47cca4d0-57c9-41ca-ae67-2172c4a81500; __da_ntes_utma=2525167.1705273738.1532671535.1532671535.1532671535.1; davisit=1; __da_ntes_utmz=2525167.1532671535.1.1.utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); _jzqc=1; WM_TID=BuJzWuW25WT9h9YnJbNPwKuHb0%2FJdiEw; __kaola_usertrack=20180727140634933960; _da_ntes_uid=20180727140634933960; NTES_KAOLA_ADDRESS_CONTROL=330000|330100|330102|1; _qzjc=1; _ga=GA1.3.960357080.1532671535; KAOLA_NEW_USER_COOKIE=no; JSESSIONID-WKL-8IO=Ej0upUk4%2BoTwIaESuhWSdrP8LjGjGKPjy%5CzIHKVwWYJVzUbwkZTvIHZZ2oVgK9ZtzWBis36RUCxcfMMr793Xhr%2FSsY%2Br23bCIsjP%2F1bmz05eUdBpClLvMDOX%5CXC%5C4Chn2a6VZ%2FwA4VITIfWMWfpIO2CBt1YfXDpi0a7q2r6pvsE3SihO%3A1532930915344; _jzqckmp=1; _jzqx=1.1532671536.1532917914.2.jzqsr=google%2Ecom|jzqct=/.jzqsr=kaola%2Ecom|jzqct=/; _gid=GA1.3.1543960295.1532671535; _qzja=1.171255260.1532671601817.1532671601817.1532917917322.1532917929092.1532918028838..0.0.7.2; _qzjto=3.1.0; NTES_KAOLA_RV=1330333_1532918029707_0|27757_1532917929047_0|27979_1532672800324_0|1472242_1532671698324_0; _jzqa=1.658432386831847000.1532671536.1532917914.1532923346.10; __da_ntes_utmb=2525167.1.10.1532917929',
        }

        params = (
            ('ri', 'navigation'),
            ('from', 'page1'),
            ('zn', 'result'),
            ('zp', 'page1-0'),
            ('position', '0'),
            ('istext', '0'),
            # ('srId', '8bd1e06482b5730be802f6ce6f56dacf'),
            ('isMarketPriceShow', 'true'),
            ('hcAntiCheatSwitch', '0'),
            ('anstipamActiCheatSwitch', '1'),
            # ('anstipamActiCheatToken', 'de3223456456fa2e3324354u4567lt'),
            # ('anstipamActiCheatValidate', 'anstipam_acti_default_validate'),
        )

        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # print(body)

        return body
Ejemplo n.º 8
0
    def _get_shop_name(self, **kwargs):
        '''
        得到shop_name
        '''
        data = kwargs.get('data', {})

        seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0)
        tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str(
            seller_id)
        seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url,
                                                   headers=self.headers,
                                                   high_conceal=True)
        if seller_info_body == '':
            print('seller_info为空!')
            return {}
        else:
            seller_info = [seller_info_body]
        seller_info_str = ''
        for item_ss in seller_info:  # 拼接字符串
            seller_info_str += item_ss

        seller_info = [seller_info_str]
        # print(seller_info)

        if seller_info != []:
            seller_info = json_2_dict(json_str=seller_info[0])
            if seller_info == {}:
                print('卖家信息在转换时出现错误, 此处跳过')
                return {}

            # pprint(seller_info)
            shop_name = seller_info.get('sellerInfo', {}).get('nickName', '')
        else:
            shop_name = ''
        # print(shop_name)

        return shop_name
Ejemplo n.º 9
0
    def get_div_desc_body(self, goods_id):
        '''
        得到div_desc的html页面
        :param goods_id:
        :return: str类型的data, 出错的情况下返回{}
        '''
        div_desc_url = 'https://pina.m.zhe800.com/nnc/product/detail_content.json?zid=' + str(goods_id)

        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers, high_conceal=True)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        tmp_body = json_2_dict(json_str=div_desc_body).get('data', '')
        if tmp_body == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值

        tmp_body = self._wash_div_desc(tmp_body=tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body
Ejemplo n.º 10
0
    def get_goods_data(self, goods_id:str) -> '重载获取数据的方法':
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data dict类型
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            if re.compile(r'/rushdetail/').findall(goods_id) != []:
                tmp_url = goods_id
                print('------>>>| 原pc地址为: ', tmp_url)

                goods_id = re.compile('https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall(goods_id)[0]
                print('------>>>| 得到的蘑菇街商品id为:', goods_id)

            else:
                print('获取到的蘑菇街买哦啥地址错误!请检查')
                self.result_data = {}
                return {}

            data = {}

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空str!')
                self.result_data = {}
                return {}

            try:
                goods_info = re.compile(r'var detailInfo = (.*?);</script>').findall(body)[0]
                # print(goods_info)

                item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall(goods_info)[0]
                # print(item_info)

                sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(goods_info)[0]
                # print(sku_info)

                shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(goods_info)[0]
                # print(shop_info)

                item_info = json_2_dict(json_str=item_info)
                sku_info = json_2_dict(json_str=sku_info)
                shop_info = json_2_dict(json_str=shop_info)
                # pprint(item_info)
                # pprint(sku_info)
                # pprint(shop_info)

                data['title'] = item_info.get('title', '')
                if data['title'] == '':
                    print('title为空!')
                    raise Exception

                data['sub_title'] = ''

                data['shop_name'] = shop_info.get('name', '')
                # print(data['shop_name'])

                # 获取所有示例图片
                all_img_url = [{'img_url': item} for item in item_info.get('topImages', [])]
                # pprint(all_img_url)
                data['all_img_url'] = all_img_url

                '''
                获取p_info
                '''
                p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(goods_id)
                tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True)
                # print(tmp_p_info_body)
                if tmp_p_info_body == '':
                    print('获取到的tmp_p_info_body为空值, 请检查!')
                    raise Exception

                p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body)
                # pprint(p_info)
                # if p_info == []:
                #     print('获取到的p_info为空list')
                #     self.result_data = {}
                #     return {}
                # else:
                # 不做上面判断了因为存在没有p_info的商品
                data['p_info'] = p_info

                # 获取每个商品的div_desc
                div_desc = self.get_goods_div_desc(tmp_p_info_body=tmp_p_info_body)
                # print(div_desc)
                if div_desc == '':
                    print('获取到的div_desc为空str, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['div_desc'] = div_desc

                '''
                获取去detail_name_list
                '''
                detail_name_list = self.get_goods_detail_name_list(sku_info=sku_info)
                # print(detail_name_list)
                if detail_name_list == '':
                    print('获取detail_name_list出错, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['detail_name_list'] = detail_name_list

                '''
                获取每个规格对应价格跟规格以及其库存
                '''
                price_info_list = self.get_price_info_list(sku_info=sku_info)
                # pprint(price_info_list)
                if price_info_list == '':
                    raise Exception
                else:
                    # pprint(price_info_list)
                    data['price_info_list'] = price_info_list


                if price_info_list == []:
                    print('该商品已售完,此处将商品状态改为1')
                    my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    try:
                        sql_str = r'update dbo.mogujie_xianshimiaosha set is_delete=1 where goods_id = %s'
                        my_pipeline._update_table(sql_str=sql_str, params=(goods_id))
                    except:
                        print('将该商品逻辑删除时出错!')
                        pass
                    print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |')
                    self.result_data = {}
                    return {}

                # 商品价格和淘宝价
                try:
                    tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in data['price_info_list']])
                    price = Decimal(tmp_price_list[-1]).__round__(2)  # 商品价格
                    taobao_price = Decimal(tmp_price_list[0]).__round__(2)  # 淘宝价
                    # print('商品的最高价: ', price, ' 最低价: ', taobao_price)
                except IndexError:
                    print('获取price和taobao_price时出错! 请检查')
                    raise Exception

                data['price'] = price
                data['taobao_price'] = taobao_price

            except Exception as e:
                print('遇到错误: ', e)
                self.result_data = {}
                return {}

            if data != {}:
                # pprint(data)
                self.result_data = data
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
Ejemplo n.º 11
0
    def _get_div_desc(self, **kwargs):
        '''
        处理detail_data转换成能被html显示页面信息
        :param kwargs:
        :return:
        '''
        detail = kwargs.get('detail')
        goods_id = kwargs.get('goods_id')
        tmp_div_desc = ''
        if isinstance(detail, dict):
            if detail.get('detailImages') is not None:
                for item in detail.get('detailImages', []):
                    tmp_big = item.get('big', '')
                    tmp_height = item.get('height', 0)
                    tmp_width = item.get('width', 0)
                    # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_big, tmp_height, tmp_width)
                    tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                        tmp_big)
                    tmp_div_desc += tmp

            if detail.get('noticeImage') is not None:
                if isinstance(detail.get('noticeImage'), dict):
                    item = detail.get('noticeImage')
                    tmp_image = item.get('image', '')
                    tmp_height = item.get('height', 0)
                    tmp_width = item.get('width', 0)
                    # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_image, tmp_height, tmp_width)
                    tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                        tmp_image)
                    tmp_div_desc += tmp
                elif isinstance(detail.get('noticeImage'), list):
                    for item in detail.get('noticeImage', []):
                        tmp_image = item.get('image', '')
                        tmp_height = item.get('height', 0)
                        tmp_width = item.get('width', 0)
                        # tmp = r'<img src="{}" style="height:{}px;width:{}px;"/>'.format(tmp_image, tmp_height, tmp_width)
                        tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                            tmp_image)
                        tmp_div_desc += tmp
                else:
                    pass
                '''
                处理有尺码的情况(将其加入到div_desc中)
                '''
                tmp_size_url = 'https://th5.m.zhe800.com/app/detail/product/size?productId=' + str(
                    goods_id)
                size_data_body = MyRequests.get_url_body(url=tmp_size_url,
                                                         headers=self.headers,
                                                         high_conceal=True)
                if size_data_body == '':
                    print('size_data为空!')
                    return ''

                else:
                    size_data = [size_data_body]

                if size_data != []:
                    size_data = json_2_dict(json_str=size_data[0])
                    if size_data == {}:
                        print('json.loads(size_data)出错, 此处跳过')
                        return ''
                    # pprint(size_data)

                    tmp_div_desc_2 = ''
                    if size_data is not None:
                        charts = size_data.get('charts', [])
                        for item in charts:
                            # print(item)
                            tmp = ''
                            charts_data = item.get('data', [])  # table
                            title = item.get('title', '')
                            for item2 in charts_data:  # item为一个list
                                # print(item2)
                                charts_item = ''
                                for i in item2:  # i为一个dict
                                    # print(i)
                                    data_value = i.get('value', '')
                                    tmp_1 = '<td style="vertical-align:inherit;display:table-cell;font-size:12px;color:#666;border:#666 1px solid;">{}</td>'.format(
                                        data_value)
                                    charts_item += tmp_1
                                charts_item = '<tr style="border:#666 1px solid;">' + charts_item + '</tr>'
                                # print(charts_item)
                                tmp += charts_item
                            tmp = '<div>' + '<strong style="color:#666;">' + title + '</strong>' + '<table style="border-color:grey;border-collapse:collapse;text-align:center;line-height:25px;background:#fff;border-spacing:0;border:#666 1px solid;"><tbody style="border:#666 1px solid;">' + tmp + '</tbody></table></div><br>'
                            tmp_div_desc_2 += tmp
                        # print(tmp_div_desc_2)
                    else:
                        pass
                else:
                    tmp_div_desc_2 = ''

            else:
                tmp_div_desc_2 = ''
                pass
            tmp_div_desc = tmp_div_desc_2 + '<div>' + tmp_div_desc + '</div>'

        return tmp_div_desc
Ejemplo n.º 12
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str(
                goods_id)
            # print('------>>>| 得到的detail信息的地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           high_conceal=True)
            if body == '':
                self.result_data = {}
                return {}
            else:
                data = [body]

            if data != []:
                data = json_2_dict(json_str=data[0])
                if data == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                # pprint(data)

                # 处理base
                base = data.get('/app/detail/product/base', '')
                base = json_2_dict(json_str=base)
                if base == {}:
                    print("json.loads转换出错,得到base值可能为空,此处跳过")
                    base = ''

                # 处理profiles
                profiles = data.get('/app/detail/product/profiles', '')
                profiles = json_2_dict(json_str=profiles)
                if profiles == {}:
                    print("json.loads转换出错,得到profiles值可能为空,此处跳过")
                    profiles = ''

                # 处理score
                score = data.get('/app/detail/product/score', '')
                score = json_2_dict(json_str=score)
                try:
                    score.pop('contents')
                except:
                    pass
                if score == {}:
                    print("json.loads转换出错,得到score值可能为空,此处跳过")
                    score = ''

                # 处理sku
                sku = data.get('/app/detail/product/sku', '')
                sku = json_2_dict(json_str=sku)
                # pprint(sku)
                if sku == {}:
                    print("json.loads转换出错,得到sku值可能为空,此处跳过")
                    sku = ''

                data['/app/detail/product/base'] = base
                data['/app/detail/product/profiles'] = profiles
                data['/app/detail/product/score'] = score
                data['/app/detail/product/sku'] = sku

                # 得到手机版地址
                try:
                    phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str(
                        base.get('dealId', ''))
                except AttributeError:
                    print('获取手机版地址失败,此处跳过')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

                print('------>>>| 得到商品手机版地址为: ', phone_url)
                # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

                # 得到并处理detail(即图文详情显示信息)
                # http://m.zhe800.com/gateway/app/detail/graph?productId=
                tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str(
                    goods_id)
                detail_data_body = MyRequests.get_url_body(
                    url=tmp_detail_url,
                    headers=self.headers,
                    high_conceal=True)
                if detail_data_body == '':
                    print('detail_data为[]!')
                    self.result_data = {}
                    return {}
                else:
                    detail_data = [detail_data_body]

                if detail_data != []:
                    detail_data = json_2_dict(json_str=detail_data[0])
                    if detail_data == {}:
                        print('json.loads(detail_data)时报错, 此处跳过')
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}
                    # pprint(detail_data)

                    detail = detail_data.get('/app/detail/graph/detail', '')
                    detail = json_2_dict(json_str=detail)
                    try:
                        detail.pop('small')
                    except:
                        pass
                    if detail == {}:
                        print("json.loads转换出错,得到detail值可能为空,此处跳过")
                        detail = ''
                    # print(detail)

                    # div_desc
                    tmp_div_desc = self._get_div_desc(detail=detail,
                                                      goods_id=goods_id)
                    if tmp_div_desc == '':
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}
                    # print(tmp_div_desc)
                    data['/app/detail/graph/detail'] = tmp_div_desc

                    # shop_name
                    shop_name = self._get_shop_name(data=data)
                    if isinstance(shop_name, dict):
                        if shop_name == {}:
                            self.result_data = {}
                            return {}
                    data['shop_name'] = shop_name
                    '''
                    得到秒杀开始时间和结束时间
                    '''
                    schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str(
                        goods_id)
                    schedule_and_stock_info_body = MyRequests.get_url_body(
                        url=schedule_and_stock_url,
                        headers=self.headers,
                        high_conceal=True)
                    if schedule_and_stock_info_body == '':
                        print('schedule_and_stock_info为空!')
                        self.result_data = {}
                        return {}
                    else:
                        schedule_and_stock_info = [
                            schedule_and_stock_info_body
                        ]

                    if schedule_and_stock_info != []:
                        schedule_and_stock_info = json_2_dict(
                            json_str=schedule_and_stock_info[0])
                        if schedule_and_stock_info == {}:
                            print('得到秒杀开始时间和结束时间时错误, 此处跳过')
                            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                            return {}

                        schedule = schedule_and_stock_info.get(
                            '/app/detail/status/schedule')
                        if schedule is None:
                            schedule = {}
                        else:
                            schedule = json_2_dict(json_str=schedule)

                        stock = schedule_and_stock_info.get(
                            '/app/detail/status/stock')
                        if stock is None:
                            stock = {}
                        else:
                            stock = json_2_dict(json_str=stock)
                    else:
                        schedule = {}
                        stock = {}
                    data['schedule'] = schedule
                    data['stock'] = stock

                    # pprint(data)
                    self.result_data = data
                    return data

                else:
                    print('detail_data为空!')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
Ejemplo n.º 13
0
    def get_div_from_pc_div_url(self, url, goods_id):
        '''
        根据pc描述的url模拟请求获取描述的div
        :return: str
        '''
        t = str(int(time.time())) + str(randint(
            100, 999))  # time.time().__round__() 表示保留到个位

        params_data_1 = {
            'id': goods_id,
            'type': '1',
        }

        tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/'
        _params = (
            ('appKey', '12574478'),
            ('t', t),
            ('api', 'mtop.taobao.detail.getdesc'),
            ('v', '6.0'),
            ('type', 'jsonp'),
            ('dataType', 'jsonp'),
            ('timeout', '20000'),
            ('callback', 'mtopjsonp1'),
            ('data', json.dumps(params_data_1)),
        )
        url = tmp_url + '?' + urlencode(_params)
        last_url = re.compile(r'\+').sub('', url)  # 转换后得到正确的url请求地址(替换'+')
        # self.my_lg.info(last_url)

        data = MyRequests.get_url_body(url=last_url,
                                       headers=self.headers,
                                       params=None,
                                       timeout=14,
                                       num_retries=3)
        if data == '':
            self.my_lg.error(
                '获取到的div_desc为空值!请检查! 出错goods_id: {0}'.format(goods_id))
            return ''

        try:
            data = re.compile('mtopjsonp1\((.*)\)').findall(data)[
                0]  # 贪婪匹配匹配所有
            # self.my_lg.info(str(data))
        except IndexError as e:
            self.my_lg.error(
                '获取data时, IndexError出错! 出错goods_id: {0}'.format(goods_id))
            self.my_lg.exception(e)
            return ''

        try:
            data = json.loads(data)
            # pprint(data)
        except JSONDecodeError:
            self.my_lg.error('json转换data时出错, 请检查!')
            data = {}

        div = data.get('data', {}).get('pcDescContent', '')
        # self.my_lg.info(str(div))
        div = self.deal_with_div(div)
        # self.my_lg.info(div)

        return div
Ejemplo n.º 14
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str(
            goods_id)
        self.my_lg.info(self.msg)

        # 获取主接口的body
        last_url = self._get_last_url(goods_id=goods_id)
        data = MyRequests.get_url_body(url=last_url,
                                       headers=self.headers,
                                       params=None,
                                       timeout=14)
        if data == '':
            self.my_lg.error('出错goods_id: {0}'.format((goods_id)))
            self.result_data = {}
            return {}

        try:
            data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)[
                0]  # 贪婪匹配匹配所有
            # self.my_lg.info(str(data))
        except IndexError:
            self.my_lg.error('data为空! 出错goods_id: {0}'.format(goods_id))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        try:
            data = json.loads(data)
        except json.JSONDecodeError:
            self.my_lg.error('json.loads转换data时出错, 请检查! 出错goods_id: ' +
                             str(goods_id))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        # pprint(data)

        if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                and data.get('data', {}).get('seller', {}).get('evaluates') is None:
            '''
            ## 表示该商品已经下架, 原地址被重定向到新页面
            '''
            self.my_lg.info('@@@@@@ 该商品已经下架...')
            tmp_data_s = self.init_pull_off_shelves_goods()
            self.result_data = {}
            return tmp_data_s

        # 处理商品被转移或者下架导致页面不存在的商品
        if data.get('data').get('seller', {}).get('evaluates') is None:
            self.my_lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        data['data']['rate'] = ''  # 这是宝贝评价
        data['data']['resource'] = ''  # 买家询问别人
        data['data']['vertical'] = ''  # 也是问和回答
        data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
        result_data = data['data']

        # 处理result_data['apiStack'][0]['value']
        # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
        result_data_apiStack_value = result_data.get('apiStack',
                                                     [])[0].get('value', {})

        # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
        result_data['apiStack'][0][
            'value'] = self._wash_result_data_apiStack_value(
                goods_id=goods_id,
                result_data_apiStack_value=result_data_apiStack_value)

        # 处理mockData
        mock_data = result_data['mockData']
        try:
            mock_data = json.loads(mock_data)
        except Exception:
            self.my_lg.error('json.loads转化mock_data时出错, 跳出' + ' 出错goods_id: ' +
                             str(goods_id))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.my_lg.info(
                "result_data.get('apiStack', [])[0].get('value', '')的值为空....")
            result_data['trade'] = {}
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            result_data['trade'] = result_data.get('apiStack', [])[0].get(
                'value', {}).get('trade', {})  # 用于判断该商品是否已经下架的参数
            # pprint(result_data['trade'])

        self.result_data = result_data
        # pprint(self.result_data)

        return result_data
Ejemplo n.º 15
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33]  # notice

        for tab_id in tab_id_list:
            for index in range(0, 50):
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(index))
                print('待抓取的限时秒杀地址为: ', tmp_url)

                data = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
                if data == '': break

                try:
                    data = json.loads(data)
                    data = data.get('data', {})
                    # print(data)
                except:
                    break

                if data.get('goodslist') == []:
                    print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, index))
                    break
                else:
                    data = data.get('goodslist', [])
                    # print(data)
                    if data == []:
                        print('goodslist为[], 此处跳过')
                        pass
                    else:
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        print(miaosha_goods_list)

                        juanpi = JuanPiParse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            if my_pipeline._select_table(
                                    sql_str=jp_select_str_5) is None:
                                db_goods_id_list = []
                            else:
                                db_goods_id_list = [
                                    item[0] for item in list(
                                        my_pipeline._select_table(
                                            sql_str=jp_select_str_5))
                                ]

                            for item in miaosha_goods_list:
                                if item.get('goods_id',
                                            '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'http://shop.juanpi.com/deal/' + item.get(
                                        'goods_id')
                                    juanpi.get_goods_data(
                                        goods_id=item.get('goods_id'))
                                    goods_data = juanpi.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item.get(
                                            'goods_id')
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get(
                                            'price')  # 秒杀前的原特价
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')  # 秒杀价
                                        goods_data['sub_title'] = item.get(
                                            'sub_title', '')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['tab_id'] = tab_id
                                        goods_data['page'] = index

                                        # print(goods_data)
                                        juanpi.insert_into_juanpi_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(.4)  # 短暂sleep下避免出错跳出
                            sleep(.65)
                        else:
                            pass
                        try:
                            del juanpi
                        except:
                            pass
                        gc.collect()
Ejemplo n.º 16
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        # # 原先采用phantomjs, 改用手机端抓html(speed slow, give up)
        # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id)
        # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, exec_code=self._exec_code)
        # # self.my_lg.info(str(body))
        #
        # if body == '':
        #     self.result_data = {}
        #     self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url)
        #     return {}
        #
        # _html_comment_list = list(Selector(text=body).css('div.remark-item').extract())
        # if _html_comment_list != []:
        #     _comment_list = []
        #     for index, item in enumerate(_html_comment_list):
        #         if index > 25:  # 就取前25条评论信息
        #             break
        #
        #         buyer_name = str(Selector(text=item).css('span.member::text').extract_first())
        #         quantify = str(Selector(text=item).css('span.amount::text').extract_first())
        #         try:
        #             quantify = int(re.compile(r'\d+').findall(quantify)[0])
        #         except IndexError:
        #             self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url)
        #             self.result_data = {}
        #             return {}
        #
        #         comment_date = str(Selector(text=item).css('div.date span::text').extract_first())
        #         comment_date = self._get_comment_date(comment_date)     # str '2017-01-25 17:06:00'
        #         tmp_sku_info = str(Selector(text=item).css('div.date::text').extract_first())
        #
        #         _comment_content = self._wash_comment(str(Selector(text=item).css('div.bd::text').extract_first()))
        #         if not filter_invalid_comment_content(_comment_content):
        #             continue
        #
        #         comment = [{
        #             'comment': _comment_content,
        #             'comment_date': comment_date,                                               # 评论创建日期
        #             'sku_info': re.compile(r'<span.*?</span>').sub('', tmp_sku_info),           # 购买的商品规格
        #             'img_url_list': [],
        #             'star_level': randint(3, 5),                                                # 几星好评
        #             'video': '',
        #         }]
        #
        #         _ = {
        #             'buyer_name': buyer_name,           # 买家昵称
        #             'comment': comment,                 # 评论内容
        #             'quantify': quantify,               # 购买数量
        #             'head_img': '',                     # 用户头像
        #             'append_comment': {},               # 追评
        #         }
        #         _comment_list.append(_)
        #
        #     _t = datetime.datetime.now()
        #
        #     _r = CommentItem()
        #     _r['goods_id'] = str(goods_id)
        #     _r['create_time'] = _t
        #     _r['modify_time'] = _t
        #     _r['_comment_list'] = _comment_list
        #     self.result_data = _r
        #     # pprint(self.result_data)
        #     return self.result_data
        # else:
        #     self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url)
        #     self.result_data = {}
        #     return {}

        '''下面是模拟pc端好评接口'''
        member_id = self._get_this_goods_member_id(goods_id=goods_id)
        self.my_lg.info('------>>>| 获取到的member_id: {0}'.format(member_id))
        if member_id == '':
            self.my_lg.error('获取到的member_id为空值!请检查!')
            self.result_data = {}
            return {}

        # 这里从db获取该商品原先的规格值
        sku_info = self._get_sku_info_from_db(goods_id)
        # self.my_lg.info('sku_info: {0}'.format(sku_info))
        if sku_info == []:
            self.result_data = {}
            return {}

        _comment_list = []
        for page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第{0}页...'.format(page_num))
            params = self._set_params(goods_id=goods_id, member_id=member_id, page_num=page_num)
            url = 'https://rate.1688.com/remark/offerDetail/rates.json'
            tmp_headers = self.headers
            tmp_headers.update({
                'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id))
            })
            # 原先用MyRequests老是404,改用phantomjsy也还是老是404
            body = MyRequests.get_url_body(url=url, headers=tmp_headers, params=params)
            # self.my_lg.info(str(body))

            # 用phantomjs
            # url = self._set_url(url=url, params=params)
            # self.my_lg.info(url)
            # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url)
            # try:
            #     body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0]
            # except IndexError:
            #     self.my_lg.error('获取body时索引异常!')
            #     self.result_data = {}
            #     return {}

            if body == '':
                self.result_data = {}
                self.my_lg.error('该地址的body为空值, 出错goods_id: {0}'.format(goods_id))
                return {}

            data = self.json_str_2_dict(json_str=body)
            if data.get('url') is not None:
                self.my_lg.info('------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time))
                sleep(self._page_sleep_time)
                break

            # self.my_lg.info(str(body))
            data = data.get('data', {}).get('rates', [])
            # pprint(data)
            if data == []:
                # sleep(self._page_sleep_time)
                break

            try:
                for item in data:
                    buyer_name = item.get('member', '')
                    comment = []
                    for i in item.get('rateItem', []):
                        _comment_content = self._wash_comment(i.get('remarkContent', ''))
                        if not filter_invalid_comment_content(_comment_content):
                            continue

                        comment.append({
                            'comment': _comment_content,
                            'comment_date': str(i.get('remarkTime', '')),    # 评论日期
                            'sku_info': choice(sku_info),  # 购买的商品规格(pc端1688商品没有规格)
                            'star_level': i.get('starLevel', 5),
                            'img_url_list': [],
                            'video': '',
                        })
                    quantify = item.get('quantity', 1)                                  # 购买数量
                    if comment == []:   # 为空不录入
                        continue

                    _ = {
                        'buyer_name': buyer_name,           # 买家昵称
                        'comment': comment,                 # 评论内容
                        'quantify': quantify,               # 购买数量
                        'head_img': '',                     # 用户头像
                        'append_comment': {},               # 追评
                    }
                    _comment_list.append(_)

            except Exception:
                self.result_data = {}
                self.my_lg.error('出错商品goods_id: {0}'.format(goods_id), exc_info=True)
                return {}

            sleep(self._page_sleep_time)

        if _comment_list != []:
            # pprint(_comment_list)
            _t = datetime.datetime.now()

            _r = CommentItem()
            _r['goods_id'] = str(goods_id)
            _r['create_time'] = _t
            _r['modify_time'] = _t
            _r['_comment_list'] = _comment_list
            self.result_data = _r

            return self.result_data
        else:
            self.my_lg.error('出错goods_id: {0}'.format(goods_id))
            self.result_data = {}
            return {}
Ejemplo n.º 17
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url, 并得到相应数据
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            self.result_data = {}
            return {}

        goods_url = 'https://h5.jumei.com/product/detail?item_id=' + str(goods_id[0]) + '&type=' + str(goods_id[1])
        print('------>>>| 对应的手机端地址为: ', goods_url)

        #** 获取ajaxStaticDetail请求中的数据
        tmp_url = 'https://h5.jumei.com/product/ajaxStaticDetail?item_id=' + goods_id[0] + '&type=' + str(goods_id[1])
        self.headers['Referer'] = goods_url
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        if body == '':
            print('获取到的body为空str!')
            self.result_data = {}
            return {}

        try:
            tmp_data = json.loads(body)
            # pprint(tmp_data)
        except Exception:
            print('json.loads转换body时出错!请检查!')
            self.result_data = {}
            return {}

        tmp_data = self.wash_data(data=tmp_data)
        # pprint(tmp_data)

        #** 获取ajaxDynamicDetail请求中的数据
        tmp_url_2 = 'https://h5.jumei.com/product/ajaxDynamicDetail?item_id=' + str(goods_id[0]) + '&type=' + str(goods_id[1])
        body_2 = MyRequests.get_url_body(url=tmp_url_2, headers=self.headers)
        # print(body)
        if body_2 == '':
            print('获取到的body为空str!')
            self.result_data = {}
            return {}

        try:
            tmp_data_2 = json.loads(body_2)
            # pprint(tmp_data_2)
        except Exception:
            print('json.loads转换body_2时出错!请检查!')
            self.result_data = {}
            return {}
        tmp_data_2 = self.wash_data_2(data=tmp_data_2)
        # pprint(tmp_data_2)

        tmp_data['data_2'] = tmp_data_2.get('data', {}).get('result', {})
        if tmp_data['data_2'] == {}:
            print('获取到的ajaxDynamicDetail中的数据为空值!请检查!')
            self.result_data = {}
            return {}

        # pprint(tmp_data)

        data = {}
        try:
            data['title'] = tmp_data.get('data', {}).get('name', '')
            data['sub_title'] = ''
            # print(data['title'])

            if data['title'] == '':
                print('获取到的title为空值, 请检查!')
                raise Exception

            # shop_name
            if tmp_data.get('data_2', {}).get('shop_info') == []:
                data['shop_name'] = ''
            else:
                data['shop_name'] = tmp_data.get('data_2', {}).get('shop_info', {}).get('store_title', '')
            # print(data['shop_name'])

            # 获取所有示例图片
            all_img_url = tmp_data.get('data', {}).get('image_url_set', {}).get('single_many', [])
            if all_img_url == []:
                print('获取到的all_img_url为空[], 请检查!')
                raise Exception
            else:
                all_img_url = [{
                    'img_url': item.get('800', ''),
                } for item in all_img_url]
            # pprint(all_img_url)
            data['all_img_url'] = all_img_url

            # 获取p_info
            p_info = self.get_p_info(tmp_data=tmp_data)
            # pprint(p_info)
            data['p_info'] = p_info

            # 获取每个商品的div_desc
            # 注意其商品的div_desc = description + description_usage + description_images
            div_desc = self.get_goods_div_desc(tmp_data=tmp_data)
            # print(div_desc)
            if div_desc == '':
                print('获取到的div_desc为空值! 请检查')
                raise Exception
            data['div_desc'] = div_desc

            '''
            上下架时间 (注意:聚美优品常规今日10点上新商品,销售时长都是24小时)
            '''
            sell_time = self.get_sell_time(
                begin_time=tmp_data.get('data_2', {}).get('start_time'),
                end_time=tmp_data.get('data_2', {}).get('end_time')
            )
            # pprint(sell_time)
            data['sell_time'] = sell_time

            # 设置detail_name_list
            detail_name_list = self.get_detail_name_list(size_attr=tmp_data.get('data_2', {}).get('size_attr', []))
            # print(detail_name_list)
            data['detail_name_list'] = detail_name_list

            '''
            获取每个规格对应价格跟规格以及库存
            '''
            true_sku_info = self.get_true_sku_info(size=tmp_data.get('data_2', {}).get('size', []))
            # pprint(true_sku_info)
            if true_sku_info == []:
                print('获取到的sku_info为空值, 请检查!')
                raise Exception
            else:
                data['price_info_list'] = true_sku_info

            '''
            is_delete
            '''
            if int(tmp_data.get('data_2', {}).get('end_time')) < int(time.time()):
                is_delete = 1
            else:
                all_stock = 0
                for item in true_sku_info:
                    all_stock += item.get('rest_number', 0)
                # print(all_stock)
                if all_stock == 0:
                    is_delete = 1
                else:
                    is_delete = 0
            # print(is_delete)
            data['is_delete'] = is_delete

            # all_sell_count
            all_sell_count = tmp_data.get('data_2', {}).get('buyer_number', '0')
            data['all_sell_count'] = all_sell_count

        except Exception as e:
            print('遇到错误如下: ', e)
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        if data != {}:
            # pprint(data)
            self.result_data = data
            return data

        else:
            print('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            result = list(
                tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_miaosha = JuanPiParse()

            for item in result:  # 实时更新数据
                miaosha_begin_time = json.loads(
                    item[1]).get('miaosha_begin_time')
                miaosha_begin_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_begin_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_begin_time)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_begin_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str,
                            params=(item[0]),
                            lock_timeout=2000)
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_begin_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                            str(item[2]),
                            str(item[3]),
                        )
                        # print('待爬取的tab_id, page地址为: ', tmp_url)

                        data = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers)
                        if data == '':
                            break

                        try:
                            data = json.loads(data)
                            data = data.get('data', {})
                            # print(data)
                        except:
                            break

                        if data.get('goodslist') == []:
                            print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.
                                  format(item[2], item[3]))
                            pass

                        else:
                            data = data.get('goodslist', [])
                            # print(data)
                            if data == []:
                                print('goodslist为[], 此处跳过')
                                pass
                            else:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(
                                    data=data)
                                # print(miaosha_goods_list)

                                # 该tab_id, page中现有的所有goods_id的list
                                miaosha_goods_all_goods_id = [
                                    i.get('goods_id')
                                    for i in miaosha_goods_list
                                ]
                                # print(miaosha_goods_all_goods_id)

                                if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                    '''
                                    表示该tab_id,page中没有了该goods_id
                                    '''
                                    tmp_sql_server._delete_table(
                                        sql_str=self.delete_sql_str,
                                        params=(item[0]))
                                    print(
                                        '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                        item[0])
                                    pass

                                else:  # 未下架的
                                    for item_1 in miaosha_goods_list:
                                        if item_1.get('goods_id',
                                                      '') == item[0]:
                                            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                            # juanpi_miaosha = JuanPiParse()
                                            juanpi_miaosha.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = juanpi_miaosha.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:  # 否则就解析并且插入
                                                goods_data[
                                                    'stock_info'] = item_1.get(
                                                        'stock_info')
                                                goods_data[
                                                    'goods_id'] = item_1.get(
                                                        'goods_id')
                                                # goods_data['username'] = '******'
                                                if item_1.get(
                                                        'stock_info'
                                                ).get('activity_stock') > 0:
                                                    goods_data[
                                                        'price'] = item_1.get(
                                                            'price')  # 秒杀前的原特价
                                                    goods_data[
                                                        'taobao_price'] = item_1.get(
                                                            'taobao_price'
                                                        )  # 秒杀价
                                                else:
                                                    pass
                                                goods_data[
                                                    'sub_title'] = item_1.get(
                                                        'sub_title', '')
                                                goods_data[
                                                    'miaosha_time'] = item_1.get(
                                                        'miaosha_time')
                                                goods_data[
                                                    'miaosha_begin_time'], goods_data[
                                                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=item_1
                                                            .get('miaosha_time'
                                                                 ))

                                                juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)

                                                sleep(.3)  # 避免太快
                                        else:
                                            pass
                    if index % 10 == 0:  # 每过几个初始化一次,既能加快速度,又能优化内存
                        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                        juanpi_miaosha = JuanPiParse()
                        gc.collect()

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            # sleep(5)
            pass
        gc.collect()
Ejemplo n.º 19
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id: 常规商品goods_id
        :return:
        '''
        """
        方法1: 原先采用调用api的方法, 无奈分析js源码未找到sign是如何md5加密,从而无法实现通过api调用参数 (pass)
        """
        # """     # 这些是构造参数
        # mw-appkey:100028
        # mw-t:1517037701053
        # mw-uuid:956bf265-90a4-45b0-bfa8-31040782f99e
        # mw-ttid:NMMain@mgj_h5_1.0
        # mw-sign:ef29b1801c79d63907f3589c68e4cd4c
        # data:{"iid":"1lnrc42","template":"1-2-detail_normal-1.0.0","appPlat":"m","noPintuan":false}
        # callback:mwpCb2
        # _:1517037701056
        # """
        # print('------>>>| 对应的手机端地址为: ', 'https://h5.mogujie.com/detail-normal/index.html?itemId=' + goods_id)
        #
        # appkey = '100028'
        # t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # ttid = 'NMMain@mgj_h5_1.0'
        # sign = ''
        #
        # '''
        # 下面是构造params
        # '''
        # params_data_2 = {
        #     'iid': goods_id,
        #     'template': '1-2-detail_normal-1.0.0',
        #     'appPlat': 'm',
        #     'noPintuan': 'false',
        # }
        #
        # params = {
        #     'data': json.dumps(params_data_2),
        # }
        #
        # tmp_url = 'https://api.mogujie.com/h5/http.detail.api/1/?mw-appkey={}&mw-t={}&mw-uuid={}&mw-ttid={}&mw-sign={}&callback=mwpCb2'.format(
        #     appkey, t, uuid, ttid, sign
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))
        #
        # try:
        #     response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     last_url = re.compile(r'\+').sub('', response.url)  # 转换后得到正确的url请求地址
        #     # print(last_url)
        #     response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     data = response.content.decode('utf-8')
        #     print(data)
        #     data = re.compile(r'mwpCb2\((.*)\)').findall(data)  # 贪婪匹配匹配所有
        #     # print(data)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        """
        方法2: 通过页面源码来获取
        """
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'https://shop.mogujie.com/detail/' + str(goods_id)
            print('------>>>| 原pc地址为: ', tmp_url)
            # print('------>>>| 对应的手机端地址为: ', 'https://h5.mogujie.com/detail-normal/index.html?itemId=' + goods_id)

            data = {}

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空str!')
                self.result_data = {}
                return {}

            try:
                goods_info = re.compile(
                    r'var detailInfo = (.*?);</script>').findall(body)[0]
                # print(goods_info)

                item_info = re.compile(r'itemInfo:(.*?),priceRuleImg').findall(
                    goods_info)[0]
                # print(item_info)

                sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(
                    goods_info)[0]
                # print(sku_info)

                shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(
                    goods_info)[0]
                # print(shop_info)

                item_info = json_2_dict(json_str=item_info)
                sku_info = json_2_dict(json_str=sku_info)
                shop_info = json_2_dict(json_str=shop_info)
                # pprint(item_info)
                # pprint(sku_info)
                # pprint(shop_info)

                data['title'] = item_info.get('title', '')
                if data['title'] == '':
                    print('title为空!')
                    raise Exception

                data['sub_title'] = ''

                data['shop_name'] = shop_info.get('name', '')
                # print(data['shop_name'])

                # 获取所有示例图片
                all_img_url = [{
                    'img_url': item
                } for item in item_info.get('topImages', [])]
                # pprint(all_img_url)
                data['all_img_url'] = all_img_url
                '''
                获取p_info
                '''
                p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(
                    goods_id)
                tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url,
                                                          headers=self.headers,
                                                          had_referer=True)
                # print(tmp_p_info_body)
                if tmp_p_info_body == '':
                    print('获取到的tmp_p_info_body为空值, 请检查!')
                    raise Exception

                p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body)
                # pprint(p_info)
                # if p_info == []:
                #     print('获取到的p_info为空list')
                #     self.result_data = {}
                #     return {}
                # else:
                # 存在p_info为[]的商品
                data['p_info'] = p_info

                # 获取每个商品的div_desc
                div_desc = self.get_goods_div_desc(
                    tmp_p_info_body=tmp_p_info_body)
                # print(div_desc)
                if div_desc == '':
                    print('获取到的div_desc为空str, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['div_desc'] = div_desc
                '''
                获取detail_name_list
                '''
                detail_name_list = self.get_goods_detail_name_list(
                    sku_info=sku_info)
                # print(detail_name_list)
                if detail_name_list == '':
                    print('获取detail_name_list出错, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['detail_name_list'] = detail_name_list
                '''
                获取每个规格对应价格跟规格以及其库存
                '''
                price_info_list = self.get_price_info_list(sku_info=sku_info)
                if price_info_list == '':
                    raise Exception
                else:
                    # pprint(price_info_list)
                    data['price_info_list'] = price_info_list

                # 商品价格和淘宝价
                try:
                    tmp_price_list = sorted([
                        round(float(item.get('detail_price', '')), 2)
                        for item in data['price_info_list']
                    ])
                    price = Decimal(tmp_price_list[-1]).__round__(2)  # 商品价格
                    taobao_price = Decimal(tmp_price_list[0]).__round__(
                        2)  # 淘宝价
                    # print('商品的最高价: ', price, ' 最低价: ', taobao_price)
                except IndexError:
                    print('获取price和taobao_price时出错! 请检查')
                    raise Exception

                data['price'] = price
                data['taobao_price'] = taobao_price

            except Exception as e:
                print('遇到错误: ', e)
                self.result_data = {}
                return {}

            if data != {}:
                # pprint(data)
                self.result_data = data
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
Ejemplo n.º 20
0
    def _get_true_sku_info(self, goods_id, tmp_data):
        '''
        得到每个规格对应的库存, 价格, 图片等详细信息
        :param tmp_data:
        :return:
        '''
        def _get_other(other_items):
            other_ = []
            for item in other_items:
                if item.get('type', 0) == 1:  # 该规格无库存
                    continue
                else:  # 该规格有库存
                    detail_price = item.get('promotion_price', '')
                    # 还是选择所有商品都拿最优惠的价格
                    # if detail_price == '' or goods_id[0] == 1:      # 为空就改为获取vipshop_price字段
                    if detail_price == '':  # 为空就改为获取vipshop_price字段
                        detail_price = item.get('vipshop_price', '')
                    else:
                        pass
                    normal_price = item.get('market_price', '')
                    if normal_price == '':
                        normal_price = detail_price
                    other_.append({
                        'spec_value': item.get('sku_name', ''),
                        'detail_price': detail_price,
                        'normal_price': normal_price,
                        'rest_number': item.get('leavings', 0),  # 该规格的剩余库存量
                        'img_url': '',  # 设置默认为空值
                    })

            return other_

        multiColor = tmp_data[5].get('result', {})
        # pprint(multiColor)
        ## ** 研究发现multiColor以及productSku中的type为1时,表示该商品规格库存为0
        productSku = tmp_data[6].get('result', {}).get('productSku', {})
        # pprint(productSku)

        true_sku_info = []
        color_ = None
        if multiColor == {} or productSku == {}:
            return []
        else:
            if multiColor.get('items') is None:
                pass
            else:
                tmp_color_items = multiColor.get('items', [])
                color_ = []
                for item in tmp_color_items:
                    if item.get('type', 0) == 1:  # 该颜色无库存
                        continue
                    else:  # 为0,表示有库存
                        # 先获取到有库存的对应规格, 是否有颜色属性后面再判断
                        color_.append({
                            'goods_id':
                            item.get('product_id', ''),
                            'name':
                            item.get('name', ''),
                            'img_url':
                            'https:' +
                            item.get('icon', {}).get('imageUrl', '')
                        })
                # pprint(color_)

            if color_ == []:  # 没有规格 也可能是 # 表示没有库存, 买完或者下架
                print('获取到的color_为空[], 请检查!')
                return []
            else:
                if productSku.get('items') is None:
                    print('获取到的others_items为None')
                    return []

                else:
                    other_items = productSku.get('items', [])
                    other_ = _get_other(other_items=other_items)

                if color_ is None:
                    for item_2 in other_:
                        spec_value = item_2.get('spec_value', '')
                        item_2['spec_value'] = spec_value
                        item_2['img_url'] = ''
                        true_sku_info.append(item_2)

                else:
                    for item in color_:
                        if item.get(
                                'goods_id') == goods_id[1]:  # 表示为原先的那个goods_id
                            if item.get('name', '') == '无':  # 表示无颜色属性
                                pass
                            else:
                                for item_2 in other_:
                                    spec_value = item.get(
                                        'name', '') + '|' + item_2.get(
                                            'spec_value', '')
                                    item_2['spec_value'] = spec_value
                                    item_2['img_url'] = item.get('img_url', '')
                                    true_sku_info.append(item_2)

                        else:  # 表示是其他颜色对应的goods_id
                            '''下面是获取该颜色对应goods_id的所有可售的规格价格信息'''
                            url = 'https://m.vip.com/server.html'
                            params = self._set_params()

                            page = 'product-0-' + str(goods_id[1]) + '.html'
                            post_data = self._set_post_data(page=page)

                            tmp_data_2 = MyRequests.get_url_body(
                                method='post',
                                url=url,
                                headers=self.headers,
                                params=params,
                                data=post_data)
                            # print(tmp_data_2)

                            # 先处理得到dict数据
                            if tmp_data_2 == '':
                                print('获取其他颜色规格的url的body时为空值')
                                return []
                            else:
                                tmp_data_2 = json_2_dict(json_str=tmp_data_2)
                                if tmp_data_2 == {}:
                                    return []

                                other_items_2 = tmp_data_2[6].get(
                                    'result', {}).get('productSku',
                                                      {}).get('items', [])
                                other_2 = _get_other(other_items=other_items_2)

                                for item_4 in other_2:
                                    spec_value = item.get(
                                        'name', '') + '|' + item_4.get(
                                            'spec_value', '')
                                    item_4['spec_value'] = spec_value
                                    item_4['img_url'] = item.get('img_url', '')
                                    true_sku_info.append(item_4)

        return true_sku_info
Ejemplo n.º 21
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id: 类型 list
        :return: data dict类型
        '''
        if goods_id == []:
            return self._error_data_init()
        else:
            data = {}
            # 抓包: 唯品会微信小程序
            url = 'https://m.vip.com/server.html'
            params = self._set_params()

            page = 'product-0-' + str(goods_id[1]) + '.html'
            post_data = self._set_post_data(page=page)

            body = MyRequests.get_url_body(method='post',
                                           url=url,
                                           headers=self.headers,
                                           params=params,
                                           data=post_data)
            # print(body)

            if body == '':
                return self._error_data_init()

            else:
                tmp_data = json_2_dict(json_str=body)
                if tmp_data == {}:
                    return self._error_data_init()

                try:
                    # title, sub_title
                    data['title'] = tmp_data[2].get('result', {}).get(
                        'product_name', '')
                    assert data['title'] != '', '获取到的title为空值, 请检查!'
                    data['sub_title'] = ''
                    data['shop_name'] = tmp_data[2].get('result', {}).get(
                        'brand_info', {}).get('brand_name', '')

                    # 获取所有示例图片
                    all_img_url = tmp_data[2].get('result',
                                                  {}).get('img_pre', [])
                    assert all_img_url != [], '获取到的all_img_url为空[], 请检查!'
                    all_img_url = [{
                        'img_url': 'https:' + item.get('b_img', '')
                    } for item in all_img_url]
                    # pprint(all_img_url)
                    data['all_img_url'] = all_img_url

                    # 获取p_info
                    p_info = self._get_p_info(tmp_data=tmp_data)
                    assert p_info != [], 'p_info为空list, 请检查!'
                    # pprint(p_info)
                    data['p_info'] = p_info

                    # 获取每个商品的div_desc
                    div_desc = self.get_goods_div_desc(
                        tmp_data=tmp_data[2].get('result', {}).get(
                            'detailImages', []))
                    assert div_desc != '', '获取到的div_desc为空值! 请检查'
                    data['div_desc'] = div_desc
                    '''
                    上下架时间
                    '''
                    data['sell_time'] = {
                        'begin_time':
                        tmp_data[2].get('result',
                                        {}).get('sell_time_from', {}),
                        'end_time':
                        tmp_data[2].get('result', {}).get('sell_time_to', {}),
                    }
                    if int(data['sell_time'].get('begin_time')) > int(
                            time.time()):
                        # *** 先根据上下架时间来判断是否为预售商品,如果是预售商品就按预售商品的method来去对应规格的价格
                        goods_id = [1, goods_id[1]]  # 设置成预售的商品goods_id格式

                    # 设置detail_name_list
                    detail_name_list = self._get_detail_name_list(
                        tmp_data=tmp_data)
                    data['detail_name_list'] = detail_name_list
                    '''
                    获取每个规格对应价格跟规格以及库存
                    '''
                    true_sku_info = self._get_true_sku_info(goods_id=goods_id,
                                                            tmp_data=tmp_data)
                    # pprint(true_sku_info)
                    if true_sku_info == []:  # 也可能是 表示没有库存, 买完或者下架
                        print('获取到的sku_info为空值, 请检查!')
                        print('*** 注意可能是卖完了,库存为0 导致!! ***')
                        # raise Exception
                        data['price_info_list'] = true_sku_info
                    else:
                        data['price_info_list'] = true_sku_info

                except Exception as e:
                    print('遇到错误如下: ', e)
                    return self._error_data_init()

                if data != {}:
                    # pprint(data)
                    self.result_data = data
                    return data

                else:
                    print('data为空!')
                    return self._error_data_init()
Ejemplo n.º 22
0
    def _get_jd_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取京东销量靠前的商品
        :param keyword:
        :return: [] or ['xxxx', ....]
        '''
        # 方案1: jd m站的搜索(基于搜索接口)
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416',
            'authority': 'so.m.jd.com',
            # 'cookie': '3AB9D23F7A4B3C9B=SL4YPRE3Y4C627UCHFP4ROHI54TTYYJKLFSVROZQ57T7K3OUUKSYIVFUJKQHBAUPRANZOTPLCVC2TICTSJG6WEMUII; mba_muid=1523868445027-16c30fbc5f8c54c429; abtest=20180416164812814_35; visitkey=41587293677961039; shshshfpa=9e159581-c64f-e9f4-ad0c-8b6ced0d9f28-1525907842; shshshfpb=1a725fe3148b84c839f009c93fc261f2218f59c61e7f4e6c05af381826; retina=1; webp=1; TrackerID=GGwYSka4RvH3lm0ZwLoO2_qdMpBwRG39BvyBvQaJfzyN5cmdGt4lEMSqqJS-sbDqj4nAUX2HU4sVDGA8vl169D37w4EqceYcH6ysXv46kMVfvVdAPmSMV9LceeO3Cc6Z; whwswswws=; __jdc=122270672; subAbTest=20180604104024339_59; mobilev=html5; m_uuid_new=05C2D24B7D8FFDA8D4243A929A5C6234; intlIpLbsCountrySite=jd; mhome=1; cid=9; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; M_Identification_abtest=20180604104040270_32361722; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; so_eggsCount=1; warehistory="4764260,10658784927,"; wq_logid=1528080290.1936376147; __jdu=15238681432201722645210; __jda=122270672.15238681432201722645210.1523868143.1528255502.1529934182.18; __jdv=122270672|direct|-|none|-|1529934182053; cn=0; user-key=ecfc3673-cc54-43e2-96bd-fb7a7e700c32; ipLoc-djd=1-72-2799-0; shshshfp=a3b9323dfc6a675230170e6a43efcb81; USER_FLAG_CHECK=d9f73823a80c0305366f70a3b99b9ecb; sid=57ea016fe0ab4b04271e00f01d94d3b9; intlIpLbsCountryIp=60.177.32.78; autoOpenApp_downCloseDate_auto=1529934572240_21600000; wxa_level=1; PPRD_P=UUID.15238681432201722645210; sc_width=1280; wq_area=15_1213_0%7C3; __jdb=122270672.10.15238681432201722645210|18.1529934182; mba_sid=15299345705167145512031951538.7; __wga=1529934993217.1529934585585.1528080039013.1526716673573.6.3; shshshsID=7f3d94fa215b4e53b467f0d5e0563e9c_9_1529934993592',
        }

        params = (
            ('keyword', keyword[1]),
            ('datatype', '1'),
            ('callback', 'jdSearchResultBkCbA'),
            ('page', '1'),
            ('pagesize', '10'),
            ('ext_attr', 'no'),
            ('brand_col', 'no'),
            ('price_col', 'no'),
            ('color_col', 'no'),
            ('size_col', 'no'),
            ('ext_attr_sort', 'no'),
            ('merge_sku', 'yes'),
            ('multi_suppliers', 'yes'),
            ('area_ids', '1,72,2819'),
            ('sort_type', 'sort_totalsales15_desc'),
            ('qp_disable', 'no'),
            ('fdesc', '\u5317\u4EAC'),
            # ('t1', '1529934992189'),
        )

        s_url = 'https://so.m.jd.com/ware/search._m2wq_list'
        body = MyRequests.get_url_body(url=s_url,
                                       headers=headers,
                                       params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                data = re.compile('jdSearchResultBkCbA\((.*)\)').findall(
                    body)[0]
            except IndexError:
                self.my_lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format(
                    (keyword[1])))
                return []
            '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。'''
            data = deal_with_JSONDecodeError_about_value_invalid_escape(
                json_str=data)
            data = json_2_dict(json_str=data, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                # 注意拿到的数据如果是京东拼购则跳过
                # pprint(data)
                data = data.get('data', {}).get('searchm',
                                                {}).get('Paragraph', [])
                # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == ''
                if data is not None and data != []:
                    goods_id_list = [
                        item.get('wareid', '') for item in data
                        if item.get('pinGou', {}).get('bp', '') == ''
                    ]

                    return goods_id_list

                else:
                    self.my_lg.error('获取到的data为空list, 请检查!')
                    return []
Ejemplo n.º 23
0
    def _get_taobao_goods_keywords_goods_id_list(self, keyword):
        '''
        获取该keywords的商品的goods_id_list
        :param keyword: (id, keyword)
        :return: a list
        '''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': '*/*',
            # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
            'authority': 's.taobao.com',
            # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw',
        }

        # 获取到的为淘宝关键字搜索按销量排名
        params = (
            ('data-key', 'sort'),
            ('data-value', 'sale-desc'),
            ('ajax', 'true'),
            # ('_ksTS', '1528171408340_395'),
            ('callback', 'jsonp396'),
            ('q', keyword[1]),
            ('imgfile', ''),
            ('commend', 'all'),
            ('ssid', 's5-e'),
            ('search_type', 'item'),
            ('sourceId', 'tb.index'),
            # ('spm', 'a21bo.2017.201856-taobao-item.1'),
            ('ie', 'utf8'),
            # ('initiative_id', 'tbindexz_20170306'),
        )

        s_url = 'https://s.taobao.com/search'
        body = MyRequests.get_url_body(url=s_url,
                                       headers=headers,
                                       params=params)
        if body == '':
            return []
        else:
            try:
                data = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1]))
                return []

            data = json_2_dict(json_str=data, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(
                    keyword[1]))
                return []
            else:
                goods_id_list = data.get('mainInfo', {}).get(
                    'traceInfo', {}).get('traceData', {}).get('allNids', [])
                if goods_id_list is None or goods_id_list == []:
                    self.my_lg.error(
                        '获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(
                            keyword[1]))
                    return []
                else:
                    return goods_id_list
Ejemplo n.º 24
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()
        else:
            tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id)
            print('------>>>| 得到的商品手机版地址为: ', tmp_url)

            '''
            原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息
            '''
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True)
            # print(body)
            if body == '':
                print('获取到的tmp_url的body为空值, 此处跳过!')
                return self._data_error_init()

            # 不用这个了因为会影响到正常情况的商品
            try:
                if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and (len(body)< 660 and len(body)>640):   # 单独处理商品页面不存在的情况
                    print('很抱歉,您查看的页面木有了~')
                    self.result_data = {}
                    return str(goods_id)
                else:
                    pass
            except:
                pass

            try:
                data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body)
            except:
                data = []

            '''
            采用phantomjs
            '''
            # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title')
            # # print(main_body)
            # if main_body == '':
            #     print('获取到的main_body为空值, 此处跳过!')
            #     return self._data_error_init()
            #
            # try:
            #     data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body)  # 贪婪匹配匹配所有
            #     # print(data)
            # except:
            #     data = []

            if data != []:
                data = json_2_dict(json_str=data[0])
                # pprint(data)
                if data == {}:
                    return self._data_error_init()

                # div_desc
                div_desc_body = self.get_div_desc_body(goods_id=goods_id)
                # print(div_desc_body)
                if div_desc_body == '':
                    print('获取到的div_desc_body为空!')
                    return {}

                # p_info
                p_info = self.get_p_info_list(goods_id=goods_id)
                # pprint(p_info)
                if p_info == []:
                    return {}

                # 获取商品实时库存信息
                stock_info = self.get_stock_info_dict(goods_id=goods_id)
                if stock_info == {}:
                    print('获取到的库存信息为{}!')
                    return {}
                # pprint(stock_info)

                data['div_desc'] = div_desc_body
                data['p_info'] = p_info
                data['stock_info'] = stock_info

                if stock_info.get('pin_status', 2) == 3:
                    print('##### 该拼团商品已经被抢光 ...')
                    is_delete = 1
                else:
                    is_delete = 0
                data['is_delete'] = is_delete
                data['parent_dir'] = _z8_get_parent_dir(goods_id)

                self.result_data = data
                # pprint(data)
                return data

            else:
                print('data为空!')
                return self._data_error_init()
Ejemplo n.º 25
0
def test():
    # 抓包: 唯品会微信小程序
    url = 'https://m.vip.com/server.html'
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip',
        'Accept-Language':
        'zh-cn',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Host':
        'm.vip.com',
        'Referer':
        'https://servicewechat.com/wxe9714e742209d35f/284/page-frame.html',
        'User-Agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f MicroMessenger/6.6.5 NetType/WIFI Language/zh_CN',
    }

    t = str(time.time().__round__()) + str(randint(100, 999))
    params = {
        'serv': 'getGoodsActiveMsg',
        '_xcxid': t,
    }

    goods_id = '460143743'
    page = 'product-0-' + str(goods_id) + '.html'
    data = dumps([
        {
            "method": "getGoodsActiveMsg",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025335,
            'id': 1,
            "jsonrpc": "2.0"
        },
        {
            "method": "getCoupon",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025336,
            'id': 2,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductDetail",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025337,
            'id': 3,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductMeta",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025338,
            'id': 4,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductSlide",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025339,
            'id': 5,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductMultiColor",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025340,
            'id': 6,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductSize",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025341,
            'id': 7,
            "jsonrpc": "2.0"
        },
        {
            "method": "getProductCountdown",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025342,
            'id': 8,
            "jsonrpc": "2.0"
        },
        {
            "method": "ProductRpc.getProductLicense",
            "params": {
                "page": page,
                "query": ""
            },
            # "id":4884390025343,
            'id': 9,
            "jsonrpc": "2.0"
        },
    ])

    body = MyRequests.get_url_body(method='post',
                                   url=url,
                                   headers=headers,
                                   params=params,
                                   data=data)
    # print(body)
    data = json_2_dict(json_str=body)

    return data
Ejemplo n.º 26
0
    def _get_target_data(self, **kwargs):
        '''
        获取目标需求数据
        :return:
        '''
        goods_id = kwargs.get('goods_id', '')

        if goods_id == '':
            self.my_lg.error('获取到的goods_id为空值!此处跳过!')
            return self._get_data_error_init()

        # 小米有品m站抓取
        base_url = 'https://home.mi.com/app/shop/pipe'
        # cookies = self._get_cookies()
        post_data = self._get_post_data(goods_id=goods_id)

        m_url = 'https://home.mi.com/detail?gid={0}'.format(goods_id)
        self.my_lg.info('------>>>| 正在抓取小米有品地址为: {0}'.format(m_url))

        write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url)

        body = MyRequests.get_url_body(method='post',
                                       url=base_url,
                                       headers=self.headers,
                                       cookies=None,
                                       data=post_data)
        # self.my_lg.info(str(body))
        if body == '':
            self.my_lg.error('获取到的body为空值!' + write_info)
            return self._get_data_error_init()

        _ = json_2_dict(json_str=body,
                        logger=self.my_lg).get('result',
                                               {}).get('detail',
                                                       {}).get('data', {})
        # pprint(_)
        if _ == {}:
            self.my_lg.error('获取到的data为空dict!' + write_info)
            return self._get_data_error_init()

        try:
            _ = self._wash_target_data(_)
        except Exception:
            self.my_lg.error('清洗数据时出错!' + write_info, exc_info=True)
            self._get_data_error_init()

        # pprint(_)
        data = {}
        try:
            data['title'] = self._wash_sensitive_info(self._get_title(data=_))
            data['sub_title'] = self._wash_sensitive_info(
                self._get_sub_title(data=_))
            data['shop_name'] = self._get_shop_name(data=_)
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)  # 小米有品无p_info
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = {}  # 默认为空
            data['detail_name_list'] = self._get_detail_name_list(
                data=_.get('group', []))
            data['price_info_list'] = self._get_price_info_list(data=_)
            data['price'], data[
                'taobao_price'] = self._get_price_and_taobao_price(
                    price_info_list=data['price_info_list'])
            if data['price'] == 0 or data['taobao_price'] == 0:  # 售罄商品处理
                data['is_delete'] = 1
            else:
                data['is_delete'] = self._get_is_delete(
                    price_info_list=data['price_info_list'],
                    data=data,
                    other=_)

        except Exception:
            self.my_lg.error('遇到错误:', exc_info=True)
            self.my_lg.error(write_info)
            return self._get_data_error_init()

        if data != {}:
            self.result_data = data
            return data
        else:
            self.my_lg.info('data为空值')
            return self._get_data_error_init()
Ejemplo n.º 27
0
    def get_goods_data(self, goods_id):
        '''
        得到data
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            self.result_data = {}
            return {}

        type = goods_id[0]  # 天猫类型
        # self.my_lg.info(str(type))
        goods_id = goods_id[1]  # 天猫goods_id
        tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id)
        self.my_lg.info('------>>>| 得到的移动端地址为: %s' % tmp_url)

        self.headers.update({'Referer': tmp_url})
        last_url = self._get_last_url(goods_id=goods_id)
        body = MyRequests.get_url_body(url=last_url,
                                       headers=self.headers,
                                       params=None,
                                       timeout=14)
        if body == '':
            self.my_lg.error('出错goods_id: {0}'.format((goods_id)))
            self.result_data = {}
            return {}

        try:
            assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % (
                str(type), goods_id)
            data = re.compile('mtopjsonp3\((.*)\)').findall(body)[
                0]  # 贪婪匹配匹配所有
        except (AssertionError, IndexError) as e:
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        if data != '':
            data = json_2_dict(json_str=data, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('出错type: %s, goods_id: %s' %
                                 (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            # pprint(data)

            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                '''
                ## 表示该商品已经下架, 原地址被重定向到新页面
                '''
                self.my_lg.info('@@@@@@ 该商品已经下架...')
                tmp_data_s = self.init_pull_off_shelves_goods(type)
                self.result_data = {}
                return tmp_data_s

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data', {}).get('seller', {}).get('evaluates') is None:
                self.my_lg.error(
                    'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: %s, goods_id: %s' %
                    (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data['data']['rate'] = ''  # 这是宝贝评价
            data['data']['resource'] = ''  # 买家询问别人
            data['data']['vertical'] = ''  # 也是问和回答
            data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack',
                                                         [])[0].get(
                                                             'value', {})

            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0][
                'value'] = self._wash_result_data_apiStack_value(
                    goods_id=goods_id,
                    result_data_apiStack_value=result_data_apiStack_value)

            # 处理mockData
            mock_data = result_data['mockData']
            mock_data = json_2_dict(json_str=mock_data, logger=self.my_lg)
            if mock_data == {}:
                self.my_lg.error('出错type: {0}, goods_id: {1}'.format(
                    type, goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            mock_data['feature'] = ''
            # pprint(mock_data)
            result_data['mockData'] = mock_data

            # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
            if result_data.get('apiStack', [])[0].get('value', '') == '':
                self.my_lg.error(
                    "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: %s, goods_id: %s"
                    % (str(type), goods_id))
                result_data['trade'] = {}
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            else:
                result_data['trade'] = result_data.get('apiStack', [])[0].get(
                    'value', {}).get('trade', {})  # 用于判断该商品是否已经下架的参数
                # pprint(result_data['trade'])

            result_data['type'] = type
            result_data['goods_id'] = goods_id
            self.result_data = result_data
            # pprint(self.result_data)
            return result_data

        else:
            self.my_lg.error('data为空! 出错type: %s, goods_id: %s' %
                             (str(type), str(goods_id)))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
Ejemplo n.º 28
0
def get_random_user_id_list():
    # cookies = {
    #     'install_id': '29797177823',
    #     'odin_tt': 'c53dd298a0e92adf64e9303da9ab2efbe0cbef78e6737970d9adb9b207d0758ac4b9c183d9d96c3b84f3e4eedb68c12d',
    #     'sessionid': '16fc74a57b38e96fc93bf967a6ccd76a',
    #     'sid_guard': '16fc74a57b38e96fc93bf967a6ccd76a%7C1522509051%7C2592000%7CMon%2C+30-Apr-2018+15%3A10%3A51+GMT',
    #     'sid_tt': '16fc74a57b38e96fc93bf967a6ccd76a',
    #     'ttreq': '1$494b0ed8e828b687a808d93e101fac11837708e6',
    #     'uid_tt': '9e0f14ca7575e68526e07408631cd322',
    # }

    headers = {
        'Host': 'aweme.snssdk.com',
        'Accept': '*/*',
        'User-Agent': 'Aweme/1.7.8 (iPhone; iOS 11.0; Scale/3.00)',
        'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
    }

    params = (
        ('iid', '29797177823'),
        ('device_id', '48592631504'),
        ('os_api', '18'),
        ('app_name', 'aweme'),
        ('channel', 'App Store'),
        ('idfa', 'DA8C3A83-C08C-4881-86A8-1E67849F5BB2'),
        ('device_platform', 'iphone'),
        ('build_number', '17805'),
        ('vid', '855FEC75-BEB7-45A5-BE6A-2699A6864BAC'),
        ('openudid', 'c33813d872541f3bfc4ca174d9fbc5e708dd9ec5'),
        ('device_type', 'iPhone7,1'),
        ('app_version', '1.7.8'),
        ('version_code', '1.7.8'),
        ('os_version', '11.0'),
        ('screen_width', '1242'),
        ('aid', '1128'),
        ('ac', 'WIFI'),
        ('count', '30'),    # 变
        ('cursor', '30'),   # 游标位置
        # ('mas', '00648f2f9c5b661213d05736e23eea622bf96d64dd09f6e283ea97'),
        # ('as', 'a12512fc4e4aaa76481324'),
        # ('ts', '1523066542'),
        ('ts', str(time.time().__round__()) + str(randint(100, 999))),
    )

    url = 'https://aweme.snssdk.com/aweme/v1/category/list/'
    body = MyRequests.get_url_body(url=url, headers=headers, params=params)
    # print(body)

    try:
        data = json.loads(body).get('category_list', [])
        # pprint(data)
        print('count数:', len(data))
    except:
        data = {}
        print('error')

    aweme_list = [item.get('aweme_list') for item in data]
    # pprint(aweme_list)

    user_id_list = []
    for item in aweme_list:
        if isinstance(item, list):
            for i in item:
                user_id_list.append(i.get('author_user_id', ''))
        else:
            user_id_list.append(item.get('author_user_id', ''))

    user_id_list = sorted(list(set(user_id_list)))
    user_id_list = [item for item in user_id_list if item not in all_user_id_list]
    # pprint(user_id_list)

    return user_id_list
Ejemplo n.º 29
0
    def _get_goods_data(self, goods_id):
        '''
        得到需求数据
        :param goods_id:
        :return:
        '''
        if goods_id == '':
            self.my_lg.error('获取到的goods_id为空值!此处跳过!')
            return self._get_data_error_init()

        # 网易严选m站抓取
        url = 'http://m.you.163.com/item/detail'
        params = self._get_params(goods_id=goods_id)

        m_url = url + '?id={0}'.format(goods_id)
        self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url))

        write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url)

        body = MyRequests.get_url_body(url=url,
                                       headers=self.headers,
                                       params=params)
        # self.my_lg.info(str(body))
        if body == '':
            self.my_lg.error('获取到的body为空值!' + write_info)
            return self._get_data_error_init()

        try:
            body = re.compile('var jsonData=(.*?),policyList=').findall(
                body)[0]
        except IndexError:
            self.my_lg.error('获取body时索引异常!' + write_info, exc_info=True)
            return self._get_data_error_init()

        body = nonstandard_json_str_handle(json_str=body)
        # self.my_lg.info(str(body))
        _ = json_2_dict(json_str=body)
        # pprint(_)
        if _ == {}:
            self.my_lg.error('获取到的data为空dict!' + write_info)
            return self._get_data_error_init()

        _ = self._wash_data(_)
        data = {}
        try:
            data['title'] = self._get_title(data=_)
            data['sub_title'] = self._get_sub_title(data=_)
            data['shop_name'] = ''
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = self._get_sell_time(data=_)
            data['detail_name_list'] = self._get_detail_name_list(
                data=_.get('skuSpecList', []))
            data['price_info_list'] = self._get_price_info_list(
                data=_.get('skuList', []))
            data['price'], data[
                'taobao_price'] = self._get_price_and_taobao_price(
                    price_info_list=data['price_info_list'])
            data['is_delete'] = self._get_is_delete(
                price_info_list=data['price_info_list'], data=data, other=_)

        except Exception:
            self.my_lg.error('遇到错误:', exc_info=True)
            self.my_lg.error(write_info)
            return self._get_data_error_init()

        if data != {}:
            self.result_data = data
            return data
        else:
            self.my_lg.info('data为空值')
            return self._get_data_error_init()
Ejemplo n.º 30
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}
            return {}
        else:
            tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                goods_id)
            print('------>>>| 得到的商品手机版地址为: ', tmp_url)
            '''
            原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息
            '''
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)
            if body == '':
                print('获取到的tmp_url的body为空值, 此处跳过!')
                self.result_data = {}
                return {}

            # 不用这个了因为会影响到正常情况的商品
            # try:
            #     if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != []:   # 单独处理商品页面不存在的情况
            #         # print('test############')
            #         self.result_data = {}
            #         return str(goods_id)
            #     else:
            #         pass
            # except:
            #     pass

            try:
                data = re.compile(
                    r'window.prod_info = (.*?);seajs.use\(.*?\);</script>'
                ).findall(body)  # 贪婪匹配匹配所有
            except:
                data = []
            '''
            采用phantomjs
            '''
            # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title')
            # # print(main_body)
            # if main_body == '':
            #     print('获取到的main_body为空值, 此处跳过!')
            #     self.result_data = {}
            #     return {}
            #
            # try:
            #     data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body)  # 贪婪匹配匹配所有
            #     # print(data)
            # except:
            #     data = []

            if data != []:
                data = data[0]
                data = json_2_dict(json_str=data)
                if data == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                # pprint(data)
                '''
                得到div_desc的html页面
                '''
                div_desc_url = 'https://pina.m.zhe800.com/nnc/product/detail_content.json?zid=' + str(
                    goods_id)

                div_desc_body = self.get_div_desc_body(
                    div_desc_url=div_desc_url)
                # print(div_desc_body)

                if div_desc_body == '':
                    print('获取到的div_desc_body为空!')
                    return {}
                '''
                获取到详情介绍页面
                '''
                p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(
                    goods_id)
                p_info = self.get_p_info_list(p_info_url=p_info_url)
                # pprint(p_info)
                if p_info == []:
                    return {}
                '''
                获取商品实时库存信息
                '''
                stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(
                    goods_id) + '/realtime_info.json'
                stock_info = self.get_stock_info_dict(
                    stock_info_url=stock_info_url)

                if stock_info == {}:
                    print('获取到的库存信息为{}!')
                    return {}
                # pprint(stock_info)

                data['div_desc'] = div_desc_body
                data['p_info'] = p_info
                data['stock_info'] = stock_info

                if stock_info.get('pin_status', 2) == 3:
                    print('##### 该拼团商品已经被抢光 ...')
                    is_delete = 1
                else:
                    is_delete = 0
                data['is_delete'] = is_delete

                self.result_data = data
                # pprint(data)
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}