Esempio n. 1
0
    async def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return:
        '''
        s_time = time.time()
        goods_list = []
        my_phantomjs = MyPhantomjs()
        for key in self.tab_dict:
            self.msg = '正在抓取的分类为: ' + key
            self.my_lg.info(self.msg)
            for index in range(1, 20):
                item_list = await self.get_one_page_goods_list(my_phantomjs=my_phantomjs, key=key, tab=self.tab_dict[key], index=index)

                all_goods_id = list(set([s.get('goods_id', '') for s in goods_list]))
                for item in item_list:
                    if item.get('goods_id', '') not in all_goods_id:
                        goods_list.append(item)
                # await asyncio.sleep(.5)
        try: del my_phantomjs
        except: pass
        self.my_lg.info(str(goods_list))
        self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list)))
        e_time = time.time()
        self.my_lg.info('总用时:' + str(e_time-s_time))
        await asyncio.sleep(3)

        return goods_list
Esempio n. 2
0
 def __init__(self, logger=None):
     super().__init__()
     self.result_data = {}
     self.msg = ''
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/阿里1688/comment/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
     self.my_phantomjs = MyPhantomjs()
     # 可动态执行的代码
     self._exec_code = '''
     self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() 
     sleep(1.5)
     # 向下滚动10000像素
     js = 'document.body.scrollTop=10000'
     self.driver.execute_script(js)
     sleep(3)
     '''
     self.headers = {
         'accept-encoding': 'gzip, deflate, br',
         'accept-language': 'zh-CN,zh;q=0.9',
         'user-agent': HEADERS[randint(0,
                                       len(HEADERS) - 1)],
         'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
         'referer': 'https://detail.1688.com/offer/45579899125.html',
     }
     self.page_size = '30'
Esempio n. 3
0
 def __init__(self, logger=None):
     self.result_data = {}
     self.msg = ''
     self._set_logger(logger)
     self._set_headers()
     self.comment_page_switch_sleep_time = 1.2  # 评论下一页sleep time
     self.my_phantomjs = MyPhantomjs()
     self._add_headers_cookies()
Esempio n. 4
0
 def __init__(self, logger=None):
     self.result_data = {}
     self.msg = ''
     self._set_logger(logger)
     self._set_headers()
     self.page_size = '10'
     self.comment_page_switch_sleep_time = 1.5  # 评论下一页sleep time
     self.my_phantomjs = MyPhantomjs()
     self.g_data = {}  # 临时数据
     self.random_sku_info_list = []  # 临时数据(存该商品所有的规格)
Esempio n. 5
0
 def __init__(self):
     self.headers = {
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 'zhe800.com',
         'User-Agent': HEADERS[randint(0, 34)]  # 随机一个请求头
     }
     self.my_phantomjs = MyPhantomjs()
Esempio n. 6
0
 def __init__(self):
     self.headers = {
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 'mobile.yangkeduo.com',
         'User-Agent': HEADERS[randint(0, 34)],  # 随机一个请求头
         # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;',      # 分析发现需要这个cookie值
     }
     self.result_data = {}
     # self.set_cookies_key_api_uid()  # 设置cookie中的api_uid的值
     self.my_phantomjs = MyPhantomjs()
Esempio n. 7
0
 def __init__(self):
     super().__init__()
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': '1688.com',
         'User-Agent': HEADERS[randint(0, 34)]  # 随机一个请求头
     }
     self.result_data = {}
     self.is_activity_goods = False
     self.my_phantomjs = MyPhantomjs()
Esempio n. 8
0
 def __init__(self, logger=None):
     super().__init__()
     self.result_data = {}
     self.msg = ''
     self._set_headers()
     self._set_logger(logger)
     self.my_phantomjs = MyPhantomjs()
     # 可动态执行的代码
     self._exec_code = '''
     self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() 
     _text = str(self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').text)
     print(_text)
     # if _text == '四五星(0)':
     assert _text != '四五星(0)', 'my assert error!'    # 通过断言来跳过执行下面的代码
     sleep(2.5)
     # 向下滚动10000像素
     js = 'document.body.scrollTop=10000'
     self.driver.execute_script(js)
     sleep(4)
     '''
     self._page_sleep_time = 1.2
Esempio n. 9
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_goods_list = []
        self.my_phantomjs = MyPhantomjs()
        cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/')
        try: del self.my_phantomjs
        except: pass
        if cookies == '':
            print('!!! 获取cookies失败 !!!')
            return False

        print('获取cookies成功!')
        self.headers.update(Cookie=cookies)

        print('开始抓取在售商品...')
        for page in range(1, 50):   # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        print('开始抓取预售商品...')
        for page in range(1, 50):   # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]
        print(all_goods_list)
        print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__())

        self.deal_with_data(all_goods_list)

        return True
    async def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id(
                logger=self.my_lg)
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(result)

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    time_number = await self.is_recent_time(pintuan_end_time)
                    if time_number == 0:
                        await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id(
                            goods_id=item[0], logger=self.my_lg)
                        self.msg = '过期的goods_id为(%s)' % item[
                            0] + ', 拼团结束时间为(%s), 删除成功!' % str(
                                json.loads(item[1]).get('begin_time'))
                        self.my_lg.info(self.msg)

                    elif time_number == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (
                            item[0], str(index))
                        self.my_lg.info(self.msg)
                        data['goods_id'] = item[0]
                        jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg)

                        _ = item[2] + '-' + str(
                            item[3])  # 格式: 'coutuan_baby-1'
                        item_list = self.api_all_goods_id.get(
                            _, [])  # 用于判断tab, index已在self.api_all_goods_id中

                        if item_list == []:
                            my_phantomjs = MyPhantomjs()
                            item_list = await jumeiyoupin_2.get_one_page_goods_list(
                                my_phantomjs=my_phantomjs,
                                tab=item[2],
                                index=item[3])
                            try:
                                del my_phantomjs
                            except:
                                pass

                        if item_list == []:
                            self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!')
                            pass
                        else:
                            if self.api_all_goods_id.get(_) is None:
                                self.api_all_goods_id[_] = item_list

                            pintuan_goods_all_goods_id = [
                                item_1.get('goods_id', '')
                                for item_1 in item_list
                            ]

                            jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse(
                                logger=self.my_lg)
                            # 内部已经下架的(测试发现官方不会提前下架活动商品)
                            if item[0] not in pintuan_goods_all_goods_id:
                                await self.update_data_2(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    pipeline=tmp_sql_server)

                            else:  # 未内部下架
                                await self.update_data_1(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumeiyoupin_2=jumeiyoupin_2,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    item_list=item_list,
                                    pipeline=tmp_sql_server)

                else:
                    self.my_lg.error('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            self.my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()

        return None
Esempio n. 11
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == '':
            self.result_data = {}
            return {}

        print('------>>>| 对应的手机端地址为: ',
              'https://m.chuchujie.com/details/detail.html?id=' + goods_id)
        '''
        1.原先直接去手机端页面api post请求数据但是死活就返回请求参数错误,反复研究无果, 就改为解析pc端的
        '''
        # tmp_url = 'https://api-product.chuchujie.com/api.php?method=product_detail'
        # self.headers['Referer'] = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id)
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))
        #
        # params_2 = {
        #     "channel": "QD_appstore",
        #     "package_name": "com.culiukeji.huanletao",
        #     "client_version": "3.9.101",
        #     "ageGroup": "AG_0to24",
        #     "client_type": "h5",
        #     "api_version": "v5",
        #     "imei": "",
        #     "method": "product_detail",
        #     "gender": "1",      # 性别 0-女、1-男
        #     "token": "",
        #     "userId": "",
        #     "product_id": int(goods_id),
        # }
        #
        # params = {
        #     'data': json.dumps(params_2),
        # }
        #
        # try:
        #     # response = requests.post(
        #     #     url=tmp_url,
        #     #     headers=self.headers,
        #     #     data=json.dumps(params),
        #     #     proxies=tmp_proxies,
        #     #     timeout=13
        #     # )
        #     response = requests.get(
        #         url=tmp_url,
        #         headers=self.headers,
        #         params=params,
        #         proxies=tmp_proxies,
        #         timeout=13,
        #     )
        #     last_url = re.compile(r'\+').sub('', response.url)  # 转换后得到正确的url请求地址
        #     print(last_url)
        #     print(tmp_url + '&data=%7B%22channel%22%3A%22QD_appstore%22%2C%22package_name%22%3A%22com.culiukeji.huanletao%22%2C%22client_version%22%3A%223.9.101%22%2C%22ageGroup%22%3A%22AG_0to24%22%2C%22client_type%22%3A%22h5%22%2C%22api_version%22%3A%22v5%22%2C%22imei%22%3A%22%22%2C%22method%22%3A%22product_detail%22%2C%22gender%22%3A%221%22%2C%22token%22%3A%22%22%2C%22userId%22%3A%22%22%2C%22product_id%22%3A10016793335%7D')
        #     response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #
        #     data = response.content.decode('utf-8')
        #     print(data)
        #
        # except Exception:
        #     print('requests.post()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        '''
        2. 改为解析pc端的商品页面数据
        '''
        tmp_url = 'http://wx.chuchujie.com/index.php?s=/WebProduct/product_detail/product_id/' + str(
            goods_id)

        # 开始常规requests有数据, 后面无数据, 改用phantomjs
        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
        my_phantomjs = MyPhantomjs()
        body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
        try:
            del my_phantomjs
        except:
            pass
        # print(body)

        if body == '':
            print('获取到的body为空str!')
            self.result_data = {}
            return {}

        data = {}

        try:
            data['title'] = Selector(
                text=body).css('div.zy_info_rt h3::text').extract_first()
            if data['title'] == '':
                print('title为空!')
                raise Exception

            data['sub_title'] = ''

            data['shop_name'] = Selector(text=body).css(
                'div.other.ft14.clearfix label b::text').extract_first()
            # print(data['shop_name'])

            # 获取所有示例图片
            all_img_url = [{
                'img_url': item
            } for item in list(
                Selector(
                    text=body).css('p.s_img label img::attr("src")').extract())
                           ]
            # pprint(all_img_url)
            data['all_img_url'] = all_img_url
            '''
            获取p_info
            '''
            # 由于获取的是pc端的对应没有p_info
            data['p_info'] = []
            '''
            获取商品的div_desc
            '''
            div_desc = Selector(text=body).css('div.s_two').extract_first()
            # print(div_desc)
            if div_desc == '':
                print('div_desc为空!请检查!')
                raise Exception

            data['div_desc'] = div_desc
            '''
            获取detail_name_list
            '''
            detail_name_list = Selector(text=body).css(
                'div.info-wd.bd-red dl.detail dt::text').extract()
            if len(detail_name_list) <= 1:
                detail_name_list = []

            else:
                detail_name_list = [{
                    'spec_name': item
                } for item in detail_name_list[:-1]]

            # print(detail_name_list)
            data['detail_name_list'] = detail_name_list

            # 商品价格(原价)跟淘宝价格
            taobao_price = Selector(
                text=body).css('dl.detail p.price b::text').extract_first()
            price = Selector(text=body).css(
                'dl.detail dd em.yjprice::text').extract_first()
            # print(taobao_price)
            # print(price)
            try:
                # 后面有'*' 是为了避免有价格为整数不是浮点类型的
                taobao_price = re.compile(r'(\d+\.{0,1}\d*)').findall(
                    taobao_price)[0]
                price = re.compile(r'(\d+\.{0,1}\d*)').findall(price)[0]
            except IndexError:
                print('获取price失败,请检查!')
                raise IndexError

            if taobao_price == '' or price == '':
                print('获取到的taobao_price或者price为空值出错, 请检查!')
                raise Exception

            taobao_price = Decimal(taobao_price).__round__(2)
            price = Decimal(price).__round__(2)
            # print('商品促销价为: ', taobao_price, ' 商品原价为: ', price)
            data['price'] = price
            data['taobao_price'] = taobao_price
            '''
            获取每个规格对应价格跟规格以及其库存
            '''
            price_info_list = self.get_price_info_list(detail_name_list, body,
                                                       price, taobao_price)
            # pprint(price_info_list)
            if price_info_list == '':
                raise Exception
            else:
                data['price_info_list'] = price_info_list
            '''
            是否卖光
            '''
            all_stock = int(
                Selector(text=body).css(
                    'dl.detail dd label em::text').extract_first())
            if all_stock == 0:
                is_delete = 1
            else:
                is_delete = 0
            data['is_delete'] = is_delete

        except Exception as e:
            print('遇到错误: ', e)
            self.result_data = {}
            return {}

        if data != {}:
            # pprint(data)
            self.result_data = data
            return data

        else:
            print('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
Esempio n. 12
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        '''
        方法一: 蘑菇街手机版拼团商品列表获取签名无法破解,所以不用手机端的方法来获取数据
        '''
        # mw_appkey = '100028'
        # mw_t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # mw_ttid = 'NMMain%40mgj_h5_1.0'
        #
        # _ = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # data = {
        #     "pid": "93745",
        #     "platform": "m",
        #     "cKey": "mwp_mait",
        #     "fcid": "",
        # }
        #
        # params = {
        #     'data': data
        # }
        #
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid
        #
        # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format(
        #     mw_appkey, mw_t, mw_uuid, mw_ttid, _
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        #
        # try:
        #     response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     body = response.content.decode('utf-8')
        #     print(body)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        '''
        方法二: 通过pc端来获取拼团商品列表
        '''
        self.my_phantomjs = MyPhantomjs()
        for key in self.fcid_dict:
            print('正在抓取的分类为: ', key)
            for index in range(1, 100):
                if index % 5 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs()

                fcid = self.fcid_dict[key]
                tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                    str(index), fcid)
                # requests请求数据被过滤(起初能用),改用phantomjs
                # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                    url=tmp_url)
                # print(body)

                try:
                    body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0]
                    tmp_data = json.loads(body)
                except:
                    print('json.loads转换body时出错, 请检查')
                    continue

                if tmp_data.get('result', {}).get('wall', {}).get('docs',
                                                                  []) == []:
                    # 表示拼团数据为空则跳出循环
                    break

                # pprint(tmp_data)
                # print(tmp_data)

                tmp_item_list = tmp_data.get('result',
                                             {}).get('wall',
                                                     {}).get('docs', [])
                # print(tmp_item_list)
                # pprint(tmp_item_list)

                begin_time_timestamp = int(time.time())  # 开始拼团的时间戳
                item_list = [{
                    'goods_id': item.get('tradeItemId', ''),
                    'pintuan_time': {
                        'begin_time':
                        self.timestamp_to_regulartime(
                            timestamp=begin_time_timestamp),
                        'end_time':
                        self.timestamp_to_regulartime(
                            self.get_pintuan_end_time(
                                begin_time_timestamp,
                                item.get('leftTimeOrg', ''))),
                    },
                    'all_sell_count': str(item.get('salesVolume', 0)),
                    'fcid': fcid,
                    'page': index,
                    'sort': key,
                } for item in tmp_item_list]
                print(item_list)

                for item_1 in item_list:
                    goods_list.append(item_1)

                sleep(MOGUJIE_SLEEP_TIME)

        # 处理goods_list数据
        print(goods_list)
        self.deal_with_data(goods_list)
        sleep(5)
Esempio n. 13
0
 def __init__(self):
     super(JuanPiParse, self).__init__()
     self._set_headers()
     self.result_data = {}
     self.my_phantomjs = MyPhantomjs()
Esempio n. 14
0
 def __init__(self):
     self._set_headers()
     self.my_phantomjs = MyPhantomjs()
    async def get_goods_data(self, jumei_pintuan_url):
        '''
        异步模拟得到原始data
        :param goods_id:
        :return:
        '''
        goods_id = await self.get_goods_id_from_url(jumei_pintuan_url)
        if goods_id == []:
            self.result_data = {}
            return {}
        '''
        原先采用requests被过滤无返回结果, 于是用aiohttp无奈速度过慢, 换用phantomjs
        '''
        # 拼团商品手机地址
        goods_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format(
            goods_id[0], goods_id[1])
        self.msg = '------>>>| 对应手机端地址为: ' + goods_url
        self.my_lg.info(self.msg)

        #** 获取ajaxDetail请求中的数据
        tmp_url = 'https://s.h5.jumei.com/yiqituan/ajaxDetail?item_id={0}&type={1}'.format(
            str(goods_id[0]), [goods_id[1]][0])
        # self.headers['Referer'] = goods_url
        # params = {
        #     'item_id': str(goods_id[0]),
        #     'type': [goods_id[1]][0],
        # }
        # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, params=params, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT)
        # # 获取原始url的tmp_body
        # tmp_body = await MyAiohttp.aio_get_url_body(url=goods_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT)
        # # print(tmp_body)
        '''
        换用phantomjs
        '''
        my_phantomjs = MyPhantomjs()
        body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
        # print(body)
        try:
            body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0]
            # print(body)
        except IndexError:
            body = ''
        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(url=goods_url)
        # print(tmp_body)
        try:
            del my_phantomjs
        except:
            pass

        if body == '' or tmp_body == '':
            self.msg = '获取到的body为空str!' + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}
            return {}

        data = await self.json_2_dict(json_str=body)
        if data == {}:
            self.msg = '出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}
            return {}
        data = await self.wash_data(data=data)
        data = data.get('data', {})
        # pprint(data)

        try:
            data['title'] = data.get('share_info', [])[1].get('text', '')
            data['title'] = re.compile(r'聚美').sub('', data['title'])
            if len(data.get('buy_alone', {})) == 1:
                data['sub_title'] = ''
            else:
                data['sub_title'] = data.get('buy_alone', {}).get('name', '')
                data['sub_title'] = re.compile(r'聚美').sub(
                    '', data['sub_title'])
            # print(data['title'])
            if data['title'] == '':
                self.my_lg.error('获取到的title为空值, 请检查!')
                raise Exception

            # shop_name
            if data.get('shop_info') == []:
                data['shop_name'] = ''
            else:
                data['shop_name'] = data.get('shop_info',
                                             {}).get('store_title', '')
            # print(data['shop_name'])

            # 获取所有示例图片
            all_img_url = await self.get_all_img_url(data=data)
            data['all_img_url'] = all_img_url

            # 获取p_info
            p_info = await self.get_p_info(body=tmp_body)
            data['p_info'] = p_info

            # 获取div_desc
            div_desc = await self.get_div_desc(body=tmp_body)
            div_desc = await MyAiohttp.wash_html(div_desc)
            # print(div_desc)
            data['div_desc'] = div_desc

            # 上下架时间(拼团列表数据接口里面有这里先不获取)

            # 设置detail_name_list
            detail_name_list = await self.get_detail_name_list(
                size_attr=data.get('buy_alone', {}).get('size_attr', []))
            data['detail_name_list'] = detail_name_list

            # 获取每个规格对应价格以及库存
            true_sku_info = await self.get_true_sku_info(
                buy_alone_size=data.get('buy_alone', {}).get('size', []),
                size=data.get('size', []),
                group_single_price=data.get('group_single_price', ''))
            data['price_info_list'] = true_sku_info

            # is_delete
            product_status = data.get('product_status', '')
            is_delete = await self.get_is_delete(product_status=product_status,
                                                 true_sku_info=true_sku_info)
            data['is_delete'] = is_delete

            # all_sell_count
            all_sell_count = data.get('buyer_number_text', '')
            if all_sell_count != '':
                all_sell_count = re.compile(r'(\d+\.?\d*)').findall(
                    all_sell_count)[0]
                is_W = re.compile(r'万').findall(all_sell_count)
                if is_W != []:
                    all_sell_count = str(int(float(all_sell_count) * 10000))
            else:
                all_sell_count = '0'
            data['all_sell_count'] = all_sell_count

            data['goods_url'] = goods_url

        except Exception as e:
            self.msg = '遇到错误如下: ' + str(e) + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.my_lg.exception(e)
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        if data != {}:
            # pprint(data)
            self.result_data = data
            return data

        else:
            self.msg = 'data为空!' + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
Esempio n. 16
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_mogujie_pintuan_all_goods_id())
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs()
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(
                            goods_id=item[0])
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(
                                    goods_id=item[0])
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # print(tmp_item_list)
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        self.timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        self.timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # print(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(goods_id=item[0])
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        tmp_price_info_list = goods_data[
                                            'price_info_list']
                                        price_info_list = [{
                                            'spec_value':
                                            item_4.get('spec_value'),
                                            'pintuan_price':
                                            item_4.get('detail_price'),
                                            'normal_price':
                                            item_4.get('normal_price'),
                                            'img_url':
                                            item_4.get('img_url'),
                                            'rest_number':
                                            item_4.get('rest_number'),
                                        } for item_4 in tmp_price_info_list]

                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = price_info_list

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                tmp_price_info_list = goods_data[
                                                    'price_info_list']
                                                price_info_list = [
                                                    {
                                                        'spec_value':
                                                        item_4.get(
                                                            'spec_value'),
                                                        'pintuan_price':
                                                        item_4.get(
                                                            'detail_price'),
                                                        'normal_price':
                                                        item_4.get(
                                                            'normal_price'),
                                                        'img_url':
                                                        item_4.get('img_url'),
                                                        'rest_number':
                                                        item_4.get(
                                                            'rest_number'),
                                                    } for item_4 in
                                                    tmp_price_info_list
                                                ]

                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = price_info_list
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data['pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                        pintuan_time=goods_data[
                                                            'pintuan_time'])
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()
Esempio n. 17
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline.select_chuchujie_xianshimiaosha_all_goods_id())
            ]
            # print(db_goods_id_list)

            # my_phantomjs = MyPhantomjs()
            # my_phantomjs.init_phantomjs()
            # index = 1
            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(
                        goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    elif goods_data.get('is_delete',
                                        0) == 1:  # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:  # 否则就解析并且插入
                        my_phantomjs = MyPhantomjs()
                        my_phantomjs.init_phantomjs()

                        # 获取剩余时间
                        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url, css_selector='p#activityTime span')
                        # print(tmp_body)

                        try:
                            del my_phantomjs
                        except:
                            pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.4)
                            pass

                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css(
                                'p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)

                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time':
                                self.timestamp_to_regulartime(int(
                                    time.time())),
                                'miaosha_end_time':
                                self.timestamp_to_regulartime(
                                    int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            # pprint(goods_data)
                            # print(goods_data)
                            chuchujie.insert_into_chuchujie_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                        # index += 1

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
Esempio n. 18
0
 def __init__(self):
     self._set_headers()
     self.result_data = {}
     # self.set_cookies_key_api_uid()  # 设置cookie中的api_uid的值
     self.my_phantomjs = MyPhantomjs()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.
                          select_jumeiyoupin_xianshimiaosha_all_goods_id())
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            jumeiyoupin_spike = JuMeiYouPinSpike()
            # 获取cookies
            my_phantomjs = MyPhantomjs()
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
                url='https://h5.jumei.com/')
            try:
                del my_phantomjs
            except:
                pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                            goods_id=item[0])
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(
                            item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print(
                                '#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                                item[0])
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0])
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(
                                item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time':
                                    goods_data['schedule'].get(
                                        'begin_time', ''),
                                    'miaosha_end_time':
                                    goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data[
                                    'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time(
                                        miaosha_time=goods_data['miaosha_time']
                                    )

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                                    data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Esempio n. 20
0
 def __init__(self):
     super().__init__()
     self._set_headers()
     self.result_data = {}
     self.is_activity_goods = False
     self.my_phantomjs = MyPhantomjs()