def get_pintuan_goods_data(self, juanpi_pintuan, goods_id, all_sell_count, page):
        '''
        得到goods_data
        :param juanpi_pintuan:
        :param goods_id: 商品id
        :param page:
        :return: a dict
        '''
        tmp_url = 'http://shop.juanpi.com/deal/' + str(goods_id)
        goods_id = juanpi_pintuan.get_goods_id_from_url(tmp_url)

        juanpi_pintuan.get_goods_data(goods_id=goods_id)
        goods_data = juanpi_pintuan.deal_with_data()

        if goods_data == {}:  # 返回的data为空则跳过
            pass

        else:
            goods_data['goods_id'] = str(goods_id)
            goods_data['spider_url'] = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id)
            goods_data['username'] = '******'
            goods_data['all_sell_count'] = all_sell_count
            goods_data['page'] = page
            goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data.get('schedule', [])[0])

        gc.collect()
        return goods_data
Beispiel #2
0
    def _deal_with_data(self):
        '''
        处理并存储抓取到的拼团商品的数据
        :return:
        '''
        zid_list = self._get_pintuan_goods_info()

        zhe_800_pintuan = Zhe800PintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline._select_table(sql_str=z8_select_str_1))
            ]
            for item in zid_list:
                if item[0] in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                        item[0])
                    goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url)

                    zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    goods_data = zhe_800_pintuan.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_id'] = str(item[0])
                        goods_data['spider_url'] = tmp_url
                        goods_data['username'] = '******'
                        goods_data['page'] = str(item[1])
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data.get('schedule', [])[0])

                        # print(goods_data)
                        _r = zhe_800_pintuan.insert_into_zhe_800_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 插入就更新
                            db_goods_id_list.append(item[0])
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                    gc.collect()

        else:
            pass
        try:
            del zhe_800_pintuan
        except:
            pass
        gc.collect()

        return None
Beispiel #3
0
    async def insert_into_table(self, tmp_item, category, current_page,
                                my_pipeline, index):
        '''
        执行插入到淘宝天天特价的操作
        :param tmp_item:
        :param category:
        :param current_page:
        :param my_pipeline:
        :param index:
        :return: index 加1
        '''
        tmp_url = 'https://item.taobao.com/item.htm?id=' + str(
            tmp_item.get('goods_id', ''))
        taobao = TaoBaoLoginAndParse(logger=self.my_lg)
        goods_id = taobao.get_goods_id_from_url(tmp_url)
        taobao.get_goods_data(goods_id=goods_id)
        goods_data = taobao.deal_with_data(goods_id=goods_id)

        if goods_data != {}:
            goods_data['goods_id'] = tmp_item.get('goods_id', '')
            goods_data['goods_url'] = tmp_url
            goods_data['schedule'] = [{
                'begin_time':
                tmp_item.get('start_time', ''),
                'end_time':
                tmp_item.get('end_time', ''),
            }]
            goods_data['tejia_begin_time'], goods_data[
                'tejia_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                    miaosha_time=goods_data.get('schedule', [])[0])
            goods_data['block_id'] = str(category)
            goods_data['tag_id'] = str(current_page)
            goods_data['father_sort'] = self.main_sort[category][0]
            goods_data['child_sort'] = ''
            # pprint(goods_data)

            await taobao.insert_into_taobao_tiantiantejia_table(
                data=goods_data, pipeline=my_pipeline)
        else:
            await asyncio.sleep(4)  # 否则休息4秒
            pass
        index += 1

        return index
Beispiel #4
0
    def _get_mia_pt_one_goods_info(self,
                                   mia_pt_obj,
                                   goods_id,
                                   sub_title='') -> dict:
        """
        获取mia单个goods info
        :return:
        """
        mia_pt_obj.get_goods_data(goods_id=goods_id)
        goods_data = mia_pt_obj.deal_with_data()
        assert goods_data != {}, 'goods_data不为空dict'

        goods_data['goods_id'] = str(goods_id)
        goods_data['sub_title'] = sub_title
        if goods_data['pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团
            now_time = get_shanghai_time()
            goods_data['pintuan_begin_time'], goods_data[
                'pintuan_end_time'] = (now_time, now_time)
        else:
            goods_data['pintuan_begin_time'], goods_data[
                'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                    miaosha_time=goods_data['pintuan_time'])

        return goods_data
Beispiel #5
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_miaosha_goods_list = self.get_all_miaosha_goods_list()
        try:
            del self.driver
        except:
            pass
        gc.collect()

        pinduoduo = PinduoduoParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            self.db_goods_id_list = self._get_db_goods_id_list()
            for item in all_miaosha_goods_list:
                '''
                注意: 明日8点半抓取到的是页面加载中返回的是空值
                '''
                if item.get('goods_id') != 'None':    # 跳过goods_id为'None'
                    if item.get('goods_id', '') in self.db_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过')
                        pass
                    else:
                        tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get('goods_id')
                        pinduoduo.get_goods_data(goods_id=item.get('goods_id'))
                        goods_data = pinduoduo.deal_with_data()

                        # print(goods_data)
                        if goods_data == {}:  # 返回的data为空则跳过
                            print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理')
                            # sleep(3)
                            pass

                        else:  # 否则就解析并插入
                            goods_data['stock_info'] = item.get('stock_info')
                            goods_data['goods_id'] = item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['username'] = '******'
                            goods_data['price'] = item.get('price')  # 秒杀前的原特价
                            goods_data['taobao_price'] = item.get('taobao_price')  # 秒杀价
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time'))

                            if item.get('stock_info', {}).get('activity_stock', 0) <= 2:
                                # 实时秒杀库存小于等于2时就标记为 已售罄
                                print('该秒杀商品已售罄...')
                                goods_data['is_delete'] = 1

                            pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(PINDUODUO_SLEEP_TIME)

                else:
                    print('该goods_id为"None", 此处跳过')
                    pass
            sleep(5)

        else:
            pass
        try:
            del pinduoduo
        except:
            pass
        gc.collect()
Beispiel #6
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=jm_select_str_2))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    jumei = JuMeiYouPinParse()
                    goods_id = item.get('goods_id', '')
                    type = item.get('type', '')
                    tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(
                        goods_id, type)
                    jumei.get_goods_data(goods_id=[goods_id, type])
                    goods_data = jumei.deal_with_data()
                    if goods_data == {}:
                        pass

                    elif goods_data.get('is_delete', 0) == 1:
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:  # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        goods_data['page'] = item.get('page')

                        # pprint(goods_data)
                        res = jumei.insert_into_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=my_pipeline)
                        if res:
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                        sleep(JUMEIYOUPIN_SLEEP_TIME
                              )  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                    try:
                        del jumei
                    except:
                        pass

        else:
            print('数据库连接失败,此处跳过!')
            pass

        gc.collect()
Beispiel #7
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=cc_select_str_2))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(
                        goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        sleep(.5)

                    elif goods_data.get('is_delete',
                                        0) == 1:  # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        sleep(.5)

                    else:  # 否则就解析并且插入
                        my_phantomjs = BaseDriver(
                            executable_path=PHANTOMJS_DRIVER_PATH,
                            ip_pool_type=self.ip_pool_type)

                        # 获取剩余时间
                        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url, css_selector='p#activityTime span')
                        # print(tmp_body)

                        try:
                            del my_phantomjs
                        except:
                            pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.5)
                            pass
                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css(
                                'p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)
                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time':
                                timestamp_to_regulartime(int(time.time())),
                                'miaosha_end_time':
                                timestamp_to_regulartime(
                                    int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            res = chuchujie.insert_into_chuchujie_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                            if res:
                                if goods_id not in db_goods_id_list:
                                    db_goods_id_list.append(goods_id)

                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠
                        # index += 1
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
Beispiel #8
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33]  # notice

        for tab_id in tab_id_list:
            for index in range(0, 50):
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(index))
                print('待抓取的限时秒杀地址为: ', tmp_url)

                data = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
                if data == '': break

                try:
                    data = json.loads(data)
                    data = data.get('data', {})
                    # print(data)
                except:
                    break

                if data.get('goodslist') == []:
                    print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, index))
                    break
                else:
                    data = data.get('goodslist', [])
                    # print(data)
                    if data == []:
                        print('goodslist为[], 此处跳过')
                        pass
                    else:
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        print(miaosha_goods_list)

                        juanpi = JuanPiParse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            if my_pipeline._select_table(
                                    sql_str=jp_select_str_5) is None:
                                db_goods_id_list = []
                            else:
                                db_goods_id_list = [
                                    item[0] for item in list(
                                        my_pipeline._select_table(
                                            sql_str=jp_select_str_5))
                                ]

                            for item in miaosha_goods_list:
                                if item.get('goods_id',
                                            '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'http://shop.juanpi.com/deal/' + item.get(
                                        'goods_id')
                                    juanpi.get_goods_data(
                                        goods_id=item.get('goods_id'))
                                    goods_data = juanpi.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item.get(
                                            'goods_id')
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get(
                                            'price')  # 秒杀前的原特价
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')  # 秒杀价
                                        goods_data['sub_title'] = item.get(
                                            'sub_title', '')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['tab_id'] = tab_id
                                        goods_data['page'] = index

                                        # print(goods_data)
                                        juanpi.insert_into_juanpi_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(.4)  # 短暂sleep下避免出错跳出
                            sleep(.65)
                        else:
                            pass
                        try:
                            del juanpi
                        except:
                            pass
                        gc.collect()
Beispiel #9
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(base_session_id=base_session_id)
            sleep(.5)
            if data.get('data', {}).get('blocks', []) == []:     # session_id不存在
                base_session_id += 2
                continue

            try:
                begin_times_timestamp = self._get_begin_times_timestamp(data)
            except Exception as e:
                print('遇到严重错误: ', e)
                base_session_id += 2
                continue

            print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp))
            is_recent_time = self.is_recent_time(timestamp=begin_times_timestamp)
            if not is_recent_time:  # 说明秒杀日期合法
                base_session_id += 2
                continue

            try:
                data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])]
            except Exception as e:
                print('遇到严重错误: ', e)
                base_session_id += 2
                continue
            # pprint(data)

            if data != []:  # 否则说明里面有数据
                miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data)
                # pprint(miaosha_goods_list)

                zhe_800 = Zhe800Parse()
                my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                if my_pipeline.is_connect_success:
                    db_goods_id_list = self._get_db_goods_id_list(my_pipeline)
                    for item in miaosha_goods_list:
                        if item.get('zid', '') in db_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过')
                            pass
                        else:
                            tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', ''))
                            goods_id = zhe_800.get_goods_id_from_url(tmp_url)

                            zhe_800.get_goods_data(goods_id=goods_id)
                            goods_data = zhe_800.deal_with_data()
                            if goods_data == {}:    # 返回的data为空则跳过
                                pass
                            else:       # 否则就解析并且插入
                                goods_data['stock_info'] = item.get('stock_info')
                                goods_data['goods_id'] = str(item.get('zid'))
                                goods_data['spider_url'] = tmp_url
                                goods_data['username'] = '******'
                                goods_data['price'] = item.get('price')
                                goods_data['taobao_price'] = item.get('taobao_price')
                                goods_data['sub_title'] = item.get('sub_title')
                                # goods_data['is_baoyou'] = item.get('is_baoyou')
                                goods_data['miaosha_time'] = item.get('miaosha_time')
                                goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time'))
                                goods_data['session_id'] = str(base_session_id)

                                # print(goods_data)
                                res = zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                                if res:
                                    if goods_id not in db_goods_id_list:
                                        db_goods_id_list.append(goods_id)

                                sleep(ZHE_800_SPIKE_SLEEP_TIME)   # 放慢速度

                    sleep(4)
                else:
                    pass
                try:
                    del zhe_800
                except:
                    pass
                gc.collect()

            else:       # 说明这个sessionid没有数据
                print('该sessionid没有相关key为jsons的数据')
                pass

            base_session_id += 2
Beispiel #10
0
    def deal_with_data(self, goods_list):
        '''
        处理并存储相关拼团商品的数据
        :param goods_list:
        :return:
        '''
        mia = MiaPintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline._select_table(sql_str=mia_select_str_1))
            ]
            # print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(
                        goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(
                                goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(
                                goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(
                                goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['sub_title'] = item.get('sub_title', '')
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['pintuan_time'])
                        goods_data['pid'] = item.get('pid')

                        # pprint(goods_data)
                        _r = mia.insert_into_mia_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
Beispiel #11
0
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=pd_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            pinduoduo_miaosha = PinduoduoParse()

            all_miaosha_goods_list = self.get_all_miaosha_goods_list()

            # 其中所有goods_id的list
            miaosha_goods_all_goods_id = [
                i.get('goods_id') for i in all_miaosha_goods_list
            ]
            # print(miaosha_goods_all_goods_id)

            for item in result:  # 实时更新数据
                # 对于拼多多先拿到该商品的结束时间点
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            '''
                            表示其中没有了该goods_id
                            '''
                            tmp_sql_server._delete_table(
                                sql_str=self.delete_sql_str, params=(item[0]))
                            print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                  item[0])
                            pass

                        else:  # 未下架的
                            for item_1 in all_miaosha_goods_list:
                                if item_1.get('goods_id', '') == item[0]:
                                    # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                    # pinduoduo_miaosha = PinduoduoParse()
                                    pinduoduo_miaosha.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = pinduoduo_miaosha.deal_with_data(
                                    )

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        # sleep(3)
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item_1.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item_1.get(
                                            'goods_id')
                                        if item_1.get('stock_info').get(
                                                'activity_stock') > 0:
                                            goods_data['price'] = item_1.get(
                                                'price')  # 秒杀前的原特价
                                            goods_data[
                                                'taobao_price'] = item_1.get(
                                                    'taobao_price')  # 秒杀价
                                        else:
                                            pass
                                        goods_data['sub_title'] = item_1.get(
                                            'sub_title', '')
                                        goods_data[
                                            'miaosha_time'] = item_1.get(
                                                'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item_1.get(
                                                        'miaosha_time'))

                                        if item_1.get('stock_info').get(
                                                'activity_stock') <= 1:
                                            # 实时秒杀库存小于等于1时就标记为 已售罄
                                            print('该秒杀商品已售罄...')
                                            goods_data['is_delete'] = 1

                                        # print(goods_data)
                                        pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                    sleep(PINDUODUO_SLEEP_TIME)
                                else:
                                    pass

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(3)
        # del ali_1688
        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mia_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mia_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                pintuan_end_time = json_2_dict(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mia_pintuan = MiaPintuanParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                            item[2]) + '/0/'
                        # print(tmp_url)

                        body = Requests.get_url_body(
                            url=tmp_url,
                            headers=self.headers,
                            had_referer=True,
                            ip_pool_type=self.ip_pool_type)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            tmp_data = json_2_dict(json_str=body)
                            if tmp_data == {}:
                                print('json.loads转换body时出错, 此处跳过!')

                            if tmp_data.get('data_list', []) == []:
                                print('得到的data_list为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                data_list = [{
                                    'goods_id':
                                    item_2.get('sku', ''),
                                    'sub_title':
                                    item_2.get('intro', ''),
                                } for item_2 in tmp_data.get('data_list', [])]
                                # pprint(data_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in data_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:  # 内部已经下架的
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass

                                    # 一律更新
                                    mia_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mia_pintuan.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:
                                        goods_data['goods_id'] = str(item[0])
                                        if goods_data[
                                                'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间)
                                            now_time = get_shanghai_time()
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = (
                                                        now_time, now_time)
                                        else:
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'pintuan_time'])

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mia_pintuan.update_mia_pintuan_table(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in data_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mia_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mia_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:
                                                goods_data['goods_id'] = str(
                                                    item[0])
                                                goods_data[
                                                    'sub_title'] = item_2.get(
                                                        'sub_title', '')
                                                if goods_data[
                                                        'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团
                                                    now_time = get_shanghai_time(
                                                    )
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = (
                                                                now_time,
                                                                now_time)
                                                else:
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                                miaosha_time=
                                                                goods_data[
                                                                    'pintuan_time']
                                                            )

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mia_pintuan.update_mia_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(MIA_SPIKE_SLEEP_TIME
                                                      )  # 放慢速度
                                        else:
                                            pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mia_delete_str_4)
            result = list(tmp_sql_server._select_table(sql_str=mia_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time,'%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mia_miaosha = MiaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                        print('过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass          # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:   # 返回1,表示在待更新区间内
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(item[2])

                        body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type)
                        # print(body)

                        if body == '' or body == '[]':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            begin_time = tmp_data.get('p_info', {}).get('start_time', '')
                            end_time = tmp_data.get('p_info', {}).get('end_time', '')
                            begin_time = int(time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')))     # 把str字符串类型转换为时间戳的形式
                            end_time = int(time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S')))
                            item_list = tmp_data.get('item_list', [])

                            # 该pid中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [item_1.get('item_id', '') for item_1 in item_list]

                            if item[0] not in miaosha_goods_all_goods_id:   # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:   # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('item_id', '') == item[0]:
                                        mia_miaosha.get_goods_data(goods_id=item[0])
                                        goods_data = mia_miaosha.deal_with_data()

                                        if goods_data == {}:    # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(item[0])
                                            goods_data['price'] = item_2.get('active_price')
                                            goods_data['taobao_price'] = item_2.get('active_price')
                                            goods_data['sub_title'] = item_2.get('short_info', '')
                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time': timestamp_to_regulartime(begin_time),
                                                'miaosha_end_time': timestamp_to_regulartime(end_time),
                                            }
                                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mia_miaosha.update_mia_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                                            sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass


                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
    async def _deal_with_all_goods_id(self):
        '''
        获取每个详细分类的商品信息
        :return: None
        '''
        _data = await self._get_all_goods_list()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if my_pipeline.is_connect_success:
            self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...')
            sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28'
            db_ = list(my_pipeline._select_table(sql_str=sql_str))
            db_all_goods_id = [item[0] for item in db_]
            self.my_lg.info('获取完毕!!!')
            # self.my_lg.info(str(db_all_goods_id))

            for item in _data:
                miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', []))
                # self.my_lg.info(str(miaosha_goods_list))
                # pprint(miaosha_goods_list)

                for tmp_item in miaosha_goods_list:
                    if tmp_item.get('goods_id', '') in db_all_goods_id:    # 处理如果该goods_id已经存在于数据库中的情况
                        self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', ''))
                        continue

                    if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                        self.my_lg.info('正在重置,并与数据库建立新连接中...')
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        # my_pipeline = SqlPools()
                        self.my_lg.info('与数据库的新连接成功建立...')

                    if my_pipeline.is_connect_success:
                        tmall = TmallParse(logger=self.my_lg)
                        tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id'))
                        goods_id = tmall.get_goods_id_from_url(tmp_url)

                        tmall.get_goods_data(goods_id=goods_id)
                        goods_data = tmall.deal_with_data()

                        if goods_data != {}:
                            # self.my_lg.info(str(tmp_item))
                            goods_data['goods_id'] = tmp_item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['miaosha_time'] = tmp_item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time'))
                            goods_data['page'] = tmp_item.get('page')
                            goods_data['spider_time'] = tmp_item.get('spider_time')

                            tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                            await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)

                        else:
                            await asyncio.sleep(5)

                        try: del tmall
                        except: pass
                        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jm_delete_str_2)
            result = list(tmp_sql_server._select_table(sql_str=jm_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 获取cookies
            my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type)
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/')
            try: del my_phantomjs
            except: pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:     # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                        print('过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass          # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:   # 返回1,表示在待更新区间内
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print('#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''),
                                    'miaosha_end_time': goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            result = list(
                tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_miaosha = JuanPiParse()

            for item in result:  # 实时更新数据
                miaosha_begin_time = json.loads(
                    item[1]).get('miaosha_begin_time')
                miaosha_begin_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_begin_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_begin_time)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_begin_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str,
                            params=(item[0]),
                            lock_timeout=2000)
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_begin_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                            str(item[2]),
                            str(item[3]),
                        )
                        # print('待爬取的tab_id, page地址为: ', tmp_url)

                        data = Requests.get_url_body(
                            url=tmp_url,
                            headers=self.headers,
                            ip_pool_type=self.ip_pool_type)
                        if data == '':
                            break

                        try:
                            data = json.loads(data)
                            data = data.get('data', {})
                            # print(data)
                        except:
                            break

                        if data.get('goodslist') == []:
                            print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.
                                  format(item[2], item[3]))
                            pass

                        else:
                            data = data.get('goodslist', [])
                            # print(data)
                            if data == []:
                                print('goodslist为[], 此处跳过')
                                pass
                            else:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(
                                    data=data)
                                # print(miaosha_goods_list)

                                # 该tab_id, page中现有的所有goods_id的list
                                miaosha_goods_all_goods_id = [
                                    i.get('goods_id')
                                    for i in miaosha_goods_list
                                ]
                                # print(miaosha_goods_all_goods_id)

                                if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                    '''
                                    表示该tab_id,page中没有了该goods_id
                                    '''
                                    tmp_sql_server._delete_table(
                                        sql_str=self.delete_sql_str,
                                        params=(item[0]))
                                    print(
                                        '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                        item[0])
                                    pass

                                else:  # 未下架的
                                    for item_1 in miaosha_goods_list:
                                        if item_1.get('goods_id',
                                                      '') == item[0]:
                                            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                            # juanpi_miaosha = JuanPiParse()
                                            juanpi_miaosha.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = juanpi_miaosha.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:  # 否则就解析并且插入
                                                goods_data[
                                                    'stock_info'] = item_1.get(
                                                        'stock_info')
                                                goods_data[
                                                    'goods_id'] = item_1.get(
                                                        'goods_id')
                                                # goods_data['username'] = '******'
                                                if item_1.get(
                                                        'stock_info'
                                                ).get('activity_stock') > 0:
                                                    goods_data[
                                                        'price'] = item_1.get(
                                                            'price')  # 秒杀前的原特价
                                                    goods_data[
                                                        'taobao_price'] = item_1.get(
                                                            'taobao_price'
                                                        )  # 秒杀价
                                                else:
                                                    pass
                                                goods_data[
                                                    'sub_title'] = item_1.get(
                                                        'sub_title', '')
                                                goods_data[
                                                    'miaosha_time'] = item_1.get(
                                                        'miaosha_time')
                                                goods_data[
                                                    'miaosha_begin_time'], goods_data[
                                                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=item_1
                                                            .get('miaosha_time'
                                                                 ))

                                                juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)

                                                sleep(.3)  # 避免太快
                                        else:
                                            pass
                    if index % 10 == 0:  # 每过几个初始化一次,既能加快速度,又能优化内存
                        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                        juanpi_miaosha = JuanPiParse()
                        gc.collect()

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            # sleep(5)
            pass
        gc.collect()
Beispiel #17
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs(
                executable_path=PHANTOMJS_DRIVER_PATH)
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()
    def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods info
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.my_lg.info(str(miaosha_begin_time))

            data = {}
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            zhe_800_miaosha = Zhe800Parse()
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                print('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                print('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                    self.my_lg.info('过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time')))

                elif self.is_recent_time(miaosha_begin_time) == 2:
                    # 可能包括过期的
                    self.my_lg.info('未来时间暂时不更新! {}'.format(timestamp_to_regulartime(miaosha_begin_time)))
                    pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                else:  # 返回1,表示在待更新区间内
                    print('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(item[0], index))
                    data['goods_id'] = item[0]

                    try:
                        tmp_data = self.zhe_800_spike._get_one_session_id_data(base_session_id=str(item[2]))
                    except Exception:
                        self.my_lg.error(msg='', exc_info=True)
                        continue

                    if tmp_data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                        self.my_lg.info('该session_id不存在,此处跳过')
                        pass

                    else:
                        tmp_data = [item_s.get('deal', {}) for item_s in tmp_data.get('data', {}).get('blocks', [])]
                        # pprint(tmp_data)
                        if tmp_data != []:  # 否则说明里面有数据
                            try:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(data=tmp_data)
                            except ValueError:
                                sleep(2)
                                continue
                            # pprint(miaosha_goods_list)

                            # 该session_id中现有的所有zid的list
                            miaosha_goods_all_goods_id = [i.get('zid') for i in miaosha_goods_list]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                                self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0])
                                self.my_lg.info('该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(item[0]))
                                pass

                            else:  # 未下架的
                                for item_1 in miaosha_goods_list:
                                    if item_1.get('zid', '') == item[0]:
                                        zhe_800_miaosha.get_goods_data(goods_id=item[0])
                                        goods_data = zhe_800_miaosha.deal_with_data()

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:  # 否则就解析并且插入
                                            goods_data['stock_info'] = item_1.get('stock_info')
                                            goods_data['goods_id'] = str(item_1.get('zid'))
                                            # goods_data['username'] = '******'
                                            if item_1.get('stock_info').get('activity_stock') > 0:
                                                goods_data['price'] = item_1.get('price')
                                                goods_data['taobao_price'] = item_1.get('taobao_price')
                                            else:
                                                pass
                                            goods_data['sub_title'] = item_1.get('sub_title')
                                            goods_data['miaosha_time'] = item_1.get('miaosha_time')
                                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                miaosha_time=item_1.get('miaosha_time'))

                                            if goods_data.get('is_delete', 0) == 1:
                                                self.my_lg.info('该商品[{0}]已售罄...'.format(item[0]))

                                            # self.my_lg.info(str(goods_data['stock_info']))
                                            # self.my_lg.info(str(goods_data['miaosha_time']))
                                            zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                                    else:
                                        pass

                        else:  # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                            self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0])
                            self.my_lg.info('该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time')))
                            pass

            else:  # 表示返回的data值为空值
                self.my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            # try:
            #     del tmall
            # except:
            #     pass
            sleep(1.2)
            gc.collect()
        self.my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        gc.collect()

        return
Beispiel #19
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(
                base_session_id=base_session_id)
            sleep(.3)

            if data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                pass

            else:  # 否则session_id存在
                try:
                    _ = str(
                        data.get('data',
                                 {}).get('blocks',
                                         [])[0].get('deal',
                                                    {}).get('begin_time',
                                                            ''))[:10]
                    if _ != '':
                        pass
                    elif data.get('data', {}).get('blocks', [])[0].get(
                            'showcase', {}) != {}:  # 未来时间
                        print('*** 未来时间 ***')
                        # pprint(data.get('data', {}))
                        _ = str(
                            data.get('data', {}).get('blocks', [])[1].get(
                                'deal', {}).get('begin_time', ''))[:10]
                    else:
                        raise Exception
                    begin_times_timestamp = int(
                        _)  # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整

                except Exception as e:
                    print('遇到严重错误: ', e)
                    base_session_id += 2
                    continue

                print('秒杀时间为: ',
                      timestamp_to_regulartime(begin_times_timestamp))

                if self.is_recent_time(
                        timestamp=begin_times_timestamp):  # 说明秒杀日期合法
                    try:
                        data = [
                            item_s.get('deal', {}) for item_s in data.get(
                                'data', {}).get('blocks', [])
                        ]
                    except Exception as e:
                        print('遇到严重错误: ', e)
                        base_session_id += 2
                        continue
                    # pprint(data)

                    if data != []:  # 否则说明里面有数据
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        # pprint(miaosha_goods_list)

                        zhe_800 = Zhe800Parse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14'
                            db_goods_id_list = [
                                item[0] for item in list(
                                    my_pipeline._select_table(sql_str=sql_str))
                            ]
                            for item in miaosha_goods_list:
                                if item.get('zid', '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'https://shop.zhe800.com/products/' + str(
                                        item.get('zid', ''))
                                    goods_id = zhe_800.get_goods_id_from_url(
                                        tmp_url)

                                    zhe_800.get_goods_data(goods_id=goods_id)
                                    goods_data = zhe_800.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = str(
                                            item.get('zid'))
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get('price')
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')
                                        goods_data['sub_title'] = item.get(
                                            'sub_title')
                                        # goods_data['is_baoyou'] = item.get('is_baoyou')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['session_id'] = str(
                                            base_session_id)
                                        # print(goods_data['miaosha_time'])

                                        # print(goods_data)
                                        zhe_800.insert_into_zhe_800_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(ZHE_800_SPIKE_SLEEP_TIME)  # 放慢速度

                            # sleep(2)
                        else:
                            pass
                        try:
                            del zhe_800
                        except:
                            pass
                        gc.collect()

                    else:  # 说明这个sessionid没有数据
                        print('该sessionid没有相关key为jsons的数据')
                        # return {}
                        pass
                else:
                    pass

            base_session_id += 2
Beispiel #20
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_4)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mogujie_miaosha = MoGuJieMiaoShaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0], ))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        item_list = self.get_item_list(event_time=str(item[2]))
                        if item_list == '':
                            # 可能网络状况导致, 先跳过
                            pass

                        elif item_list == []:
                            print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                            # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            tmp_sql_server._update_table(
                                sql_str=mg_update_str_1, params=(item[0], ))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            # 该event_time中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('iid', '') for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                tmp_sql_server._update_table(
                                    sql_str=mg_update_str_1,
                                    params=(item[0], ))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('iid', '') == item[0]:
                                        spider_url = item[3]
                                        mogujie_miaosha.get_goods_data(
                                            goods_id=spider_url)
                                        goods_data = mogujie_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])

                                            # price设置为原价
                                            try:
                                                tmp_price_list = sorted([
                                                    round(
                                                        float(
                                                            item_4.get(
                                                                'normal_price',
                                                                '')), 2)
                                                    for item_4 in goods_data[
                                                        'price_info_list']
                                                ])
                                                price = Decimal(
                                                    tmp_price_list[-1]
                                                ).__round__(2)  # 商品原价
                                                goods_data['price'] = price
                                            except:
                                                print('设置price为原价时出错!请检查')
                                                continue

                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'startTime', 0))),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'endTime', 0))),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])
                                            # print(goods_data['title'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mogujie_miaosha.update_mogujie_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Beispiel #21
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关拼团商品的数据
        :param params: 待传参数
        :return:
        '''
        goods_list = params[0]

        mogujie = MoGuJieParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mg_select_str_1))
            db_goods_id_list = [item[0] for item in _]
            print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://shop.mogujie.com/detail/' + str(
                        goods_id)

                    mogujie.get_goods_data(goods_id=str(goods_id))
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        # 规范化
                        goods_data[
                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                goods_data['price_info_list'])
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['pintuan_time'] = item.get(
                            'pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=item.get('pintuan_time', {}))
                        goods_data['all_sell_count'] = item.get(
                            'all_sell_count', '')
                        goods_data['fcid'] = str(item.get('fcid'))
                        goods_data['page'] = str(item.get('page'))
                        goods_data['sort'] = str(item.get('sort', ''))

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mogujie.insert_into_mogujie_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()
Beispiel #22
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        pid = param[0]
        begin_time = int(time.mktime(time.strptime(param[1], '%Y/%m/%d %H:%M:%S')))     # 把str字符串类型转换为时间戳的形式
        end_time = int(time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S')))
        item_list = param[3]

        mia = MiaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mia_select_str_4))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('item_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('item_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['price'] = item.get('active_price')
                        goods_data['taobao_price'] = item.get('active_price')       # 秒杀最低价
                        goods_data['sub_title'] = item.get('short_info', '')
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': timestamp_to_regulartime(begin_time),
                            'miaosha_end_time': timestamp_to_regulartime(end_time),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['pid'] = str(pid)

                        # pprint(goods_data)
                        # print(goods_data)
                        mia.insert_into_mia_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
Beispiel #23
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        print(60 * '*')
        event_time = param[0]
        item_list = param[1]
        print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t',
              '对应时间戳为: ', event_time)
        print(60 * '*')

        mogujie = MoGuJieMiaoShaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=mg_select_str_4))
            db_goods_id_list = [item[0] for item in _]
            for item in item_list:
                goods_id = str(item.get('iid', ''))
                if goods_id in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = item.get('link', '')
                    # print(tmp_url)
                    try:
                        object_id = re.compile('objectId=(\w+)').findall(
                            tmp_url)[0]
                    except IndexError:  # 表示匹配到的地址不是秒杀商品的地址
                        print('+++++++ 这个url不是秒杀的url: ', tmp_url)
                        continue
                    tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format(
                        goods_id, object_id)
                    tmp_ = mogujie.get_goods_id_from_url(tmp_url)
                    mogujie.get_goods_data(goods_id=tmp_)
                    goods_data = mogujie.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)

                        # price设置为原价
                        try:
                            tmp_price_list = sorted([
                                round(float(item_4.get('normal_price', '')), 2)
                                for item_4 in goods_data['price_info_list']
                            ])
                            price = Decimal(tmp_price_list[-1]).__round__(
                                2)  # 商品原价
                            goods_data['price'] = price
                        except:
                            print('设置price为原价时出错!请检查')
                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                            continue

                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            timestamp_to_regulartime(
                                int(item.get('startTime', 0))),
                            'miaosha_end_time':
                            timestamp_to_regulartime(
                                int(item.get('endTime', 0))),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        goods_data['event_time'] = str(event_time)
                        # pprint(goods_data)
                        # print(goods_data)
                        res = mogujie.insert_into_mogujie_xianshimiaosha_table(
                            data=goods_data, pipeline=my_pipeline)
                        if res:
                            if goods_id not in db_goods_id_list:
                                db_goods_id_list.append(goods_id)

                    sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()