Example #1
0
    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time':
                timestamp_to_regulartime(int(item.get('start_time'))),
                'miaosha_end_time':
                timestamp_to_regulartime(int(item.get('end_time'))),
            }
            stock = item.get('stock', 0)
            # 卷皮商品的goods_id
            tmp['goods_id'] = item.get('goods_id')
            # 限时秒杀库存信息
            tmp['stock_info'] = {
                'activity_stock':
                int(item.get('stock', 0) * (item.get('rate', 0) / 100)),
                'stock':
                item.get('stock', 0),
            }
            # 原始价格
            tmp['price'] = round(float(item.get('oprice', '0')), 2)
            tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2)
            miaosha_goods_list.append(tmp)

        return miaosha_goods_list
Example #2
0
    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            miaosha_begin_time = str(
                timestamp_to_regulartime(
                    int(item.get('data', {}).get('start_time'))))
            tmp_hour = miaosha_begin_time[-8:-6]
            if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST:
                if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST:
                    '''
                    # 这些起始的点秒杀时间只有30分钟
                    '''
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 30))
                else:
                    miaosha_end_time = str(
                        timestamp_to_regulartime(
                            int(item.get('data', {}).get('start_time')) +
                            60 * 60))

                tmp['miaosha_time'] = {
                    'miaosha_begin_time': miaosha_begin_time,
                    'miaosha_end_time': miaosha_end_time,
                }
                # 卷皮商品的goods_id
                tmp['goods_id'] = str(item.get('data', {}).get('goods_id'))
                # 限时秒杀库存信息
                tmp['stock_info'] = {
                    'activity_stock':
                    int(
                        item.get('data', {}).get('all_quantity', 0) -
                        item.get('data', {}).get('sold_quantity', 0)),
                    'stock':
                    item.get('data', {}).get('all_quantity', 0),
                }
                # 原始价格
                tmp['price'] = round(
                    float(item.get('data', {}).get('normal_price', '0')) / 100,
                    2)
                tmp['taobao_price'] = round(
                    float(item.get('data', {}).get('price', '0')) / 100, 2)
                miaosha_goods_list.append(tmp)
            else:
                pass
        return miaosha_goods_list
Example #3
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.5)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Example #4
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            try:
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             high_conceal=True,
                                             ip_pool_type=self.ip_pool_type)
                assert body != '', 'body为空值!'
                tmp_data = json_2_dict(json_str=body, default_res={}).get(
                    'data', {}).get('goods', [])
                # print(tmp_data)
                assert tmp_data != [], '该tmp_url得到的goods为空list, 此处跳过!'
                sleep(.5)
            except AssertionError as e:
                print(e)
                sleep(.5)
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
    def _deal_with_data_net_worth_trend(self, **kwargs):
        '''
        处理data_net_worth_trend(单位净值走势), 并成像
        :param fund_name:
        :param fund_code:
        :param data_net_worth_trend:
        :return:
        '''
        fund_name = kwargs.get('fund_name')
        fund_code = kwargs.get('fund_code')
        data_net_worth_trend = kwargs.get('data_net_worth_trend', [])

        [
            item.update(
                {'x': str(timestamp_to_regulartime(str(item.get('x'))[:10]))})
            for item in data_net_worth_trend
        ]
        print('时间格式转换成功!')
        # pprint(data_net_worth_trend)

        x = [item.get('x') for item in data_net_worth_trend]
        y = [item.get('y') for item in data_net_worth_trend]
        '''绘图'''
        self.plot_pic = self._drawing(fund_name=fund_name,
                                      fund_code=fund_code,
                                      x=x,
                                      y=y)

        try:
            del self.plot_pic
        except:
            pass
        gc.collect()

        return True
Example #6
0
    def _verification(self, req_params: dict) -> dict:
        '''
        校验请求是否有效
        :param req_params: 请求的所有查询参数(公共参数和私有参数)
        :return:
        '''
        res = dict(msg=None, success=False)
        # pprint(req_params)
        try:
            req_version = req_params["v"]
            req_timestamp = req_params["t"]
            req_accesskey_id = req_params["access_key_id"]
            req_signature = req_params["sign"]
        except KeyError as e:
            res.update(msg="Invalid public params")
        except Exception as e:
            res.update(msg="Unknown server error")
        else:
            # NO.1 校验版本
            if req_version == self._version:
                # NO.2 校验时间戳
                if self._check_req_timestamp(req_timestamp):
                    # NO.3 校验accesskey_id
                    if self._check_req_accesskey_id(req_accesskey_id):
                        # NO.4 校验签名
                        if req_signature == self._sign(req_params):
                            res.update(msg="Verification pass", success=True)

                        else:
                            res.update(msg="Invalid query string")
                    else:
                        res.update(msg="Invalid access_key_id")
                else:
                    self.my_lg.error(
                        '当前系统时间戳为: {0}[{1}], 而请求的时间戳为: {2}[{3}]'.format(
                            self.now_timestamp,
                            str(timestamp_to_regulartime(self.now_timestamp)),
                            req_timestamp,
                            str(timestamp_to_regulartime(req_timestamp))))
                    res.update(msg="Invalid timestamp")
            else:
                res.update(msg="Invalid version")

        return res
Example #7
0
    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            # pprint(item)
            tmp = {}
            # 秒杀开始时间和结束时间
            try:
                tmp['miaosha_time'] = {
                    'miaosha_begin_time':
                    timestamp_to_regulartime(
                        int(str(item.get('begin_time'))[:10])),
                    'miaosha_end_time':
                    timestamp_to_regulartime(
                        int(str(item.get('end_time'))[:10])),
                }
            except ValueError:
                continue

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 是否包邮
            # tmp['is_baoyou'] = item.get('is_baoyou', 0)
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            # 原始价格
            tmp['price'] = float(item.get('list_price'))
            # 秒杀的价格, float类型
            tmp['taobao_price'] = float(item.get('price'))
            # 子标题
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list
Example #8
0
    def _get_goods_schedule(self, data):
        '''
        获取商品销售时间段
        :param data:
        :return:
        '''
        # print(data.get('skudata', {}).get('info', {}))
        # print(data.get('skudata', {}))
        begin_time = data.get('skudata', {}).get('info', {}).get(
            'start_time')  # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的
        end_time = data.get('skudata', {}).get('info', {}).get('end_time')
        if begin_time is None or end_time is None:
            schedule = []
        else:
            schedule = [{
                'begin_time': timestamp_to_regulartime(begin_time),
                'end_time': timestamp_to_regulartime(end_time),
            }]

        return schedule
Example #9
0
 def json_to_dict(self, tmp_data):
     try:
         data = json.loads(tmp_data)
         # pprint(data)
         times = [str(timestamp_to_regulartime(int(item))) for item in data.get('times', [])]
         data = data.get('items', [])
         # print(data)
         # print(times)
     except:
         print('json.loads转换data的时候出错,data为空')
         data = []
     return data
Example #10
0
    def _get_sell_time(self, data):
        '''
        得到上下架时间
        :param data:
        :return:
        '''
        try:
            left_time = data.get('gradientPrice', {}).get('leftTime', 0)
        except AttributeError:  # gradientPrice的值可能为''
            return {}

        if left_time == 0:
            return {}

        now_time_timestamp = datetime_to_timestamp(get_shanghai_time())
        sell_time = {
            'begin_time': timestamp_to_regulartime(now_time_timestamp),
            'end_time': timestamp_to_regulartime(now_time_timestamp + left_time),
        }

        return sell_time
Example #11
0
    def _get_sell_time(self, data):
        '''
        得到上下架时间点
        :param data:
        :return:
        '''
        if data.get('temaiActivityInfo') is None:
            return {}

        start_time = data.get('temaiActivityInfo', {}).get('startTime', 0)
        rest_time = data.get('temaiActivityInfo', {}).get('nowToEndMs', 0)
        if start_time == 0:
            return {}
        else:
            start_time = int(str(start_time)[:10])
            end_time = start_time + rest_time

            return {
                'begin_time': timestamp_to_regulartime(start_time),
                'end_time': timestamp_to_regulartime(end_time),
            }
Example #12
0
    def get_sell_time(self, begin_time, end_time):
        '''
        得到上下架时间 (注意:聚美优品常规今日10点上新商品,销售时长都是24小时)
        :param begin_time: 类型int
        :param end_time: 类型int
        :return: [] 表示出错 | {'xx':'yyy'} 表示success
        '''
        if begin_time is None:
            print('获取到该商品的begin_time是None')
            raise Exception

        if isinstance(begin_time, int):
            sell_time = {
                'begin_time': timestamp_to_regulartime(int(begin_time)),
                'end_time': timestamp_to_regulartime(int(end_time)),
            }

        else:
            print('获取该商品的begin_time类型错误, 请检查!')
            raise Exception

        return sell_time
Example #13
0
    async def get_tiantiantejia_goods_list(self, data):
        '''
        将data转换为需求的list
        :param data:
        :return: a list
        '''
        tejia_goods_list = []
        if data != []:
            # 处理得到需要的数据
            try:
                tejia_goods_list = [{
                    'goods_id':
                    item.get('baseinfo', {}).get('itemId', ''),
                    'start_time':
                    timestamp_to_regulartime(
                        int(item.get('baseinfo', {}).get('ostime', '')[0:10])),
                    'end_time':
                    timestamp_to_regulartime(
                        int(item.get('baseinfo', {}).get('oetime', '')[0:10])),
                } for item in data]
            except Exception as e:
                self.my_lg.exception(e)

        return tejia_goods_list
Example #14
0
    def request(self):
        """测试用例"""
        # goods_link = 'https://h5.m.taobao.com/awp/core/detail.htm?id=551047454198&umpChannel=libra-A9F9140EBD8F9031B980FBDD4B9038F4&u_channel=libra-A9F9140EBD8F9031B980FBDD4B9038F4&spm=a2141.8147851.1.1'
        # link中不能带&否则会被编码在sign中加密

        # tb
        # goods_link = 'https://h5.m.taobao.com/awp/core/detail.htm?id=551047454198'
        # tm
        goods_link = 'https://detail.tmall.hk/hk/item.htm?spm=a1z10.5-b-s.w4011-16816054130.101.3e6227dfLIwIrR&id=555709593338&rn=2563b85d76e776e4dd26a13103df62bd&abbucket=6'
        # jd
        # goods_link = 'https://item.m.jd.com/ware/view.action?wareId=3713001'
        # goods_link = 'https://item.jd.com/5025518.html'

        from base64 import b64encode

        now_timestamp = get_current_timestamp() - 5
        print('请求时间戳为: {0}[{1}]'.format(
            now_timestamp, str(timestamp_to_regulartime(now_timestamp))))
        params = {
            'access_key_id': self._access_key_id,
            'version': self._version,
            'timestamp': now_timestamp,
            'goods_link': b64encode(
                s=goods_link.encode('utf-8')).decode('utf-8'),  # 传str, 不传byte
        }

        params.update({'sign': self._sign(params)})

        # print(self.make_url(params))
        # url = 'http://127.0.0.1:5000/basic_data_2?' + self.make_url(params)
        # url = 'http://127.0.0.1:5000/api/goods'
        url = 'http://spider.taobao_tmall.k85u.com/api/goods'
        # url = 'http://spider.other.k85u.com/api/goods'

        # result = requests.get(url)
        result = requests.get(url, params=params)

        print(result.text)

        return result
    def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods info
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.my_lg.info(str(miaosha_begin_time))

            data = {}
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            zhe_800_miaosha = Zhe800Parse()
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                print('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                print('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                    self.my_lg.info('过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time')))

                elif self.is_recent_time(miaosha_begin_time) == 2:
                    # 可能包括过期的
                    self.my_lg.info('未来时间暂时不更新! {}'.format(timestamp_to_regulartime(miaosha_begin_time)))
                    pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                else:  # 返回1,表示在待更新区间内
                    print('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(item[0], index))
                    data['goods_id'] = item[0]

                    try:
                        tmp_data = self.zhe_800_spike._get_one_session_id_data(base_session_id=str(item[2]))
                    except Exception:
                        self.my_lg.error(msg='', exc_info=True)
                        continue

                    if tmp_data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                        self.my_lg.info('该session_id不存在,此处跳过')
                        pass

                    else:
                        tmp_data = [item_s.get('deal', {}) for item_s in tmp_data.get('data', {}).get('blocks', [])]
                        # pprint(tmp_data)
                        if tmp_data != []:  # 否则说明里面有数据
                            try:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(data=tmp_data)
                            except ValueError:
                                sleep(2)
                                continue
                            # pprint(miaosha_goods_list)

                            # 该session_id中现有的所有zid的list
                            miaosha_goods_all_goods_id = [i.get('zid') for i in miaosha_goods_list]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                                self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0])
                                self.my_lg.info('该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(item[0]))
                                pass

                            else:  # 未下架的
                                for item_1 in miaosha_goods_list:
                                    if item_1.get('zid', '') == item[0]:
                                        zhe_800_miaosha.get_goods_data(goods_id=item[0])
                                        goods_data = zhe_800_miaosha.deal_with_data()

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:  # 否则就解析并且插入
                                            goods_data['stock_info'] = item_1.get('stock_info')
                                            goods_data['goods_id'] = str(item_1.get('zid'))
                                            # goods_data['username'] = '******'
                                            if item_1.get('stock_info').get('activity_stock') > 0:
                                                goods_data['price'] = item_1.get('price')
                                                goods_data['taobao_price'] = item_1.get('taobao_price')
                                            else:
                                                pass
                                            goods_data['sub_title'] = item_1.get('sub_title')
                                            goods_data['miaosha_time'] = item_1.get('miaosha_time')
                                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                miaosha_time=item_1.get('miaosha_time'))

                                            if goods_data.get('is_delete', 0) == 1:
                                                self.my_lg.info('该商品[{0}]已售罄...'.format(item[0]))

                                            # self.my_lg.info(str(goods_data['stock_info']))
                                            # self.my_lg.info(str(goods_data['miaosha_time']))
                                            zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                                    else:
                                        pass

                        else:  # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                            self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0])
                            self.my_lg.info('该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time')))
                            pass

            else:  # 表示返回的data值为空值
                self.my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            # try:
            #     del tmall
            # except:
            #     pass
            sleep(1.2)
            gc.collect()
        self.my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        gc.collect()

        return
Example #16
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mia_miaosha = MiaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                            item[2])

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)
                        # print(body)

                        if body == '' or body == '[]':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            begin_time = tmp_data.get('p_info', {}).get(
                                'start_time', '')
                            end_time = tmp_data.get('p_info',
                                                    {}).get('end_time', '')
                            begin_time = int(
                                time.mktime(
                                    time.strptime(begin_time,
                                                  '%Y/%m/%d %H:%M:%S'))
                            )  # 把str字符串类型转换为时间戳的形式
                            end_time = int(
                                time.mktime(
                                    time.strptime(end_time,
                                                  '%Y/%m/%d %H:%M:%S')))
                            item_list = tmp_data.get('item_list', [])

                            # 该pid中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('item_id', '')
                                for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('item_id', '') == item[0]:
                                        mia_miaosha.get_goods_data(
                                            goods_id=item[0])
                                        goods_data = mia_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])
                                            goods_data['price'] = item_2.get(
                                                'active_price')
                                            goods_data[
                                                'taobao_price'] = item_2.get(
                                                    'active_price')
                                            goods_data[
                                                'sub_title'] = item_2.get(
                                                    'short_info', '')
                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    begin_time),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    end_time),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mia_miaosha.update_mia_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Example #17
0
    async def get_one_page_goods_list(self, **kwargs):
        '''
        获取单页面的goods_list
        :param kwargs:
        :return: item_list 类型list
        '''
        my_phantomjs = kwargs.get('my_phantomjs')
        key = kwargs.get('key', '')
        tab = kwargs.get('tab', '')
        index = kwargs.get('index')
        i_time = time.time()
        tmp_url = 'http://s.h5.jumei.com/yiqituan/tab_list?tab={0}&page={1}&per_page=20'.format(
            tab, str(index))
        # 常规requests被过滤, aiohttp成功, 测试发现:设置时间短抓取较快
        # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_API_TIMEOUT)

        # 改用phantomjs,aiohttp太慢
        body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
        try:
            body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0]
        except:
            pass
        await asyncio.sleep(1)
        # self.my_lg.info(body)

        self.msg = '正在抓取第' + str(index) + '页...' + ' ☭ 用时: ' + str(
            time.time() - i_time)
        self.my_lg.info(self.msg)

        item_list = []
        if body == '':
            self.msg = '获取到的body为空str!' + ' 出错地址: ' + tmp_url
            self.my_lg.error(self.msg)
        else:
            one_data = await self.json_2_dict(json_str=body)
            if one_data == {}:
                self.msg = '出错地址: ' + tmp_url
                self.my_lg.error(self.msg)
            else:
                if one_data.get('data', []) == []:
                    pass

                else:
                    tmp_item_list = one_data.get('data', [])

                    for item in tmp_item_list:  # 由于await 不能理解列表表达式,就采用常规做法
                        if item.get('status', '') != 'soldout':
                            item_list.append({
                                'goods_id':
                                item.get('item_id', ''),
                                'pintuan_time': {
                                    'begin_time':
                                    timestamp_to_regulartime(
                                        item.get('start_time', '0')),
                                    'end_time':
                                    timestamp_to_regulartime(
                                        item.get('end_time', '0')),
                                },
                                'type':
                                item.get('type', ''),
                                'sort':
                                key,
                                'page':
                                index,
                                'tab':
                                tab,
                            })
                    # self.my_lg.info(str(item_list))

        return item_list
Example #18
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_4)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mogujie_miaosha = MoGuJieMiaoShaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0], ))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        item_list = self.get_item_list(event_time=str(item[2]))
                        if item_list == '':
                            # 可能网络状况导致, 先跳过
                            pass

                        elif item_list == []:
                            print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                            # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            tmp_sql_server._update_table(
                                sql_str=mg_update_str_1, params=(item[0], ))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            # 该event_time中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('iid', '') for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                tmp_sql_server._update_table(
                                    sql_str=mg_update_str_1,
                                    params=(item[0], ))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('iid', '') == item[0]:
                                        spider_url = item[3]
                                        mogujie_miaosha.get_goods_data(
                                            goods_id=spider_url)
                                        goods_data = mogujie_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])

                                            # price设置为原价
                                            try:
                                                tmp_price_list = sorted([
                                                    round(
                                                        float(
                                                            item_4.get(
                                                                'normal_price',
                                                                '')), 2)
                                                    for item_4 in goods_data[
                                                        'price_info_list']
                                                ])
                                                price = Decimal(
                                                    tmp_price_list[-1]
                                                ).__round__(2)  # 商品原价
                                                goods_data['price'] = price
                                            except:
                                                print('设置price为原价时出错!请检查')
                                                continue

                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'startTime', 0))),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'endTime', 0))),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])
                                            # print(goods_data['title'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mogujie_miaosha.update_mogujie_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Example #19
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        '''
        方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据
        '''
        # mw_appkey = '100028'
        # mw_t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # mw_ttid = 'NMMain%40mgj_h5_1.0'
        #
        # _ = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # data = {
        #     "pid": "93745",
        #     "platform": "m",
        #     "cKey": "mwp_mait",
        #     "fcid": "",
        # }
        #
        # params = {
        #     'data': data
        # }
        #
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid
        #
        # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format(
        #     mw_appkey, mw_t, mw_uuid, mw_ttid, _
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        #
        # try:
        #     response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     body = response.content.decode('utf-8')
        #     print(body)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        '''
        方法二: 通过pc端来获取拼团商品列表
        '''
        self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH,
                                       ip_pool_type=self.ip_pool_type)
        for key in self.fcid_dict:
            print('正在抓取的分类为: ', key)
            for index in range(1, 100):
                if index % 5 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                fcid = self.fcid_dict[key]
                tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                    str(index), fcid)
                # requests请求数据被过滤(起初能用),改用phantomjs
                # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                    url=tmp_url)
                # print(body)

                try:
                    body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0]
                    tmp_data = json.loads(body)
                except:
                    print('json.loads转换body时出错, 请检查')
                    continue

                if tmp_data.get('result', {}).get('wall', {}).get('docs',
                                                                  []) == []:
                    # 表示拼团数据为空则跳出循环
                    break

                # pprint(tmp_data)
                # print(tmp_data)

                tmp_item_list = tmp_data.get('result',
                                             {}).get('wall',
                                                     {}).get('docs', [])
                # print(tmp_item_list)
                # pprint(tmp_item_list)

                begin_time_timestamp = int(time.time())  # 开始拼团的时间戳
                item_list = [{
                    'goods_id': item.get('tradeItemId', ''),
                    'pintuan_time': {
                        'begin_time':
                        timestamp_to_regulartime(
                            timestamp=begin_time_timestamp),
                        'end_time':
                        timestamp_to_regulartime(
                            self.get_pintuan_end_time(
                                begin_time_timestamp,
                                item.get('leftTimeOrg', ''))),
                    },
                    'all_sell_count': str(item.get('salesVolume', 0)),
                    'fcid': fcid,
                    'page': index,
                    'sort': key,
                } for item in tmp_item_list]
                print(item_list)

                for item_1 in item_list:
                    goods_list.append(item_1)

                sleep(MOGUJIE_SLEEP_TIME)

        # 处理goods_list数据
        print(goods_list)
        self.deal_with_data(goods_list)
        sleep(5)
Example #20
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(
                base_session_id=base_session_id)
            sleep(.3)

            if data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                pass

            else:  # 否则session_id存在
                try:
                    _ = str(
                        data.get('data',
                                 {}).get('blocks',
                                         [])[0].get('deal',
                                                    {}).get('begin_time',
                                                            ''))[:10]
                    if _ != '':
                        pass
                    elif data.get('data', {}).get('blocks', [])[0].get(
                            'showcase', {}) != {}:  # 未来时间
                        print('*** 未来时间 ***')
                        # pprint(data.get('data', {}))
                        _ = str(
                            data.get('data', {}).get('blocks', [])[1].get(
                                'deal', {}).get('begin_time', ''))[:10]
                    else:
                        raise Exception
                    begin_times_timestamp = int(
                        _)  # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整

                except Exception as e:
                    print('遇到严重错误: ', e)
                    base_session_id += 2
                    continue

                print('秒杀时间为: ',
                      timestamp_to_regulartime(begin_times_timestamp))

                if self.is_recent_time(
                        timestamp=begin_times_timestamp):  # 说明秒杀日期合法
                    try:
                        data = [
                            item_s.get('deal', {}) for item_s in data.get(
                                'data', {}).get('blocks', [])
                        ]
                    except Exception as e:
                        print('遇到严重错误: ', e)
                        base_session_id += 2
                        continue
                    # pprint(data)

                    if data != []:  # 否则说明里面有数据
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        # pprint(miaosha_goods_list)

                        zhe_800 = Zhe800Parse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14'
                            db_goods_id_list = [
                                item[0] for item in list(
                                    my_pipeline._select_table(sql_str=sql_str))
                            ]
                            for item in miaosha_goods_list:
                                if item.get('zid', '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'https://shop.zhe800.com/products/' + str(
                                        item.get('zid', ''))
                                    goods_id = zhe_800.get_goods_id_from_url(
                                        tmp_url)

                                    zhe_800.get_goods_data(goods_id=goods_id)
                                    goods_data = zhe_800.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = str(
                                            item.get('zid'))
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get('price')
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')
                                        goods_data['sub_title'] = item.get(
                                            'sub_title')
                                        # goods_data['is_baoyou'] = item.get('is_baoyou')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['session_id'] = str(
                                            base_session_id)
                                        # print(goods_data['miaosha_time'])

                                        # print(goods_data)
                                        zhe_800.insert_into_zhe_800_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(ZHE_800_SPIKE_SLEEP_TIME)  # 放慢速度

                            # sleep(2)
                        else:
                            pass
                        try:
                            del zhe_800
                        except:
                            pass
                        gc.collect()

                    else:  # 说明这个sessionid没有数据
                        print('该sessionid没有相关key为jsons的数据')
                        # return {}
                        pass
                else:
                    pass

            base_session_id += 2
Example #21
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            _ = list(my_pipeline._select_table(sql_str=cc_select_str_2))
            db_goods_id_list = [item[0] for item in _]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(
                        goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        sleep(.5)

                    elif goods_data.get('is_delete',
                                        0) == 1:  # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        sleep(.5)

                    else:  # 否则就解析并且插入
                        my_phantomjs = BaseDriver(
                            executable_path=PHANTOMJS_DRIVER_PATH,
                            ip_pool_type=self.ip_pool_type)

                        # 获取剩余时间
                        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url, css_selector='p#activityTime span')
                        # print(tmp_body)

                        try:
                            del my_phantomjs
                        except:
                            pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.5)
                            pass
                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css(
                                'p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)
                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time':
                                timestamp_to_regulartime(int(time.time())),
                                'miaosha_end_time':
                                timestamp_to_regulartime(
                                    int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            res = chuchujie.insert_into_chuchujie_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                            if res:
                                if goods_id not in db_goods_id_list:
                                    db_goods_id_list.append(goods_id)

                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠
                        # index += 1
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
Example #22
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        print(60 * '*')
        event_time = param[0]
        print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t',
              '对应时间戳为: ', event_time)
        print(60 * '*')

        item_list = param[1]

        mogujie = MoGuJieMiaoShaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline._select_table(sql_str=mg_select_str_4))
            ]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('iid', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('iid', ''))
                    tmp_url = item.get('link', '')

                    try:
                        object_id = re.compile(r'objectId=(.*?)&').findall(
                            tmp_url)[0]
                    except IndexError:  # 表示匹配到的地址不是秒杀商品的地址
                        print('+++++++ 这个url不是秒杀的url: ', tmp_url)
                        continue

                    tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format(
                        goods_id, object_id)

                    tmp_ = mogujie.get_goods_id_from_url(tmp_url)
                    mogujie.get_goods_data(goods_id=tmp_)
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)

                        # price设置为原价
                        try:
                            tmp_price_list = sorted([
                                round(float(item_4.get('normal_price', '')), 2)
                                for item_4 in goods_data['price_info_list']
                            ])
                            price = Decimal(tmp_price_list[-1]).__round__(
                                2)  # 商品原价
                            goods_data['price'] = price
                        except:
                            print('设置price为原价时出错!请检查')
                            continue

                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            timestamp_to_regulartime(
                                int(item.get('startTime', 0))),
                            'miaosha_end_time':
                            timestamp_to_regulartime(
                                int(item.get('endTime', 0))),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        goods_data['event_time'] = str(event_time)

                        # pprint(goods_data)
                        # print(goods_data)
                        mogujie.insert_into_mogujie_xianshimiaosha_table(
                            data=goods_data, pipeline=my_pipeline)
                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()
Example #23
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs(
                executable_path=PHANTOMJS_DRIVER_PATH)
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()
Example #24
0
    def deal_with_data(self):
        '''
        处理得到规范的data数据
        :return: result 类型 dict
        '''
        data = self.result_data
        if data != {}:
            shop_name = data['shop_name']
            account = ''
            title = data['title']
            sub_title = data['sub_title']
            detail_name_list = data['detail_name_list']
            price_info_list = data['price_info_list']
            all_img_url = data['all_img_url']
            p_info = data['p_info']
            # pprint(p_info)
            div_desc = data['div_desc']
            is_delete = self._get_is_delete(data=data,
                                            price_info_list=price_info_list)

            # 上下架时间
            schedule = [{
                'begin_time':
                timestamp_to_regulartime(
                    int(data.get('sell_time', {}).get('begin_time', ''))),
                'end_time':
                timestamp_to_regulartime(
                    int(data.get('sell_time', {}).get('end_time', ''))),
            }]

            # 销售总量
            all_sell_count = ''

            # 商品价格和淘宝价
            # pprint(data['price_info_list'])
            try:
                tmp_price_list = sorted([
                    round(float(item.get('detail_price', '')), 2)
                    for item in data['price_info_list']
                ])
                price = tmp_price_list[-1]  # 商品价格
                taobao_price = tmp_price_list[0]  # 淘宝价
            except IndexError:
                print('获取price和taobao_price时出错, 请检查!'
                      )  # 商品下架时, detail_price为空str, 所以会IndexError报错
                print('@@@@@@ 此处对该商品进行逻辑删除! @@@@@@')
                self.result_data = {}
                price = 0.
                taobao_price = 0.
                is_delete = 1
                # return {}

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,           # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list,# 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'schedule': schedule,  # 商品特价销售时间段
                'all_sell_count': all_sell_count,  # 销售总量
                'is_delete': is_delete  # 用于判断商品是否已经下架
            }
            # pprint(result)
            # print(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            self.result_data = {}
            return result

        else:
            print('待处理的data为空的dict, 该商品可能已经转移或者下架')
            return self._error_data_init()
Example #25
0
    def deal_with_data(self):
        '''
        处理result_data, 返回需要的信息
        :return: 字典类型
        '''
        data = self.result_data
        if data != {}:
            shop_name = data.get('mall', {}).get('mallName', '') \
                if data.get('mall') is not None else ''
            account = ''
            title = data.get('goods', {}).get('goodsName', '')
            sub_title = ''
            detail_name_list = self._get_detail_name_list(data=data)
            # print(detail_name_list)

            price_info_list = self._get_price_info_list(data=data)
            if price_info_list == []:
                print('price_info_list为空值')
                return {}

            # 商品价格和淘宝价
            tmp_price_list = sorted([
                round(float(item.get('detail_price', '')), 2)
                for item in price_info_list
            ])
            price = tmp_price_list[-1]  # 商品价格
            taobao_price = tmp_price_list[0]  # 淘宝价

            if detail_name_list == []:
                print('## detail_name_list为空值 ##')
                price_info_list = []

            # print('最高价为: ', price)
            # print('最低价为: ', taobao_price)
            # print(len(price_info_list))
            # pprint(price_info_list)

            all_img_url = self._get_all_img_url(data=data)
            # print(all_img_url)

            p_info = self._get_p_info(data=data)
            # print(p_info)

            # 总销量
            all_sell_count = data.get('goods', {}).get('sales', 0)
            div_desc = data.get('div_desc', '')

            # 商品销售时间区间
            schedule = [{
                'begin_time':
                timestamp_to_regulartime(
                    data.get('goods', {}).get('groupTypes',
                                              [])[0].get('startTime')),
                'end_time':
                timestamp_to_regulartime(
                    data.get('goods', {}).get('groupTypes',
                                              [])[0].get('endTime')),
            }]
            # pprint(schedule)

            # 用于判断商品是否已经下架
            is_delete = 0

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                # 'shop_name_url': shop_name_url,        # 店铺主页地址
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,            # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list,# 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'schedule': schedule,  # 商品开卖时间和结束开卖时间
                'all_sell_count': all_sell_count,  # 商品总销售量
                'is_delete': is_delete  # 用于判断商品是否已经下架
            }
            # pprint(result)
            # print(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            return result

        else:
            print('待处理的data为空的dict, 该商品可能已经转移或者下架')
            return {}
Example #26
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        pid = param[0]
        begin_time = int(time.mktime(time.strptime(param[1], '%Y/%m/%d %H:%M:%S')))     # 把str字符串类型转换为时间戳的形式
        end_time = int(time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S')))
        item_list = param[3]

        mia = MiaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mia_select_str_4))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('item_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('item_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['price'] = item.get('active_price')
                        goods_data['taobao_price'] = item.get('active_price')       # 秒杀最低价
                        goods_data['sub_title'] = item.get('short_info', '')
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': timestamp_to_regulartime(begin_time),
                            'miaosha_end_time': timestamp_to_regulartime(end_time),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['pid'] = str(pid)

                        # pprint(goods_data)
                        # print(goods_data)
                        mia.insert_into_mia_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         had_referer=True,
                                         ip_pool_type=self.ip_pool_type)
        # print(tmp_body)

        tmp_data = json_2_dict(json_str=tmp_body).get('data', [])
        if tmp_data == []:
            return self._data_error_init()

        true_sku_info = []
        i_s = {}
        pintuan_time = {}  # 初始化
        all_sell_count = '0'
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        try:
                            if item_2.get('g_l', []) == []:
                                break  # 表示如果该规格的拼团价为[], 则跳出这层循环

                            pintuan_price = str(
                                item_2.get('g_l', [])[0].get('gp', ''))
                            # print(pintuan_price)
                        except:
                            print('获取该规格拼团价pintuan_price时出错!')
                            return self._data_error_init()

                        try:
                            s = str(item_2.get('g_l', [])[0].get('s',
                                                                 ''))  # 拼团开始时间
                            e = str(item_2.get('g_l', [])[0].get('e',
                                                                 ''))  # 拼团结束时间
                            s = self.change_to_number_str_time(s)
                            e = self.change_to_number_str_time(e)
                            pintuan_time = {
                                'begin_time':
                                timestamp_to_regulartime(
                                    int(
                                        time.mktime(
                                            time.strptime(
                                                s, '%m %d %Y %H:%M:%S')))),
                                'end_time':
                                timestamp_to_regulartime(
                                    int(
                                        time.mktime(
                                            time.strptime(
                                                e, '%m %d %Y %H:%M:%S')))),
                            }
                        except:
                            print('获取拼团pintuan_time时出错!')
                            return self._data_error_init()

                        try:
                            all_sell_count = str(
                                item_2.get('g_l', [])[0].get('rsn', ''))
                        except:
                            print('获取拼团all_sell_count时出错!')
                            return self._data_error_init()

                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['pintuan_price'] = pintuan_price
                            tmp['detail_price'] = detail_price
                            tmp['normal_price'] = normal_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s, pintuan_time, all_sell_count)