def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=z8_delete_str_1)
            result = list(sql_cli._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                db_is_delete = item[1]
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    remainder=50,
                )
                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if sql_cli.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        if db_is_delete == 1:
                            print('该goods_id[{0}]已过期!'.format(goods_id))
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                        else:
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        session_id = item[2]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_z8_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              remainder=30)

        if self.sql_cli.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=z8_update_str_6,
                    sql_cli=self.sql_cli,
                )
                self.lg.info(
                    '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                        goods_id,
                        timestamp_to_regulartime(miaosha_begin_time)))
                index += 1
                self.goods_index = index
                await async_sleep(.3)

                return goods_id, res

            elif is_recent_time == 2:
                # 可能包括过期的
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    # 处理已过期的逻辑删
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))
                else:
                    self.lg.info(
                        '未来时间暂时不更新! miaosha_begin_time: {}, miaosha_end_time: {}'
                        .format(
                            timestamp_to_regulartime(miaosha_begin_time),
                            timestamp_to_regulartime(miaosha_end_time),
                        ))

                index += 1
                self.goods_index = index

                return goods_id, res

            else:
                # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                try:
                    tmp_data = self.zhe_800_spike._get_one_session_id_data(
                        base_session_id=str(session_id))
                except Exception:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                try:
                    tmp_data = tmp_data.get('data', {}).get('blocks', [])
                    assert tmp_data != [], '该session_id不存在,此处跳过'
                except AssertionError:
                    # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        msg=
                        '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'
                        .format(goods_id, miaosha_begin_time))
                    index += 1
                    self.goods_index = index
                    await async_sleep(1.2)

                    return goods_id, res

                tmp_data = [item_s.get('deal', {}) for item_s in tmp_data]
                # pprint(tmp_data)
                try:
                    miaosha_goods_list = await self._get_miaoshao_goods_info_list(
                        data=tmp_data)
                    # pprint(miaosha_goods_list)
                except ValueError:
                    await async_sleep(2)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                # 该session_id中现有的所有zid的list
                miaosha_goods_all_goods_id = [
                    i.get('zid') for i in miaosha_goods_list
                ]
                if goods_id not in miaosha_goods_all_goods_id:
                    # 内部已经下架的
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(
                            goods_id))
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:
                    # 未下架的
                    res = await self._one_update(
                        miaosha_goods_list=miaosha_goods_list,
                        goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.5)

        return goods_id, res
Example #3
0
    def _get_goods_data(self, goods_id):
        '''
        得到需求数据
        :param goods_id: 
        :return: 
        '''
        data = {}

        url = 'https://goods.kaola.com/product/{0}.html'.format(goods_id)
        self.lg.info('------>>>| 正在抓取考拉地址为: {0}'.format(url))
        try:
            assert goods_id != '', '获取到的goods_id为空值!此处跳过!'

            body = self.get_kl_pc_body(goods_id=goods_id)
            pc_goods_body = body

            # _ = self._get_right_body(body)    # phone端
            _ = self._get_pc_right_body(body)  # pc端
            # pprint(_)
            assert _ != {}, '获取body时索引异常!'

            _['sku_info'] = self.get_kl_pc_sku_info(goods_id=goods_id)
            # pprint(_)
            _ = self._wash_data(_)
            # pprint(_)

            # title, sub_title
            data['title'] = self._get_title(data=_)
            data['sub_title'] = ''
            data['shop_name'] = _.get('goodsInfoBase', {}).get('brandName', '')
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = self._get_sell_time(data=_.get('sku_info', {}))
            data['detail_name_list'] = self._get_detail_name_list(
                data=_.get('sku_info', {}).get('skuDetailList', []))
            # TODO 网易考拉官方有bug, 实际规格没货的商品, 前端还在卖, 估计是下单后再去订货, 库存0: 我这边就处理为下架
            # data['price_info_list'] = self._get_sku_info(data=_.get('sku_info', {}).get('skuDetailList', []))
            '''获取pc端的, 价格为算上税费的'''
            data['price_info_list'] = self._get_pc_sku_info(
                data=_.get('sku_info', {}).get('skuDetailList', []))

            data['price'], data[
                'taobao_price'] = self._get_price_and_taobao_price(
                    data=_.get('sku_info', {}).get('skuPrice', {}),
                    price_info_list=data['price_info_list'])
            data['is_delete'] = self._get_is_delete(
                price_info_list=data['price_info_list'], data=data, other=_)
            data['parent_dir'] = self._get_parent_dir(body=pc_goods_body)
            self.lg.info('parent_dir: {}'.format(data['parent_dir']))

        except GoodsShelvesException:
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            return self._get_data_error_init()

        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            self.lg.error('出错goods_id: {0}, 地址: {1}'.format(goods_id, url))
            return self._get_data_error_init()

        self.result_data = data

        return data
Example #4
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()

        tmp_url = 'https://web.juanpi.com/pintuan/shop/{}'.format(goods_id)
        print('------>>>| 得到的商品手机版的地址为: ', tmp_url)
        try:
            """
            2.采用phantomjs来处理,记住使用前别翻墙
            """
            body = self.driver.get_url_body(
                url=tmp_url,
                # 该css为手机端标题块
                # css_selector='div.sc-kgoBCf.bTQvTk',
                timeout=28,)
            assert body != ''
            # print(body)
            if re.compile(r'<span id="t-index">页面丢失ing</span>').findall(body) != []:
                # 页面为空处理
                raise GoodsShelvesException
            else:
                pass

            data = re.compile(r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(body)  # 贪婪匹配匹配所有
            assert data != [], 'data为空list!'

            # 得到skudata
            # 卷皮原先的skudata请求地址1(官方放弃)
            # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id)
            # 现在卷皮skudata请求地址2
            skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(goods_id)
            headers = get_random_headers(upgrade_insecure_requests=False)
            headers.update({
                'Host': 'webservice.juanpi.com'
            })
            skudata_body = Requests.get_url_body(
                url=skudata_url,
                headers=headers,
                ip_pool_type=self.ip_pool_type,
                proxy_type=self.proxy_type,
                num_retries=self.req_num_retries,)
            assert skudata_body != '', '获取到的skudata_body为空str!请检查!'

            skudata = re.compile(r'(.*)').findall(skudata_body)  # 贪婪匹配匹配所有
            assert skudata != [], 'skudata为空!'
            skudata = json_2_dict(json_str=skudata[0]).get('skudata', {})
            # pprint(skudata)
            assert skudata != {}

            if skudata.get('info') is not None:
                pass  # 说明得到正确的skudata
            else:  # 否则跳出
                raise AssertionError('skudata中info的key为None, 返回空dict')

            main_data = json_2_dict(json_str=data[0])
            assert main_data != {}
            # pprint(main_data)

            goods_status = int(main_data.get('detail', {}).get('baseInfo', {}).get('status', '1'))
            # print('goods_status: {}'.format(goods_status))
            if goods_status == 0:
                # 表示商品下架, 无法正常购买
                raise GoodsShelvesException
            else:
                pass

            if main_data.get('detail') is not None:
                main_data = self._wash_main_data(main_data.get('detail', {}))

                main_data['skudata'] = skudata
                main_data['goods_id'] = goods_id
                main_data['parent_dir'] = _jp_get_parent_dir(
                    phantomjs=self.driver,
                    goods_id=goods_id)
                self.result_data = main_data
                # pprint(main_data)

                return main_data

            else:
                raise AssertionError('data中detail的key为None, 返回空dict')

        except GoodsShelvesException:
            _handle_goods_shelves_in_auto_goods_table(
                goods_id=goods_id, )
            return self._data_error_init()

        except Exception as e:
            print(e)
            return self._data_error_init()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True)
            for item in result:  # 实时更新数据
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg,
                                       is_real_times_update_call=True)
                    collect()

                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    logger=my_lg,
                    remainder=10,
                )
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    db_goods_info_obj = KLDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    data = kaola._get_goods_data(goods_id=goods_id)
                    if data.get('is_delete', 0) == 1:
                        # 单独处理下架商品
                        data['goods_id'] = goods_id
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=db_goods_info_obj.is_delete,
                                shelf_time=db_goods_info_obj.shelf_time,
                                delete_time=db_goods_info_obj.delete_time,
                            )

                        try:
                            kaola.to_right_and_update_data(data,
                                                           pipeline=sql_cli)
                        except Exception:
                            my_lg.error(exc_info=True)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        if data.get('is_delete', 0) == 1:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='kl',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )
                        kaola.to_right_and_update_data(data, pipeline=sql_cli)

                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠3s中...')
                        sleep(3.)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:
            # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    lg = set_logger(logger_name=get_uuid1(),
                    log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                    str(get_shanghai_time())[0:10] + '.txt',
                    console_log_level=INFO,
                    file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    try:
        # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据
        # 得 处理 因为只要此处会清数据了
        tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None)
        # await async_sleep(10)
        result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7))
    except TypeError:
        lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    await _print_db_old_data(
        result=result,
        logger=lg,
    )

    index = 1
    for item in result:
        goods_id = item[0]
        tejia_end_time = item[2]

        tmp_sql_server = await _get_new_db_conn(
            db_obj=tmp_sql_server,
            index=index,
            logger=lg,
            db_conn_type=1,
        )
        if tmp_sql_server.is_connect_success:
            # lg.info(str(tejia_end_time))
            if tejia_end_time < get_shanghai_time():
                # 过期的不删除, 降为更新为常规爆款促销商品
                # index = await update_expired_goods_to_normal_goods(
                #     goods_id=goods_id,
                #     index=index,
                #     tmp_sql_server=tmp_sql_server,
                #     logger=lg
                # )
                # 过期直接下架
                lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id))
                _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=lg,
                    update_sql_str=tb_update_str_5,
                )
                index += 1

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # lg.info(str(tejia_goods_list))
                # await async_sleep(.45)
                # # lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id)
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (goods_id, str(index)))
                taobao = TaoBaoLoginAndParse(
                    logger=lg,
                    is_real_times_update_call=is_real_times_update_call)
                taobao.get_goods_data(goods_id)
                goods_data = taobao.deal_with_data(goods_id=goods_id)
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id)
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = goods_id
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = goods_id
                    if goods_data.get('is_delete', 0) == 1:
                        lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await async_sleep(4)

                await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                collect()

        else:
            lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        collect()
    lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        await async_sleep(5 * 60)

    else:
        await async_sleep(60 * 1)
    collect()

    return True
Example #7
0
    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        page = item[2]
        goods_url = item[3]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_jumei_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
        )

        if self.sql_cli.is_connect_success:
            is_recent_time_res = await self._is_recent_time(miaosha_end_time)
            if is_recent_time_res == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=jm_update_str_4,
                    sql_cli=self.sql_cli,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_end_time)))
                await async_sleep(.3)

            elif is_recent_time_res == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_end_time)))

                else:
                    pass

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                this_page_all_goods_list = await self._get_one_page_all_goods_list(
                    page)
                if isinstance(this_page_all_goods_list, str):
                    self.lg.error('网络错误!先跳过')
                    await async_sleep(1.5)
                    return res

                elif this_page_all_goods_list == []:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.error(
                        '#### 该page对应得到的this_page_all_goods_list为空[]!')
                    self.lg.error(
                        '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format(
                            goods_id))
                    await async_sleep(.3)

                else:
                    """
                    由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                    """
                    # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                    #
                    # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    #     res = _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         logger=self.lg,
                    #         update_sql_str=jm_update_str_4,
                    #         sql_cli=self.sql_cli, )
                    #     self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                    #     pass

                    # else:  # 未下架的
                    tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url(
                        goods_url)
                    self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                    goods_data = self.jumeiyoupin_miaosha.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:
                        goods_data['goods_id'] = goods_id
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=self.sql_cli)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(JUMEIYOUPIN_SLEEP_TIME)

        return [goods_id, res]
Example #8
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=mg_delete_str_2)
            result = list(sql_cli._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            self.my_phantomjs = BaseDriver(
                executable_path=PHANTOMJS_DRIVER_PATH,
                ip_pool_type=self.ip_pool_type)
            for item in result:  # 实时更新数据
                goods_id = item[0]
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            update_sql_str=mg_update_str_5,
                            sql_cli=sql_cli,
                        )
                        print(
                            '过期的goods_id为(%s)' % goods_id,
                            ', 拼团开始时间为(%s), 逻辑删除成功!' %
                            json.loads(item[1]).get('begin_time'))
                        sleep(.3)

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                _handle_goods_shelves_in_auto_goods_table(
                                    goods_id=goods_id,
                                    update_sql_str=mg_update_str_5,
                                    sql_cli=sql_cli,
                                )
                                sleep(.3)

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if goods_id not in pintuan_goods_all_goods_id:
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=goods_id)
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = goods_id
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data, pipeline=sql_cli)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == goods_id:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=goods_id)
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data[
                                                    'goods_id'] = goods_id
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=sql_cli)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        gc.collect()
Example #9
0
    def deal_with_data(self):
        '''
        处理result_data, 返回需要的信息
        :return: 字典类型
        '''
        data = self.result_data
        goods_id = data.get('goods_id', '')
        if data != {}:
            try:
                shop_name = data.get('shop_name', '')
                account = ''
                title = data.get('/app/detail/product/base',
                                 {}).get('title', '')
                sub_title = ''

                # 要存储的每个标签对应规格的价格及其库存
                tmp_price_info_list = data.get('/app/detail/product/sku',
                                               {}).get('items')
                # pprint(tmp_price_info_list)

                cache = self._get_detail_name_list_and_price_info_list_and_price_and_taobao_price(
                    data=data, tmp_price_info_list=tmp_price_info_list)
                all_img_url = self._get_all_img_url(tmp_all_img_url=data.get(
                    '/app/detail/product/base', {}).get('images', []))

                detail_name_list = cache[0]
                price_info_list = cache[1]
                price = cache[2]
                taobao_price = cache[3]
                # print('最高价为: ', price)
                # print('最低价为: ', taobao_price)
                # print(detail_name_list)
                # pprint(price_info_list)

                p_info = self._get_p_info(data=data)
                # pprint(p_info)
                # div_desc
                div_desc = data.get('/app/detail/graph/detail', '')
                is_delete = self._get_is_delete(
                    price_info_list=price_info_list)
                schedule, is_delete = self._get_schedule(data=data,
                                                         is_delete=is_delete)
                # pprint(schedule)
                parent_dir = str(data.get('parent_dir', ''))
                all_sell_count = ''
                if target_str_contain_some_char_check(
                        target_str=title,
                        check_char_obj=CONTRABAND_GOODS_KEY_TUPLE):
                    print('违禁物品下架...')
                    is_delete = 1
                else:
                    pass

            except GoodsShelvesException:
                _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, )
                return self._data_error()

            except (AttributeError, Exception) as e:
                print('遇到错误:', e)

                return self._data_error()

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                # 'shop_name_url': shop_name_url,            # 店铺主页地址
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,                # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list,    # 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'schedule': schedule,  # 商品开卖时间和结束开卖时间
                'is_delete': is_delete,  # 用于判断商品是否已经下架
                'parent_dir': parent_dir,
                'all_sell_count': all_sell_count,
            }
            # pprint(result)
            # print(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            return result

        else:
            print('待处理的data为空的dict, 该商品可能已经转移或者下架')
            self.result_data = {}
            return {}
Example #10
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error()

        tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str(
            goods_id)
        # print('------>>>| 得到的detail信息的地址为: ', tmp_url)
        body = Requests.get_url_body(url=tmp_url,
                                     headers=self.headers,
                                     ip_pool_type=self.ip_pool_type)
        data = json_2_dict(
            json_str=body,
            default_res={},
        )
        if body == '' \
                or data == {}:
            return self._data_error()

        # 处理base
        base = json_2_dict(json_str=data.get('/app/detail/product/base', ''),
                           default_res={})

        # 处理profiles
        profiles = data.get('/app/detail/product/profiles', '')
        profiles = json_2_dict(json_str=profiles)
        if profiles == {}:
            print("json.loads转换出错,得到profiles值可能为空,此处跳过")
            profiles = ''

        # 处理score
        score = json_2_dict(json_str=data.get('/app/detail/product/score', ''),
                            default_res={})
        try:
            score.pop('contents')
        except:
            pass

        # 处理sku
        sku = json_2_dict(json_str=data.get('/app/detail/product/sku', ''),
                          default_res={})
        # pprint(sku)

        data['/app/detail/product/base'] = base
        data['/app/detail/product/profiles'] = profiles
        data['/app/detail/product/score'] = score
        data['/app/detail/product/sku'] = sku
        # pprint(base)

        try:
            # 得到手机版地址
            phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str(
                base.get('dealId', ''))
        except AttributeError:
            # None表示获取失败, False表示已下架, True正常
            can_join_cart = sku.get('canJoinCart')
            if can_join_cart is not None:
                if isinstance(can_join_cart, bool) \
                        and not can_join_cart:
                    # todo 已下架!
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id, )
                    return self._data_error()

                else:
                    pass

            print('获取手机版地址失败,此处跳过')
            return self._data_error()

        print('------>>>| 得到商品手机版地址为: ', phone_url)
        # 得到并处理detail(即图文详情显示信息)
        tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str(
            goods_id)
        detail_data_body = Requests.get_url_body(
            url=tmp_detail_url,
            headers=self.headers,
            ip_pool_type=self.ip_pool_type)
        # print(detail_data_body)
        if detail_data_body == '':
            print('detail_data为[]!')
            return self._data_error()

        detail_data = json_2_dict(json_str=detail_data_body, default_res={})
        if detail_data == {}:
            print('json.loads(detail_data)时报错, 此处跳过')
            return self._data_error()

        detail = json_2_dict(json_str=detail_data.get(
            '/app/detail/graph/detail', ''),
                             default_res={})
        try:
            detail.pop('small')
        except:
            pass
        # print(detail)

        # div_desc
        tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id)
        if tmp_div_desc == '':
            return self._data_error()
        # print(tmp_div_desc)
        data['/app/detail/graph/detail'] = tmp_div_desc

        # shop_name
        shop_name = self._get_shop_name(data=data)
        if isinstance(shop_name, dict):
            if shop_name == {}:
                return self._data_error()
        data['shop_name'] = shop_name
        '''
        得到秒杀开始时间和结束时间
        '''
        schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str(
            goods_id)
        schedule_and_stock_info_body = Requests.get_url_body(
            url=schedule_and_stock_url,
            headers=self.headers,
            high_conceal=True,
            ip_pool_type=self.ip_pool_type)
        if schedule_and_stock_info_body == '':
            print('schedule_and_stock_info为空!')
            return self._data_error()

        schedule_and_stock_info = json_2_dict(
            json_str=schedule_and_stock_info_body)
        if schedule_and_stock_info == {}:
            print('得到秒杀开始时间和结束时间时错误, 此处跳过')
            return self._data_error()

        schedule = json_2_dict(json_str=schedule_and_stock_info.get(
            '/app/detail/status/schedule', None),
                               default_res={})
        stock = json_2_dict(json_str=schedule_and_stock_info.get(
            '/app/detail/status/stock', None),
                            default_res={})

        data['schedule'] = schedule
        data['stock'] = stock
        data['parent_dir'] = _z8_get_parent_dir(goods_id)
        data['goods_id'] = goods_id

        self.result_data = data
        # pprint(data)

        return data
Example #11
0
    def get_goods_data(self, goods_id):
        '''
        得到data
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            return self._data_error_init()

        type = goods_id[0]  # 天猫类型
        # self.lg.info(str(type))
        goods_id = goods_id[1]  # 天猫goods_id
        tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id)
        # self.lg.info('------>>>| phone_url: {}'.format(tmp_url))

        self.headers.update({'Referer': tmp_url})
        last_url = self._get_last_url(goods_id=goods_id)
        body = Requests.get_url_body(
            url=last_url,
            headers=self.headers,
            timeout=14,
            ip_pool_type=self.ip_pool_type,
            proxy_type=self.proxy_type,
            num_retries=self.req_num_retries,
        )

        try:
            assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % (
                str(type), goods_id)
            data = json_2_dict(
                json_str=re.compile('mtopjsonp3\((.*)\)').findall(body)[0],
                default_res={},
                logger=self.lg)
            assert data != {}, 'data为空dict, 出错type: {}, goods_id: {}'.format(
                str(type), str(goods_id))
            # pprint(data)
            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                raise GoodsShelvesException

        except GoodsShelvesException:
            ## 表示该商品已经下架, 原地址被重定向到新页面
            self.lg.info('@@@@@@ 该商品已经下架...')
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            tmp_data_s = self.init_pull_off_shelves_goods(type)
            self.result_data = {}
            return tmp_data_s

        except (AssertionError, IndexError):
            self.lg.error('遇到错误:', exc_info=True)
            return self._data_error_init()

        # 处理商品被转移或者下架导致页面不存在的商品
        if data.get('data', {}).get('seller', {}).get('evaluates') is None:
            self.lg.error(
                'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: {}, goods_id: {}'.
                format(type, goods_id))
            return self._data_error_init()

        data['data']['rate'] = ''  # 这是宝贝评价
        data['data']['resource'] = ''  # 买家询问别人
        data['data']['vertical'] = ''  # 也是问和回答
        data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
        result_data = data['data']

        # 处理result_data['apiStack'][0]['value']
        # self.lg.info(result_data.get('apiStack', [])[0].get('value', ''))
        result_data_apiStack_value = result_data.get('apiStack',
                                                     [])[0].get('value', {})

        # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
        result_data['apiStack'][0][
            'value'] = self._wash_result_data_apiStack_value(
                goods_id=goods_id,
                result_data_apiStack_value=result_data_apiStack_value)

        # 处理mockData
        mock_data = result_data['mockData']
        mock_data = json_2_dict(json_str=mock_data, logger=self.lg)
        if mock_data == {}:
            self.lg.error('出错type: {0}, goods_id: {1}'.format(type, goods_id))
            return self._data_error_init()

        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # self.lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.lg.error(
                "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: {}, goods_id: {}"
                .format(str(type), goods_id))
            result_data['trade'] = {}
            return self._data_error_init()
        else:
            result_data['trade'] = result_data.get('apiStack', [])[0].get(
                'value', {}).get('trade', {})  # 用于判断该商品是否已经下架的参数
            # pprint(result_data['trade'])

        result_data['type'] = type
        result_data['goods_id'] = goods_id
        self.result_data = result_data
        # pprint(self.result_data)

        return result_data
    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        tab_id = item[2]
        page = item[3]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_jp_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=jp_update_str_6,
                    sql_cli=self.tmp_sql_server,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_begin_time)))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jp_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))
                else:
                    self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id),
                    str(page),
                )
                # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url))
                body = Requests.get_url_body(url=tmp_url,
                                             headers=await
                                             self._get_pc_headers(),
                                             ip_pool_type=self.ip_pool_type)
                try:
                    data = json_2_dict(body, default_res={}).get('data', {})
                    assert data != {}, 'data为空dict!'
                    data = data.get('goodslist', [])
                    assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, page)
                except AssertionError:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                miaosha_goods_list = await self._get_miaoshao_goods_info_list(
                    data=data)
                # self.lg.info(str(miaosha_goods_list))
                # 该tab_id, page中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [
                    i.get('goods_id') for i in miaosha_goods_list
                ]
                self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    if miaosha_goods_all_goods_id != []:  # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过!
                        self.lg.info(
                            '该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id))
                    else:
                        # 表示该tab_id,page中没有了该goods_id
                        res = _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            logger=self.lg,
                            update_sql_str=jp_update_str_6,
                            sql_cli=self.tmp_sql_server,
                        )
                        self.lg.info(
                            '该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(
                                goods_id))

                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        miaosha_goods_list=miaosha_goods_list,
                        goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        await async_sleep(1.2)

        return goods_id, res
Example #13
0
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            _goods_id = item[0]
            miaosha_time = item[1]
            miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
                miaosha_time=miaosha_time,
                logger=self.lg,
            )

            tmall = TmallParse(logger=self.lg)
            tmp_sql_server = await _get_new_db_conn(
                db_obj=tmp_sql_server,
                index=index,
                logger=self.lg,
                remainder=20,
            )

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=_goods_id,
                        logger=self.lg,
                        update_sql_str=tb_update_str_4,
                        sql_cli=tmp_sql_server,
                    )
                    self.lg.info('过期的goods_id为(%s)' % _goods_id +
                                 ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time)
                    await async_sleep(.3)

                else:  # 返回1, 表示在待更新的区间内
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (_goods_id, str(index)))
                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.lg.info(str(item))
                        goods_data['goods_id'] = _goods_id

                        await tmall._update_taoqianggou_xianshimiaosha_table(
                            data=goods_data, pipeline=tmp_sql_server)
                        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await async_sleep(5)

                index += 1

            try:
                del tmall
            except:
                pass

        collect()

        return
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        result = self._get_db_old_data()
        index = 1
        for item in result:  # 实时更新数据
            goods_id = item[0]
            pid = item[2]
            # 2020-04-12 00:00:00
            pintuan_end_time = json_2_dict(item[1]).get('end_time')
            pintuan_end_time = datetime_to_timestamp(
                string_to_datetime(pintuan_end_time))
            # print(pintuan_end_time)

            data = {}
            self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli,
                                                  index=index,
                                                  remainder=50)
            if self.sql_cli.is_connect_success:
                is_recent_time = self.is_recent_time(pintuan_end_time)
                if is_recent_time == 0:
                    # 已恢复原价的
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        update_sql_str=mia_update_str_7,
                        sql_cli=self.sql_cli)
                    print('该goods拼团开始时间为({})'.format(
                        json.loads(item[1]).get('begin_time')))
                    sleep(.4)

                elif is_recent_time == 2:
                    # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
                    pass

                else:  # 返回1,表示在待更新区间内
                    print(
                        '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'
                        .format(goods_id, index))
                    data['goods_id'] = goods_id
                    try:
                        data_list = get_mia_pintuan_one_page_api_goods_info(
                            page_num=pid)
                    except ResponseBodyIsNullStrException:
                        index += 1
                        sleep(.4)
                        continue

                    # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新
                    # try:
                    #     assert data_list != [], 'data_list不为空list!'
                    # except AssertionError as e:
                    #     print(e)
                    #     _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         update_sql_str=mia_update_str_7,
                    #         sql_cli=self.sql_cli)
                    #     sleep(.4)
                    #     index += 1
                    #     continue

                    pintuan_goods_all_goods_id = [
                        item_1.get('goods_id', '') for item_1 in data_list
                    ]
                    # print(pintuan_goods_all_goods_id)
                    '''
                    蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                    '''
                    mia_pt = MiaPintuanParse()
                    if goods_id not in pintuan_goods_all_goods_id:
                        # 内部已经下架的
                        # 一律更新
                        try:
                            goods_data = self._get_mia_pt_one_goods_info(
                                mia_pt_obj=mia_pt,
                                goods_id=goods_id,
                            )
                        except AssertionError:
                            # 返回的data为空则跳过
                            index += 1
                            continue

                        # pprint(goods_data)
                        mia_pt.update_mia_pintuan_table(data=goods_data,
                                                        pipeline=self.sql_cli)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                    else:
                        # 未下架的
                        for item_2 in data_list:
                            if item_2.get('goods_id', '') == goods_id:
                                sub_title = item_2.get('sub_title', '')
                                try:
                                    goods_data = self._get_mia_pt_one_goods_info(
                                        mia_pt_obj=mia_pt,
                                        goods_id=goods_id,
                                        sub_title=sub_title,
                                    )
                                except AssertionError:
                                    # 返回的data为空则跳过
                                    continue

                                # pprint(goods_data)
                                mia_pt.update_mia_pintuan_table(
                                    data=goods_data, pipeline=self.sql_cli)
                                sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                            else:
                                pass

                    try:
                        del mia_pt
                    except:
                        pass

            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass

            index += 1
            collect()

        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
Example #15
0
    def get_goods_data(self, goods_id):
        '''
        得到data
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            return self._data_error_init()

        tm_type = goods_id[0]  # 天猫类型
        # self.lg.info(str(tm_type))
        goods_id = goods_id[1]  # 天猫goods_id
        phone_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id)
        # self.lg.info('------>>>| phone_url: {}'.format(phone_url))

        # 使用获取基础数据的方式
        get_base_data_method = 0
        headers = get_random_headers(
            upgrade_insecure_requests=False,
            cache_control='',
        )
        headers.update({
            'Referer': phone_url,
        })
        last_url = self._get_last_url(goods_id=goods_id)
        body = Requests.get_url_body(
            url=last_url,
            headers=headers,
            timeout=self.req_timeout,
            ip_pool_type=self.ip_pool_type,
            proxy_type=self.proxy_type,
            num_retries=self.req_num_retries,
        )

        try:
            assert body != '', '获取到的body为空值, 此处跳过!'
            data = json_2_dict(
                json_str=re.compile('\((.*)\)').findall(body)[0],
                default_res={},
                logger=self.lg)

            try:
                if 'login.m.taobao.com' in data.get('data', {}).get('url', ''):
                    # 第一种获取接口出错, 抛出异常(要求登录)
                    raise AssertionError('被重定向到login_url...')
                else:
                    pass
                assert data != {}, 'data为空dict!'
            except AssertionError:
                # 尝试第二种获取数据方式
                self.lg.info(
                    'trying second method to get data[where goods_id: {}] ...'.
                    format(goods_id))
                # 修改方式
                get_base_data_method = 1
                data = get_tm_m_body_data(goods_id=goods_id,
                                          proxy_type=self.proxy_type,
                                          num_retries=self.req_num_retries,
                                          logger=self.lg)

            # pprint(data)
            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                raise GoodsShelvesException

        except GoodsShelvesException:
            ## 表示该商品已经下架, 原地址被重定向到新页面
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            tmp_data_s = self.init_pull_off_shelves_goods(tm_type=tm_type)
            self.result_data = {}
            return tmp_data_s

        except (AssertionError, IndexError):
            self.lg.error(msg='遇到错误[出错tm_type: {}, goods_id: {}]:'.format(
                tm_type,
                goods_id,
            ),
                          exc_info=True)
            return self._data_error_init()

        # 处理商品被转移或者下架导致页面不存在的商品
        if data.get('data', {}).get('seller', {}).get('evaluates') is None:
            self.lg.error(
                'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错tm_type: {}, goods_id: {}'.
                format(tm_type, goods_id))
            return self._data_error_init()

        # 这是宝贝评价
        data['data']['rate'] = ''
        # 买家询问别人
        data['data']['resource'] = ''
        # 也是问和回答
        data['data']['vertical'] = ''
        # 宝贝描述, 卖家服务, 物流服务的评价值...
        data['data']['seller']['evaluates'] = ''
        result_data = data['data']

        # 处理result_data['apiStack'][0]['value']
        # self.lg.info(result_data.get('apiStack', [])[0].get('value', ''))
        result_data_apiStack_value = result_data.get('apiStack',
                                                     [])[0].get('value', {})

        if get_base_data_method == 0:
            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0][
                'value'] = self._wash_result_data_apiStack_value(
                    goods_id=goods_id,
                    result_data_apiStack_value=result_data_apiStack_value)
        else:
            pass

        mock_data = result_data['mockData']
        if get_base_data_method == 0:
            # 处理mockData
            mock_data = json_2_dict(json_str=mock_data, logger=self.lg)
        elif get_base_data_method == 1:
            pass
        else:
            raise ValueError('get_base_data_method value异常!')

        if mock_data == {}:
            self.lg.error('出错tm_type: {0}, goods_id: {1}'.format(
                tm_type, goods_id))
            return self._data_error_init()

        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # self.lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.lg.error(
                "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错tm_type: {}, goods_id: {}"
                .format(tm_type, goods_id))
            result_data['trade'] = {}
            return self._data_error_init()
        else:
            # 用于判断该商品是否已经下架的参数
            result_data['trade'] = result_data\
                .get('apiStack', [])[0]\
                .get('value', {})\
                .get('trade', {})
            # pprint(result_data['trade'])

        result_data['type'] = tm_type
        result_data['goods_id'] = goods_id
        self.result_data = result_data
        # pprint(self.result_data)

        return result_data
Example #16
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        phone_url = 'https://h5.m.taobao.com/awp/core/detail.htm?id={}'.format(goods_id)
        self.msg = '------>>>| phone_url: {}'
        # self.lg.info(self.msg)

        # 获取主接口的body
        last_url = self._get_last_url(goods_id=goods_id)
        body = Requests.get_url_body(
            url=last_url,
            headers=self.headers,
            params=None,
            timeout=14,
            ip_pool_type=self.ip_pool_type,
            proxy_type=self.proxy_type,
            num_retries=self.req_num_retries,)

        try:
            data = json_2_dict(
                json_str=re.compile(r'\((.*)\)').findall(body)[0],
                default_res={},
                logger=self.lg)
            # self.lg.info(str(data))
            assert data != {}, '获取到的data为空dict!'
            # pprint(data)
            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                raise GoodsShelvesException

        except GoodsShelvesException:
            ## 表示该商品已经下架, 原地址被重定向到新页面
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id, logger=self.lg)
            tmp_data_s = self.init_pull_off_shelves_goods()
            self.result_data = {}
            return tmp_data_s

        except (IndexError, AssertionError):
            self.lg.error('data为空! 出错goods_id: {0}'.format(goods_id), exc_info=True)
            return self._data_error_init()

        # 处理商品被转移或者下架导致页面不存在的商品
        if data.get('data').get('seller', {}).get('evaluates') is None:
            self.lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架')
            return self._data_error_init()

        data = self._wash_tb_origin_data(data=data)
        result_data = data['data']

        # 处理result_data['apiStack'][0]['value']
        # self.lg.info(result_data.get('apiStack', [])[0].get('value', ''))
        result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {})

        # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
        result_data['apiStack'][0]['value'] = self._wash_result_data_apiStack_value(
            goods_id=goods_id,
            result_data_apiStack_value=result_data_apiStack_value)

        # 处理mockData
        mock_data = result_data['mockData']
        mock_data = json_2_dict(
            json_str=mock_data,
            logger=self.lg,)
        if mock_data == {}:
            self.lg.error('出错goods_id: {0}'.format(goods_id))

            return self._data_error_init()

        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # 可能会有{'name': 'esi', 'value': ''}的情况
        # self.lg.info(str(result_data.get('apiStack', [])[0]))
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.lg.info("result_data.get('apiStack', [])[0].get('value', '')的值为空....")
            result_data['trade'] = {}

            return self._data_error_init()

        else:
            # 用于判断该商品是否已经下架的参数
            result_data['trade'] = result_data\
                .get('apiStack', [])[0]\
                .get('value', {})\
                .get('trade', {})
            # pprint(result_data['trade'])

        self.result_data = result_data
        # pprint(self.result_data)

        return result_data
    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        单个更新
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        pid = item[2]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_mia_obj(index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30,
        )

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=mia_update_str_6,
                    sql_cli=self.tmp_sql_server,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_begin_time)))
                await async_sleep(.5)
                self.goods_index = index + 1

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))

                else:
                    pass

                self.goods_index = index + 1

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                    pid)
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             had_referer=True,
                                             ip_pool_type=self.ip_pool_type)
                # print(body)
                body = '' if body == '' or body == '[]' else body
                try:
                    tmp_data = json_2_dict(
                        json_str=body,
                        default_res={},
                        logger=self.lg,
                    )
                    assert tmp_data != {}, 'tmp_data为空dict!'
                except AssertionError:
                    self.lg.error('遇到错误:', exc_info=True)
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                item_list = tmp_data.get('item_list', [])
                # 该pid中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [
                    item_1.get('item_id', '') for item_1 in item_list
                ]
                # self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        item_list=item_list,
                        goods_id=goods_id,
                        tmp_data=tmp_data,
                    )

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')

        await async_sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        self.goods_index = index + 1
        collect()

        return goods_id, res
Example #18
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/小米有品/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yp_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            yp = YouPinParse(logger=my_lg)
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del yp
                    except:
                        pass
                    yp = YouPinParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    yp._get_target_data(goods_id=goods_id)

                    data = yp._handle_target_data()
                    db_goods_info_obj = YPDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yp',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yp._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        collect()
Example #19
0
    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        gender = item[2]
        page = item[3]

        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_cc_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            remainder=25,
        )

        if self.sql_cli.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=cc_update_str_2,
                    sql_cli=self.sql_cli,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_end_time)))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=cc_update_str_2,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_end_time)))

                else:
                    pass

                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                body = await self._get_one_page_goods_info(gender, page)
                if body == '':
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                json_body = json_2_dict(body, default_res={})
                try:
                    this_page_total_count = json_body.get('data', {}).get(
                        'groupList', [])[0].get('totalCount', 0)
                except IndexError:
                    self.lg.error('获取this_page_total_count时出错, 请检查!')
                    this_page_total_count = 0

                item_list = await self._get_item_list(
                    this_page_total_count=this_page_total_count,
                    json_body=json_body)
                if item_list == []:
                    self.lg.info(
                        '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除'
                    )
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=item[0],
                        logger=self.lg,
                        update_sql_str=cc_update_str_2,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    await async_sleep(.3)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:
                    res = await self._one_update(goods_id=goods_id,
                                                 item_list=item_list)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(CHUCHUJIE_SLEEP_TIME)

        return goods_id, res
Example #20
0
    def get_goods_data(self, goods_id: str) -> dict:
        '''
        模拟构造得到data
        :param goods_id:
        :return: data dict类型
        '''
        if goods_id == '':
            self._data_error_init()

        data = {}
        # 常规商品手机地址
        goods_url = 'https://m.mia.com/item-{}.html'.format(goods_id)
        # 常规商品pc地址
        # goods_url = 'https://www.mia.com/item-{}.html'.format(goods_id)
        print('------>>>| 待抓取的地址为: ', goods_url)

        body = Requests.get_url_body(
            url=goods_url,
            headers=self._get_phone_headers(),
            # had_referer=True,
            ip_pool_type=self.ip_pool_type,
            proxy_type=self.proxy_type,
            num_retries=self.req_num_retries,
        )
        # print(body)
        if body == '':
            print('获取到的body为空值!跳过!')
            return self._data_error_init()

        is_mia_mian_page = Selector(
            text=body).css('div.item-center ::text').extract_first() or ''
        # print(is_mia_mian_page)
        # m站是否为补货状态的 判断方法: 通过pc站点击加入购物车的请求来判断是否已缺货!!
        is_replenishment_status = self._get_replenishment_status(
            goods_id=goods_id, body=body)
        if (isinstance(is_mia_mian_page, str) and is_mia_mian_page == '进口母婴正品特卖')\
                or is_replenishment_status:      # 单独处理拼团下架被定向到手机版主页的拼团商品
            print('++++++ 该拼团商品已下架,被定向到蜜芽主页 or 处在缺货状态中!')
            _handle_goods_shelves_in_auto_goods_table(
                goods_id=goods_id, update_sql_str=mia_update_str_7)
            collect()
            return self._data_error_init()

        # 判断是否跳转,并得到跳转url, 跳转url的body, 以及is_hk(用于判断是否是全球购的商品)
        body, sign_direct_url, is_hk = self.get_jump_to_url_and_is_hk(
            body=body)
        try:
            self.main_info_dict = self._get_goods_main_info_dict(
                goods_id=goods_id)
            # pprint(self.main_info_dict)
            data['title'], data['sub_title'] = self.get_title_and_sub_title(
                body=body)
            all_img_url = self.get_all_img_url()
            # pprint(all_img_url)

            p_info = self._get_p_info(body=body)
            # pprint(p_info)
            data['p_info'] = p_info

            # 获取每个商品的div_desc
            div_desc = self.get_goods_div_desc()
            assert div_desc != '', '获取到的div_desc为空值! 请检查'
            data['div_desc'] = div_desc
            # print(div_desc)
            '''
            获取每个规格的goods_id,跟规格名,以及img_url, 用于后面的处理
            '''
            sku_info = self.get_tmp_sku_info(body, goods_id, sign_direct_url,
                                             is_hk)
            assert sku_info != {}, 'sku_info为空dict'
            # pprint(sku_info)
            '''
            获取每个规格对应价格跟规格以及其库存
            '''
            true_sku_info, i_s, pintuan_time, all_sell_count = self.get_true_sku_info(
                sku_info=sku_info, goods_id=goods_id)
            # pprint(true_sku_info)
            data['price_info_list'] = true_sku_info
            data['pintuan_time'] = pintuan_time
            data['all_sell_count'] = all_sell_count
            # pprint(true_sku_info)

            # 设置detail_name_list
            data['detail_name_list'] = self.get_detail_name_list(
                true_sku_info=true_sku_info)
            # print(data['detail_name_list'])
            '''单独处理all_img_url为[]的情况'''
            if all_img_url == []:
                all_img_url = [{
                    'img_url': true_sku_info[0].get('img_url', '')
                }]

            data['all_img_url'] = all_img_url
            # pprint(all_img_url)

            # 单独处理得到goods_url
            if sign_direct_url != '':
                goods_url = sign_direct_url

            data['goods_url'] = goods_url
            data['parent_dir'] = _mia_get_parent_dir(p_info=p_info)

        except MiaSkusIsNullListException:
            print('该商品已不参与拼团!! 无拼团属性')
            _handle_goods_shelves_in_auto_goods_table(
                goods_id=goods_id, update_sql_str=mia_update_str_7)
            collect()
            return self._data_error_init()

        except Exception as e:
            print('遇到错误如下: ', e)
            return self._data_error_init()

        self.result_data = data

        return data
Example #21
0
    def deal_with_data(self):
        '''
        解析data数据,得到需要的东西
        :return: dict
        '''
        data = self.result_data
        if data != {}:
            shop_name = self._get_shop_name(data=data)
            # 掌柜
            account = ''
            title = self._get_title(data=data)
            sub_title = ''
            detail_name_list = self._get_detail_name_list(data=data)
            # print(detail_name_list)

            '''单独处理下架的情况'''
            if isinstance(detail_name_list, str):
                if detail_name_list == 'is_delete=1':
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=self.result_data.get('goods_id', ''),
                    )
                else:
                    pass

            if detail_name_list == {}:
                return self._data_error_init()

            price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price(data=data)
            all_img_url = self._get_all_img_url(data=data)
            p_info = self._get_p_info(data=data)
            div_desc = self._get_div_desc(data=data)
            # 商品销售时间段
            schedule = self._get_goods_schedule(data=data)
            # pprint(schedule)

            is_delete = self._get_is_delete(data=data, schedule=schedule)
            if price == 0 or taobao_price == 0:     # 没有获取到价格说明商品已经下架了
                is_delete = 1
            else:
                pass

            parent_dir = data.get('parent_dir', '')
            all_sell_count = ''
            if target_str_contain_some_char_check(
                    target_str=title,
                    check_char_obj=CONTRABAND_GOODS_KEY_TUPLE):
                print('违禁物品下架...')
                is_delete = 1
            else:
                pass

            result = {
                'shop_name': shop_name,                 # 店铺名称
                'account': account,                     # 掌柜
                'title': title,                         # 商品名称
                'sub_title': sub_title,                 # 子标题
                'price': price,                         # 商品价格
                'taobao_price': taobao_price,           # 淘宝价
                # 'goods_stock': goods_stock,           # 商品库存
                'detail_name_list': detail_name_list,   # 商品标签属性名称
                # 'detail_value_list': detail_value_list, # 商品标签属性对应的值
                'price_info_list': price_info_list,     # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,             # 所有示例图片地址
                'p_info': p_info,                       # 详细信息标签名对应属性
                'div_desc': div_desc,                   # div_desc
                'is_delete': is_delete,                 # 是否下架判断
                'schedule': schedule,                   # 商品销售时间段
                'parent_dir': parent_dir,
                'all_sell_count': all_sell_count,
            }
            # pprint(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            collect()
            return result

        else:
            print('待处理的data为空的dict')
            return {}
Example #22
0
    def _get_goods_data(self, goods_id):
        '''
        得到需求数据
        :param goods_id: 
        :return: 
        '''
        if goods_id == '':
            self.lg.error('获取到的goods_id为空值!此处跳过!')
            return self._get_data_error_init()

        # 网易考拉pc站抓取, m站p_info信息不全(不采用)
        # phone_body(requests设置代理一直302无限重定向, 于是phantomjs)
        # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=url)

        url = 'https://goods.kaola.com/product/{0}.html'.format(goods_id)
        self.lg.info('------>>>| 正在抓取考拉地址为: {0}'.format(url))

        body = self._get_pc_goods_body(goods_id=goods_id)
        # self.lg.info(body)
        pc_goods_body = body
        if body == '':
            return self._get_data_error_init()

        if '你很神,找到了不存在的页面' in body:
            _handle_goods_shelves_in_auto_goods_table(
                goods_id=goods_id,
                logger=self.lg,
            )
            return self._get_data_error_init()

        # _ = self._get_right_body(body)    # phone端
        _ = self._get_pc_right_body(body)  # pc端
        # pprint(_)
        if _ == {}:
            self.lg.error('获取body时索引异常!出错goods_id为:{0}, 出错地址: {1}'.format(
                goods_id, url))
            return self._get_data_error_init()

        else:
            # TODO 获取m站的sku_info(但是没有税费)
            # sku_info_url = 'https://m-goods.kaola.com/product/getWapGoodsDetailDynamic.json'
            # params = self._get_params(goods_id=goods_id)
            # body = Requests.get_url_body(url=sku_info_url, headers=self.headers, params=params)

            # 获取pc站的sku_info
            sku_info_url = 'https://goods.kaola.com/product/getPcGoodsDetailDynamic.json'
            params = self._get_pc_sku_info_params(goods_id=goods_id)
            body = Requests.get_url_body(url=sku_info_url,
                                         headers=self.headers,
                                         params=params,
                                         ip_pool_type=self.ip_pool_type)

            sku_info = json_2_dict(json_str=body, logger=self.lg).get('data')
            if sku_info is None:
                self.lg.error(
                    '获取到we的sku_info为None!出错goods_id: {0}, 出错地址: {1}'.format(
                        goods_id, url))
            _['sku_info'] = sku_info
            # pprint(_)

        _ = self._wash_data(_)
        # pprint(_)

        data = {}
        try:
            # title, sub_title
            data['title'] = self._get_title(data=_)
            data['sub_title'] = ''
            data['shop_name'] = _.get('goodsInfoBase', {}).get('brandName', '')
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = self._get_sell_time(data=_.get('sku_info', {}))
            data['detail_name_list'] = self._get_detail_name_list(
                data=_.get('sku_info', {}).get('skuDetailList', []))
            # TODO 网易考拉官方有bug, 实际规格没货的商品, 前端还在卖, 估计是下单后再去订货, 库存0: 我这边就处理为下架
            # data['price_info_list'] = self._get_sku_info(data=_.get('sku_info', {}).get('skuDetailList', []))
            '''获取pc端的, 价格为算上税费的'''
            data['price_info_list'] = self._get_pc_sku_info(
                data=_.get('sku_info', {}).get('skuDetailList', []))

            data['price'], data[
                'taobao_price'] = self._get_price_and_taobao_price(
                    data=_.get('sku_info', {}).get('skuPrice', {}),
                    price_info_list=data['price_info_list'])
            data['is_delete'] = self._get_is_delete(
                price_info_list=data['price_info_list'], data=data, other=_)
            data['parent_dir'] = self._get_parent_dir(body=pc_goods_body)
            self.lg.info('parent_dir: {}'.format(data['parent_dir']))

        except GoodsShelvesException:
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            return self._get_data_error_init()

        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            self.lg.error('出错goods_id: {0}, 地址: {1}'.format(goods_id, url))
            return self._get_data_error_init()

        self.result_data = data
        return data
Example #23
0
    def get_goods_data(self, goods_id):
        """
        得到data
        :param goods_id:
        :return: data 类型dict
        """
        if goods_id == []:
            return self._data_error_init()

        tm_type = goods_id[0]  # 天猫类型
        # self.lg.info(str(tm_type))
        goods_id = goods_id[1]  # 天猫goods_id

        # 使用获取基础数据的方式
        get_base_data_method = 0
        try:
            data = self.get_tm_base_data(goods_id=goods_id)
            # pprint(data)
            try:
                tb_api_redirect_detect(data=data)
            except AssertionError:
                # 尝试第二种获取数据方式
                self.lg.info(
                    'trying second method to get data[where goods_id: {}] ...'.
                    format(goods_id))
                # 修改方式
                get_base_data_method = 1
                data = get_tm_m_body_data(goods_id=goods_id,
                                          proxy_type=self.proxy_type,
                                          num_retries=self.req_num_retries,
                                          logger=self.lg)

            # pprint(data)
            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                raise GoodsShelvesException

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data', {}).get('seller', {}).get('evaluates') is None:
                self.lg.error(
                    'data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错tm_type: {}, goods_id: {}'
                    .format(tm_type, goods_id))
                return self._data_error_init()

            data = self._wash_tm_ori_data(data=data)
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack',
                                                         [])[0].get(
                                                             'value', {})

        except GoodsShelvesException:
            ## 表示该商品已经下架, 原地址被重定向到新页面
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            tmp_data_s = self.init_pull_off_shelves_goods(tm_type=tm_type)
            self.result_data = {}
            return tmp_data_s

        except (AssertionError, IndexError):
            self.lg.error(msg='遇到错误[出错tm_type: {}, goods_id: {}]:'.format(
                tm_type,
                goods_id,
            ),
                          exc_info=True)
            return self._data_error_init()

        if get_base_data_method == 0:
            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0][
                'value'] = self._wash_result_data_apiStack_value(
                    goods_id=goods_id,
                    result_data_apiStack_value=result_data_apiStack_value)
        else:
            pass

        mock_data = result_data['mockData']
        if get_base_data_method == 0:
            # 处理mockData
            mock_data = json_2_dict(json_str=mock_data, logger=self.lg)
        elif get_base_data_method == 1:
            pass
        else:
            raise ValueError('get_base_data_method value异常!')

        if mock_data == {}:
            self.lg.error('出错tm_type: {0}, goods_id: {1}'.format(
                tm_type, goods_id))
            return self._data_error_init()

        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # self.lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.lg.error(
                "result_data.get('apiStack', [])[0].get('value', '')的值为空....出错tm_type: {}, goods_id: {}"
                .format(tm_type, goods_id))
            result_data['trade'] = {}
            return self._data_error_init()
        else:
            # 用于判断该商品是否已经下架的参数
            result_data['trade'] = result_data\
                .get('apiStack', [])[0]\
                .get('value', {})\
                .get('trade', {})
            # pprint(result_data['trade'])

        # 单独写爬虫进行获取优惠券
        # # 获取tm优惠券
        # result_data['coupon_list'] = self.get_coupon_list(
        #     result_data=result_data,
        #     goods_id=goods_id,)

        result_data['type'] = tm_type
        result_data['goods_id'] = goods_id
        self.result_data = result_data
        # pprint(self.result_data)

        return result_data