Exemple #1
0
    async def _handle_al_goods_is_delete(self, goods_id) -> dict:
        '''
        处理商品无法查看或者下架的
        :return:
        '''
        try:
            sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            is_in_db = sql_cli._select_table(sql_str=al_select_str_1,
                                             params=(str(goods_id), ))
            # self.lg.info(str(is_in_db))
        except Exception:
            self.lg.error('数据库连接失败!' + self.error_base_record, exc_info=True)
            return await self._data_error_init()

        self.result_data = {}
        # 初始化下架商品的属性
        tmp_data_s = await self._init_al_pull_off_shelves_goods()
        if is_in_db != []:
            # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
            sql_cli._update_table_2(sql_str=al_update_str_1,
                                    params=(goods_id),
                                    logger=self.lg)
            self.lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
            # 用来判断原先该goods是否在db中
            tmp_data_s['before'] = True

        else:
            # 表示该goods_id没存在于db中
            self.lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
            tmp_data_s['before'] = False

        return tmp_data_s
Exemple #2
0
class JDUpdater(AsyncCrawler):
    """jd常规商品更新"""
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/jd/实时更新/'
        )
        self.sql_cli = None
        self.goods_index = 1
        # 并发量
        self.concurrency = 10  

    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            result = list(self.sql_cli._select_table(sql_str=jd_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_jd_obj(self, index):
        if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.jd
            except:
                pass
            collect()
            self.jd = JdParse(logger=self.lg)

    async def _get_tmp_item(self, site_id, goods_id):
        tmp_item = []
        if site_id == 7 or site_id == 8:  # 从数据库中取出时,先转换为对应的类型
            tmp_item.append(0)
        elif site_id == 9:
            tmp_item.append(1)
        elif site_id == 10:
            tmp_item.append(2)

        tmp_item.append(goods_id)

        return tmp_item

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个jd商品信息
        :param db_goods_info_obj:
        :param index:
        :return:
        '''
        res = False
        await self._get_new_jd_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(
                db_goods_info_obj.goods_id,
                index))
            tmp_item = await self._get_tmp_item(
                site_id=db_goods_info_obj.site_id,
                goods_id=db_goods_info_obj.goods_id,)
            data = self.jd.get_goods_data(goods_id=tmp_item)
            if data.get('is_delete', 1) == 1:
                self.lg.info('该商品已下架...')
                self.sql_cli._update_table_2(
                    sql_str=jd_update_str_2,
                    params=(str(get_shanghai_time()), tmp_item[1],),
                    logger=self.lg)
                await async_sleep(1.2)
                index += 1
                self.goods_index = index

                return db_goods_info_obj.goods_id, index

            data = self.jd.deal_with_data(goods_id=tmp_item)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='jd',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,)
                self.jd.to_right_and_update_data(data, pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.2)       # 避免被发现使用代理

        return db_goods_info_obj.goods_id, index

    async def _update_db(self):
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.jd = JdParse(logger=self.lg)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = JDDbGoodsInfoObj(item=item, logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id))
                        tasks.append(self.loop.create_task(self._update_one_goods_info(
                            db_goods_info_obj=db_goods_info_obj,
                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.5)
            try:
                del self.jd
            except:
                pass
            collect()

    def __del__(self):
        try:
            del self.lg
        except: pass
        try:
            del self.loop
        except:pass
        collect()
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str(
            goods_id)
        self.lg.info(self.msg)

        # 获取主接口的body
        last_url = self._get_last_url(goods_id=goods_id)
        data = Requests.get_url_body(url=last_url,
                                     headers=self.headers,
                                     params=None,
                                     timeout=14,
                                     high_conceal=True,
                                     ip_pool_type=self.ip_pool_type)
        if data == '':
            self.lg.error('出错goods_id: {0}'.format((goods_id)))
            return self._data_error_init()

        try:
            data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)[
                0]  # 贪婪匹配匹配所有
            # self.lg.info(str(data))
        except IndexError:
            self.lg.error('data为空! 出错goods_id: {0}'.format(goods_id))
            return self._data_error_init()

        data = json_2_dict(json_str=data, logger=self.lg)
        if data == {}:
            self.lg.error('出错goods_id: {0}'.format(str(goods_id)))
            return self._data_error_init()
        # pprint(data)

        if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                and data.get('data', {}).get('seller', {}).get('evaluates') is None:
            '''
            ## 表示该商品已经下架, 原地址被重定向到新页面
            '''
            self.lg.info('@@@@@@ 该商品已经下架...')
            _ = SqlServerMyPageInfoSaveItemPipeline()
            if _.is_connect_success:
                _._update_table_2(sql_str=tb_update_str_3,
                                  params=(goods_id, ),
                                  logger=self.lg)
                try:
                    del _
                except:
                    pass
            tmp_data_s = self.init_pull_off_shelves_goods()
            self.result_data = {}
            return tmp_data_s

        # 处理商品被转移或者下架导致页面不存在的商品
        if data.get('data').get('seller', {}).get('evaluates') is None:
            self.lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架')
            return self._data_error_init()

        data['data']['rate'] = ''  # 这是宝贝评价
        data['data']['resource'] = ''  # 买家询问别人
        data['data']['vertical'] = ''  # 也是问和回答
        data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
        result_data = data['data']

        # 处理result_data['apiStack'][0]['value']
        # self.lg.info(result_data.get('apiStack', [])[0].get('value', ''))
        result_data_apiStack_value = result_data.get('apiStack',
                                                     [])[0].get('value', {})

        # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
        result_data['apiStack'][0][
            'value'] = self._wash_result_data_apiStack_value(
                goods_id=goods_id,
                result_data_apiStack_value=result_data_apiStack_value)

        # 处理mockData
        mock_data = result_data['mockData']
        mock_data = json_2_dict(json_str=mock_data, logger=self.lg)
        if mock_data == {}:
            self.lg.error('出错goods_id: {0}'.format(goods_id))
            return self._data_error_init()
        mock_data['feature'] = ''
        # pprint(mock_data)
        result_data['mockData'] = mock_data

        # self.lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
        if result_data.get('apiStack', [])[0].get('value', '') == '':
            self.lg.info(
                "result_data.get('apiStack', [])[0].get('value', '')的值为空....")
            result_data['trade'] = {}
            return self._data_error_init()
        else:
            result_data['trade'] = result_data.get('apiStack', [])[0].get(
                'value', {}).get('trade', {})  # 用于判断该商品是否已经下架的参数
            # pprint(result_data['trade'])

        self.result_data = result_data
        # pprint(self.result_data)

        return result_data
Exemple #4
0
class GX8899Spider(Crawler):
    def __init__(self, logger=None):
        super(GX8899Spider, self).__init__(
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            logger=logger,
            log_save_path=MY_SPIDER_LOGS_PATH + '/gx8899/_/',
            
            is_use_driver=True,
            driver_executable_path=PHANTOMJS_DRIVER_PATH
        )
        self._set_sort_type_name()
        self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s'
        self.id_list = []
        self.update_index = 0

    def _set_sort_type_name(self):
        '''
        设置抓取的分类名
        :return:
        '''
        self.sort_type_name_list = [
            # 'weixin',
            # 'nansheng',
            # 'nvsheng',
            'fengjing',
            'jingxuan',
            'wupin',
            'oumei',
            'weimei',
            'heibai',
            'baqi',
            'xiaoqingxin',
            'yijing',
            'beiying',
            'chouyan',
            'sumiao',
            'gexing',
            'xiaohai',
            'qiche',
            'zhiwu',
            'shouhui',
            'weshen',
            'mingxing',
            'jianzhu',
            'renwu',
        ]

    def _get_gx8899_all_img_url(self):
        self.lg.info('即将开始采集gx8899...')
        fz = []
        for sort_type_name in self.sort_type_name_list:
            tmp = self._get_one_sort_type_name_page_info(sort_type_name)
            if tmp != []:
                fz += tmp

        self.lg.info('@@@ 全部头像抓取完毕!')
        self.fz = fz

        return fz

    def _get_new_wait_2_handle_id_list(self):
        '''
        获取新的带处理的
        :return:
        '''
        sql_str = '''
        select top 1000 id 
        from dbo.sina_weibo
        where sina_type = 'bilibili' and modify_time is null
        '''
        if self.id_list == []:
            self.lg.info('@@@ 重新获取id_list...')
            self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            try:
                wait = self.my_pipeline._select_table(sql_str=sql_str)
                self.id_list = [i[0] for i in wait]
            except TypeError or IndexError:
                sleep(8)
                return []
        else:
            pass

        return self.id_list

    @fz_set_timeout(6)
    def oo(self, id, img_url):
        try:
            self.my_pipeline._update_table_2(
                sql_str=self.update_sql,
                params=(img_url, get_shanghai_time(), id),
                logger=self.lg
            )
        except Exception:
            return False
        return True

    def _get_one_sort_type_name_page_info(self, sort_type_name):
        '''
        得到一个分类的某页信息
        :return:
        '''
        base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name)
        headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_pc_ua(),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'Referer': 'http://m.gx8899.com/weixin/',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        index = 0
        res = []
        while True:
            if index == 0:
                url = base_url
                index += 1  # 第二页index_2开始
            else:
                url = base_url + 'index_{0}.html'.format(index)

            self.lg.info('正在抓取{0}'.format(url))
            # 太慢, 改用phantomjs
            # body = self._get_loop_run_result(url=url, headers=headers)

            if index % 15 == 0:
                try:
                    del self.driver
                except:
                    pass
                gc.collect()
                self.driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.lg, ip_pool_type=self.ip_pool_type)
                self.lg.info('[+] phantomjs已重置!')

            body = self.driver.get_url_body(url=url)
            # self.lg.info(str(body))
            if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall(body) != []:
                break

            need = Selector(text=body).css('div#con_tabone_1 li.last a:last-child ::attr(href)').extract()
            pprint(need)
            if need == []:
                self.lg.error('获取到的need为空list!出错地址:{0}'.format(url))
                continue

            for article_url in need:
                _ = self._get_one_article_page_info(article_url)
                if _ != []:
                    res += _

                self.lg.info('#### 已更新{0}个id !'.format(self.update_index))

            index += 1

        return res

    def _get_one_article_page_info(self, url):
        '''
        得到一个推荐地址里面所有图片list
        :param url:
        :return:
        '''
        headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

        # body = self._get_loop_run_result(url=url, headers=headers)
        body = self.driver.get_url_body(url=url)
        if body == '':
            self.lg.info('获取到img list为空list!出错地址:{}'.format(url))
            return []

        need = Selector(text=body).css('div.content p img ::attr(src)').extract()
        # pprint(need)
        # self.lg.info(str(need))
        if need != []:
            self.lg.info('[+] crawl子地址success')
        else:
            self.lg.info('[-] crawl子地址fail')

        # 数据更新操作
        for img_url in need:
            try:
                random_id_index = randint(0, len(self._get_new_wait_2_handle_id_list())-1)
            except:
                sleep(5)
                continue
            res = self.oo(
                id=self.id_list[random_id_index],
                img_url=img_url,
            )
            if res:
                self.id_list.pop(random_id_index)
                self.update_index += 1

        return need

    async def _get_one_page_body(self, url, headers):
        '''
        异步获取body
        :param url:
        :param headers:
        :return:
        '''
        body = await AioHttp.aio_get_url_body(url=url, headers=headers, ip_pool_type=self.ip_pool_type)

        return body

    def _get_loop_run_result(self, **kwargs):
        loop = get_event_loop()
        result = loop.run_until_complete(self._get_one_page_body(
            url=kwargs.get('url', ''),
            headers=kwargs.get('headers', {})
        ))

        return result

    def __del__(self):
        try:
            del self.driver
            del self.lg
        except:
            pass
        gc.collect()
class YXGoodsInfoMonitorSpider(AsyncCrawler):
    """cp goods info monitor"""
    def __init__(self):
        AsyncCrawler.__init__(
            self,
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/cp/yx_goods_monitor/',
        )
        self.req_num_retries = 5
        self.concurrency = 100
        self.concurrent_type = 1
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        self.init_sql_str()

    async def _fck_run(self):
        while True:
            self.db_res = await self.get_db_res()
            await self.get_all_goods_info_and_handle_by_goods_id_list(
                goods_id_list=[item[0] for item in self.db_res], )
            await async_sleep(10.)

    async def get_db_res(self) -> list:
        """
        获取目标goods_id_list
        :return:
        """
        db_res = []
        try:
            db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, ))
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        assert db_res != []

        return db_res

    async def get_all_goods_info_and_handle_by_goods_id_list(
            self, goods_id_list: list):
        """
        根据goods_id_list获取所有goods info并处理
        :return:
        """
        async def get_tasks_params_list() -> list:
            tasks_params_list = []
            for goods_id in goods_id_list:
                tasks_params_list.append({'goods_id': goods_id})

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where goods_id: {}] ...'.format(
                k['goods_id'], )

        def get_now_args(k) -> list:
            return [
                k['goods_id'],
            ]

        assert goods_id_list != []
        all_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=await get_tasks_params_list(),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.get_goods_info_by_goods_id,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=self.handle_one_res,
            func_name_where_add_one_res_2_all_res=
            default_add_one_res_2_all_res2,
            one_default_res={},
            step=self.concurrency,
            logger=self.lg,
            concurrent_type=self.concurrent_type,
        )

        return all_res

    def handle_one_res(self, one_res) -> None:
        """
        处理单个结果
        :return:
        """
        # 每次重连
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        if not self.sql_cli.is_connect_success:
            return

        # pprint(one_res)
        for item in one_res:
            try:
                goods_id = item.get('goods_id', '')
                assert goods_id != ''
                # 会员价
                tb_price0 = item.get('tb_price0', 0.)
                assert tb_price0 != 0.
                # 优点价
                tb_price1 = item.get('tb_price1', 0.)
            except AssertionError:
                continue

            for item in self.db_res:
                db_goods_id = item[0]
                db_tb_price = float(item[1]).__round__(2)
                site_id = item[3]
                modify_time = item[4]
                goods_url = item[5]
                if goods_id == db_goods_id:
                    if tb_price1 != db_tb_price:
                        # 先对比会员价
                        if tb_price0 != db_tb_price:
                            # 再对比优点价格, 两者都不同才进行修正, 否则pass(原因cp 部分会员价显示错误)
                            now_time = get_shanghai_time()
                            cp_url = 'https://m.yiuxiu.com/Product/Info/{}'.format(
                                goods_id)
                            self.lg.info(
                                'goods_id: {}, 优点价格: {}, 会员价格: {}, db_tb_price: {}, site_id: {}, modify_time: {}, cp_url: {}, goods_url: {}'
                                .format(
                                    goods_id,
                                    tb_price1,
                                    tb_price0,
                                    db_tb_price,
                                    site_id,
                                    modify_time,
                                    cp_url,
                                    goods_url,
                                ))
                            self.sql_cli._update_table_2(
                                sql_str=self.sql_tr1,
                                params=(
                                    now_time,
                                    now_time,
                                    now_time,
                                    goods_id,
                                ),
                                logger=self.lg,
                            )
                        else:
                            pass
                        break
                    else:
                        continue

        return None

    @catch_exceptions_with_class_logger(default_res={})
    def get_goods_info_by_goods_id(self, goods_id: str) -> dict:
        """
        根据goods_id获取商品信息
        :param goods_id:
        :return:
        """
        def parse_body() -> dict:
            """
            解析
            :return:
            """
            nonlocal body

            # 多规格的最低价
            # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用
            tb_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPriceTips span:nth-child(2) ::text',
            }
            big_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPrice big ::text',
            }
            yd_sel = {
                'method': 're',
                'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>'
            }
            tb_price0 = parse_field(
                parser=tb_price_sel,
                target_obj=body,
            )
            assert tb_price0 != ''
            big_price = parse_field(
                parser=big_price_sel,
                target_obj=body,
            )
            assert big_price != ''
            yd = parse_field(
                parser=yd_sel,
                target_obj=body,
            )
            assert yd != ''
            # 会员价
            tb_price0 = float(tb_price0).__round__(2)
            # 优点价
            tb_price1 = (float(big_price) + float(yd) / 100).__round__(2)

            return {
                'goods_id': goods_id,
                'tb_price0': tb_price0,
                'tb_price1': tb_price1,
            }

        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,
        )
        headers.update({
            'authority': 'm.yiuxiu.com',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '******',
            'sec-fetch-site': 'none',
        })
        url = 'https://m.yiuxiu.com/Product/Info/{}'.format(goods_id)
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.req_num_retries,
            proxy_type=PROXY_TYPE_HTTPS,
        )
        assert body != ''
        # self.lg.info(body)

        res = parse_body()
        self.lg.info('[{}] goods_id: {}'.format(
            '+' if res != {} else '-',
            goods_id,
        ))

        return res

    def init_sql_str(self):
        self.sql_tr0 = '''
        select MainGoodsID, TaoBaoPrice, Price, SiteID, ModfiyTime, GoodsUrl
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not null
        and IsDelete=0
        '''
        self.sql_tr1 = '''
        update dbo.GoodsInfoAutoGet 
        set is_spec_change=1, 
        spec_trans_time=%s, 
        ModfiyTime=%s, 
        IsPriceChange=1, 
        sku_info_trans_time=%s, 
        PriceChangeInfo=SKUInfo
        where MainGoodsID=%s
        '''

    def __del__(self):
        try:
            del self.lg
            del self.loop
        except:
            pass
        collect()
Exemple #6
0
    def get_ali_1688_data(self, goods_id):
        if goods_id == '':
            return self._data_error_init()

        wait_to_deal_with_url = 'https://m.1688.com/offer/' + str(
            goods_id) + '.html'
        self.my_lg.info(
            '------>>>| 待处理的阿里1688地址为: {0}'.format(wait_to_deal_with_url))

        self.error_base_record = '出错goods_id:{0}'.format(goods_id)
        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=wait_to_deal_with_url, css_selector='div.d-content')
        # self.my_lg.info(str(body))
        if body == '':
            self.my_lg.error('获取到的body为空str!请检查!' + self.error_base_record)
            return self._data_error_init()

        tmp_body = body
        try:
            pull_off_shelves = Selector(
                text=body).css('div.d-content p.info::text').extract_first()
        except:
            pull_off_shelves = ''
        if pull_off_shelves == '该商品无法查看或已下架':  # 表示商品已下架, 同样执行插入数据操作
            try:
                tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                is_in_db = tmp_my_pipeline._select_table(
                    sql_str=al_select_str_1, params=(str(goods_id), ))
                # self.my_lg.info(str(is_in_db))
            except Exception:
                self.my_lg.error('数据库连接失败!' + self.error_base_record,
                                 exc_info=True)
                return self._data_error_init()

            if is_in_db != []:  # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
                tmp_my_pipeline._update_table_2(sql_str=al_update_str_1,
                                                params=(goods_id),
                                                logger=self.my_lg)
                self.my_lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = True  # 用来判断原先该goods是否在db中
                self.result_data = {}

                return tmp_data_s

            else:  # 表示该goods_id没存在于db中
                self.my_lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = False
                self.result_data = {}

                return tmp_data_s

        body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall(
            body)
        if body != []:
            body = body[0]
            body = r'{"beginAmount"' + body
            # self.my_lg.info(str(body))
            body = json_2_dict(json_str=body)
            # pprint(body)

            if body.get('discountPriceRanges') is not None:
                self.result_data = self._wash_discountPriceRanges(body=body)
                return self.result_data
            else:
                self.my_lg.error('data为空!' + self.error_base_record)
                return self._data_error_init()

        else:
            self.my_lg.info('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!')
            body = re.compile(
                r'{"activityId"(.*?)</script></div></div>').findall(tmp_body)
            if body != []:
                body = body[0]
                body = r'{"activityId"' + body
                # self.my_lg.info(str(body))
                body = json_2_dict(json_str=body)
                # pprint(body)

                if body.get('discountPriceRanges') is not None:
                    self.result_data = self._wash_discountPriceRanges(
                        body=body)
                    self.is_activity_goods = True
                    return self.result_data
                else:
                    self.my_lg.error('data为空!' + self.error_base_record)
                    return self._data_error_init()
            else:
                self.my_lg.error('这个商品对应活动属性未知, 此处不解析, 设置为跳过!' +
                                 self.error_base_record)
                return self._data_error_init()
Exemple #7
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    db_goods_info_obj = YXDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:
                            # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            sql_cli._update_table_2(
                                sql_str=yx_update_str_2,
                                params=(db_goods_info_obj.goods_id, ),
                                logger=my_lg,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yx',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yanxuan.to_right_and_update_data(data,
                                                         pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
Exemple #8
0
    def get_goods_data(self, goods_id):
        '''
        得到data
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            self.result_data = {}
            return {}

        type = goods_id[0]  # 天猫类型
        # self.my_lg.info(str(type))
        goods_id = goods_id[1]  # 天猫goods_id
        tmp_url = 'https://detail.m.tmall.com/item.htm?id=' + str(goods_id)
        self.my_lg.info('------>>>| 得到的移动端地址为: %s' % tmp_url)

        self.headers.update({'Referer': tmp_url})
        last_url = self._get_last_url(goods_id=goods_id)
        body = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14)
        if body == '':
            self.my_lg.error('出错goods_id: {0}'.format((goods_id)))
            self.result_data = {}
            return {}

        try:
            assert body != '', '获取到的body为空值, 此处跳过! 出错type %s: , goods_id: %s' % (str(type), goods_id)
            data = re.compile('mtopjsonp3\((.*)\)').findall(body)[0]  # 贪婪匹配匹配所有
        except (AssertionError, IndexError) as e:
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        if data != '':
            data = json_2_dict(json_str=data, logger=self.my_lg)
            if data == {}:
                self.my_lg.error('出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            # pprint(data)

            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' \
                    and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                '''
                ## 表示该商品已经下架, 原地址被重定向到新页面
                '''
                self.my_lg.info('@@@@@@ 该商品已经下架...')
                _ = SqlServerMyPageInfoSaveItemPipeline()
                if _.is_connect_success:
                    _._update_table_2(sql_str=tm_update_str_3, params=(goods_id,), logger=self.my_lg)
                    try:
                        del _
                    except:
                        pass
                tmp_data_s = self.init_pull_off_shelves_goods(type)
                self.result_data = {}
                return tmp_data_s

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data', {}).get('seller', {}).get('evaluates') is None:
                self.my_lg.error('data为空, 地址被重定向, 该商品可能已经被转移或下架, 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data['data']['rate'] = ''  # 这是宝贝评价
            data['data']['resource'] = ''  # 买家询问别人
            data['data']['vertical'] = ''  # 也是问和回答
            data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {})

            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0]['value'] = self._wash_result_data_apiStack_value(
                goods_id=goods_id,
                result_data_apiStack_value=result_data_apiStack_value
            )

            # 处理mockData
            mock_data = result_data['mockData']
            mock_data = json_2_dict(json_str=mock_data, logger=self.my_lg)
            if mock_data == {}:
                self.my_lg.error('出错type: {0}, goods_id: {1}'.format(type, goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            mock_data['feature'] = ''
            # pprint(mock_data)
            result_data['mockData'] = mock_data

            # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
            if result_data.get('apiStack', [])[0].get('value', '') == '':
                self.my_lg.error("result_data.get('apiStack', [])[0].get('value', '')的值为空....出错type: %s, goods_id: %s" % (str(type), goods_id))
                result_data['trade'] = {}
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            else:
                result_data['trade'] = result_data.get('apiStack', [])[0].get('value', {}).get('trade', {})     # 用于判断该商品是否已经下架的参数
                # pprint(result_data['trade'])

            result_data['type'] = type
            result_data['goods_id'] = goods_id
            self.result_data = result_data
            # pprint(self.result_data)
            return result_data

        else:
            self.my_lg.error('data为空! 出错type: %s, goods_id: %s' % (str(type), str(goods_id)))
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
class CommentRealTimeUpdateSpider(object):
    def __init__(self):
        self._set_logger()
        self.msg = ''
        self.debugging_api = self._init_debugging_api()
        self._set_func_name_dict()
        self.sql_str = cm_update_str_1

        if self._init_debugging_api().get(2):
            self.my_lg.info('初始化 1688 phantomjs中...')
            self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(3) is True \
                or self._init_debugging_api().get(4) is True\
                or self._init_debugging_api().get(6) is True:
            self.my_lg.info('初始化 天猫 phantomjs中...')
            self.tmall = TmallCommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(7) is True \
                or self._init_debugging_api().get(8) is True\
                or self._init_debugging_api().get(9) is True\
                or self._init_debugging_api().get(10) is True:
            self.my_lg.info('初始化 京东 phantomjs中...')
            self.jd = JdCommentParse(logger=self.my_lg)

    def _set_logger(self):
        self.my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/all_comment/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

    def _init_debugging_api(self):
        '''
        用于设置待抓取的商品的site_id
        :return: dict
        '''
        return {
            1: True,
            2: True,
            3: True,
            4: True,
            6: True,
            7: True,
            8: True,
            9: True,
            10: True,
            11: False,
            12: False,
            13: False,
            25: False,
        }

    def _set_func_name_dict(self):
        self.func_name_dict = {
            'taobao': 'self._update_taobao_comment({0}, {1}, {2})',
            'ali': 'self._update_ali_1688_comment({0}, {1}, {2})',
            'tmall': 'self._update_tmall_comment({0}, {1}, {2})',
            'jd': 'self._update_jd_comment({0}, {1}, {2})',
            'zhe_800': 'self._update_zhe_800_comment({0}, {1}, {2})',
            'juanpi': 'self._update_juanpi_comment({0}, {1}, {2})',
            'pinduoduo': 'self._update_pinduoduo_comment({0}, {1}, {2})',
            'vip': 'self._update_vip_comment({0}, {1}, {2})',
        }

    def _just_run(self):
        while True:
            #### 更新数据
            self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            #  and GETDATE()-a.modify_time>1
            try:
                result = list(self._comment_pipeline._select_table(sql_str=cm_select_str_1, logger=self.my_lg))
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                continue

            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info('--------------------------------------------------------')
            self.my_lg.info('待更新个数: {0}'.format(len(result)))

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))

            # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房  11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会
            for index, item in enumerate(result):     # item: ('xxxx':goods_id, 'y':site_id)
                if not self.debugging_api.get(item[1]):
                    self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index))
                    continue

                if index % 20 == 0:
                    try: del self._comment_pipeline
                    except: pass
                    self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()

                switch = {
                    1: self.func_name_dict.get('taobao'),       # 淘宝
                    2: self.func_name_dict.get('ali'),          # 阿里1688
                    3: self.func_name_dict.get('tmall'),        # 天猫
                    4: self.func_name_dict.get('tmall'),        # 天猫超市
                    6: self.func_name_dict.get('tmall'),        # 天猫国际
                    7: self.func_name_dict.get('jd'),           # 京东
                    8: self.func_name_dict.get('jd'),           # 京东超市
                    9: self.func_name_dict.get('jd'),           # 京东全球购
                    10: self.func_name_dict.get('jd'),          # 京东大药房
                    11: self.func_name_dict.get('zhe_800'),     # 折800
                    12: self.func_name_dict.get('juanpi'),      # 卷皮
                    13: self.func_name_dict.get('pinduoduo'),   # 拼多多
                    25: self.func_name_dict.get('vip'),         # 唯品会
                }

                # 动态执行
                exec_code = compile(switch[item[1]].format(index, item[0], item[1]), '', 'exec')
                exec(exec_code)
                sleep(1.1)

    def _update_taobao_comment(self, index, goods_id, site_id):
        '''
        处理淘宝的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index))

            taobao = TaoBaoCommentParse(logger=self.my_lg)
            _r = taobao._get_comment_data(goods_id=str(goods_id))

            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

            try:
                del taobao
            except:
                self.my_lg.info('del taobao失败!')
            gc.collect()
        else:
            pass

    def _update_ali_1688_comment(self, index, goods_id, site_id):
        '''
        处理阿里1688的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try: del self.ali_1688
                except: self.my_lg.error('del ali_1688失败!')
                gc.collect()
                self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

            _r = self.ali_1688._get_comment_data(goods_id=goods_id)
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)

            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

        else:
            pass

    def _update_tmall_comment(self, index, goods_id, site_id):
        '''
        处理tmall商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index))

            if site_id == 3:
                _type = 0
            elif site_id == 4:
                _type = 1
            elif site_id == 6:
                _type = 2
            else:
                return None

            if index % 5 == 0:
                try:
                    del self.tmall
                except:
                    self.my_lg.info('del tmall失败!')
                gc.collect()
                self.tmall = TmallCommentParse(logger=self.my_lg)

            _r = self.tmall._get_comment_data(type=_type, goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
            gc.collect()
        else:
            pass

    def _update_jd_comment(self, index, goods_id, site_id):
        '''
        处理京东商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try: del self.jd
                except: self.my_lg.info('del jd失败!')
                gc.collect()
                self.jd = JdCommentParse(logger=self.my_lg)

            _r = self.jd._get_comment_data(goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
        else:
            pass

    def _update_zhe_800_comment(self, index, goods_id, site_id):
        '''
        处理折800商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_juanpi_comment(self, index, goods_id, site_id):
        '''
        处理卷皮商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_pinduoduo_comment(self, index, goods_id, site_id):
        '''
        处理拼多多的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_vip_comment(self, index, goods_id, site_id):
        '''
        处理唯品会的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _get_db_update_params(self, item):
        return (
            item['modify_time'],
            dumps(item['_comment_list'], ensure_ascii=False),

            item['goods_id'],
        )

    def __del__(self):
        try:
            del self.my_lg
            del self.msg
            del self.debugging_api
        except:
            pass
        try: del self._comment_pipeline
        except: pass
        try:
            del self.tmall
        except:
            pass
        gc.collect()
Exemple #10
0
class MIUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.delete_sql_str = mia_delete_str_3
        self.concurrency = 8  # 并发量
        self.tmp_sql_server = None
        self.goods_index = 1

    async def _get_pc_headers(self) -> dict:
        return {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.mia.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    async def _get_db_old_data(self):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=mia_select_str_3))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_end_time(self, miaosha_time):
        miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time')
        miaosha_end_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_end_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_end_time

    async def _get_new_mia_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.mia_miaosha
            except:
                pass
            collect()
            self.mia_miaosha = MiaParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        单个更新
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        pid = item[2]
        miaosha_end_time = await self._get_miaosha_end_time(miaosha_time)
        await self._get_new_mia_obj(index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = self.tmp_sql_server._update_table_2(
                    sql_str=mia_update_str_6,
                    params=(goods_id, ),
                    logger=self.lg)
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                    goods_id,
                    json_2_dict(miaosha_time).get('miaosha_begin_time')))
                await async_sleep(.5)
                self.goods_index = index + 1

                return goods_id, res

            elif is_recent_time == 2:
                self.goods_index = index + 1

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                    pid)
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             had_referer=True,
                                             ip_pool_type=self.ip_pool_type)
                # print(body)
                body = '' if body == '' or body == '[]' else body
                try:
                    tmp_data = json_2_dict(body, default_res={})
                    assert tmp_data != {}, 'tmp_data为空dict!'
                except AssertionError:
                    self.lg.error('遇到错误:', exc_info=True)
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                item_list = tmp_data.get('item_list', [])
                # 该pid中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [
                    item_1.get('item_id', '') for item_1 in item_list
                ]
                # self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    res = self.tmp_sql_server._update_table_2(
                        sql_str=mia_update_str_6,
                        params=(goods_id, ),
                        logger=self.lg)
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(item_list=item_list,
                                                 goods_id=goods_id,
                                                 tmp_data=tmp_data)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')

        await async_sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        self.goods_index = index + 1
        collect()

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.mia_miaosha = MiaParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.mia_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        item_list = kwargs.get('item_list')
        goods_id = kwargs.get('goods_id')
        tmp_data = kwargs.get('tmp_data')

        begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data
                                                                       )
        for item_2 in item_list:
            if item_2.get('item_id', '') == goods_id:
                self.mia_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.mia_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    pass
                else:
                    goods_data['goods_id'] = str(goods_id)
                    goods_data['price'] = item_2.get('active_price')
                    goods_data['taobao_price'] = item_2.get('active_price')
                    goods_data['sub_title'] = item_2.get('short_info', '')
                    goods_data['miaosha_time'] = {
                        'miaosha_begin_time':
                        timestamp_to_regulartime(begin_time),
                        'miaosha_end_time': timestamp_to_regulartime(end_time),
                    }
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=goods_data['miaosha_time'])

                    res = self.mia_miaosha.update_mia_xianshimiaosha_table(
                        data=goods_data, pipeline=self.tmp_sql_server)
                    break
            else:
                pass

        return res

    async def _get_begin_time_and_end_time(self, tmp_data) -> tuple:
        begin_time = tmp_data.get('p_info', {}).get('start_time', '')
        end_time = tmp_data.get('p_info', {}).get('end_time', '')
        # 把str字符串类型转换为时间戳的形式
        begin_time = int(
            time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')))
        end_time = int(
            time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S')))

        return begin_time, end_time

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.mia_miaosha
        except:
            pass
        collect()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            tmp_sql_server._update_table_2(sql_str=yx_update_str_2, params=(item[1],), logger=my_lg)
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price']
                            )
                            try:
                                old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=30)
                            except AttributeError:  # 处理已被格式化过的
                                old_sku_info = item[7]
                            data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(data['price_info_list'], site_id=30),
                                is_price_change=item[8] if item[8] is not None else 0
                            )

                        yanxuan.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        gc.collect()
    async def get_coupon_url_list_by_goods_id_list(self,
                                                   slice_params_list) -> list:
        """
        根据给与的goods_id_list来获取对应的coupon_url_list
        :return:
        """
        def get_create_task_msg(k) -> str:
            return 'create task[where goods_id: {}, site_id: {}] ...'.format(
                k['goods_id'],
                k['site_id'],
            )

        def get_now_args(k) -> list:
            return [
                k['goods_id'],
            ]

        all_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=slice_params_list,
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.get_tm_coupon_url_from_lq5u,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=
            default_add_one_res_2_all_res2,
            one_default_res='',
            step=self.concurrency,
            logger=self.lg,
            concurrent_type=self.concurrent_type,
            func_timeout=25,
        )

        res = []
        for item in all_res:
            if item != '':
                res.append(item)

        # 修改对应的goods_id的coupon_check_time
        sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s'
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        for item in slice_params_list:
            goods_id = item['goods_id']
            coupon_check_time_change_res = False
            try:
                coupon_check_time_change_res = sql_cli._update_table_2(
                    sql_str=sql_str,
                    params=(
                        get_shanghai_time(),
                        goods_id,
                    ),
                    logger=self.lg,
                )
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)

            self.lg.info('[{}] update goods_id: {} coupon_check_time'.format(
                '+' if coupon_check_time_change_res else '-',
                goods_id,
            ))
        try:
            del sql_cli
        except:
            pass

        try:
            del all_res
        except:
            pass
        collect()

        return res
Exemple #13
0
class ZWMSpider(AsyncCrawler):
    def __init__(self):
        AsyncCrawler.__init__(
            self,
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/',
        )
        self.init_zwm_pwd()
        self.concurrency = 20
        self.num_retries = 6
        self.max_transaction_details_page_num = 20              # 交易截止抓取页
        self.max_business_settlement_records_page_num = 20      # 商户结算记录截止抓取页
        self.max_business_manage_page_num = 80                  # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改)
        self.login_cookies_dict = {}
        self.sleep_time = 5

    def init_zwm_pwd(self):
        ori_data = ''
        with open(ZWM_PWD_PATH, 'r') as f:
            for line in f:
                ori_data += line.replace('\n', '').replace('  ', '')
        data = json_2_dict(
            json_str=ori_data,
            logger=self.lg,
            default_res={},)
        self.zwm_username, self.zwm_pwd = data['username'], data['pwd']
        assert self.zwm_username != '' and self.zwm_pwd != ''

    async def _fck_run(self) -> None:
        while True:
            try:
                login_res = await self._login()
                assert login_res is True, '登录失败, 退出后续同步操作!'

                # 获取所有交易明细(自己有接口, 不需要了)
                # all_transaction_details = await self._get_all_transaction_details()
                # pprint(all_transaction_details)
                # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details)))
                # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details)

                # 获取所有商户结算记录
                self.lg.info('获取所有商户结算记录...')
                all_business_settlement_records = await self._get_all_business_settlement_records_by_something()
                # pprint(all_business_settlement_records)
                self.lg.info('len_now_business_settlement_records: {}'.format(len(all_business_settlement_records)))
                await self._wash_save_all_business_settlement_records(target_list=all_business_settlement_records)
                self.lg.info('\n')

                # 获取所有商户及门店管理记录
                self.lg.info('获取所有商户及门店管理记录 ...')
                all_business_manage_records = await self._get_all_business_manage_records_by_something()
                # pprint(all_business_manage_records)
                self.lg.info('len_all_business_manage_records: {}'.format(len(all_business_manage_records)))
                await self._wash_save_all_business_manage_records(target_list=all_business_manage_records)
                self.lg.info('\n')

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)

            self.lg.info('## 同步完成 ##')
            self.lg.info('休眠 {} minutes ...'.format(self.sleep_time))

            # 定时
            await async_sleep(60 * self.sleep_time)

    async def _login(self) -> bool:
        """
        登录
        :return:
        """
        headers = await self._get_random_pc_headers()
        headers.update({
            'Referer': 'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp',
        })
        file_load = {
            'loginName': self.zwm_username,
            'userPassword': self.zwm_pwd,
        }
        m = MultipartEncoder(fields=file_load)
        # self.lg.info(m)
        headers.update({
            'Content-Type': m.content_type
        })
        login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do'

        with session() as _session:
            try:
                response = _session.post(
                    url=login_url,
                    headers=headers,
                    data=m,
                    proxies=self._get_proxies(),)
                login_res = json_2_dict(
                    json_str=response.text,
                    default_res={},
                    logger=self.lg, ).get('message', '')
                assert login_res == '登录成功', '登录失败!'
                self.lg.info(login_res)
                self.login_cookies_dict = response.cookies.get_dict()
                assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!'
                # pprint(self.login_cookies_dict)
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                return False

        return True

    async def _wash_save_all_business_manage_records(self, target_list: list):
        """
        清洗并存储所有未存储的 or 更新所有已存储的business manage records
        :param target_list:
        :return:
        """
        all_res = []
        for item in target_list:
            try:
                now_time = get_shanghai_time()
                create_time, modify_time, approval_status_change_time = now_time, now_time, now_time
                agent_name = item['agentName']
                top_agent_name = item['topAgentName']
                shop_type = item['merType']

                is_high_quality_shop = item['isHighQualityMer']
                if is_high_quality_shop == '否':
                    is_high_quality_shop = 0
                elif is_high_quality_shop == '是':
                    is_high_quality_shop = 1
                else:
                    raise ValueError('is_high_quality_shop value: {} 异常!'.format(is_high_quality_shop))

                shop_id = item.get('jhmid', '')
                assert shop_id != ''
                shop_chat_name = item.get('merchantName', '')
                assert shop_chat_name != ''
                phone_num = item.get('phone', '')
                assert phone_num != ''
                shop_chant_num = int(item['merchantNum'])
                sale = item['sale']
                is_real_time = 0 if item['isRealTime'] == '未开通' else 1
                approve_date = date_parse(item['approveDate'])
                rate = Decimal(item['rate']).__round__(4)
                account_type = item['accType']
                apply_time = date_parse(item['applyTime'])
                # 可为空值
                process_context = item.get('processContext', '')
                is_non_contact = 0 if item['isNonContact'] == '未开通' else 1

                approval_status = item['approvalStatus']
                if approval_status == '待审核':
                    approval_status = 1
                elif approval_status == '审核通过':
                    approval_status = 0
                elif approval_status == '退回':
                    approval_status = 2
                else:
                    raise ValueError('approval_status value: {} 异常'.format(approval_status))

                # 用其原值为定值不变, 且唯一
                unique_id = item['id']

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

            zwm_item = ZWMBusinessManageRecordItem()
            zwm_item['unique_id'] = unique_id
            zwm_item['create_time'] = create_time
            zwm_item['modify_time'] = modify_time
            zwm_item['agent_name'] = agent_name
            zwm_item['top_agent_name'] = top_agent_name
            zwm_item['shop_type'] = shop_type
            zwm_item['is_high_quality_shop'] = is_high_quality_shop
            zwm_item['shop_id'] = shop_id
            zwm_item['shop_chat_name'] = shop_chat_name
            zwm_item['phone_num'] = phone_num
            zwm_item['shop_chant_num'] = shop_chant_num
            zwm_item['sale'] = sale
            zwm_item['is_real_time'] = is_real_time
            zwm_item['approve_date'] = approve_date
            zwm_item['rate'] = rate
            zwm_item['account_type'] = account_type
            zwm_item['apply_time'] = apply_time
            zwm_item['process_context'] = process_context
            zwm_item['is_non_contact'] = is_non_contact
            zwm_item['approval_status'] = approval_status
            zwm_item['approval_status_change_time'] = approval_status_change_time
            all_res.append(dict(zwm_item))

            # 查看
            # if shop_id == 'YRMPAY100038574':
            # if phone_num == '18192242001':
            # if shop_chat_name == '哇哇叫':
            #     pprint(dict(zwm_item))

        # pprint(all_res)
        await self._insert_or_update_shop_manage_records_table(all_res=all_res)
        try:
            del all_res
        except:
            pass

        return None

    async def _insert_or_update_shop_manage_records_table(self, all_res: list):
        """
        插入or update原数据
        :param all_res:
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            db_data = self.sql_cli._select_table(
                sql_str=zwm_select_str_2,
                params=None,
                logger=self.lg, )
            # pprint(db_data)
            db_unique_id_list = [item[0] for item in db_data]
            assert db_unique_id_list != [], 'db_unique_id_list != []'
            self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list)))
        except Exception:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            self.lg.error('遇到错误:', exc_info=True)
            return None

        new_add_count = 0
        for item in all_res:
            unique_id = item['unique_id']
            if unique_id not in db_unique_id_list:
                # 插入
                self.lg.info('inserting unique_id: {} ...'.format(unique_id))
                params = await self._get_insert_item_params2(item=item)
                try:
                    res = self.sql_cli._insert_into_table_2(
                        sql_str=zwm_insert_str_2,
                        params=params,
                        logger=self.lg)
                    if res:
                        new_add_count += 1
                except Exception:
                    self.lg.error('遇到错误:', exc_info=True)
                    continue
            else:
                db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time(
                    db_data=db_data,
                    unique_id=unique_id,)
                item['approval_status_change_time'] = await self._get_new_approval_status_change_time(
                    db_old_approval_status=db_old_approval_status,
                    db_old_approval_status_change_time=db_old_approval_status_change_time,
                    new_approval_status=item['approval_status'],
                    new_approval_status_change_time=item['approval_status_change_time'])
                # 更新
                self.lg.info('updating unique_id: {} ...'.format(unique_id))
                params = await self._get_update_item_params(item=item)
                try:
                    res = self.sql_cli._update_table_2(
                        sql_str=zwm_update_str_1,
                        params=params,
                        logger=self.lg)
                except Exception:
                    self.lg.error('遇到错误:', exc_info=True)
                    continue

        if not self.sql_cli.is_connect_success:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        else:
            pass

        try:
            del db_data
            del db_unique_id_list
        except:
            pass

        self.lg.info('table.zwm_buss_manage_records新增个数: {}'.format(new_add_count))

    async def _get_new_approval_status_change_time(self,
                                                   db_old_approval_status,
                                                   db_old_approval_status_change_time,
                                                   new_approval_status,
                                                   new_approval_status_change_time):
        """
        获取新的approval_status_change_time
        :return:
        """
        if db_old_approval_status_change_time is not None:
            new_approval_status_change_time = db_old_approval_status_change_time \
                if db_old_approval_status == new_approval_status \
                else get_shanghai_time()
        else:
            pass

        return new_approval_status_change_time

    async def _get_dd_old_approval_status_and_approval_status_change_time(self, db_data: list, unique_id: str) -> tuple:
        """
        获取db 原先的approval_status
        :param db_data:
        :param unique_id:
        :return:
        """
        for item in db_data:
            if unique_id == item[0]:
                return item[1], item[2]
            else:
                continue

    async def _get_all_business_manage_records_by_something(self,):
        """
        获取所有商户及门店管理记录
        :return:
        """
        async def get_tasks_params_list(max_business_manage_page_num) -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, max_business_manage_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where page_num: {}]...'.format(k['page_num'])

        def get_now_args(k) -> list:
            return [
                k['page_num'],
            ]

        res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=await get_tasks_params_list(
                max_business_manage_page_num=self.max_business_manage_page_num),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self._get_one_page_business_manage_records_by_something,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,)

        return res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_business_manage_records_by_something(self,
                                                           page_num: int,
                                                           start_date: str = None,
                                                           end_date: str = None,):
        """
        获取单页商户及门店管理记录
        :param page_num:
        :param start_date:      默认设置前一个月27号, eg: '2019-01-27 00:00'
        :param end_date:        eg: '2019-07-20 09:39'
        :return:
        """
        # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间
        # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00'
        start_date = '2018-01-01 00:00'
        end_date = (str(get_shanghai_time()) if end_date is None else end_date)[0:16]
        self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date))

        headers = self.get_random_pc_headers()
        headers.update({
            'Accept': '*/*',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (
            ('_dc', get_now_13_bit_timestamp()),
        )
        data = {
            'merchantCode': '',
            'accType': '',
            'phone': '',
            'approveDate': '',
            'merchantName': '',
            'processStatus': '',
            'startTime': start_date,
            'endTime': end_date,
            'agentName': '',
            'page': str(page_num),
            'start': str((page_num - 1) * 100),     # 开始位置0, 100, 200
            'limit': '100',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do'
        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,)
        assert body != '', 'body不为空值!'
        res = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={}).get('materialList', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    async def _wash_save_all_business_settlement_records(self, target_list):
        """
        清洗并存储 未被存储的所有商户结算记录
        :param target_list:
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            db_data = self.sql_cli._select_table(
                sql_str=zwm_select_str_1,
                params=None,
                logger=self.lg,)
            # pprint(db_data)
            db_unique_id_list = [item[0] for item in db_data]
            assert db_unique_id_list != [], 'db_unique_id_list != []'
            self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list)))
        except Exception:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            self.lg.error('遇到错误:', exc_info=True)
            return None

        all_res = []
        for item in target_list:
            # pprint(item)
            try:
                create_time = get_shanghai_time()
                shop_name = item.get('merName', '')
                assert shop_name != ''
                shop_id = item.get('mid', '')
                assert shop_id != ''
                agent_name = item['agentName']
                top_agent_name = item['topAgentName']
                date_settle_type = item['settleType']
                trans_amount = item.get('transAmt', '')
                assert trans_amount != ''
                trans_amount = Decimal(trans_amount).__round__(2)
                service_charge = Decimal(item['mda']).__round__(2)
                accounting_amount = Decimal(item['mnamt']).__round__(2)
                # 正常情况为: '20190704', 异常为'20190824-20190824'
                txn_day = item['txnDay']
                if re.compile('-').findall(txn_day) != []:
                    txn_day = txn_day.split('-')[0]
                else:
                    pass
                trans_date = date_parse(txn_day)
                trans_status = item['status']
                if trans_status == '已结算':
                    trans_status = 0
                else:
                    raise ValueError('trans_status: {}, 未知交易状态!'.format(trans_status))
                settle_type = item['type']
                settle_date = date_parse(item['minDay'])
                # 生成唯一标识码
                unique_id = get_uuid3(
                    target_str=shop_id + str(date_settle_type) + str(trans_amount) + \
                               str(service_charge) + str(trans_date) + \
                               str(settle_type) + str(settle_date),)
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

            if unique_id in db_unique_id_list:
                # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id))
                continue

            settle_record_item = ZWMBusinessSettlementRecordItem()
            settle_record_item['unique_id'] = unique_id
            settle_record_item['create_time'] = create_time
            settle_record_item['shop_name'] = shop_name
            settle_record_item['shop_id'] = shop_id
            settle_record_item['agent_name'] = agent_name
            settle_record_item['top_agent_name'] = top_agent_name
            settle_record_item['date_settle_type'] = date_settle_type
            settle_record_item['trans_amount'] = trans_amount
            settle_record_item['service_charge'] = service_charge
            settle_record_item['accounting_amount'] = accounting_amount
            settle_record_item['trans_date'] = trans_date
            settle_record_item['trans_status'] = trans_status
            settle_record_item['settle_type'] = settle_type
            settle_record_item['settle_date'] = settle_date
            all_res.append(dict(settle_record_item))

        # pprint(all_res)
        self.lg.info('未存储个数: {}'.format(len(all_res)))
        await self._save_all_business_settlement_records(all_res=all_res)

        try:
            del all_res
        except:
            pass

        return None

    async def _save_all_business_settlement_records(self, all_res) -> None:
        """
        存储新增的商家提现记录
        :param all_res:
        :return:
        """
        new_add_count = 0
        for item in all_res:
            # 处理未存储的新数据
            unique_id = item['unique_id']
            self.lg.info('saving unique_id: {} ...'.format(unique_id))
            params = await self._get_insert_item_params(item=item)
            try:
                res = self.sql_cli._insert_into_table_2(
                    sql_str=zwm_insert_str_1,
                    params=params,
                    logger=self.lg)
                if res:
                    new_add_count += 1
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

        if not self.sql_cli.is_connect_success:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        else:
            pass

        self.lg.info('新增个数: {}'.format(new_add_count))

        return None

    async def _get_insert_item_params(self, item) -> tuple:
        """
        待插入对象
        :param item:
        :return:
        """
        return tuple([
            item['unique_id'],
            item['create_time'],
            item['shop_name'],
            item['shop_id'],
            item['agent_name'],
            item['top_agent_name'],
            item['date_settle_type'],
            item['trans_amount'],
            item['service_charge'],
            item['accounting_amount'],
            item['trans_date'],
            item['trans_status'],
            item['settle_type'],
            item['settle_date'],
        ])

    async def _get_insert_item_params2(self, item) -> tuple:
        """
        待插入对象, zwm_buss_manage_records table
        :param item:
        :return:
        """
        return tuple([
            item['unique_id'],
            item['create_time'],
            item['modify_time'],
            item['agent_name'],
            item['top_agent_name'],
            item['shop_type'],
            item['is_high_quality_shop'],
            item['shop_id'],
            item['shop_chat_name'],
            item['phone_num'],
            item['shop_chant_num'],
            item['sale'],
            item['is_real_time'],
            item['approve_date'],
            item['rate'],
            item['account_type'],
            item['apply_time'],
            item['process_context'],
            item['is_non_contact'],
            item['approval_status'],
            item['approval_status_change_time'],
        ])

    async def _get_update_item_params(self, item: dict) -> tuple:
        """
        更新对象, zwm_buss_manage_records table
        :param item:
        :return:
        """
        return tuple([
            item['modify_time'],
            item['agent_name'],
            item['top_agent_name'],
            item['shop_type'],
            item['is_high_quality_shop'],
            item['shop_id'],
            item['shop_chat_name'],
            item['phone_num'],
            item['shop_chant_num'],
            item['sale'],
            item['is_real_time'],
            item['approve_date'],
            item['rate'],
            item['account_type'],
            item['apply_time'],
            item['process_context'],
            item['is_non_contact'],
            item['approval_status'],
            item['approval_status_change_time'],

            item['unique_id'],
        ])

    async def _wash_and_save_all_transaction_details(self, target_list: list):
        """
        清洗并存储所有交易明细
        :param target_list:
        :return:
        """
        pass

    async def _get_all_business_settlement_records_by_something(self):
        """
        获取所有商户结算记录
        :return:
        """
        async def get_tasks_params_list(max_business_settlement_records_page_num) -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, max_business_settlement_records_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where page_num: {}]...'.format(k['page_num'])

        def get_now_args(k) -> list:
            return [
                k['page_num'],
            ]

        res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=await get_tasks_params_list(
                max_business_settlement_records_page_num=self.max_business_settlement_records_page_num),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self._get_one_page_business_settlement_records_by_something,
            func_name_where_get_now_args=get_now_args,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,)

        return res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_business_settlement_records_by_something(self,
                                                               page_num :int,
                                                               start_date: str=None,
                                                               end_date: str=None,
                                                               mid: str='',
                                                               agent_name: str='') -> list:
        """
        得到单页商户结算记录
        :param page_num:
        :param start_date:              默认设置前一个月27号, eg: '2019-07-01'
        :param end_date:                eg: '2019-07-16'
        :param mid:                     商户编号
        :param agent_name:              顶级机构名称
        :return:
        """
        start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0]
        # start_date = '2018-01-01'
        end_date = (str(get_shanghai_time()) if end_date is None else end_date).split(' ')[0]
        self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date))

        headers = self.get_random_pc_headers()
        headers.update({
            'Accept': '*/*',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (
            ('_dc', get_now_13_bit_timestamp()),
        )
        data = {
            'startDate': start_date,
            'endDate': end_date,
            'mid': mid,
            'agentName': agent_name,
            'loginAgentId': self.zwm_username[0:8],                   # 前8位
            'page': str(page_num),
            'start': str((page_num - 1) * 100),                       # 开始位置, 0, 100, 200
            'limit': '100',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do'
        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,)
        # self.lg.info(body)
        assert body != '', 'body不为空值!'
        res = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={}).get('data', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    async def _get_all_transaction_details(self) -> list:
        """
        获取所有交易流水
        :return:
        """
        async def _get_tasks_params_list() -> list:
            """获取tasks_params_list"""
            tasks_params_list = []
            for page_num in range(1, self.max_transaction_details_page_num):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        tasks_params_list = await _get_tasks_params_list()
        tasks_params_list_obj = TasksParamsListObj(
            tasks_params_list=tasks_params_list,
            step=self.concurrency,)

        all_res = []
        while True:
            try:
                slice_params_list = tasks_params_list_obj.__next__()
            except AssertionError:
                break

            tasks = []
            for k in slice_params_list:
                page_num = k['page_num']
                self.lg.info('create task[where page_num: {}]...'.format(page_num))
                func_args = [
                    page_num,
                ]
                tasks.append(self.loop.create_task(
                    unblock_func(
                        func_name=self._get_one_page_transaction_details_by_something,
                        func_args=func_args,
                        logger=self.lg,)))

            one_res = await async_wait_tasks_finished(tasks=tasks)
            try:
                del tasks
            except:
                pass
            for i in one_res:
                for j in i:
                    all_res.append(j)

        return all_res

    @catch_exceptions_with_class_logger(default_res=[])
    def _get_one_page_transaction_details_by_something(self,
                                                      page_num: int,
                                                      start_date: str=None,
                                                      end_date: str=None,
                                                      transaction_status: str='',
                                                      mer_name: str='',
                                                      order_no: str='',
                                                      mid: str='',
                                                      agent_name: str='',
                                                      pay_channel: str ='',
                                                      sale_name: str='',) -> list:
        """
        获取单页交易流水
        :param page_num:                开始页面, eg: 1, 2, 3
        :param start_date:              eg: '2019-07-16 00:00'
        :param end_data:                eg: '2019-07-16 10:02'
        :param transaction_status:      交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3'
        :param mer_name:                待查询的商户名称
        :param order_no:                订单号
        :param mid:                     商户编号
        :param agent_name:              顶级机构名称
        :param pay_channel:             支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67'
        :param sale_name:               销售名称
        :return:
        """
        res = []
        start_date = self.get_0_00_on_the_day() if start_date is None else start_date
        end_date = str(get_shanghai_time()) if end_date is None else end_date

        headers = self.get_random_pc_headers()
        headers.update({
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer': 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do',
            'X-Requested-With': 'XMLHttpRequest',
        })
        params = (
            ('_dc', get_now_13_bit_timestamp()),
        )
        data = {
            'startDate': start_date,
            'endDate': end_date,
            'type': '2',
            'status': transaction_status,
            'payChannel': pay_channel,
            'orderNo': order_no,
            'merName': mer_name,
            'mid': mid,
            'agentName': agent_name,
            'saleName': sale_name,
            'page': str(page_num),
            'start': str((page_num - 1) * 20),          # 开始位置, 0, 20, 40
            'limit': '20',
        }
        url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do'

        body = Requests.get_url_body(
            method='post',
            url=url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            data=data,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,)
        assert body != '', 'body不为空值!'
        res = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={}).get('data', [])

        self.lg.info('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))

        return res

    def get_0_00_on_the_day(self) -> str:
        """
        获取当天的0点
        :return:
        """
        now_time = get_shanghai_time()

        return str(datetime(
                year=now_time.year,
                month=now_time.month,
                day=now_time.day))

    def get_1_on_the_month(self) -> str:
        """
        获取当月的第一天
        :return:
        """
        now_time = get_shanghai_time()
        # 避免月底流水无法获取
        day = 5

        now_month = now_time.month
        if now_month > 1:
            now_month -= 1
        else:
            # now_month为1月份
            now_month = 12

        return str(datetime(
            year=now_time.year,
            month=now_month,
            day=day,))

    def _get_proxies(self) -> dict:
        """
        获取代理
        :return:
        """
        proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, )
        assert proxies != {}, 'proxies不为空dict!'

        return proxies

    async def _get_random_pc_headers(self) -> dict:
        """
        :return:
        """
        return self.get_random_pc_headers()

    @staticmethod
    def get_random_pc_headers() -> dict:
        headers = get_random_headers(
            upgrade_insecure_requests=False,
            cache_control='',)
        headers.update({
            'Origin': 'https://agent.yrmpay.com',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM',
            'accept': 'text/plain, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
        })
        return headers

    def __del__(self):
        try:
            del self.lg
            del self.login_cookies_dict
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()