Ejemplo n.º 1
0
class MyAllCommentSpider(object):
    def __init__(self):
        self._set_logger()
        self.msg = ''
        self.debugging_api = self._init_debugging_api()
        self._set_func_name_dict()
        self.sql_str = r'insert into dbo.all_goods_comment(goods_id, create_time, modify_time, comment_info) values(%s, %s, %s, %s)'

        if self._init_debugging_api().get(2):
            self.my_lg.info('初始化 1688 phantomjs中...')
            self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(3) is True \
                or self._init_debugging_api().get(4) is True\
                or self._init_debugging_api().get(6) is True:
            self.my_lg.info('初始化 天猫 phantomjs中...')
            self.tmall = TmallCommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(7) is True \
                or self._init_debugging_api().get(8) is True\
                or self._init_debugging_api().get(9) is True\
                or self._init_debugging_api().get(10) is True:
            self.my_lg.info('初始化 京东 phantomjs中...')
            self.jd = JdCommentParse(logger=self.my_lg)

        self.my_lg.info('初始化完毕!!!')

    def _set_logger(self):
        self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                '/all_comment/_/' +
                                str(get_shanghai_time())[0:10] + '.txt',
                                console_log_level=INFO,
                                file_log_level=ERROR)

    def _init_debugging_api(self):
        '''
        用于设置待抓取的商品的site_id
        :return: dict
        '''
        return {
            1: True,
            2: True,
            3: True,
            4: True,
            6: True,
            7: True,
            8: True,
            9: True,
            10: True,
            11: True,
            12: False,
            13: False,
            25: False,
        }

    def _set_func_name_dict(self):
        self.func_name_dict = {
            'taobao': 'self._taobao_comment({0}, {1}, {2})',
            'ali': 'self._ali_1688_comment({0}, {1}, {2})',
            'tmall': 'self._tmall_comment({0}, {1}, {2})',
            'jd': 'self._jd_comment({0}, {1}, {2})',
            'zhe_800': 'self._zhe_800_comment({0}, {1}, {2})',
            'juanpi': 'self._juanpi_comment({0}, {1}, {2})',
            'pinduoduo': 'self._pinduoduo_comment({0}, {1}, {2})',
            'vip': 'self._vip_comment({0}, {1}, {2})',
        }

    def _just_run(self):
        while True:
            #### 实时更新数据
            tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
            sql_str = '''
            select GoodsID, SiteID 
            from dbo.GoodsInfoAutoGet 
            where MainGoodsID is not null and IsDelete=0 and GoodsID not in (select goods_id from dbo.all_goods_comment)
            ORDER BY ID DESC'''
            try:
                result = list(tmp_sql_server._select_table(sql_str=sql_str))
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                result = None
            if result is None:
                pass
            else:
                self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
                self.my_lg.info(str(result))
                self.my_lg.info(
                    '--------------------------------------------------------')

                self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
                self._comment_pipeline = CommentInfoSaveItemPipeline(
                    logger=self.my_lg)
                if self._comment_pipeline.is_connect_success:
                    sql_str = r'select goods_id from dbo.all_goods_comment'
                    _db_goods_id = self._comment_pipeline._select_table(
                        sql_str=sql_str)
                    try:
                        _db_goods_id = [item[0] for item in _db_goods_id]
                    except IndexError:
                        continue
                    self.my_lg.info(str(_db_goods_id))

                else:
                    continue

                # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房  11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会
                for index, item in enumerate(
                        result):  # item: ('xxxx':goods_id, 'y':site_id)
                    if not self.debugging_api.get(item[1]):
                        self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index))
                        continue

                    try:
                        if item[0] in _db_goods_id:
                            self.my_lg.info('该goods_id[%s]已存在于db中, 此处跳过!' %
                                            item[0])
                            continue
                    except IndexError:
                        print('IndexError')

                    if index % 20 == 0:
                        self.my_lg.info('_comment_pipeline客户端重连中...')
                        try:
                            del self._comment_pipeline
                        except:
                            pass
                        self._comment_pipeline = CommentInfoSaveItemPipeline(
                            logger=self.my_lg)
                        self.my_lg.info('_comment_pipeline客户端重连完毕!')

                    switch = {
                        1: self.func_name_dict.get('taobao'),  # 淘宝
                        2: self.func_name_dict.get('ali'),  # 阿里1688
                        3: self.func_name_dict.get('tmall'),  # 天猫
                        4: self.func_name_dict.get('tmall'),  # 天猫超市
                        6: self.func_name_dict.get('tmall'),  # 天猫国际
                        7: self.func_name_dict.get('jd'),  # 京东
                        8: self.func_name_dict.get('jd'),  # 京东超市
                        9: self.func_name_dict.get('jd'),  # 京东全球购
                        10: self.func_name_dict.get('jd'),  # 京东大药房
                        11: self.func_name_dict.get('zhe_800'),  # 折800
                        12: self.func_name_dict.get('juanpi'),  # 卷皮
                        13: self.func_name_dict.get('pinduoduo'),  # 拼多多
                        25: self.func_name_dict.get('vip'),  # 唯品会
                    }

                    # 动态执行
                    _code = switch[item[1]].format(index, item[0], item[1])
                    if item[1] != 11:
                        exec_code = compile(_code, '', 'exec')
                        exec(exec_code)
                    else:  # 特殊单独执行
                        self._zhe_800_comment(index=index,
                                              goods_id=item[0],
                                              site_id=item[1])

                    sleep(1.2)

    def _taobao_comment(self, index, goods_id, site_id):
        '''
        处理淘宝的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index))

            taobao = TaoBaoCommentParse(logger=self.my_lg)
            _r = taobao._get_comment_data(goods_id=str(goods_id))

            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._insert_into_table(
                        sql_str=self.sql_str,
                        params=self._get_db_insert_params(item=_r))
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

            try:
                del taobao
            except:
                self.my_lg.info('del taobao失败!')
            gc.collect()
        else:
            pass

    def _ali_1688_comment(self, index, goods_id, site_id):
        '''
        处理阿里1688的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try:
                    del self.ali_1688
                except:
                    self.my_lg.info('del ali_1688失败!')
                gc.collect()
                self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

            _r = self.ali_1688._get_comment_data(goods_id=goods_id)
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._insert_into_table(
                        sql_str=self.sql_str,
                        params=self._get_db_insert_params(item=_r))
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
        else:
            pass

    def _tmall_comment(self, index, goods_id, site_id):
        '''
        处理tmall商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index))

            if site_id == 3:
                _type = 0
            elif site_id == 4:
                _type = 1
            elif site_id == 6:
                _type = 2
            else:
                return None

            if index % 5 == 0:
                try:
                    del self.tmall
                except:
                    self.my_lg.info('del tmall失败!')
                gc.collect()
                self.tmall = TmallCommentParse(logger=self.my_lg)

            _r = self.tmall._get_comment_data(type=_type,
                                              goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._insert_into_table(
                        sql_str=self.sql_str,
                        params=self._get_db_insert_params(item=_r))
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

        else:
            pass

    def _jd_comment(self, index, goods_id, site_id):
        '''
        处理京东商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try:
                    del self.jd
                except:
                    self.my_lg.info('del jd失败!')
                gc.collect()
                self.jd = JdCommentParse(logger=self.my_lg)

            _r = self.jd._get_comment_data(goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                # self.my_lg.info('获取评论success!')
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._insert_into_table(
                        sql_str=self.sql_str,
                        params=self._get_db_insert_params(item=_r))
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

        else:
            pass

    def _zhe_800_comment(self, index, goods_id, site_id):
        '''
        处理折800商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 折800\t\t索引值(%s)' % str(index))

            zhe_800 = Zhe800CommentParse(logger=self.my_lg)
            _r = zhe_800._get_comment_data(goods_id=str(goods_id))
            # pprint(_r)

            if _r.get('_comment_list', []) != []:
                # self.my_lg.info('获取评论success!')
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._insert_into_table(
                        sql_str=self.sql_str,
                        params=self._get_db_insert_params(item=_r))
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

            try:
                del zhe_800
            except:
                self.my_lg.info('del zhe_800失败!')
            gc.collect()
        else:
            pass

    def _juanpi_comment(self, index, goods_id, site_id):
        '''
        处理卷皮商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _pinduoduo_comment(self, index, goods_id, site_id):
        '''
        处理拼多多的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _vip_comment(self, index, goods_id, site_id):
        '''
        处理唯品会的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _get_db_insert_params(self, item):
        '''
        得到待插入的数据
        :param item:
        :return:
        '''
        return (
            item['goods_id'],
            item['create_time'],
            item['modify_time'],
            dumps(item['_comment_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
        )

    def __del__(self):
        try:
            del self.my_lg
            del self.msg
            del self.debugging_api
        except:
            pass
        try:
            del self._comment_pipeline
        except:
            pass
        try:
            del self.ali_1688
        except:
            pass
        try:
            del self.tmall
        except:
            pass
        try:
            del self.jd
        except:
            pass
        gc.collect()
class CommentRealTimeUpdateSpider(object):
    def __init__(self):
        self._set_logger()
        self.msg = ''
        self.debugging_api = self._init_debugging_api()
        self._set_func_name_dict()
        self.sql_str = cm_update_str_1

        if self._init_debugging_api().get(2):
            self.my_lg.info('初始化 1688 phantomjs中...')
            self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(3) is True \
                or self._init_debugging_api().get(4) is True\
                or self._init_debugging_api().get(6) is True:
            self.my_lg.info('初始化 天猫 phantomjs中...')
            self.tmall = TmallCommentParse(logger=self.my_lg)

        if self._init_debugging_api().get(7) is True \
                or self._init_debugging_api().get(8) is True\
                or self._init_debugging_api().get(9) is True\
                or self._init_debugging_api().get(10) is True:
            self.my_lg.info('初始化 京东 phantomjs中...')
            self.jd = JdCommentParse(logger=self.my_lg)

    def _set_logger(self):
        self.my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/all_comment/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

    def _init_debugging_api(self):
        '''
        用于设置待抓取的商品的site_id
        :return: dict
        '''
        return {
            1: True,
            2: True,
            3: True,
            4: True,
            6: True,
            7: True,
            8: True,
            9: True,
            10: True,
            11: False,
            12: False,
            13: False,
            25: False,
        }

    def _set_func_name_dict(self):
        self.func_name_dict = {
            'taobao': 'self._update_taobao_comment({0}, {1}, {2})',
            'ali': 'self._update_ali_1688_comment({0}, {1}, {2})',
            'tmall': 'self._update_tmall_comment({0}, {1}, {2})',
            'jd': 'self._update_jd_comment({0}, {1}, {2})',
            'zhe_800': 'self._update_zhe_800_comment({0}, {1}, {2})',
            'juanpi': 'self._update_juanpi_comment({0}, {1}, {2})',
            'pinduoduo': 'self._update_pinduoduo_comment({0}, {1}, {2})',
            'vip': 'self._update_vip_comment({0}, {1}, {2})',
        }

    def _just_run(self):
        while True:
            #### 更新数据
            self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            #  and GETDATE()-a.modify_time>1
            try:
                result = list(self._comment_pipeline._select_table(sql_str=cm_select_str_1, logger=self.my_lg))
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                continue

            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info('--------------------------------------------------------')
            self.my_lg.info('待更新个数: {0}'.format(len(result)))

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))

            # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房  11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会
            for index, item in enumerate(result):     # item: ('xxxx':goods_id, 'y':site_id)
                if not self.debugging_api.get(item[1]):
                    self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index))
                    continue

                if index % 20 == 0:
                    try: del self._comment_pipeline
                    except: pass
                    self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()

                switch = {
                    1: self.func_name_dict.get('taobao'),       # 淘宝
                    2: self.func_name_dict.get('ali'),          # 阿里1688
                    3: self.func_name_dict.get('tmall'),        # 天猫
                    4: self.func_name_dict.get('tmall'),        # 天猫超市
                    6: self.func_name_dict.get('tmall'),        # 天猫国际
                    7: self.func_name_dict.get('jd'),           # 京东
                    8: self.func_name_dict.get('jd'),           # 京东超市
                    9: self.func_name_dict.get('jd'),           # 京东全球购
                    10: self.func_name_dict.get('jd'),          # 京东大药房
                    11: self.func_name_dict.get('zhe_800'),     # 折800
                    12: self.func_name_dict.get('juanpi'),      # 卷皮
                    13: self.func_name_dict.get('pinduoduo'),   # 拼多多
                    25: self.func_name_dict.get('vip'),         # 唯品会
                }

                # 动态执行
                exec_code = compile(switch[item[1]].format(index, item[0], item[1]), '', 'exec')
                exec(exec_code)
                sleep(1.1)

    def _update_taobao_comment(self, index, goods_id, site_id):
        '''
        处理淘宝的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index))

            taobao = TaoBaoCommentParse(logger=self.my_lg)
            _r = taobao._get_comment_data(goods_id=str(goods_id))

            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

            try:
                del taobao
            except:
                self.my_lg.info('del taobao失败!')
            gc.collect()
        else:
            pass

    def _update_ali_1688_comment(self, index, goods_id, site_id):
        '''
        处理阿里1688的商品comment
        :param index: 索引
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try: del self.ali_1688
                except: self.my_lg.error('del ali_1688失败!')
                gc.collect()
                self.ali_1688 = ALi1688CommentParse(logger=self.my_lg)

            _r = self.ali_1688._get_comment_data(goods_id=goods_id)
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)

            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')

        else:
            pass

    def _update_tmall_comment(self, index, goods_id, site_id):
        '''
        处理tmall商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index))

            if site_id == 3:
                _type = 0
            elif site_id == 4:
                _type = 1
            elif site_id == 6:
                _type = 2
            else:
                return None

            if index % 5 == 0:
                try:
                    del self.tmall
                except:
                    self.my_lg.info('del tmall失败!')
                gc.collect()
                self.tmall = TmallCommentParse(logger=self.my_lg)

            _r = self.tmall._get_comment_data(type=_type, goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
            gc.collect()
        else:
            pass

    def _update_jd_comment(self, index, goods_id, site_id):
        '''
        处理京东商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index))

            if index % 5 == 0:
                try: del self.jd
                except: self.my_lg.info('del jd失败!')
                gc.collect()
                self.jd = JdCommentParse(logger=self.my_lg)

            _r = self.jd._get_comment_data(goods_id=str(goods_id))
            if _r.get('_comment_list', []) != []:
                if self._comment_pipeline.is_connect_success:
                    self._comment_pipeline._update_table_2(
                        sql_str=self.sql_str,
                        params=self._get_db_update_params(item=_r),
                        logger=self.my_lg)
            else:
                self.my_lg.info('该商品_comment_list为空list! 此处跳过!')
        else:
            pass

    def _update_zhe_800_comment(self, index, goods_id, site_id):
        '''
        处理折800商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_juanpi_comment(self, index, goods_id, site_id):
        '''
        处理卷皮商品的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_pinduoduo_comment(self, index, goods_id, site_id):
        '''
        处理拼多多的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _update_vip_comment(self, index, goods_id, site_id):
        '''
        处理唯品会的comment
        :param index:
        :param goods_id:
        :param site_id:
        :return:
        '''
        if self.debugging_api.get(site_id):
            pass
        else:
            pass

    def _get_db_update_params(self, item):
        return (
            item['modify_time'],
            dumps(item['_comment_list'], ensure_ascii=False),

            item['goods_id'],
        )

    def __del__(self):
        try:
            del self.my_lg
            del self.msg
            del self.debugging_api
        except:
            pass
        try: del self._comment_pipeline
        except: pass
        try:
            del self.tmall
        except:
            pass
        gc.collect()