Python MysqlDB Examples

Programming Language: Python

Namespace/Package Name: spider.db.mysqldb

Class/Type: MysqlDB

Examples at hotexamples.com: 11

Python MysqlDB - 11 examples found. These are the top rated real world Python examples of spider.db.mysqldb.MysqlDB extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MysqlDB(6)

execute(2)

find(2)

update(2)

add(1)

add_batch(1)

Example #1

Show file

File: base_parser.py Project: zhuyoucai168/boris-spider

    def __init__(self,
                 task_table,
                 batch_record_table,
                 task_state,
                 date_format,
                 task_condition=""):
        self._mysqldb = MysqlDB()  # mysqldb

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._task_state = task_state  # mysql中任务表的state字段名
        self._date_format = date_format  # 批次日期格式

        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)

Example #2

Show file

File: base_parser.py Project: zhuyoucai168/boris-spider

    def batch_date(self):
        """
        @summary: 获取批次时间
        ---------
        ---------
        @result:
        """

        batch_date = os.environ.get("batch_date")
        if not batch_date:
            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
                date_format=self._date_format.replace(":%M", ":%i"),
                batch_record_table=self._batch_record_table,
            )
            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
            if batch_info:
                os.environ["batch_date"] = batch_date = batch_info[0][0]
            else:
                log.error("需先运行 start_monitor_task()")
                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启

        return batch_date

Example #3

Show file

File: create_builder.py Project: xingx001/boris-spider

 def __init__(self):
     self._db = MysqlDB()
     self._create_init = CreateInit()

Example #4

Show file

File: create_builder.py Project: xingx001/boris-spider

class CreateItem:
    def __init__(self):
        self._db = MysqlDB()
        self._create_init = CreateInit()

    def select_columns(self, table_name):
        # sql = 'SHOW COLUMNS FROM ' + table_name
        sql = "SELECT COLUMN_NAME, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, COLUMN_KEY, COLUMN_COMMENT FROM INFORMATION_SCHEMA.Columns WHERE table_name = '{}'".format(
            table_name
        )
        columns = self._db.find(sql)

        return columns

    def select_tables_name(self, tables_name):
        """
        @summary:
        ---------
        @param tables_name: 一类tables 如 qidian*
        ---------
        @result:
        """
        sql = (
            "select table_name from information_schema.tables where table_name like '%s'"
            % tables_name
        )
        tables_name = self._db.find(sql)

        return tables_name

    def convert_table_name_to_hump(self, table_name):
        """
        @summary: 格式化表明为驼峰格式
        ---------
        @param table:
        ---------
        @result:
        """
        table_hump_format = ""

        words = table_name.split("_")
        for word in words:
            table_hump_format += word.capitalize()  # 首字母大写

        return table_hump_format

    def get_item_template(self):
        templete_path = os.path.abspath(
            os.path.join(__file__, "../../templates/item_template.tmpl")
        )
        with open(templete_path, "r") as file:
            item_template = file.read()

        return item_template

    def create_item(self, item_template, columns, table_name_hump_format, support_dict):
        # 组装 类名
        item_template = item_template.replace("${table_name}", table_name_hump_format)

        # 组装 属性
        propertys = ""
        for column in columns:
            column_name = column[0]
            column_type = column[1]
            is_nullable = column[2]
            column_default = column[3]
            column_extra = column[4]
            column_key = column[5]
            column_comment = column[6]

            try:
                value = (
                    "kwargs.get('{column_name}')".format(column_name=column_name)
                    if support_dict
                    else (
                        column_default != "CURRENT_TIMESTAMP" and column_default or None
                    )
                    and eval(column_default)
                )
            except:
                value = (
                    "kwargs.get('{column_name}')".format(column_name=column_name)
                    if support_dict
                    else (
                        column_default != "CURRENT_TIMESTAMP" and column_default or None
                    )
                    and column_default
                )

            if column_extra == "auto_increment" or column_default is not None:
                propertys += (
                    "# self.{column_name} = {value}  # type : {column_type} | allow_null : {is_nullable} | key : {column_key} | default_value : {column_default} | extra : {column_extra} | column_comment : {column_comment}".format(
                        column_name=column_name,
                        value=value,
                        column_type=column_type,
                        is_nullable=is_nullable,
                        column_key=column_key,
                        column_default=column_default,
                        column_extra=column_extra,
                        column_comment=column_comment,
                    )
                    + "\n"
                    + " " * 8
                )

            else:
                if value is None or isinstance(value, (float, int)) or support_dict:
                    propertys += (
                        "self.{column_name} = {value}  # type : {column_type} | allow_null : {is_nullable} | key : {column_key} | default_value : {column_default} | extra : {column_extra}| column_comment : {column_comment}".format(
                            column_name=column_name,
                            value=value,
                            column_type=column_type,
                            is_nullable=is_nullable,
                            column_key=column_key,
                            column_default=column_default,
                            column_extra=column_extra,
                            column_comment=column_comment,
                        )
                        + "\n"
                        + " " * 8
                    )
                else:
                    propertys += (
                        "self.{column_name} = '{value}'  # type : {column_type} | allow_null : {is_nullable} | key : {column_key} | default_value : {column_default} | extra : {column_extra}| column_comment : {column_comment}".format(
                            column_name=column_name,
                            value=value,
                            column_type=column_type,
                            is_nullable=is_nullable,
                            column_key=column_key,
                            column_default=column_default,
                            column_extra=column_extra,
                            column_comment=column_comment,
                        )
                        + "\n"
                        + " " * 8
                    )

        item_template = item_template.replace("${propertys}", propertys.strip())
        item_template = deal_file_info(item_template)

        return item_template

    def save_template_to_file(self, item_template, table_name, table_name_hump_format):
        item_file = table_name + "_item.py"
        if os.path.exists(item_file):
            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % item_file)
            if confirm != "y":
                print("取消覆盖  退出")
                return

        with open(item_file, "w", encoding="utf-8") as file:
            file.write(item_template)
            print("\n%s 生成成功" % table_name_hump_format)

        self._create_init.create()

    def create(self, tables_name, support_dict):
        input_tables_name = tables_name

        tables_name = self.select_tables_name(tables_name)
        if not tables_name:
            print(tables_name)
            tip = "mysql数据库中无 %s 表 " % input_tables_name
            raise KeyError(tip)

        for table_name in tables_name:
            table_name = table_name[0]
            table_name_hump_format = self.convert_table_name_to_hump(table_name)

            columns = self.select_columns(table_name)
            item_template = self.get_item_template()
            item_template = self.create_item(
                item_template, columns, table_name_hump_format, support_dict
            )
            self.save_template_to_file(
                item_template, table_name, table_name_hump_format
            )

Example #5

Show file

File: create_builder.py Project: xingx001/boris-spider

 def __init__(self):
     self._db = MysqlDB()

Example #6

Show file

File: create_builder.py Project: xingx001/boris-spider

class CreateTable:
    def __init__(self):
        self._db = MysqlDB()

    def is_vaild_date(self, date):
        try:
            if ":" in date:
                time.strptime(date, "%Y-%m-%d %H:%M:%S")
            else:
                time.strptime(date, "%Y-%m-%d")
            return True
        except:
            return False

    def get_key_type(self, value):
        try:
            value = eval(value)
        except:
            value = value

        key_type = "varchar(255)"
        if isinstance(value, int):
            key_type = "int"
        elif isinstance(value, float):
            key_type = "double"
        elif isinstance(value, str):
            if self.is_vaild_date(value):
                if ":" in value:
                    key_type = "datetime"
                else:
                    key_type = "date"
            elif len(value) > 255:
                key_type = "text"
            else:
                key_type = "varchar(255)"

        return key_type

    def get_data(self):
        """
        @summary: 从控制台读取多行
        ---------
        ---------
        @result:
        """
        data = ""
        while True:
            line = sys.stdin.readline().strip()
            data += line
            if line == "}":
                break

        return tools.get_json(data)

    def create(self, table_name):
        # 输入表字段
        print('请输入表数据 json格式 如 {"name":"张三"}\n等待输入：\n')
        data = self.get_data()

        if not isinstance(data, dict):
            raise Exception("表数据格式不正确")

        # 拼接表结构
        sql = """
            CREATE TABLE `{db}`.`{table_name}` (
                `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id 自动递增',
                {other_key}
                `gtime` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
                PRIMARY KEY (`id`),
                {unique}
            ) COMMENT='';
        """

        print("请设置注释 回车跳过")
        other_key = ""
        for key, value in data.items():
            key = key2underline(key)
            key_type = self.get_key_type(value)

            comment = input("%s : %s  -> comment：" % (key, key_type))

            other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
                key=key, key_type=key_type, comment=comment
            )

        print("\n")

        while True:
            is_need_batch_date = input("是否添加batch_date 字段 （y/n）:")
            if is_need_batch_date == "y":
                other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
                    key="batch_date", key_type="date", comment="批次时间"
                )
                break
            elif is_need_batch_date == "n":
                break

        print("\n")

        while True:
            unique = input("请设置唯一索引, 多个逗号间隔\n等待输入：\n").replace("，", ",")
            if unique:
                break
        unique = "UNIQUE `idx` USING BTREE (`%s`) comment ''" % "`,`".join(
            unique.split(",")
        )

        sql = sql.format(
            db=setting.MYSQL_DB,
            table_name=table_name,
            other_key=other_key,
            unique=unique,
        )
        print(sql)
        self._db.execute(sql)
        print("\n%s 创建成功" % table_name)

Example #7

Show file

    def __init__(self,
                 task_table,
                 batch_record_table,
                 batch_name,
                 batch_interval,
                 task_keys,
                 task_state="state",
                 min_task_count=10000,
                 check_task_interval=5,
                 task_limit=10000,
                 related_table_folder=None,
                 related_batch_record=None,
                 task_condition="",
                 task_order_by="",
                 table_folder=None,
                 parser_count=None,
                 begin_callback=None,
                 end_callback=None,
                 delete_tabs=(),
                 process_num=None,
                 auto_stop_when_spider_done=None,
                 send_run_time=False,
                 *parser_args,
                 **parser_kwargs):
        """
        @summary: 批次爬虫
        必要条件
        1、需有任务表
            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段，则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充

            参考建表语句如下：
            CREATE TABLE `table_name` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
              `state` int(11) DEFAULT NULL COMMENT '任务状态',
              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
              PRIMARY KEY (`id`),
              UNIQUE KEY `nui` (`param`) USING BTREE
            ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;

        2、需有批次记录表 不存在自动创建

            此表节结构固定，参考建表语句如下：
            CREATE TABLE `xxx_batch_record` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `batch_date` date DEFAULT NULL,
              `done_count` int(11) DEFAULT NULL,
              `total_count` int(11) DEFAULT NULL,
              PRIMARY KEY (`id`)
            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

        ---------
        @param task_table: mysql中的任务表
        @param batch_record_table: mysql 中的批次记录表
        @param batch_name: 批次采集程序名称
        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次，可写成1/24
        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser，则需将parser_name字段取出来。
        @param task_state: mysql中任务表的state字段名
        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
        @param check_task_interval: 检查是否还有任务的时间间隔；
        @param task_limit: 数据库中取任务的数量
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数，默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表，元组类型。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务，默认自动结束
        @param send_run_time: 发送运行时间
        @param related_table_folder: 有关联的其他爬虫任务表（redis）
        @param related_batch_record: 有关联的其他爬虫批次表（mysql）注意：要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定
            related_table_folder 与 related_batch_record 选其一配置即可。
            若相关连的爬虫为批次爬虫，推荐以related_batch_record配置，
            若相关连的爬虫为普通爬虫，无批次表，可以以related_table_folder配置
        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务，及where后的条件语句
        @param task_order_by: 取任务时的排序条件 如 id desc


        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """
        Scheduler.__init__(
            self,
            table_folder=table_folder,
            parser_count=parser_count,
            begin_callback=begin_callback,
            end_callback=end_callback,
            delete_tabs=delete_tabs,
            process_num=process_num,
            auto_stop_when_spider_done=auto_stop_when_spider_done,
            auto_start_requests=False,
            send_run_time=send_run_time,
            batch_interval=batch_interval,
            *parser_args,
            **parser_kwargs)

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._request_buffer = RequestBuffer(self._table_folder)

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._batch_name = batch_name  # 批次采集程序名称
        self._task_keys = task_keys  # 需要获取的任务字段

        self._task_state = task_state  # mysql中任务表的state字段名
        self._min_task_count = min_task_count  # redis 中最少任务数
        self._check_task_interval = check_task_interval
        self._task_limit = task_limit  # mysql中一次取的任务数量
        self._related_task_tables = [
            setting.TAB_REQUSETS.format(table_folder=table_folder)
        ]  # 自己的task表也需要检查是否有任务
        if related_table_folder:
            self._related_task_tables.append(
                setting.TAB_REQUSETS.format(table_folder=related_table_folder))

        self._related_batch_record = related_batch_record
        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)
        self._task_order_by = task_order_by and " order by {}".format(
            task_order_by)

        self._batch_date_cache = None
        if self._batch_interval >= 1:
            self._date_format = "%Y-%m-%d"
        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
            self._date_format = "%Y-%m-%d %H"
        else:
            self._date_format = "%Y-%m-%d %H:%M"

        # 报警相关
        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
        self._last_send_msg_time = None

        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
        self._spider_deal_speed_cached = None

        self._is_more_parsers = True  # 多模版类爬虫

Example #8

Show file

class BatchSpider(BatchParser, Scheduler):
    def __init__(self,
                 task_table,
                 batch_record_table,
                 batch_name,
                 batch_interval,
                 task_keys,
                 task_state="state",
                 min_task_count=10000,
                 check_task_interval=5,
                 task_limit=10000,
                 related_table_folder=None,
                 related_batch_record=None,
                 task_condition="",
                 task_order_by="",
                 table_folder=None,
                 parser_count=None,
                 begin_callback=None,
                 end_callback=None,
                 delete_tabs=(),
                 process_num=None,
                 auto_stop_when_spider_done=None,
                 send_run_time=False,
                 *parser_args,
                 **parser_kwargs):
        """
        @summary: 批次爬虫
        必要条件
        1、需有任务表
            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段，则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充

            参考建表语句如下：
            CREATE TABLE `table_name` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
              `state` int(11) DEFAULT NULL COMMENT '任务状态',
              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
              PRIMARY KEY (`id`),
              UNIQUE KEY `nui` (`param`) USING BTREE
            ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;

        2、需有批次记录表 不存在自动创建

            此表节结构固定，参考建表语句如下：
            CREATE TABLE `xxx_batch_record` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `batch_date` date DEFAULT NULL,
              `done_count` int(11) DEFAULT NULL,
              `total_count` int(11) DEFAULT NULL,
              PRIMARY KEY (`id`)
            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

        ---------
        @param task_table: mysql中的任务表
        @param batch_record_table: mysql 中的批次记录表
        @param batch_name: 批次采集程序名称
        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次，可写成1/24
        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser，则需将parser_name字段取出来。
        @param task_state: mysql中任务表的state字段名
        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
        @param check_task_interval: 检查是否还有任务的时间间隔；
        @param task_limit: 数据库中取任务的数量
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数，默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表，元组类型。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务，默认自动结束
        @param send_run_time: 发送运行时间
        @param related_table_folder: 有关联的其他爬虫任务表（redis）
        @param related_batch_record: 有关联的其他爬虫批次表（mysql）注意：要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定
            related_table_folder 与 related_batch_record 选其一配置即可。
            若相关连的爬虫为批次爬虫，推荐以related_batch_record配置，
            若相关连的爬虫为普通爬虫，无批次表，可以以related_table_folder配置
        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务，及where后的条件语句
        @param task_order_by: 取任务时的排序条件 如 id desc


        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """
        Scheduler.__init__(
            self,
            table_folder=table_folder,
            parser_count=parser_count,
            begin_callback=begin_callback,
            end_callback=end_callback,
            delete_tabs=delete_tabs,
            process_num=process_num,
            auto_stop_when_spider_done=auto_stop_when_spider_done,
            auto_start_requests=False,
            send_run_time=send_run_time,
            batch_interval=batch_interval,
            *parser_args,
            **parser_kwargs)

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._request_buffer = RequestBuffer(self._table_folder)

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._batch_name = batch_name  # 批次采集程序名称
        self._task_keys = task_keys  # 需要获取的任务字段

        self._task_state = task_state  # mysql中任务表的state字段名
        self._min_task_count = min_task_count  # redis 中最少任务数
        self._check_task_interval = check_task_interval
        self._task_limit = task_limit  # mysql中一次取的任务数量
        self._related_task_tables = [
            setting.TAB_REQUSETS.format(table_folder=table_folder)
        ]  # 自己的task表也需要检查是否有任务
        if related_table_folder:
            self._related_task_tables.append(
                setting.TAB_REQUSETS.format(table_folder=related_table_folder))

        self._related_batch_record = related_batch_record
        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)
        self._task_order_by = task_order_by and " order by {}".format(
            task_order_by)

        self._batch_date_cache = None
        if self._batch_interval >= 1:
            self._date_format = "%Y-%m-%d"
        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
            self._date_format = "%Y-%m-%d %H"
        else:
            self._date_format = "%Y-%m-%d %H:%M"

        # 报警相关
        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
        self._last_send_msg_time = None

        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
        self._spider_deal_speed_cached = None

        self._is_more_parsers = True  # 多模版类爬虫

    def init_property(self):
        """
        每个批次开始时需要重置的属性
        @return:
        """
        self._last_send_msg_time = None

        self._spider_last_done_time = None
        self._spider_last_done_count = 0  # 爬虫刚开始启动时已做任务数量

    def add_parser(self, parser):
        parser = parser(
            self._task_table,
            self._batch_record_table,
            self._task_state,
            self._date_format,
        )  # parser 实例化
        self._parsers.append(parser)

    def start_monitor_task(self):
        """
        @summary: 监控任务状态
        ---------
        ---------
        @result:
        """
        if not self._parsers:  # 不是多模版模式， 将自己注入到parsers，自己为模版
            self._is_more_parsers = False
            self._parsers.append(self)

        elif len(self._parsers) <= 1:
            self._is_more_parsers = False

        self.create_batch_record_table()

        # 添加任务
        for parser in self._parsers:
            parser.add_task()

        is_first_check = True
        while True:
            try:
                if self.check_batch(is_first_check):  # 该批次已经做完
                    if not self._auto_stop_when_spider_done:
                        is_first_check = True
                        log.info("爬虫所有任务已做完，不自动结束，等待新任务...")
                        time.sleep(self._check_task_interval)
                        continue
                    else:
                        break

                is_first_check = False

                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = self._redisdb.zget_count(tab_requests)

                tasks = []
                if todo_task_count < self._min_task_count:  # 从mysql中取任务
                    # 更新batch表的任务状态数量
                    self.update_task_done_count()

                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" %
                             todo_task_count)
                    tasks = self.get_todo_task_from_mysql()
                    if not tasks:  # 状态为0的任务已经做完，需要检查状态为2的任务是否丢失

                        if (todo_task_count == 0
                            ):  # redis 中无待做任务，此时mysql中状态为2的任务为丢失任务。需重新做
                            lose_task_count = self.get_lose_task_count()

                            if not lose_task_count:
                                time.sleep(self._check_task_interval)
                                continue

                            elif (
                                    lose_task_count > self._task_limit * 5
                            ):  # 丢失任务太多，直接重置，否则每次等redis任务消耗完再取下一批丢失任务，速度过慢
                                log.info("正在重置丢失任务为待做 共 {} 条".format(
                                    lose_task_count))
                                # 重置正在做的任务为待做
                                if self.reset_lose_task_from_mysql():
                                    log.info("重置丢失任务成功")
                                else:
                                    log.info("重置丢失任务失败")

                                continue

                            else:  # 丢失任务少，直接取
                                log.info("正在取丢失任务 共 {} 条, 取 {} 条".format(
                                    lose_task_count,
                                    self._task_limit
                                    if self._task_limit <= lose_task_count else
                                    lose_task_count,
                                ))
                                tasks = self.get_doing_task_from_mysql()

                    else:
                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))

                else:
                    log.info("redis 中尚有%s条积压任务，暂时不派发新任务" % todo_task_count)

                if not tasks:
                    if todo_task_count >= self._min_task_count:
                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
                        pass
                    else:
                        log.info("mysql 中无待做任务 redis中剩余任务 %s" %
                                 todo_task_count)
                else:
                    # make start requests
                    self.distribute_task(tasks)
                    log.info("添加任务到redis成功")

            except Exception as e:
                log.exception(e)

            time.sleep(self._check_task_interval)

    def create_batch_record_table(self):
        sql = (
            "select table_name from information_schema.tables where table_name like '%s'"
            % self._batch_record_table)
        tables_name = self._mysqldb.find(sql)
        if not tables_name:
            sql = """
                CREATE TABLE `{table_name}` (
                      `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                      `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间',
                      `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)',
                      `total_count` int(11) DEFAULT NULL COMMENT '任务总数',
                      `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)',
                      `interval` float(11) DEFAULT NULL COMMENT '批次间隔',
                      `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour',
                      `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间',
                      `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间',
                      `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成  1 完成',
                      PRIMARY KEY (`id`)
                    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
            """.format(
                table_name=self._batch_record_table,
                batch_date="date"
                if self._date_format == "%Y-%m-%d" else "datetime",
            )

            self._mysqldb.execute(sql)

    def distribute_task(self, tasks):
        """
        @summary: 分发任务
        ---------
        @param tasks:
        ---------
        @result:
        """
        if self._is_more_parsers:  # 为多模版类爬虫，需要下发指定的parser
            for task in tasks:
                for parser in self._parsers:  # 寻找task对应的parser
                    if parser.name in task:
                        requests = parser.start_requests(task)
                        if requests and not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "start_requests"))

                        result_type = 1
                        for request in requests or []:
                            if isinstance(request, Request):
                                request.parser_name = request.parser_name or parser.name
                                self._request_buffer.put_request(request)
                                result_type = 1

                            elif isinstance(request, Item):
                                self._item_buffer.put_item(request)
                                result_type = 2

                                if (self._item_buffer.get_items_count() >=
                                        MAX_ITEM_COUNT):
                                    self._item_buffer.flush()

                            elif callable(
                                    request):  # callbale的request可能是更新数据库操作的函数
                                if result_type == 1:
                                    self._request_buffer.put_request(request)
                                else:
                                    self._item_buffer.put_item(request)

                                    if (self._item_buffer.get_items_count() >=
                                            MAX_ITEM_COUNT):
                                        self._item_buffer.flush()

                            else:
                                raise TypeError(
                                    "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}"
                                    .format(type(requests)))

                        break

        else:  # task没对应的parser 则将task下发到所有的parser
            for task in tasks:
                for parser in self._parsers:
                    requests = parser.start_requests(task)
                    if requests and not isinstance(requests, Iterable):
                        raise Exception("%s.%s返回值必须可迭代" %
                                        (parser.name, "start_requests"))

                    result_type = 1
                    for request in requests or []:
                        if isinstance(request, Request):
                            request.parser_name = request.parser_name or parser.name
                            self._request_buffer.put_request(request)
                            result_type = 1

                        elif isinstance(request, Item):
                            self._item_buffer.put_item(request)
                            result_type = 2

                            if self._item_buffer.get_items_count(
                            ) >= MAX_ITEM_COUNT:
                                self._item_buffer.flush()

                        elif callable(
                                request):  # callbale的request可能是更新数据库操作的函数
                            if result_type == 1:
                                self._request_buffer.put_request(request)
                            else:
                                self._item_buffer.put_item(request)

                                if (self._item_buffer.get_items_count() >=
                                        MAX_ITEM_COUNT):
                                    self._item_buffer.flush()

        self._request_buffer.flush()
        self._item_buffer.flush()

    def __get_task_state_count(self):
        sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format(
            state=self._task_state,
            task_table=self._task_table,
            task_condition=self._task_condition_prefix_where,
        )
        task_state_count = self._mysqldb.find(sql)

        task_state = {
            "total_count":
            sum(count for state, count in task_state_count),
            "done_count":
            sum(count for state, count in task_state_count
                if state in (1, -1)),
            "failed_count":
            sum(count for state, count in task_state_count if state == -1),
        }

        return task_state

    def update_task_done_count(self):
        """
        @summary: 更新批次表中的任务状态
        ---------
        ---------
        @result:
        """
        task_count = self.__get_task_state_count()

        # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count))

        # 更新批次表
        sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format(
            self._batch_record_table,
            task_count.get("done_count"),
            task_count.get("total_count"),
            task_count.get("failed_count"),
            self._batch_interval
            if self._batch_interval >= 1 else self._batch_interval * 24,
            "day" if self._batch_interval >= 1 else "hour",
            self.batch_date,
        )
        self._mysqldb.update(sql)

    def update_is_done(self):
        sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format(
            self._batch_record_table, self.batch_date)
        self._mysqldb.update(sql)

    def get_todo_task_from_mysql(self):
        """
        @summary: 取待做的任务
        ---------
        ---------
        @result:
        """
        # TODO 分批取数据 每批最大取 1000000个，防止内存占用过大
        # 查询任务
        sql = "select %s from %s where %s = 0%s%s limit %s" % (
            ", ".join(self._task_keys),
            self._task_table,
            self._task_state,
            self._task_condition_prefix_and,
            self._task_order_by,
            self._task_limit,
        )
        tasks = self._mysqldb.find(sql)

        if tasks:
            # 更新任务状态
            for i in range(0, len(tasks), 10000):  # 10000 一批量更新
                task_ids = str(tuple([task[0] for task in tasks[i:i + 10000]
                                      ])).replace(",)", ")")
                sql = "update %s set %s = 2 where id in %s" % (
                    self._task_table,
                    self._task_state,
                    task_ids,
                )
                self._mysqldb.update(sql)

        return tasks

    def get_doing_task_from_mysql(self):
        """
        @summary: 取正在做的任务
        ---------
        ---------
        @result:
        """

        # 查询任务
        sql = "select %s from %s where %s = 2%s%s limit %s" % (
            ", ".join(self._task_keys),
            self._task_table,
            self._task_state,
            self._task_condition_prefix_and,
            self._task_order_by,
            self._task_limit,
        )
        tasks = self._mysqldb.find(sql)

        return tasks

    def get_lose_task_count(self):
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )
        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
        batch_date, total_count, done_count = batch_info[0]
        return total_count - done_count

    def reset_lose_task_from_mysql(self):
        """
        @summary: 重置丢失任务为待做
        ---------
        ---------
        @result:
        """

        sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format(
            table=self._task_table,
            state=self._task_state,
            task_condition=self._task_condition_prefix_and,
        )
        return self._mysqldb.update(sql)

    def get_deal_speed(self, total_count, done_count, last_batch_date):
        """
        获取处理速度
        @param total_count: 总数量
        @param done_count: 做完数量
        @param last_batch_date: 批次时间 datetime
        @return:
            deal_speed （条/小时）, need_time （秒）, overflow_time（秒） （ overflow_time < 0 时表示提前多少秒完成 )
            或
            None
        """
        if not self._spider_last_done_count:
            now_date = datetime.datetime.now()
            self._spider_last_done_count = done_count
            self._spider_last_done_time = now_date

        if done_count > self._spider_last_done_count:
            now_date = datetime.datetime.now()

            time_interval = (now_date -
                             self._spider_last_done_time).total_seconds()
            deal_speed = (done_count -
                          self._spider_last_done_count) / time_interval  # 条/秒
            need_time = (total_count - done_count) / deal_speed  # 单位秒
            overflow_time = (
                (now_date - last_batch_date).total_seconds() + need_time -
                datetime.timedelta(days=self._batch_interval).total_seconds()
            )  # 溢出时间 秒
            calculate_speed_time = now_date.strftime(
                "%Y-%m-%d %H:%M:%S")  # 统计速度时间

            deal_speed = int(deal_speed * 3600)  # 条/小时

            # 更新最近已做任务数及时间
            self._spider_last_done_count = done_count
            self._spider_last_done_time = now_date

            self._spider_deal_speed_cached = (
                deal_speed,
                need_time,
                overflow_time,
                calculate_speed_time,
            )

        return self._spider_deal_speed_cached

    def check_batch(self, is_first_check=False):
        """
        @summary: 检查批次是否完成
        ---------
        @param: is_first_check 是否为首次检查，若首次检查，且检查结果为批次已完成，则不发送批次完成消息。因为之前发送过了
        ---------
        @result: 完成返回True 否则False
        """

        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )
        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)

        if batch_info:
            batch_date, total_count, done_count = batch_info[0]

            now_date = datetime.datetime.now()
            last_batch_date = datetime.datetime.strptime(
                batch_date, self._date_format)
            time_difference = now_date - last_batch_date

            if total_count == done_count and time_difference < datetime.timedelta(
                    days=self._batch_interval):  # 若在本批次内，再次检查任务表是否有新增任务
                # # 改成查询任务表 看是否真的没任务了，因为batch_record表里边的数量可能没来得及更新
                task_count = self.__get_task_state_count()

                total_count = task_count.get("total_count")
                done_count = task_count.get("done_count")

            if total_count == done_count:
                # 检查相关联的爬虫是否完成
                releated_spider_is_done = self.related_spider_is_done()
                if releated_spider_is_done == False:
                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
                        self._batch_name,
                        self._related_batch_record
                        or self._related_task_tables,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    log.info(msg)
                    # 检查是否超时 超时发出报警
                    if time_difference >= datetime.timedelta(
                            days=self._batch_interval):  # 已经超时
                        if (not self._last_send_msg_time
                                or now_date - self._last_send_msg_time >=
                                self._send_msg_interval):
                            self._last_send_msg_time = now_date
                            self.send_msg(msg, level="error")

                    return False

                elif releated_spider_is_done == True:
                    # 更新is_done 状态
                    self.update_is_done()

                else:
                    self.update_is_done()

                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
                    self._batch_name, batch_date, done_count)
                log.info(msg)
                if not is_first_check:
                    self.send_msg(msg)

                # 判断下一批次是否到
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):
                    msg = "《{}》下一批次开始".format(self._batch_name)
                    log.info(msg)
                    self.send_msg(msg)

                    # 初始化任务表状态
                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
                        # 初始化属性
                        self.init_property()

                        is_success = (
                            self.record_batch()
                        )  # 有可能插入不成功，但是任务表已经重置了，不过由于当前时间为下一批次的时间，检查批次是否结束时不会检查任务表，所以下次执行时仍然会重置
                        if is_success:
                            log.info(
                                "插入新批次记录成功 1分钟后开始下发任务")  # 防止work批次时间没来得及更新
                            tools.delay_time(60)

                            return False  # 下一批次开始

                        else:
                            return True  # 下一批次不开始。先不派发任务，因为批次表新批次插入失败了，需要插入成功后再派发任务

                else:
                    log.info("《{}》下次批次时间未到".format(self._batch_name))
                    if not is_first_check:
                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
                    return True

            else:
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):  # 已经超时
                    time_out = time_difference - datetime.timedelta(
                        days=self._batch_interval)
                    time_out_pretty = tools.format_seconds(
                        time_out.total_seconds())

                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
                        self._batch_name,
                        time_out_pretty,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    if self._batch_interval >= 1:
                        msg += ", 期望时间{}天".format(self._batch_interval)
                    else:
                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))

                    log.info(msg)

                    if (not self._last_send_msg_time
                            or now_date - self._last_send_msg_time >=
                            self._send_msg_interval):
                        self._last_send_msg_time = now_date
                        self.send_msg(msg, level="error")

                else:  # 未超时
                    remaining_time = (
                        datetime.timedelta(days=self._batch_interval) -
                        time_difference)
                    remaining_time_pretty = tools.format_seconds(
                        remaining_time.total_seconds())

                    if self._batch_interval >= 1:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval,
                            remaining_time_pretty,
                        )
                    else:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval * 24,
                            remaining_time_pretty,
                        )

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))
                            # 发送警报
                            if (not self._last_send_msg_time
                                    or now_date - self._last_send_msg_time >=
                                    self._send_msg_interval):
                                self._last_send_msg_time = now_date
                                self.send_msg(msg, level="error")

                        elif overflow_time < 0:
                            msg += ", 该批次预计提前 {} 完成".format(
                                tools.format_seconds(-overflow_time))

                    log.info(msg)

        else:
            # 插入batch_date
            self.record_batch()

            # 初始化任务表状态 可能有产生任务的代码
            self.init_task()

            return False

    def related_spider_is_done(self):
        """
        相关连的爬虫是否跑完
        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
        """

        for related_redis_task_table in self._related_task_tables:
            if self._redisdb.exists_key(related_redis_task_table):
                return False

        if self._related_batch_record:
            sql = "select is_done from {} order by id desc limit 1".format(
                self._related_batch_record)
            is_done = self._mysqldb.find(sql)
            is_done = is_done[0][0] if is_done else None

            if is_done is None:
                log.warning("相关联的批次表不存在或无批次信息")
                return None

            if not is_done:
                return False

        return True

    def record_batch(self):
        """
        @summary: 记录批次信息（初始化）
        ---------
        ---------
        @result:
        """

        # 查询总任务数
        sql = "select count(1) from %s%s" % (
            self._task_table,
            self._task_condition_prefix_where,
        )
        total_task_count = self._mysqldb.find(sql)[0][0]

        batch_date = tools.get_current_date(self._date_format)

        sql = (
            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
            % (
                self._batch_record_table,
                batch_date,
                0,
                total_task_count,
                self._batch_interval
                if self._batch_interval >= 1 else self._batch_interval * 24,
                "day" if self._batch_interval >= 1 else "hour",
            ))

        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
        if affect_count:
            # 重置批次日期
            self._batch_date_cache = batch_date
            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
            os.environ["batch_date"] = self._batch_date_cache

            # 爬虫开始
            self.spider_begin()
            self.record_spider_state(
                spider_type=2,
                state=0,
                batch_date=batch_date,
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )
        else:
            log.error("插入新批次失败")

        return affect_count

    # -------- 批次结束逻辑 ------------

    def task_is_done(self):
        """
        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
        ---------
        ---------
        @result: True / False （做完 / 未做完）
        """

        is_done = False

        # 查看批次记录表任务状态
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )

        batch_info = self._mysqldb.find(sql)
        if batch_info is None:
            raise Exception("查询批次信息失败")

        if batch_info:
            self._batch_date_cache, total_count, done_count, is_done = batch_info[
                0]  # 更新self._batch_date_cache, 防止新批次已经开始了，但self._batch_date_cache还是原来的批次时间

            log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % (
                self._batch_name,
                self._batch_date_cache,
                done_count,
                total_count,
                is_done,
            ))
            os.environ[
                "batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间

        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
            # 比较耗时 加锁防止多进程同时查询
            with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=0,
                    redis_uri="redis://:{password}@{host_post}/{db}".format(
                        password=setting.REDISDB_USER_PASS,
                        host_post=setting.REDISDB_IP_PORTS,
                        db=setting.REDISDB_DB,
                    ),
            ) as lock:
                if lock.locked:
                    log.info("批次表标记已完成，正在检查任务表是否有未完成的任务")

                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
                        self._task_table,
                        self._task_state,
                        self._task_state,
                        self._task_condition_prefix_and,
                    )
                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
                    if tasks:
                        log.info("检测到任务表中有未完成任务，等待任务下发")
                        is_done = False

                        # 更新batch_record 表的is_done 状态，减少查询任务表的次数
                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
                            batch_record_table=self._batch_record_table,
                            batch_date=self._batch_date_cache,
                        )
                        self._mysqldb.update(sql)

                    else:
                        log.info("任务表中任务均已完成，爬虫结束")
                else:
                    log.info("批次表标记已完成，其他爬虫进程正在检查任务表是否有未完成的任务，本进程跳过检查，继续等待")

                    is_done = False

        return is_done

    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完， 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                if (
                        self.task_is_done() and self.all_thread_is_done()
                ):  # redis全部的任务已经做完 并且mysql中的任务已经做完（检查各个线程all_thread_is_done，防止任务没做完，就更新任务状态，导致程序结束的情况）
                    if not self._is_notify_end:
                        self.spider_end()
                        self.record_spider_state(
                            spider_type=2,
                            state=1,
                            batch_date=self._batch_date_cache,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if self._auto_stop_when_spider_done:
                        self._stop_all_thread()
                        break
                else:
                    self._is_notify_end = False

                self.check_task_status()
                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg)

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启

    @classmethod
    def to_DebugBatchSpider(cls, *args, **kwargs):
        # DebugBatchSpider 继承 cls
        DebugBatchSpider.__bases__ = (cls, )
        DebugBatchSpider.__name__ = cls.__name__
        return DebugBatchSpider(*args, **kwargs)

Example #9

Show file

File: base_parser.py Project: zhuyoucai168/boris-spider

class BatchParser(BaseParse):
    """
    @summary: 批次爬虫模版
    ---------
    """
    def __init__(self,
                 task_table,
                 batch_record_table,
                 task_state,
                 date_format,
                 task_condition=""):
        self._mysqldb = MysqlDB()  # mysqldb

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._task_state = task_state  # mysql中任务表的state字段名
        self._date_format = date_format  # 批次日期格式

        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)

    def add_task(self):
        """
        @summary: 添加任务, 每次启动start_monitor 都会调用，且在init_task之前调用
        ---------
        ---------
        @result:
        """

    def start_requests(self, task):
        """
        @summary:
        ---------
        @param task: 任务信息 list
        ---------
        @result:
        """

    def init_task(self):
        """
        @summary: 初始化任务表中的任务， 新一个批次开始时调用。 可能会重写
        ---------
        ---------
        @result:
        """

        sql = "update {task_table} set {state} = 0 where {state} != -1{task_condition}".format(
            task_table=self._task_table,
            state=self._task_state,
            task_condition=self._task_condition_prefix_and,
        )
        return self._mysqldb.update(sql)

    def update_task_state(self, task_id, state=1, **kwargs):
        """
        @summary: 更新任务表中任务状态，做完每个任务时代码逻辑中要主动调用。可能会重写
        调用方法为 yield lambda : self.update_task_state(task_id, state)
        ---------
        @param task_id:
        @param state:
        ---------
        @result:
        """

        kwargs["id"] = task_id
        kwargs[self._task_state] = state

        sql = tools.make_update_sql(
            self._task_table,
            kwargs,
            condition="id = {task_id}".format(task_id=task_id))

        if self._mysqldb.update(sql):
            log.debug("置任务%s状态成功" % task_id)
        else:
            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))

    def update_task_batch(self, task_id, state=1, **kwargs):
        """
        批量更新任务 多处调用，更新的字段必须一致
        注意：需要 写成 yield update_task_batch(...) 否则不会更新
        @param task_id:
        @param state:
        @param kwargs:
        @return:
        """
        kwargs["id"] = task_id
        kwargs[self._task_state] = state

        update_item = UpdateItem(**kwargs)
        update_item.table_name = self._task_table
        update_item.name_underline = self._task_table + "_item"

        return update_item

    @property
    def batch_date(self):
        """
        @summary: 获取批次时间
        ---------
        ---------
        @result:
        """

        batch_date = os.environ.get("batch_date")
        if not batch_date:
            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
                date_format=self._date_format.replace(":%M", ":%i"),
                batch_record_table=self._batch_record_table,
            )
            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
            if batch_info:
                os.environ["batch_date"] = batch_date = batch_info[0][0]
            else:
                log.error("需先运行 start_monitor_task()")
                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启

        return batch_date

Example #10

Show file

File: export_data.py Project: weilan157/spider

 def __init__(self):
     self._redisdb = RedisDB()
     self._to_db = MysqlDB()

Example #11

Show file

File: export_data.py Project: weilan157/spider

class ExportData(object):
    def __init__(self):
        self._redisdb = RedisDB()
        self._to_db = MysqlDB()

    def export(self, from_table, to_table, auto_update=False, batch_count=100):
        """
        @summary:
        用于从redis的item中导出数据到关系型数据库，如mysql/oracle
        from_table与to_table表结构必须一致
        ---------
        @param from_table:
        @param to_table:
        @param auto_update: 当数据存在时是否自动更新 默认否
        ---------
        @result:
        """
        total_count = 0

        while True:
            datas = []
            try:
                datas = self._redisdb.sget(from_table,
                                           count=batch_count,
                                           is_pop=False)
                if not datas:
                    log.info("""
                        \r%s -> %s 共导出 %s 条数据""" %
                             (from_table, to_table, total_count))
                    break

                json_datas = [eval(data) for data in datas]
                sql, json_datas = tools.make_batch_sql(to_table, json_datas,
                                                       auto_update)
                if self._to_db.add_batch(sql, json_datas):
                    total_count += len(json_datas)
                    self._redisdb.srem(from_table, datas)

            except Exception as e:
                log.exception(e)
                log.error(datas)

    def export_all(
        self,
        tables,
        auto_update=False,
        batch_count=100,
        every_table_per_export_callback=None,
    ):
        """
        @summary: 导出所有item
        ---------
        @param tables: 如qidian  则导出起点下面所有的items
        数据库中的表格式必须有规律 如导出 qidian:comment:s_qidian_book_comment_dynamic_item 对应导入 qidian_book_comment_dynamic
        @param auto_update: 是否自动更新
        @param batch_count: 每批次导出的数量
        @every_table_per_export_callback: 导出前的回调函数, 用来修改特定表的参数 to_table, auto_update, batch_count
        如：
            def every_table_per_export_callback(to_table, auto_update, batch_count):
                if to_table == 'xxx':
                    auto_update = True
                return to_table, auto_update, batch_count
        ---------
        @result:
        """
        tables = (self._redisdb.getkeys(tables + "*_item")
                  if not isinstance(tables, list) else tables)
        if not tables:
            log.info("无表数据")
        for table in tables:
            from_table = table
            to_table = tools.get_info(str(from_table),
                                      ":s_(.*?)_item",
                                      fetch_one=True)
            if callable(every_table_per_export_callback):
                to_table, auto_update, batch_count = every_table_per_export_callback(
                    to_table, auto_update, batch_count)

            log.info("""
                \r正在导出 %s -> %s""" % (from_table, to_table))
            self.export(from_table, to_table, auto_update, batch_count)

    def export_items(self, tab_item, items_data):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        ---------
        @result:
        """

        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(to_table, items_data)
        add_count = self._to_db.add_batch(sql, datas)
        datas_size = len(datas)
        if add_count is None:
            log.error("导出数据到表 %s 失败" % (to_table))
        else:
            log.info("共导出 %s 条数据 到 %s, 重复 %s 条" %
                     (datas_size, to_table, datas_size - add_count))

        return add_count != None

    def update_items(self, tab_item, items_data, update_keys=()):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        @param update_keys: 更新的字段
        ---------
        @result:
        """
        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(
            to_table,
            items_data,
            update_columns=update_keys or list(items_data[0].keys()),
        )
        update_count = self._to_db.add_batch(sql, datas)
        if update_count is None:
            log.error("更新表 %s 数据失败" % (to_table))
        else:
            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table)
            if update_keys:
                msg += " 更新字段为 {}".format(update_keys)
            log.info(msg)

        return update_count != None