Python MysqlDB Examples

Programming Language: Python

Namespace/Package Name: db.mysqldb

Class/Type: MysqlDB

Examples at hotexamples.com: 10

Python MysqlDB - 10 examples found. These are the top rated real world Python examples of db.mysqldb.MysqlDB extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MysqlDB(5)

add(3)

find(2)

set_unique_key(2)

update(2)

close(1)

Example #1

Show file

    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()

Example #2

Show file

File: task_manager.py Project: xjqx05/wechat-spider-1

    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get(
            'redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get(
            'spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get(
            'spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get(
            'spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range")
                                  or "~").split('~')

Example #3

Show file

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

Example #4

Show file

File: task_manager.py Project: 18141922515/wechat-spider

class TaskManager():
    IS_IN_TIME_RANGE = 1  # 在时间范围
    NOT_REACH_TIME_RANGE = 2  # 没到达时间范围
    OVER_MIN_TIME_RANGE = 3  # 超过时间范围

    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get('redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~')

    def __get_task_from_redis(self, key):
        task = self._redis.zget(key, is_pop=True)
        if task:
            task = eval(task[0])
            return task

    def __random_int(self, min, max):
        pass

    def get_account_task(self):
        """
        获取公众号任务
        :return:
            {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None}
            或
            None
        """
        task = self.__get_task_from_redis(self._account_task_key)
        if not task:
            publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else ''
            sql = '''
                SELECT
                    __biz,
                    last_publish_time
                FROM
                    wechat_account_task
                WHERE
                    `is_zombie` != 1
                AND (
                    (
                        (
                            UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time)
                        ) > {monitor_interval}
                        {publish_time_condition}
                    )
                    OR (last_spider_time IS NULL)
                )
                '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition)

            tasks = self._mysqldb.find(sql, to_json=True)
            if tasks:
                self._redis.zadd(self._account_task_key, tasks)
                task = self.__get_task_from_redis(self._account_task_key)

        return task

    def get_article_task(self):
        """
        获取文章任务
        :return:
            {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'}
            或
            None
        """
        task = self.__get_task_from_redis(self._article_task_key)
        if not task:
            sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000'
            tasks = self._mysqldb.find(sql)
            if tasks:
                # 更新任务状态
                task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')')
                sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids)
                self._mysqldb.update(sql)

            else:
                sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000'
                tasks = self._mysqldb.find(sql)

            if tasks:
                task_json = [
                    {
                        'article_url': article_url
                    }
                    for id, article_url in tasks
                ]
                self._redis.zadd(self._article_task_key, task_json)
                task = self.__get_task_from_redis(self._article_task_key)

        return task

    def update_article_task_state(self, sn, state=1):
        sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn)
        self._mysqldb.update(sql)

    def record_last_article_publish_time(self, __biz, last_publish_time):
        self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '')

    def is_reach_last_article_publish_time(self, __biz, publish_time):
        last_publish_time = self._redis.hget(self._last_article_publish_time, __biz)
        if not last_publish_time:
            # 查询mysql里是否有该任务
            sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz
            data = self._mysqldb.find(sql)
            if data:  # [(None,)] / []
                last_publish_time = str(data[0][0] or '')
                self.record_last_article_publish_time(__biz, last_publish_time)

        if last_publish_time is None:
            return

        if publish_time < last_publish_time:
            return True

        return False

    def is_in_crawl_time_range(self, publish_time):
        """
        是否在时间范围
        :param publish_time:
        :return: 是否达时间范围
        """
        if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]):
            return TaskManager.IS_IN_TIME_RANGE

        if self._crawl_time_range[0]:  # 时间范围 上限
            if publish_time > self._crawl_time_range[0]:
                return TaskManager.NOT_REACH_TIME_RANGE

            if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]:
                return TaskManager.IS_IN_TIME_RANGE

        if publish_time < self._crawl_time_range[1]:  # 下限
            return TaskManager.OVER_MIN_TIME_RANGE

        return TaskManager.IS_IN_TIME_RANGE

    def record_new_last_article_publish_time(self, __biz, new_last_publish_time):
        self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time)

    def get_new_last_article_publish_time(self, __biz):
        return self._redis.hget(self._new_last_article_publish_time, __biz)

    def update_account_last_publish_time(self, __biz, last_publish_time):
        sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format(
            last_publish_time, tools.get_current_date(), __biz
        )
        self._mysqldb.update(sql)

    def is_zombie_account(self, last_publish_timestamp):
        if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400:
            return True
        return False

    def sign_account_is_zombie(self, __biz, last_publish_time=None):
        if last_publish_time:
            sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                last_publish_time, tools.get_current_date(), __biz
            )
        else:
            sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                tools.get_current_date(), __biz
            )

        self._mysqldb.update(sql)

    def get_task(self, url=None, tip=''):
        """
        获取任务
        :param url: 指定url时，返回该url包装后的任务。否则先取公众号任务，无则取文章任务。若均无任务，则休眠一段时间之后再取
        :return:
        """

        sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max)

        if not url:
            account_task = self.get_account_task()
            if account_task:
                __biz = account_task.get('__biz')
                last_publish_time = account_task.get('last_publish_time')
                self.record_last_article_publish_time(__biz, last_publish_time)
                tip = '正在抓取列表'
                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz)
            else:
                article_task = self.get_article_task()
                if article_task:
                    tip = '正在抓取详情'
                    url = article_task.get('article_url')
                else:
                    sleep_time = config.get('spider').get('no_task_sleep_time')
                    log.info('暂无任务 休眠 {}s'.format(sleep_time))
                    tip = '暂无任务 '

        if url:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000
            )
        else:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000
            )

        return next_page

    def reset_task(self):
        # 清除redis缓存
        keys = self._task_root_key + "*"
        keys = self._redis.getkeys(keys)
        if keys:
            for key in keys:
                self._redis.clear(key)

            # 重设任务
            sql = "update wechat_article_task set state = 0 where state = 2"
            self._mysqldb.update(sql)

Example #5

Show file

def create_table():
    wechat_article_list_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_list` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `title` varchar(255) UNIQUE COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `digest` varchar(2000) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `url` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `source_url` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT NULL  ,
      `cover` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `subtype` int(11) DEFAULT NULL,
      `is_multi` int(11) DEFAULT NULL,
      `author` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `copyright_stat` int(11) DEFAULT NULL,
      `duration` int(11) DEFAULT NULL,
      `del_flag` int(11) DEFAULT NULL,
      `type` int(11) DEFAULT NULL,
      `publish_time` datetime DEFAULT NULL,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_task_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_task` (
      `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
      `sn` varchar(50) DEFAULT NULL,
      `article_url` varchar(255) DEFAULT NULL UNIQUE ,
      `state` int(11) DEFAULT '0' COMMENT '文章抓取状态，0 待抓取 2 抓取中 1 抓取完毕 -1 抓取失败',
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`) USING BTREE,
      KEY `state` (`state`) USING BTREE
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
    '''

    wechat_article_dynamic_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_dynamic` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `read_num` int(11) DEFAULT NULL,
      `like_num` int(11) DEFAULT NULL,
      `comment_count` int(11) DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_comment_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article_comment` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `comment_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '与文章关联',
      `nick_name` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `logo_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `content` varchar(2000) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `create_time` datetime DEFAULT NULL,
      `content_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '本条评论内容的id',
      `like_num` int(11) DEFAULT NULL,
      `is_top` int(11) DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `content_id` (`content_id`),
      KEY `comment_id` (`comment_id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_article_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_article` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `account` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `title` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `author` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `publish_time` datetime DEFAULT NULL,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `digest` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `cover` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `pics_url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
      `content_html` text COLLATE utf8mb4_unicode_ci,
      `source_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `comment_id` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `sn` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `sn` (`sn`),
      KEY `__biz` (`__biz`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    wechat_account_task_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_account_task` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `__biz` varchar(50) DEFAULT NULL,
      `last_publish_time` datetime DEFAULT NULL COMMENT '上次抓取到的文章发布时间，做文章增量采集用',
      `last_spider_time` datetime DEFAULT NULL COMMENT '上次抓取时间，用于同一个公众号每隔一段时间扫描一次',
      `is_zombie` int(11) DEFAULT '0' COMMENT '僵尸号 默认3个月未发布内容为僵尸号，不再检测',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
    '''

    wechat_account_table = '''
    CREATE TABLE IF NOT EXISTS `wechat_account` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `__biz` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `account` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `head_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `summary` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `qr_code` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `verify` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      `spider_time` datetime DEFAULT NULL,
      PRIMARY KEY (`id`),
      UNIQUE KEY `__biz` (`__biz`)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=DYNAMIC
    '''

    if config.get('mysqldb').get('auto_create_tables'):
        mysqldb = MysqlDB(**config.get('mysqldb'))
        _create_table(mysqldb, wechat_article_list_table)
        _create_table(mysqldb, wechat_article_task_table)
        _create_table(mysqldb, wechat_article_dynamic_table)
        _create_table(mysqldb, wechat_article_comment_table)
        _create_table(mysqldb, wechat_article_table)
        _create_table(mysqldb, wechat_account_task_table)
        _create_table(mysqldb, wechat_account_table)

Example #6

Show file

File: export_data.py Project: cash2one/internet-content-detection

 def export_to_mysql(self):
     self._aim_db = MysqlDB()
     self.__export()

Example #7

Show file

File: export_data.py Project: cash2one/internet-content-detection

 def export_to_oracle(self):
     self._aim_db = OracleDB()
     self._is_oracle = True
     self.__export()

Example #8

Show file

File: export_data.py Project: cash2one/internet-content-detection

class ExportData():
    def __init__(self, source_table, aim_table, key_map, unique_key=None):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key7' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key

        self._mongodb = MongoDB()

        self._is_oracle = False
        self._export_count = 0

    def export_to_oracle(self):
        self._aim_db = OracleDB()
        self._is_oracle = True
        self.__export()

    def export_to_mysql(self):
        self._aim_db = MysqlDB()
        self.__export()

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)

        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key （源key包含type 和 key 信息）
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        datas = self._mongodb.find(self._source_table, {'read_status': 0})
        for data in datas:
            sql = 'insert into ' + self._aim_table + " (" + ', '.join(
                aim_keys) + ") values ("
            values = []
            for i in range(len(keys)):
                if value_types[i] == 'str':
                    values.append(data[keys[i]].replace(
                        "'", "''"))  # 将单引号替换成两个单引号 否者sql语句语法出错
                    sql += "'%s', "

                elif value_types[i] == 'int':
                    values.append(data[keys[i]])
                    if isinstance(data[keys[i]], int):
                        sql += '%d, '
                    else:
                        sql += '%s, '

                elif value_types[i] == 'date':
                    values.append(data[keys[i]])
                    if self._is_oracle:
                        sql += "to_date('%s','yyyy-mm-dd hh24:mi:ss'), "
                    else:
                        sql += "'%s', "

                elif value_types[i] == 'vint':
                    values.append(keys[i])
                    sql += '%s, '

                elif value_types[i] == 'vstr':
                    values.append(keys[i])
                    sql += "'%s', "

                elif value_types[i] == 'sint':
                    value = self._oracledb.find(keys[i], fetch_one=True)
                    values.append(value)
                    sql += '%d, '

                elif value_types[i] == 'sstr':
                    value = self._oracledb.find(keys[i], fetch_one=True)
                    values.append(value)
                    sql += "'%s', "

                else:
                    log.error('%s不符合key_map规定格式' % value_types[i])
                    return

            sql = sql[:-2] + ")"
            sql = sql % tuple(values)

            log.debug(sql)
            if self._aim_db.add(sql):
                self._export_count += 1
                self._mongodb.update(self._source_table, data,
                                     {'read_status': 1})

        # self._aim_db.close()
        log.debug('共导出%d条数据' % self._export_count)

Example #9

Show file

class ExportData():
    INSERT = 1
    UPDATE = 2
    EXCEPTION = 3

    def __init__(self,
                 source_table='',
                 aim_table='',
                 key_map='',
                 unique_key=None,
                 unique_key_mapping_source_key=None,
                 update_read_status=True,
                 condition={'read_status': 0},
                 datas=[],
                 callback='',
                 sync_to_es=False):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table mongo数据库
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'vdate_name',               # 目标键 = 值                   类型为date
            'aim_key7' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
            'aim_key9' : 'clob_key8'                 # 目标键 = 源键对应的值         类型为clob
            'aim_key10' : 'clob_key8'                # 目标键 = 源键对应的值         类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据
         eg: unique_key_mapping_source_key = {
            'url':'str_url'                         # 目标键 = 源键对应的值         类型为str
         }
        @param condition:    导出满足什么样条件的数据 默认是read_status = 0 的
        @param datas:   要导出的数据，格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表，为空时默认导出mongodb的数据
        @param callback 导出数据的回调，导出一组，执行一次，callback(execute_type, sql) execute_type为执行类型（ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION）
        sql 为执行的语句
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._update_read_status = update_read_status
        self._condition = condition

        self._mongodb = MongoDB() if self._source_table else ''
        self._datas = datas
        self._sync_to_es = sync_to_es
        self._callback = callback

        self._is_oracle = False
        self._is_set_unique_key = False
        self._is_set_unique_key = False
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()

    def make_sql(self, data):
        '''
        @summary:
        ---------
        @param data: 数据字典
        ---------
        @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql
        '''
        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key （源key包含type 和 key 信息）
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join(
            aim_keys) + ") values ("
        update_sql = 'update ' + self._aim_table + " set "
        data_json = {}  # 导入到es中用
        values = []
        for i in range(len(keys)):
            if (value_types[i] != 'vint' and value_types[i] != 'vstr'
                    and value_types[i] != 'vdate' and value_types[i] != 'sint'
                    and value_types[i] != 'sstr') and (not data[keys[i]]
                                                       and data[keys[i]] != 0):
                values.append('null')
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = None

            elif value_types[i] == 'str':
                values.append(
                    str(data[keys[i]]).replace("'", "''")
                )  # if isinstance(data[keys[i]], str) else data[keys[i]])  # 将单引号替换成两个单引号 否者insert_sql语句语法出错
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'clob':
                text = str(data[keys[i]]).replace("'", "''")
                if not text:
                    insert_sql += "'%s', "
                    values.append(text)
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = None
                else:
                    values_ = tools.cut_string(text, 1000)

                    clob_text = ''
                    for value in values_:
                        clob_text += "to_clob('%s') || " % value

                    clob_text = clob_text[:-len(' || ')]
                    values.append(clob_text)
                    insert_sql += "%s, "
                    update_sql += aim_keys[i] + " = %s, " % values[-1]
                    data_json[aim_keys[i].upper()] = data[keys[i]]

            elif value_types[i] == 'int':
                if isinstance(data[keys[i]], int) or isinstance(
                        data[keys[i]], float) or isinstance(
                            data[keys[i]], str):
                    values.append(data[keys[i]])
                elif isinstance(data[keys[i]], bool):
                    values.append(data[keys[i]] and 1 or 0)
                else:  # _id
                    values.append(int(str(data[keys[i]])[-6:], 16))

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = eval(
                    values[-1]) if isinstance(values[-1], str) else values[-1]

            elif value_types[i] == 'date':
                values.append(data[keys[i]].replace('年', '-').replace(
                    '月', '-').replace('日', ''))
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vint':
                if tools.get_english_words(keys[i]):
                    sql = 'select %s from dual' % keys[i]
                    value = self._aim_db.find(sql)[0][0]
                    values.append(value)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    values.append(keys[i])
                    data_json[aim_keys[i].upper()] = eval(values[-1])

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]

            elif value_types[i] == 'vstr':
                values.append(keys[i])
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vdate':
                values.append(keys[i])
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sint':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % value
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sstr':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % value
                data_json[aim_keys[i].upper()] = values[-1]

            else:
                error_msg = '%s不符合key_map规定格式' % value_types[i]
                raise (Exception(error_msg))

        insert_sql = insert_sql[:-2] + ")"
        insert_sql = insert_sql % tuple(values)
        # tools.print(data_json)

        # log.debug(insert_sql)
        if self._unique_key_mapping_source_key:
            # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

            # value = tuple(self._unique_key_mapping_source_key.values())[0]
            # temp_var = value.split('_', 1)

            # source_key_types = temp_var[0]
            # source_key = temp_var[1]

            # if source_key_types == 'str':
            #     update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key])
            # elif source_key_types == 'int':
            #     update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key])

            # # log.debug(update_sql)

            return insert_sql, update_sql[:-2], data_json
        else:
            return insert_sql, data_json

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key and not self._is_set_unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)
            self._is_set_unique_key = True

        datas = self._mongodb.find(
            self._source_table,
            condition=self._condition) if self._mongodb else (
                self._datas
                if isinstance(self._datas, list) else [self._datas])
        for data in datas:
            if self._unique_key_mapping_source_key:
                insert_sql, update_sql, data_json = self.make_sql(data)
            else:
                insert_sql, data_json = self.make_sql(data)

            # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+')
            def exception_callfunc(e):
                if 'ORA-00001' in str(e):
                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})
                else:
                    log.error(insert_sql)

            execute_type = ExportData.EXCEPTION
            sql = ''
            # log.debug(insert_sql)
            if self._aim_db.add(insert_sql, exception_callfunc):
                self._export_count += 1
                sql = insert_sql
                execute_type = ExportData.INSERT

                if self._update_read_status:
                    self._mongodb.update(self._source_table, data,
                                         {'read_status': 1})

            elif self._unique_key_mapping_source_key:
                # 取id字段
                aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

                value = tuple(self._unique_key_mapping_source_key.values())[0]
                temp_var = value.split('_', 1)

                source_key_types = temp_var[0]
                source_key = temp_var[1]

                select_sql = 'select id from ' + self._aim_table
                if source_key_types == 'str':
                    select_sql = select_sql + " where %s = '%s'" % (
                        aim_key, data[source_key])
                elif source_key_types == 'int':
                    select_sql = select_sql + " where %s = %s" % (
                        aim_key, data[source_key])

                data_id = self._aim_db.find(select_sql)
                if data_id:
                    data_id = data_id[0][0]
                else:
                    continue

                #拼接update语句
                update_sql += " where id = %s" % data_id
                log.debug(update_sql)

                # 删除 update 里面 id= xxx 的条件，保证更新后的数据 ID不变
                id_info = ''.join(
                    tools.get_info(update_sql, [' id .*?,', ' ID .*?,']))
                update_sql = update_sql.replace(id_info, '')

                # 修改data_json 里的ID
                if "ID" in data_json.keys():
                    data_json["ID"] = data_id

                # 更新
                if self._aim_db.update(update_sql):
                    self._update_count += 1
                    sql = update_sql
                    execute_type = ExportData.UPDATE

                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})

            # 同步到ES
            if self._sync_to_es and execute_type != ExportData.EXCEPTION:
                self._es.add(table=self._aim_table,
                             data=data_json,
                             data_id=data_json.get('ID'))

            if self._callback:
                self._callback(execute_type, sql, data_json)

        log.debug('''
            共导出%s条数据
            共更新%s条数据
            ''' % (self._export_count, self._update_count))

        return self._export_count + self._update_count

    def close(self):
        self._aim_db.close()

Example #10

Show file

File: data_pipeline.py Project: TIM952597205/wechat-spider

# -*- coding: utf-8 -*-
'''
Created on 2019/5/13 12:44 AM
---------
@summary:
---------
@author:
'''
from db.mysqldb import MysqlDB
import utils.tools as tools
from utils.log import log
from config import config

db = MysqlDB(**config.get('mysqldb'))


def save_account(data):
    log.debug(tools.dumps_json(data))

    sql = tools.make_insert_sql('wechat_account', data, insert_ignore=True)
    db.add(sql)


def save_article_list(datas: list):
    log.debug(tools.dumps_json(datas))

    sql, articles = tools.make_batch_sql('wechat_article_list', datas)
    db.add_batch(sql, articles)

    # 存文章任务
    article_task = [