Example #1
0
    def __init__(self):
        super(Posts, self).__init__()
        self.data = {
            'table':
            'posts',
            'column_types': [
                Id,
                PostTypeId,
                AcceptedAnswerId,
                ParentId,
                Score,
                ViewCount,
                Body,
                OwnerUserId,
                LastEditorUserId,
                LastEditDate,
                LastActivityDate,
                Title,
                Tags,
                AnswerCount,
                CommentCount,
                FavoriteCount,
                CreationDate,
            ],
        }

        logger.info(
            '%s: table %s, column "%s"' %
            (sys._getframe().f_code.co_name, self.data['table'], ','.join(
                map(lambda x: x.__name__, self.data['column_types']))))
Example #2
0
    def __init__(self):
        super(Users, self).__init__()
        self.data = {
            'table':
            'users',
            'column_types': [
                Id,
                Reputation,
                CreationDate,
                DisplayName,
                LastAccessDate,
                Views,
                WebsiteUrl,
                Location,
                AboutMe,
                Age,
                UpVotes,
                DownVotes,
                EmailHash,
            ],
        }

        logger.info(
            '%s: table %s, column "%s"' %
            (sys._getframe().f_code.co_name, self.data['table'], ','.join(
                map(lambda x: x.__name__, self.data['column_types']))))
Example #3
0
    def conf_db(self, user, password, host, db, port=3306):
        self.user = user
        self.password = password
        self.host = host
        self.db = db
        self.port = port

        self.conn = Mysql(host=self.host,
                          user=self.user,
                          password=self.password,
                          db=self.db,
                          port=self.port)
        logger.info('%s: %s@%s:%s %s' %
                    (sys._getframe().f_code.co_name, self.user, self.host,
                     self.port, self.db))
Example #4
0
    def __init__(self):
        super(Votes, self).__init__()
        self.data = {
            'table': 'votes',
            'column_types': [
                Id,
                PostId,
                VoteTypeId,
                CreationDate,
            ],
        }

        logger.info(
            '%s: table %s, column "%s"' %
            (sys._getframe().f_code.co_name, self.data['table'], ','.join(
                map(lambda x: x.__name__, self.data['column_types']))))
Example #5
0
    def __init__(self):
        super(Badges, self).__init__()
        self.data = {
                'table': 'badges',
                'column_types': [
                    Id,
                    UserId,
                    Name,
                    Date,
                ],
                }

        logger.info('%s: table %s, column "%s"' % (
            sys._getframe().f_code.co_name, 
            self.data['table'], 
            ','.join(map(lambda x: x.__name__, self.data['column_types']))
            ))
Example #6
0
    def __init__(self):
        super(Comments, self).__init__()
        self.data = {
            'table': 'comments',
            'column_types': [
                Id,
                PostId,
                Score,
                Text,
                CreationDate,
                UserId,
            ],
        }

        logger.info(
            '%s: table %s, column "%s"' %
            (sys._getframe().f_code.co_name, self.data['table'], ','.join(
                map(lambda x: x.__name__, self.data['column_types']))))
    def __init__(self):
        super(PostHistory, self).__init__()
        self.data = {
            'table':
            'post_history',
            'column_types': [
                Id,
                PostHistoryTypeId,
                PostId,
                RevisionGUID,
                CreationDate,
                UserId,
                Text,
            ],
        }

        logger.info(
            '%s: table %s, column "%s"' %
            (sys._getframe().f_code.co_name, self.data['table'], ','.join(
                map(lambda x: x.__name__, self.data['column_types']))))
Example #8
0
 def conf_batch(self, batch_size):
     self.batch_size = batch_size
     logger.info('%s: size %s' %
                 (sys._getframe().f_code.co_name, self.batch_size))
Example #9
0
class Store(object):
    def __init__(self):
        self.total = 0
        self.data = {'table': '', 'column_types': []}
        self.batch_size = 5000

    def __str__(self):
        pass

    def conf_batch(self, batch_size):
        self.batch_size = batch_size
        logger.info('%s: size %s' %
                    (sys._getframe().f_code.co_name, self.batch_size))

    def conf_db(self, user, password, host, db, port=3306):
        self.user = user
        self.password = password
        self.host = host
        self.db = db
        self.port = port

        self.conn = Mysql(host=self.host,
                          user=self.user,
                          password=self.password,
                          db=self.db,
                          port=self.port)
        logger.info('%s: %s@%s:%s %s' %
                    (sys._getframe().f_code.co_name, self.user, self.host,
                     self.port, self.db))

    def exec_insert(self, table, cols, insert_value_batch):
        # 判断column与value数目是否相等
        if len(insert_value_batch) == 0:
            logger.warn('%s: insert value batch is empty' %
                        (sys._getframe().f_code.co_name))
            return False
        if len(insert_value_batch[0]) != len(cols):
            logger.warn(
                '%s: column size is not equal to value, col %s val %s' %
                (sys._getframe().f_code.co_name, len(cols),
                 len(insert_value_batch[0])))
            return False

        vals = []
        for insert_value in insert_value_batch:
            val = '(%s)' % (','.join(insert_value))
            vals.append(val)
        vals = ','.join(vals)

        sql = 'INSERT IGNORE INTO %(table)s (%(cols)s) VALUES %(vals)s;'
        para = {
            'table': table,
            'cols': ','.join(map(lambda x: x.__name__, cols)),
            'vals': vals,
        }
        sql %= para
        while True:
            try:
                self.conn.execute(sql)
                break
            except Exception, e:
                logger.error(str(e))
                # 处理连接丢失的情况
                # (2013, 'Lost connection to MySQL server during query')
                # (2006, 'MySQL server has gone away')
                if e.errno == 2013 or e.errno == 2006:
                    self.conn.close()
                    self.conn = Mysql(host=self.host,
                                      user=self.user,
                                      password=self.password,
                                      db=self.db,
                                      port=self.port)
                    logger.warn('%s: reconnect %s@%s:%s %s' %
                                (sys._getframe().f_code.co_name, self.user,
                                 self.host, self.port, self.db))
                else:
                    return False

        self.total += len(insert_value_batch)
        logger.info('%s: insert %s/%s' % (sys._getframe().f_code.co_name,
                                          len(insert_value_batch), self.total))
        return True
Example #10
0
    def load(self, file_path, start=0, end=sys.maxint):
        insert_value_batch = []
        # load file incrementally
        logger.info('%s: file "%s" is loading, row range [%s, %s]' %
                    (sys._getframe().f_code.co_name, file_path, start, end))
        cur_iter = 0
        context = ET.iterparse(file_path,
                               events=('end', ))  # ignore event 'start'
        for event, elem in context:
            if elem.tag == 'row':  # skip tag <posts>, </posts>
                cur_iter += 1
                if cur_iter > end:  # 到达插入范围终点
                    break

                do_continue = False
                if cur_iter < start:
                    do_continue = True  # continue

                # wrap column definition to column.py
                #rec['Id'] = elem.attrib.get('Id', 'n/a')
                #rec['PostTypeId'] = elem.attrib.get('PostTypeId', 'n/a')
                #rec['AcceptedAnswerId'] = elem.attrib.get('AcceptedAnswerId', '')
                #print event, elem, elem.attrib.get('Id', 'n/a'), elem.attrib.get('Title', 'answer')

                # 过滤空数据行(预防异常事件)
                if not elem.attrib.get('Id', ''):
                    do_continue = True  # continue

                if not do_continue:
                    insert_value = []
                    rec_list = list()
                    for column_type in self.data['column_types']:
                        column = column_type(
                            elem.attrib.get(column_type.__name__, ''))
                        insert_value.append(column.sql())
                    insert_value_batch.append(insert_value)

                    # 批量执行插入操作
                    if len(insert_value_batch) % self.batch_size == 0 and len(
                            insert_value_batch) > 0:
                        logger.info('%s: batch insert row %s %s' %
                                    (sys._getframe().f_code.co_name, cur_iter -
                                     len(insert_value_batch) + 1, cur_iter))
                        #time.sleep(1)
                        self.exec_insert(self.data['table'],
                                         self.data['column_types'],
                                         insert_value_batch)
                        insert_value_batch = []

            # 每次循环保证会执行到此处,进行资源释放
            # It's safe to call clear() here because no descendants will be accessed
            elem.clear()
            # Also eliminate now-empty references from the root node to row (USED ONLY in lxml)
            #while elem.getprevious() is not None:
            #    del elem.getparent()[0]

            # end for-loop
        del context
        # 插入剩余数据
        if len(insert_value_batch) > 0:
            logger.info('%s: batch insert row %s %s' %
                        (sys._getframe().f_code.co_name,
                         cur_iter - len(insert_value_batch) + 1, cur_iter))
            self.exec_insert(self.data['table'], self.data['column_types'],
                             insert_value_batch)
            insert_value_batch = []
        logger.info('%s: file load done' % (sys._getframe().f_code.co_name))
        return