コード例 #1
0
    def _read_files(self):
        """
            如果没有找到可以解析的文件, 程序退出
            否则
            生成器迭代文件
        """
        try:
            _files = self._get_local_file(self.time_args)
            if len(_files) == 0:
                etllog.warning(u'没有可以解析的文件,程序退出!')
                from dwetl.petl.p_decorator import sys_exit1
                sys_exit1()

            for onefile in _files:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: Read File %s ==========' %
                    onefile)
                etllog.info(
                    '========== ETL-LOG-FILE: Read File %s ==========' %
                    onefile)
                with open(onefile, 'rb') as onef:
                    for line in onef:
                        yield line
        except Exception, e:
            etllog.error('[consume_log.py] ReadFile [_read_files]' + str(e))
コード例 #2
0
ファイル: sftp_get_file.py プロジェクト: marvinxw/DTETL
    def _get(self):
        try:
            if len(self._get_local_file(self.time_args)) != 0:
                # 如果有当前的文件, 属于执行过的文件
                etllog.error(' 文件已经执行过, 请在本地删除在执行! [* 不删除DB中相应的数据会有主键错误!]')
                from dwetl.petl.p_decorator import sys_exit1
                sys_exit1()

            for _server_info in self.server:  # 多台服务器
                self._conn_server(_server_info)
                etllog.info(
                    '========== ETL-LOG-FILE:Conn Server %s ==========' %
                    (json.dumps(_server_info.get('host'))))
                etllog.lfinfo('========== ETL-LOG-FILE:Server %s ==========' %
                              (json.dumps(_server_info.get('host'))))
                for f in self._get_remote_file(self.time_args):  # 多个目录文件
                    # 放文件
                    # 一对一的
                    fname = f.split('/')[-1]
                    fpath = '/'.join(f.split('/')[:-1]) + '/'
                    index = self._remote_path.index(fpath)
                    self.sftp.get(f, self._local_path[index] + fname)
                    etllog.lfinfo(
                        '========== ETL-LOG-FILE: Get File remote %s local %s=========='
                        % (f, self._local_path[index] + fname))
                    etllog.info(
                        '========== ETL-LOG-FILE: Get File remote %s local %s=========='
                        % (f, self._local_path[index] + fname))
            self._close()
        except Exception, e:
            etllog.error('[sftp_get_file.py] SftpGFile [_get]' + str(e))
コード例 #3
0
 def _start_table(self, table):
     """
         @:table 表
     """
     etllog.info('========== CURRENT TABLE %s ==========' % table)
     self._log_message['TABLE_OR_VIEW'] = table
     task = EtlTask(table)
     task.start()
コード例 #4
0
    def _update_max_delta_values(self, table, _is_data):
        """
            @:table 要update的表
            @:_is_data 用来确定是否更新表;
                        若_is_data没有值,不更新直接返回
        """

        # todo 制表
        if not _is_data:
            etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' %
                        table)
            return

        try:
            _delta_field = clean_etl_conf._get_delta_field(table)
            # where conditions
            _SQL = '''SELECT '''
            _SQL += ','.join(["MAX(%s)" % _ for _ in _delta_field])
            _SQL += " FROM %s WHERE " % table
            _where = '''1=1 '''
            for key, value in self._where_values.items():
                if isinstance(value, int) or isinstance(value, long):
                    _where += "and {0} >= {1} ".format(key, value)
                else:
                    _where += " and {0} >= to_date('{1}', 'yyyy-mm-dd HH:MI:SS')".format(
                        key, value)

            # max sql
            _SQL = _SQL + _where

            # new dict
            self._conn_cursor()
            self.query(_SQL)
            _info = dict(zip(_delta_field, self.fetchallrows()[0]))

            self._close_cursor()

            # modify
            for key, value in _info.items(
            ):  #{_: _info[_] for _ in self._where_values.keys()}.items():
                if isinstance(value, int) or isinstance(value, long):
                    # number格式加1
                    value += 1
                else:
                    # 时间格式加一秒
                    value = clean_datetime._time_plus_second(value)
                _info[key] = value

            # update
            self._DETAL_CACHES.update(
                {table: {_: _info[_]
                         for _ in self._where_values.keys()}})
        except Exception, e:
            etllog.error(
                '[delta_control.py] TDeltaConf [_update_max_delta_values]' +
                str(e))
コード例 #5
0
def main_file():
    start_time = time.time()
    etllog.info('========== [ETL] START TIME: ==========')

    etllog.info('========== [ETL] START TASK ==========')
    etl_work = EtlWork('access_log')
    etl_work.start()
    etllog.info('========== [ETL] FINISH TASK ==========')

    etllog.info('========== [ETL] END TIME: ==========')
    etllog.info('========== [ETL] TIME ELAPSED %s s==========' % (time.time()-start_time))
コード例 #6
0
ファイル: load_datas.py プロジェクト: marvinxw/DTETL
 def _insert(sql):
     """
     调用插入
     :param sql:
     :return:
     """
     try:
         self._roll_back()
         self.insert(sql)
     except Exception, e:
         etllog.info(u'_insert异常数据===>' + sql)
         etllog.error(
             '[load_datas.py] LoadDatas [_exception_clean] _insert ' +
             str(e))
コード例 #7
0
    def location_log(self):
        # 取location相应的数据

        def _match_line(line):
            p = re.compile(r'\[.*\]')
            # p.search(line).group()
            new_line = json.loads(p.search(line).group())
            return new_line

        def _rank_replace(line):
            clean_user_id = _user_id(line['userId'])
            if clean_user_id == 0:
                return
            else:
                return (
                    clean_user_id,
                    line['time'],
                    line['latitude'],
                    line['longitude'],
                )

        lines = self._get_rows()
        try:
            match_datas = map(_match_line, lines)
            # match_datas = [_match_line(line) for line in lines]
            _dict_list_datas = [
                _ for detail_datas in match_datas for _ in detail_datas
            ]
            # _datas_clean_none = [_rank_replace(_) for _ in _dict_list_datas]
            _datas_clean_none = map(_rank_replace, _dict_list_datas)
            self.lineno += len(match_datas)
            if len(match_datas) == 0:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s, TOTAL Data =========='
                    % (self.lineno, ))
                etllog.info(
                    '========== ETL-LOG-FILE: Read File END ==========')
                etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' %
                              clean_datetime._today())
                return 0
            else:
                DATAS.put(_datas_clean_none)
                return DATAS.qsize()
        except Exception, e:
            etllog.error(
                '[consume_log.py] ReadFile [_get_file_datas_to_queue]' +
                str(e))
コード例 #8
0
ファイル: delta_control.py プロジェクト: marvinxw/DTETL
    def _end_update_sql(self, table, json_values):
        """
            @:table, 需要update的表; 将增量字段更新到表t_delta_conf中
        """
        try:
            self._update_sql += '''
                UPDATE t_delta_conf SET primarykey = '%s', update_date = '%s' WHERE tables = '%s';
            ''' % (json.dumps(json_values), clean_datetime._today(), table)

            self._conn_cursor()
            self.insert(self._update_sql)
            self._close_cursor()

            # 只实例化了一次, 每次保存后, 需要改为0
            self.limit = 0
        except Exception, e:
            etllog.info('[delta_control.py] TDeltaConf [t_delta_conf]======>' + self._update_sql)
            etllog.error('[delta_control.py] TDeltaConf [_end_update_sql]' + str(e))
コード例 #9
0
ファイル: load_datas.py プロジェクト: marvinxw/DTETL
 def _delete(err_str):
     """
     调用删除
     :param err_str:
     :return:
     """
     _sql = None
     try:
         # 匹配duplicate data
         p = re.compile(r'(\(.*\))=(\(.*\))')
         m = re.search(p, err_str)
         _field, _values = m.group(1), m.group(2)
         _sql = self.delete_sql(_split_str(_field), _split_str(_values))
         self._roll_back()
         self.delete(_sql)
     except Exception, e:
         etllog.info(u'_delete异常数据===>' + _sql)
         etllog.error(
             '[load_datas.py] LoadDatas [_exception_clean] _delete ' +
             str(e))
コード例 #10
0
    def access_log(self):
        # 取相应的数据
        def _rank_replace(line):
            new_line = line.replace('\r\n', '').split(',')
            clean_user_id = _user_id(new_line[3])
            if clean_user_id == 0:
                return
            else:
                return (
                    clean_user_id,
                    new_line[0],
                    5 if new_line[2] == 2 else _interaction_type(new_line[5]),
                    _object_id(new_line[5]),
                    _object_id_2(),
                    new_line[2] if new_line[2] else 0,
                    new_line[5],
                )

        lines = self._get_rows()
        try:
            # _datas = [_rank_replace(line) for line in lines]
            _datas = map(_rank_replace, lines)
            _datas_clean_none = [_ for _ in _datas if _ != None]
            self.lineno += len(_datas)
            if len(_datas) == 0:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s. ignore UUID: %s =========='
                    % (self.lineno, _datas.count(None)))
                etllog.info(
                    '========== ETL-LOG-FILE: Read File END ==========')
                etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' %
                              clean_datetime._today())
                return 0
            else:
                DATAS.put(_datas_clean_none)
                return DATAS.qsize()
        except Exception, e:
            etllog.error(
                '[consume_log.py] ReadFile [_get_file_datas_to_queue]' +
                str(e))
コード例 #11
0
ファイル: delta_control.py プロジェクト: marvinxw/DTETL
    def _update_max_delta_values(self, table, _is_data):
        """
            @:table 要update的表
            @:_is_data 用来确定是否更新表;
                        若_is_data没有值,不更新直接返回
        """
        if not _is_data:
            self.limit = 0
            etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table)
            return

        if not self._where_values:
            self.limit = 0
            return

        try:
            # {'a': 1, 'max_a': 2, 'b': 2, 'max_b': 3, 'c': 3, 'max_c': 4}
            # ===>
            # {'a': 2, 'b': 3, 'c': 4}

            _info = {_: self._where_dict.get('max_'+_) for _ in self._where_dict.keys() if 'max_' not in _}

            for key, value in _info.items():
                if isinstance(value, int) or isinstance(value, long):
                    # number格式加1
                    value += 1
                else:
                    # 时间格式加一秒
                    value = str(clean_datetime._time_plus_second(value))
                _info[key] = value

            # 当前表任务完成, 增量字段
            etllog.info('==========[ETL] UPDATE TABLES=%s DELTA VALUE ==========' % table)

            self._end_update_sql(table, _info)

        except Exception, e:
            etllog.error('[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
コード例 #12
0
def _refresh_view():
    start_time = time.time()
    etllog.info('========== [ETL] REFRESH MATERIALIZED VIEW START TIME: %s ==========' % clean_datetime._today())

    rmv = RefreshMaterializedView('user_age_group')
    rmv._refresh_start()

    etllog.info('========== [ETL] REFRESH MATERIALIZED VIEW END TIME: %s ==========' % clean_datetime._today())
    etllog.info('========== [ETL] TIME ELAPSED %s s ==========' % (time.time()-start_time))
コード例 #13
0
def main_db(table):
    start_time = time.time()
    etllog.info('========== [ETL] START TIME: %s ==========' % clean_datetime._today())

    etllog.info('========== [ETL] READ DETAL TABLE ==========')
    t_delta_conf._detal_caches()

    etllog.info('========== [ETL] START TASK ==========')
    etl_work = EtlWork(table)
    etl_work.start()
    etllog.info('========== [ETL] FINISH TASK ==========')

    etllog.info('==========[ETL] SAVE DETAL TABLE==========')

    etllog.info('========== [ETL] END TIME: %s ==========' % clean_datetime._today())
    etllog.info('========== [ETL] TIME ELAPSED %s s==========' % (time.time()-start_time))
コード例 #14
0
 def _start_file(self, file):
     etllog.info('========== CURRENT FILE %s ==========' % self.options)
     etllog.lfinfo('========== CURRENT FILE %s ==========' % self.options)
     self._log_message['TABLE_OR_VIEW'] = '[FILE]' + file
     task = EtlTask(file)
     task.start()
コード例 #15
0
ファイル: load_datas.py プロジェクト: marvinxw/DTETL
                    str(e))

        try:
            while 1:
                if len(self.back_datas) != 0:
                    self._datas = [self.back_datas.pop()]
                else:
                    break
                try:
                    _put()
                except Exception, e:
                    # 删除数据,重新插入
                    err_str = str(e)  # 需要匹配的异常数据
                    _ins_sql = self._sql  # 需要再次插入的数据

                    if 'duplicate key' in err_str:
                        _delete(err_str)
                        _insert(_ins_sql)
                    else:
                        etllog.info(u'异常数据===>' + _ins_sql)

                    # 重新put后面的数据
                    self._roll_back()
                    self._datas = self.back_datas[:]
                    self._roll_back()
                    self.bulk_put(exflag=1)
                    break
        except Exception, e:
            etllog.error('[load_datas.py] LoadDatas [_exception_clean]' +
                         str(e))