Beispiel #1
0
    def _read_files(self):
        """
            如果没有找到可以解析的文件, 程序退出
            否则
            生成器迭代文件
        """
        try:
            _files = self._get_local_file(self.time_args)
            if len(_files) == 0:
                etllog.warning(u'没有可以解析的文件,程序退出!')
                from dwetl.petl.p_decorator import sys_exit1
                sys_exit1()

            for onefile in _files:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: Read File %s ==========' %
                    onefile)
                etllog.info(
                    '========== ETL-LOG-FILE: Read File %s ==========' %
                    onefile)
                with open(onefile, 'rb') as onef:
                    for line in onef:
                        yield line
        except Exception, e:
            etllog.error('[consume_log.py] ReadFile [_read_files]' + str(e))
Beispiel #2
0
    def _get(self):
        try:
            if len(self._get_local_file(self.time_args)) != 0:
                # 如果有当前的文件, 属于执行过的文件
                etllog.error(' 文件已经执行过, 请在本地删除在执行! [* 不删除DB中相应的数据会有主键错误!]')
                from dwetl.petl.p_decorator import sys_exit1
                sys_exit1()

            for _server_info in self.server:  # 多台服务器
                self._conn_server(_server_info)
                etllog.info(
                    '========== ETL-LOG-FILE:Conn Server %s ==========' %
                    (json.dumps(_server_info.get('host'))))
                etllog.lfinfo('========== ETL-LOG-FILE:Server %s ==========' %
                              (json.dumps(_server_info.get('host'))))
                for f in self._get_remote_file(self.time_args):  # 多个目录文件
                    # 放文件
                    # 一对一的
                    fname = f.split('/')[-1]
                    fpath = '/'.join(f.split('/')[:-1]) + '/'
                    index = self._remote_path.index(fpath)
                    self.sftp.get(f, self._local_path[index] + fname)
                    etllog.lfinfo(
                        '========== ETL-LOG-FILE: Get File remote %s local %s=========='
                        % (f, self._local_path[index] + fname))
                    etllog.info(
                        '========== ETL-LOG-FILE: Get File remote %s local %s=========='
                        % (f, self._local_path[index] + fname))
            self._close()
        except Exception, e:
            etllog.error('[sftp_get_file.py] SftpGFile [_get]' + str(e))
Beispiel #3
0
    def _mail_send(self,
                   FROM_ADDR=None,
                   PASSWORD=None,
                   TO_ADDR=None,
                   SMTP_SERVER=None,
                   PORT=None,
                   ATTACHMENT=None):
        """
            @FROM_ADDR 邮件发送人
            @PASSWORD 邮件发送人密码
            @TO_ADDR 发送给
            @SMTP_SERVER SMTP服务
            @PORT 端口号
            @ATTACHMENT 附件
        """
        self._mail_header(FROM_ADDR, TO_ADDR)
        self._mail_contents()

        if ATTACHMENT:
            self._mail_attachment(ATTACHMENT)

        try:
            #发送邮件
            smtp = smtplib.SMTP()
            smtp.set_debuglevel(1)
            smtp.connect(SMTP_SERVER, PORT)
            smtp.starttls()
            smtp.login(FROM_ADDR, PASSWORD)
            smtp.sendmail(FROM_ADDR, TO_ADDR, self._mail.as_string())
            smtp.quit()
        except Exception, e:
            etllog.error('smtpMail [_mail_send]' + str(e))
Beispiel #4
0
    def _mail_send(self,
                   FROM_ADDR=None,
                   PASSWORD=None,
                   TO_ADDR=None,
                   SUBJECT=None,
                   SMTP_SERVER=None,
                   PORT=None,
                   ATTACHMENT=None):
        self._mail_header(FROM_ADDR, TO_ADDR, SUBJECT)
        self._mail_contents()

        if ATTACHMENT:
            self._read_log_lastn()
            self._mail_attachment(ATTACHMENT)

        try:
            #发送邮件
            smtp = smtplib.SMTP()
            smtp.set_debuglevel(1)
            smtp.connect(SMTP_SERVER, PORT)
            smtp.starttls()
            smtp.login(FROM_ADDR, PASSWORD)
            smtp.sendmail(FROM_ADDR, TO_ADDR, self._mail.as_string())
            smtp.quit()
        except Exception, e:
            etllog.error('smtpMail [_mail_send]' + str(e))
Beispiel #5
0
 def _put(self):
     try:
         self.load_datas.bulk_put()
     except Exception, e:
         etllog.error('[etl_task.py] EtlTask [_put]' + str(e))
         from dwetl.petl.p_decorator import sys_exit1
         sys_exit1()
Beispiel #6
0
 def read_config_file(self):
     try:
         self.config = ConfigParser()
         self.config.read(self.CONFIGFILE)
     except Exception, e:
         etllog.error('[delta_control.py] RwConfigFile read_config_file' +
                      str(e))
Beispiel #7
0
 def set_config_context(self, section, option, value):
     try:
         self.config.set(section, option, value)
         self.config.write(open(self.CONFIGFILE, 'w'))
     except Exception, e:
         etllog.error('[delta_control.py] RwConfigFile set_config_context' +
                      str(e))
Beispiel #8
0
 def _get(self):
     try:
         res = self.extract_datas._bulk_get()
         return res
     except Exception, e:
         etllog.error('[etl_task.py] EtlTask [_get]' + str(e))
         from dwetl.petl.p_decorator import sys_exit1
         sys_exit1()
Beispiel #9
0
 def _query_sql_limit(self):
     """
         组成select语句
     """
     try:
         self._sql = self._local_sql + t_delta_conf._set_tables_limit()
     except Exception, e:
         etllog.error('[extract_datas.py] DbDatas [_query_sql]' + str(e))
Beispiel #10
0
    def ppserver(self):

        try:
            for g in self.groups:
                task_groups = EtlWork(self.groups, )
            print self.job_server.print_stats()
        except Exception, e:
            etllog.error(str(e))
            pass
Beispiel #11
0
    def _update_max_delta_values(self, table, _is_data):
        """
            @:table 要update的表
            @:_is_data 用来确定是否更新表;
                        若_is_data没有值,不更新直接返回
        """

        # todo 制表
        if not _is_data:
            etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' %
                        table)
            return

        try:
            _delta_field = clean_etl_conf._get_delta_field(table)
            # where conditions
            _SQL = '''SELECT '''
            _SQL += ','.join(["MAX(%s)" % _ for _ in _delta_field])
            _SQL += " FROM %s WHERE " % table
            _where = '''1=1 '''
            for key, value in self._where_values.items():
                if isinstance(value, int) or isinstance(value, long):
                    _where += "and {0} >= {1} ".format(key, value)
                else:
                    _where += " and {0} >= to_date('{1}', 'yyyy-mm-dd HH:MI:SS')".format(
                        key, value)

            # max sql
            _SQL = _SQL + _where

            # new dict
            self._conn_cursor()
            self.query(_SQL)
            _info = dict(zip(_delta_field, self.fetchallrows()[0]))

            self._close_cursor()

            # modify
            for key, value in _info.items(
            ):  #{_: _info[_] for _ in self._where_values.keys()}.items():
                if isinstance(value, int) or isinstance(value, long):
                    # number格式加1
                    value += 1
                else:
                    # 时间格式加一秒
                    value = clean_datetime._time_plus_second(value)
                _info[key] = value

            # update
            self._DETAL_CACHES.update(
                {table: {_: _info[_]
                         for _ in self._where_values.keys()}})
        except Exception, e:
            etllog.error(
                '[delta_control.py] TDeltaConf [_update_max_delta_values]' +
                str(e))
Beispiel #12
0
 def _datas_str(self):
     """
         拼成(),().....();
     """
     _mogrify = self.mogrify_post
     try:
         self._str_datas = ','.join(
             _mogrify(self._formatns(), d) for d in self._datas)
     except Exception, e:
         etllog.error('[load_datas.py] LoadDatas [_datas_str]' + str(e))
Beispiel #13
0
 def _get_datas(self):
     """
         执行sql语句
     """
     try:
         self.query(self._sql)
         res = self.fetchallrows()
     except Exception, e:
         etllog.error('[extract_datas.py] DbDatas [_get_datas]' + str(e))
         res = None
Beispiel #14
0
 def _detal_caches(self):
     """
         取出所有表的增量字段, 缓存在_DETAL_CACHES属性中
     """
     try:
         self.query(self._query_sql)
         self._DETAL_CACHES = {_[1]: _[2] for _ in self.fetchallrows()}
         self._close_cursor()
     except Exception, e:
         etllog.error('[delta_control.py] TDeltaConf [_detal_caches] ' + str(e))
Beispiel #15
0
 def _conn_server(self, _server_info):
     """
         sftp, 下载服务器文件到本地
     """
     try:
         self.pt = paramiko.Transport((_server_info['host'], _server_info['port']))
         self.pt.connect(username=_server_info['username'], password=_server_info['password'])
         self.sftp = paramiko.SFTPClient.from_transport(self.pt)
     except Exception, e:
         etllog.error('[common_file.py] FileUBase [_get]' + str(e))
Beispiel #16
0
 def __init__(self, options):
     """
         @:options 表 或 文件
     """
     self.table = options
     try:
         self.extract_datas = ExtractDatas(options)._create_instance()
         # 判断是文件还是表, 是文件取得表
         if options in clean_file_server._set_file_or_table():
             options = clean_file_server._set_file_or_table(options)
         self.load_datas = LoadDatas(options)
     except Exception, e:
         etllog.error('[etl_task.py] EtlTask [__init__]' + str(e))
Beispiel #17
0
 def _insert(sql):
     """
     调用插入
     :param sql:
     :return:
     """
     try:
         self._roll_back()
         self.insert(sql)
     except Exception, e:
         etllog.info(u'_insert异常数据===>' + sql)
         etllog.error(
             '[load_datas.py] LoadDatas [_exception_clean] _insert ' +
             str(e))
Beispiel #18
0
    def location_log(self):
        # 取location相应的数据

        def _match_line(line):
            p = re.compile(r'\[.*\]')
            # p.search(line).group()
            new_line = json.loads(p.search(line).group())
            return new_line

        def _rank_replace(line):
            clean_user_id = _user_id(line['userId'])
            if clean_user_id == 0:
                return
            else:
                return (
                    clean_user_id,
                    line['time'],
                    line['latitude'],
                    line['longitude'],
                )

        lines = self._get_rows()
        try:
            match_datas = map(_match_line, lines)
            # match_datas = [_match_line(line) for line in lines]
            _dict_list_datas = [
                _ for detail_datas in match_datas for _ in detail_datas
            ]
            # _datas_clean_none = [_rank_replace(_) for _ in _dict_list_datas]
            _datas_clean_none = map(_rank_replace, _dict_list_datas)
            self.lineno += len(match_datas)
            if len(match_datas) == 0:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s, TOTAL Data =========='
                    % (self.lineno, ))
                etllog.info(
                    '========== ETL-LOG-FILE: Read File END ==========')
                etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' %
                              clean_datetime._today())
                return 0
            else:
                DATAS.put(_datas_clean_none)
                return DATAS.qsize()
        except Exception, e:
            etllog.error(
                '[consume_log.py] ReadFile [_get_file_datas_to_queue]' +
                str(e))
Beispiel #19
0
    def _end_update_sql(self, table, json_values):
        """
            @:table, 需要update的表; 将增量字段更新到表t_delta_conf中
        """
        try:
            self._update_sql += '''
                UPDATE t_delta_conf SET primarykey = '%s', update_date = '%s' WHERE tables = '%s';
            ''' % (json.dumps(json_values), clean_datetime._today(), table)

            self._conn_cursor()
            self.insert(self._update_sql)
            self._close_cursor()

            # 只实例化了一次, 每次保存后, 需要改为0
            self.limit = 0
        except Exception, e:
            etllog.info('[delta_control.py] TDeltaConf [t_delta_conf]======>' + self._update_sql)
            etllog.error('[delta_control.py] TDeltaConf [_end_update_sql]' + str(e))
Beispiel #20
0
    def _set_tables_where_conditions(self, table):
        """
            获得表的where范围
            field & max_field
        """
        self._where_values = self._DETAL_CACHES.get(table)
        try:
            if self._where_values:

                max_value = MaxValue(self._where_values, table)

                self._where_dict = max_value._get_max_delta_field()

                return self._where_dict
            else:
                return {}
        except Exception, e:
            etllog.error('[delta_control.py] TDeltaConf [_set_tables_where_conditions] ' + str(e))
Beispiel #21
0
 def _delete(err_str):
     """
     调用删除
     :param err_str:
     :return:
     """
     _sql = None
     try:
         # 匹配duplicate data
         p = re.compile(r'(\(.*\))=(\(.*\))')
         m = re.search(p, err_str)
         _field, _values = m.group(1), m.group(2)
         _sql = self.delete_sql(_split_str(_field), _split_str(_values))
         self._roll_back()
         self.delete(_sql)
     except Exception, e:
         etllog.info(u'_delete异常数据===>' + _sql)
         etllog.error(
             '[load_datas.py] LoadDatas [_exception_clean] _delete ' +
             str(e))
Beispiel #22
0
    def access_log(self):
        # 取相应的数据
        def _rank_replace(line):
            new_line = line.replace('\r\n', '').split(',')
            clean_user_id = _user_id(new_line[3])
            if clean_user_id == 0:
                return
            else:
                return (
                    clean_user_id,
                    new_line[0],
                    5 if new_line[2] == 2 else _interaction_type(new_line[5]),
                    _object_id(new_line[5]),
                    _object_id_2(),
                    new_line[2] if new_line[2] else 0,
                    new_line[5],
                )

        lines = self._get_rows()
        try:
            # _datas = [_rank_replace(line) for line in lines]
            _datas = map(_rank_replace, lines)
            _datas_clean_none = [_ for _ in _datas if _ != None]
            self.lineno += len(_datas)
            if len(_datas) == 0:
                etllog.lfinfo(
                    '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s. ignore UUID: %s =========='
                    % (self.lineno, _datas.count(None)))
                etllog.info(
                    '========== ETL-LOG-FILE: Read File END ==========')
                etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' %
                              clean_datetime._today())
                return 0
            else:
                DATAS.put(_datas_clean_none)
                return DATAS.qsize()
        except Exception, e:
            etllog.error(
                '[consume_log.py] ReadFile [_get_file_datas_to_queue]' +
                str(e))
Beispiel #23
0
    def _update_max_delta_values(self, table, _is_data):
        """
            @:table 要update的表
            @:_is_data 用来确定是否更新表;
                        若_is_data没有值,不更新直接返回
        """
        if not _is_data:
            self.limit = 0
            etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table)
            return

        if not self._where_values:
            self.limit = 0
            return

        try:
            # {'a': 1, 'max_a': 2, 'b': 2, 'max_b': 3, 'c': 3, 'max_c': 4}
            # ===>
            # {'a': 2, 'b': 3, 'c': 4}

            _info = {_: self._where_dict.get('max_'+_) for _ in self._where_dict.keys() if 'max_' not in _}

            for key, value in _info.items():
                if isinstance(value, int) or isinstance(value, long):
                    # number格式加1
                    value += 1
                else:
                    # 时间格式加一秒
                    value = str(clean_datetime._time_plus_second(value))
                _info[key] = value

            # 当前表任务完成, 增量字段
            etllog.info('==========[ETL] UPDATE TABLES=%s DELTA VALUE ==========' % table)

            self._end_update_sql(table, _info)

        except Exception, e:
            etllog.error('[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
Beispiel #24
0
                    str(e))

        try:
            while 1:
                if len(self.back_datas) != 0:
                    self._datas = [self.back_datas.pop()]
                else:
                    break
                try:
                    _put()
                except Exception, e:
                    # 删除数据,重新插入
                    err_str = str(e)  # 需要匹配的异常数据
                    _ins_sql = self._sql  # 需要再次插入的数据

                    if 'duplicate key' in err_str:
                        _delete(err_str)
                        _insert(_ins_sql)
                    else:
                        etllog.info(u'异常数据===>' + _ins_sql)

                    # 重新put后面的数据
                    self._roll_back()
                    self._datas = self.back_datas[:]
                    self._roll_back()
                    self.bulk_put(exflag=1)
                    break
        except Exception, e:
            etllog.error('[load_datas.py] LoadDatas [_exception_clean]' +
                         str(e))
Beispiel #25
0
 def get_config_info(self, section, option):
     try:
         return self.config.get(section, option)
     except Exception, e:
         etllog.error('[delta_control.py] RwConfigFile get_config_info' +
                      str(e))
Beispiel #26
0
 def get_info(self):
     _msg = "(\"%s\" \"%s\")\n" % (self.type, self.message)
     etllog.error(_msg)
     return _msg