def _read_files(self): """ 如果没有找到可以解析的文件, 程序退出 否则 生成器迭代文件 """ try: _files = self._get_local_file(self.time_args) if len(_files) == 0: etllog.warning(u'没有可以解析的文件,程序退出!') from dwetl.petl.p_decorator import sys_exit1 sys_exit1() for onefile in _files: etllog.lfinfo( '========== ETL-LOG-FILE: Read File %s ==========' % onefile) etllog.info( '========== ETL-LOG-FILE: Read File %s ==========' % onefile) with open(onefile, 'rb') as onef: for line in onef: yield line except Exception, e: etllog.error('[consume_log.py] ReadFile [_read_files]' + str(e))
def _get(self): try: if len(self._get_local_file(self.time_args)) != 0: # 如果有当前的文件, 属于执行过的文件 etllog.error(' 文件已经执行过, 请在本地删除在执行! [* 不删除DB中相应的数据会有主键错误!]') from dwetl.petl.p_decorator import sys_exit1 sys_exit1() for _server_info in self.server: # 多台服务器 self._conn_server(_server_info) etllog.info( '========== ETL-LOG-FILE:Conn Server %s ==========' % (json.dumps(_server_info.get('host')))) etllog.lfinfo('========== ETL-LOG-FILE:Server %s ==========' % (json.dumps(_server_info.get('host')))) for f in self._get_remote_file(self.time_args): # 多个目录文件 # 放文件 # 一对一的 fname = f.split('/')[-1] fpath = '/'.join(f.split('/')[:-1]) + '/' index = self._remote_path.index(fpath) self.sftp.get(f, self._local_path[index] + fname) etllog.lfinfo( '========== ETL-LOG-FILE: Get File remote %s local %s==========' % (f, self._local_path[index] + fname)) etllog.info( '========== ETL-LOG-FILE: Get File remote %s local %s==========' % (f, self._local_path[index] + fname)) self._close() except Exception, e: etllog.error('[sftp_get_file.py] SftpGFile [_get]' + str(e))
def _mail_send(self, FROM_ADDR=None, PASSWORD=None, TO_ADDR=None, SMTP_SERVER=None, PORT=None, ATTACHMENT=None): """ @FROM_ADDR 邮件发送人 @PASSWORD 邮件发送人密码 @TO_ADDR 发送给 @SMTP_SERVER SMTP服务 @PORT 端口号 @ATTACHMENT 附件 """ self._mail_header(FROM_ADDR, TO_ADDR) self._mail_contents() if ATTACHMENT: self._mail_attachment(ATTACHMENT) try: #发送邮件 smtp = smtplib.SMTP() smtp.set_debuglevel(1) smtp.connect(SMTP_SERVER, PORT) smtp.starttls() smtp.login(FROM_ADDR, PASSWORD) smtp.sendmail(FROM_ADDR, TO_ADDR, self._mail.as_string()) smtp.quit() except Exception, e: etllog.error('smtpMail [_mail_send]' + str(e))
def _mail_send(self, FROM_ADDR=None, PASSWORD=None, TO_ADDR=None, SUBJECT=None, SMTP_SERVER=None, PORT=None, ATTACHMENT=None): self._mail_header(FROM_ADDR, TO_ADDR, SUBJECT) self._mail_contents() if ATTACHMENT: self._read_log_lastn() self._mail_attachment(ATTACHMENT) try: #发送邮件 smtp = smtplib.SMTP() smtp.set_debuglevel(1) smtp.connect(SMTP_SERVER, PORT) smtp.starttls() smtp.login(FROM_ADDR, PASSWORD) smtp.sendmail(FROM_ADDR, TO_ADDR, self._mail.as_string()) smtp.quit() except Exception, e: etllog.error('smtpMail [_mail_send]' + str(e))
def _put(self): try: self.load_datas.bulk_put() except Exception, e: etllog.error('[etl_task.py] EtlTask [_put]' + str(e)) from dwetl.petl.p_decorator import sys_exit1 sys_exit1()
def read_config_file(self): try: self.config = ConfigParser() self.config.read(self.CONFIGFILE) except Exception, e: etllog.error('[delta_control.py] RwConfigFile read_config_file' + str(e))
def set_config_context(self, section, option, value): try: self.config.set(section, option, value) self.config.write(open(self.CONFIGFILE, 'w')) except Exception, e: etllog.error('[delta_control.py] RwConfigFile set_config_context' + str(e))
def _get(self): try: res = self.extract_datas._bulk_get() return res except Exception, e: etllog.error('[etl_task.py] EtlTask [_get]' + str(e)) from dwetl.petl.p_decorator import sys_exit1 sys_exit1()
def _query_sql_limit(self): """ 组成select语句 """ try: self._sql = self._local_sql + t_delta_conf._set_tables_limit() except Exception, e: etllog.error('[extract_datas.py] DbDatas [_query_sql]' + str(e))
def ppserver(self): try: for g in self.groups: task_groups = EtlWork(self.groups, ) print self.job_server.print_stats() except Exception, e: etllog.error(str(e)) pass
def _update_max_delta_values(self, table, _is_data): """ @:table 要update的表 @:_is_data 用来确定是否更新表; 若_is_data没有值,不更新直接返回 """ # todo 制表 if not _is_data: etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table) return try: _delta_field = clean_etl_conf._get_delta_field(table) # where conditions _SQL = '''SELECT ''' _SQL += ','.join(["MAX(%s)" % _ for _ in _delta_field]) _SQL += " FROM %s WHERE " % table _where = '''1=1 ''' for key, value in self._where_values.items(): if isinstance(value, int) or isinstance(value, long): _where += "and {0} >= {1} ".format(key, value) else: _where += " and {0} >= to_date('{1}', 'yyyy-mm-dd HH:MI:SS')".format( key, value) # max sql _SQL = _SQL + _where # new dict self._conn_cursor() self.query(_SQL) _info = dict(zip(_delta_field, self.fetchallrows()[0])) self._close_cursor() # modify for key, value in _info.items( ): #{_: _info[_] for _ in self._where_values.keys()}.items(): if isinstance(value, int) or isinstance(value, long): # number格式加1 value += 1 else: # 时间格式加一秒 value = clean_datetime._time_plus_second(value) _info[key] = value # update self._DETAL_CACHES.update( {table: {_: _info[_] for _ in self._where_values.keys()}}) except Exception, e: etllog.error( '[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
def _datas_str(self): """ 拼成(),().....(); """ _mogrify = self.mogrify_post try: self._str_datas = ','.join( _mogrify(self._formatns(), d) for d in self._datas) except Exception, e: etllog.error('[load_datas.py] LoadDatas [_datas_str]' + str(e))
def _get_datas(self): """ 执行sql语句 """ try: self.query(self._sql) res = self.fetchallrows() except Exception, e: etllog.error('[extract_datas.py] DbDatas [_get_datas]' + str(e)) res = None
def _detal_caches(self): """ 取出所有表的增量字段, 缓存在_DETAL_CACHES属性中 """ try: self.query(self._query_sql) self._DETAL_CACHES = {_[1]: _[2] for _ in self.fetchallrows()} self._close_cursor() except Exception, e: etllog.error('[delta_control.py] TDeltaConf [_detal_caches] ' + str(e))
def _conn_server(self, _server_info): """ sftp, 下载服务器文件到本地 """ try: self.pt = paramiko.Transport((_server_info['host'], _server_info['port'])) self.pt.connect(username=_server_info['username'], password=_server_info['password']) self.sftp = paramiko.SFTPClient.from_transport(self.pt) except Exception, e: etllog.error('[common_file.py] FileUBase [_get]' + str(e))
def __init__(self, options): """ @:options 表 或 文件 """ self.table = options try: self.extract_datas = ExtractDatas(options)._create_instance() # 判断是文件还是表, 是文件取得表 if options in clean_file_server._set_file_or_table(): options = clean_file_server._set_file_or_table(options) self.load_datas = LoadDatas(options) except Exception, e: etllog.error('[etl_task.py] EtlTask [__init__]' + str(e))
def _insert(sql): """ 调用插入 :param sql: :return: """ try: self._roll_back() self.insert(sql) except Exception, e: etllog.info(u'_insert异常数据===>' + sql) etllog.error( '[load_datas.py] LoadDatas [_exception_clean] _insert ' + str(e))
def location_log(self): # 取location相应的数据 def _match_line(line): p = re.compile(r'\[.*\]') # p.search(line).group() new_line = json.loads(p.search(line).group()) return new_line def _rank_replace(line): clean_user_id = _user_id(line['userId']) if clean_user_id == 0: return else: return ( clean_user_id, line['time'], line['latitude'], line['longitude'], ) lines = self._get_rows() try: match_datas = map(_match_line, lines) # match_datas = [_match_line(line) for line in lines] _dict_list_datas = [ _ for detail_datas in match_datas for _ in detail_datas ] # _datas_clean_none = [_rank_replace(_) for _ in _dict_list_datas] _datas_clean_none = map(_rank_replace, _dict_list_datas) self.lineno += len(match_datas) if len(match_datas) == 0: etllog.lfinfo( '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s, TOTAL Data ==========' % (self.lineno, )) etllog.info( '========== ETL-LOG-FILE: Read File END ==========') etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' % clean_datetime._today()) return 0 else: DATAS.put(_datas_clean_none) return DATAS.qsize() except Exception, e: etllog.error( '[consume_log.py] ReadFile [_get_file_datas_to_queue]' + str(e))
def _end_update_sql(self, table, json_values): """ @:table, 需要update的表; 将增量字段更新到表t_delta_conf中 """ try: self._update_sql += ''' UPDATE t_delta_conf SET primarykey = '%s', update_date = '%s' WHERE tables = '%s'; ''' % (json.dumps(json_values), clean_datetime._today(), table) self._conn_cursor() self.insert(self._update_sql) self._close_cursor() # 只实例化了一次, 每次保存后, 需要改为0 self.limit = 0 except Exception, e: etllog.info('[delta_control.py] TDeltaConf [t_delta_conf]======>' + self._update_sql) etllog.error('[delta_control.py] TDeltaConf [_end_update_sql]' + str(e))
def _set_tables_where_conditions(self, table): """ 获得表的where范围 field & max_field """ self._where_values = self._DETAL_CACHES.get(table) try: if self._where_values: max_value = MaxValue(self._where_values, table) self._where_dict = max_value._get_max_delta_field() return self._where_dict else: return {} except Exception, e: etllog.error('[delta_control.py] TDeltaConf [_set_tables_where_conditions] ' + str(e))
def _delete(err_str): """ 调用删除 :param err_str: :return: """ _sql = None try: # 匹配duplicate data p = re.compile(r'(\(.*\))=(\(.*\))') m = re.search(p, err_str) _field, _values = m.group(1), m.group(2) _sql = self.delete_sql(_split_str(_field), _split_str(_values)) self._roll_back() self.delete(_sql) except Exception, e: etllog.info(u'_delete异常数据===>' + _sql) etllog.error( '[load_datas.py] LoadDatas [_exception_clean] _delete ' + str(e))
def access_log(self): # 取相应的数据 def _rank_replace(line): new_line = line.replace('\r\n', '').split(',') clean_user_id = _user_id(new_line[3]) if clean_user_id == 0: return else: return ( clean_user_id, new_line[0], 5 if new_line[2] == 2 else _interaction_type(new_line[5]), _object_id(new_line[5]), _object_id_2(), new_line[2] if new_line[2] else 0, new_line[5], ) lines = self._get_rows() try: # _datas = [_rank_replace(line) for line in lines] _datas = map(_rank_replace, lines) _datas_clean_none = [_ for _ in _datas if _ != None] self.lineno += len(_datas) if len(_datas) == 0: etllog.lfinfo( '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s. ignore UUID: %s ==========' % (self.lineno, _datas.count(None))) etllog.info( '========== ETL-LOG-FILE: Read File END ==========') etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' % clean_datetime._today()) return 0 else: DATAS.put(_datas_clean_none) return DATAS.qsize() except Exception, e: etllog.error( '[consume_log.py] ReadFile [_get_file_datas_to_queue]' + str(e))
def _update_max_delta_values(self, table, _is_data): """ @:table 要update的表 @:_is_data 用来确定是否更新表; 若_is_data没有值,不更新直接返回 """ if not _is_data: self.limit = 0 etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table) return if not self._where_values: self.limit = 0 return try: # {'a': 1, 'max_a': 2, 'b': 2, 'max_b': 3, 'c': 3, 'max_c': 4} # ===> # {'a': 2, 'b': 3, 'c': 4} _info = {_: self._where_dict.get('max_'+_) for _ in self._where_dict.keys() if 'max_' not in _} for key, value in _info.items(): if isinstance(value, int) or isinstance(value, long): # number格式加1 value += 1 else: # 时间格式加一秒 value = str(clean_datetime._time_plus_second(value)) _info[key] = value # 当前表任务完成, 增量字段 etllog.info('==========[ETL] UPDATE TABLES=%s DELTA VALUE ==========' % table) self._end_update_sql(table, _info) except Exception, e: etllog.error('[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
str(e)) try: while 1: if len(self.back_datas) != 0: self._datas = [self.back_datas.pop()] else: break try: _put() except Exception, e: # 删除数据,重新插入 err_str = str(e) # 需要匹配的异常数据 _ins_sql = self._sql # 需要再次插入的数据 if 'duplicate key' in err_str: _delete(err_str) _insert(_ins_sql) else: etllog.info(u'异常数据===>' + _ins_sql) # 重新put后面的数据 self._roll_back() self._datas = self.back_datas[:] self._roll_back() self.bulk_put(exflag=1) break except Exception, e: etllog.error('[load_datas.py] LoadDatas [_exception_clean]' + str(e))
def get_config_info(self, section, option): try: return self.config.get(section, option) except Exception, e: etllog.error('[delta_control.py] RwConfigFile get_config_info' + str(e))
def get_info(self): _msg = "(\"%s\" \"%s\")\n" % (self.type, self.message) etllog.error(_msg) return _msg