def _read_files(self): """ 如果没有找到可以解析的文件, 程序退出 否则 生成器迭代文件 """ try: _files = self._get_local_file(self.time_args) if len(_files) == 0: etllog.warning(u'没有可以解析的文件,程序退出!') from dwetl.petl.p_decorator import sys_exit1 sys_exit1() for onefile in _files: etllog.lfinfo( '========== ETL-LOG-FILE: Read File %s ==========' % onefile) etllog.info( '========== ETL-LOG-FILE: Read File %s ==========' % onefile) with open(onefile, 'rb') as onef: for line in onef: yield line except Exception, e: etllog.error('[consume_log.py] ReadFile [_read_files]' + str(e))
def _get(self): try: if len(self._get_local_file(self.time_args)) != 0: # 如果有当前的文件, 属于执行过的文件 etllog.error(' 文件已经执行过, 请在本地删除在执行! [* 不删除DB中相应的数据会有主键错误!]') from dwetl.petl.p_decorator import sys_exit1 sys_exit1() for _server_info in self.server: # 多台服务器 self._conn_server(_server_info) etllog.info( '========== ETL-LOG-FILE:Conn Server %s ==========' % (json.dumps(_server_info.get('host')))) etllog.lfinfo('========== ETL-LOG-FILE:Server %s ==========' % (json.dumps(_server_info.get('host')))) for f in self._get_remote_file(self.time_args): # 多个目录文件 # 放文件 # 一对一的 fname = f.split('/')[-1] fpath = '/'.join(f.split('/')[:-1]) + '/' index = self._remote_path.index(fpath) self.sftp.get(f, self._local_path[index] + fname) etllog.lfinfo( '========== ETL-LOG-FILE: Get File remote %s local %s==========' % (f, self._local_path[index] + fname)) etllog.info( '========== ETL-LOG-FILE: Get File remote %s local %s==========' % (f, self._local_path[index] + fname)) self._close() except Exception, e: etllog.error('[sftp_get_file.py] SftpGFile [_get]' + str(e))
def _start_table(self, table): """ @:table 表 """ etllog.info('========== CURRENT TABLE %s ==========' % table) self._log_message['TABLE_OR_VIEW'] = table task = EtlTask(table) task.start()
def _update_max_delta_values(self, table, _is_data): """ @:table 要update的表 @:_is_data 用来确定是否更新表; 若_is_data没有值,不更新直接返回 """ # todo 制表 if not _is_data: etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table) return try: _delta_field = clean_etl_conf._get_delta_field(table) # where conditions _SQL = '''SELECT ''' _SQL += ','.join(["MAX(%s)" % _ for _ in _delta_field]) _SQL += " FROM %s WHERE " % table _where = '''1=1 ''' for key, value in self._where_values.items(): if isinstance(value, int) or isinstance(value, long): _where += "and {0} >= {1} ".format(key, value) else: _where += " and {0} >= to_date('{1}', 'yyyy-mm-dd HH:MI:SS')".format( key, value) # max sql _SQL = _SQL + _where # new dict self._conn_cursor() self.query(_SQL) _info = dict(zip(_delta_field, self.fetchallrows()[0])) self._close_cursor() # modify for key, value in _info.items( ): #{_: _info[_] for _ in self._where_values.keys()}.items(): if isinstance(value, int) or isinstance(value, long): # number格式加1 value += 1 else: # 时间格式加一秒 value = clean_datetime._time_plus_second(value) _info[key] = value # update self._DETAL_CACHES.update( {table: {_: _info[_] for _ in self._where_values.keys()}}) except Exception, e: etllog.error( '[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
def main_file(): start_time = time.time() etllog.info('========== [ETL] START TIME: ==========') etllog.info('========== [ETL] START TASK ==========') etl_work = EtlWork('access_log') etl_work.start() etllog.info('========== [ETL] FINISH TASK ==========') etllog.info('========== [ETL] END TIME: ==========') etllog.info('========== [ETL] TIME ELAPSED %s s==========' % (time.time()-start_time))
def _insert(sql): """ 调用插入 :param sql: :return: """ try: self._roll_back() self.insert(sql) except Exception, e: etllog.info(u'_insert异常数据===>' + sql) etllog.error( '[load_datas.py] LoadDatas [_exception_clean] _insert ' + str(e))
def location_log(self): # 取location相应的数据 def _match_line(line): p = re.compile(r'\[.*\]') # p.search(line).group() new_line = json.loads(p.search(line).group()) return new_line def _rank_replace(line): clean_user_id = _user_id(line['userId']) if clean_user_id == 0: return else: return ( clean_user_id, line['time'], line['latitude'], line['longitude'], ) lines = self._get_rows() try: match_datas = map(_match_line, lines) # match_datas = [_match_line(line) for line in lines] _dict_list_datas = [ _ for detail_datas in match_datas for _ in detail_datas ] # _datas_clean_none = [_rank_replace(_) for _ in _dict_list_datas] _datas_clean_none = map(_rank_replace, _dict_list_datas) self.lineno += len(match_datas) if len(match_datas) == 0: etllog.lfinfo( '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s, TOTAL Data ==========' % (self.lineno, )) etllog.info( '========== ETL-LOG-FILE: Read File END ==========') etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' % clean_datetime._today()) return 0 else: DATAS.put(_datas_clean_none) return DATAS.qsize() except Exception, e: etllog.error( '[consume_log.py] ReadFile [_get_file_datas_to_queue]' + str(e))
def _end_update_sql(self, table, json_values): """ @:table, 需要update的表; 将增量字段更新到表t_delta_conf中 """ try: self._update_sql += ''' UPDATE t_delta_conf SET primarykey = '%s', update_date = '%s' WHERE tables = '%s'; ''' % (json.dumps(json_values), clean_datetime._today(), table) self._conn_cursor() self.insert(self._update_sql) self._close_cursor() # 只实例化了一次, 每次保存后, 需要改为0 self.limit = 0 except Exception, e: etllog.info('[delta_control.py] TDeltaConf [t_delta_conf]======>' + self._update_sql) etllog.error('[delta_control.py] TDeltaConf [_end_update_sql]' + str(e))
def _delete(err_str): """ 调用删除 :param err_str: :return: """ _sql = None try: # 匹配duplicate data p = re.compile(r'(\(.*\))=(\(.*\))') m = re.search(p, err_str) _field, _values = m.group(1), m.group(2) _sql = self.delete_sql(_split_str(_field), _split_str(_values)) self._roll_back() self.delete(_sql) except Exception, e: etllog.info(u'_delete异常数据===>' + _sql) etllog.error( '[load_datas.py] LoadDatas [_exception_clean] _delete ' + str(e))
def access_log(self): # 取相应的数据 def _rank_replace(line): new_line = line.replace('\r\n', '').split(',') clean_user_id = _user_id(new_line[3]) if clean_user_id == 0: return else: return ( clean_user_id, new_line[0], 5 if new_line[2] == 2 else _interaction_type(new_line[5]), _object_id(new_line[5]), _object_id_2(), new_line[2] if new_line[2] else 0, new_line[5], ) lines = self._get_rows() try: # _datas = [_rank_replace(line) for line in lines] _datas = map(_rank_replace, lines) _datas_clean_none = [_ for _ in _datas if _ != None] self.lineno += len(_datas) if len(_datas) == 0: etllog.lfinfo( '========== ETL-LOG-FILE: TOTAL LINENUMBER: %s. ignore UUID: %s ==========' % (self.lineno, _datas.count(None))) etllog.info( '========== ETL-LOG-FILE: Read File END ==========') etllog.lfinfo('========== ETL-LOG-FILE END %s ==========' % clean_datetime._today()) return 0 else: DATAS.put(_datas_clean_none) return DATAS.qsize() except Exception, e: etllog.error( '[consume_log.py] ReadFile [_get_file_datas_to_queue]' + str(e))
def _update_max_delta_values(self, table, _is_data): """ @:table 要update的表 @:_is_data 用来确定是否更新表; 若_is_data没有值,不更新直接返回 """ if not _is_data: self.limit = 0 etllog.info('[delta_control.py] [delta_control] %s 没有最新的数据' % table) return if not self._where_values: self.limit = 0 return try: # {'a': 1, 'max_a': 2, 'b': 2, 'max_b': 3, 'c': 3, 'max_c': 4} # ===> # {'a': 2, 'b': 3, 'c': 4} _info = {_: self._where_dict.get('max_'+_) for _ in self._where_dict.keys() if 'max_' not in _} for key, value in _info.items(): if isinstance(value, int) or isinstance(value, long): # number格式加1 value += 1 else: # 时间格式加一秒 value = str(clean_datetime._time_plus_second(value)) _info[key] = value # 当前表任务完成, 增量字段 etllog.info('==========[ETL] UPDATE TABLES=%s DELTA VALUE ==========' % table) self._end_update_sql(table, _info) except Exception, e: etllog.error('[delta_control.py] TDeltaConf [_update_max_delta_values]' + str(e))
def _refresh_view(): start_time = time.time() etllog.info('========== [ETL] REFRESH MATERIALIZED VIEW START TIME: %s ==========' % clean_datetime._today()) rmv = RefreshMaterializedView('user_age_group') rmv._refresh_start() etllog.info('========== [ETL] REFRESH MATERIALIZED VIEW END TIME: %s ==========' % clean_datetime._today()) etllog.info('========== [ETL] TIME ELAPSED %s s ==========' % (time.time()-start_time))
def main_db(table): start_time = time.time() etllog.info('========== [ETL] START TIME: %s ==========' % clean_datetime._today()) etllog.info('========== [ETL] READ DETAL TABLE ==========') t_delta_conf._detal_caches() etllog.info('========== [ETL] START TASK ==========') etl_work = EtlWork(table) etl_work.start() etllog.info('========== [ETL] FINISH TASK ==========') etllog.info('==========[ETL] SAVE DETAL TABLE==========') etllog.info('========== [ETL] END TIME: %s ==========' % clean_datetime._today()) etllog.info('========== [ETL] TIME ELAPSED %s s==========' % (time.time()-start_time))
def _start_file(self, file): etllog.info('========== CURRENT FILE %s ==========' % self.options) etllog.lfinfo('========== CURRENT FILE %s ==========' % self.options) self._log_message['TABLE_OR_VIEW'] = '[FILE]' + file task = EtlTask(file) task.start()
str(e)) try: while 1: if len(self.back_datas) != 0: self._datas = [self.back_datas.pop()] else: break try: _put() except Exception, e: # 删除数据,重新插入 err_str = str(e) # 需要匹配的异常数据 _ins_sql = self._sql # 需要再次插入的数据 if 'duplicate key' in err_str: _delete(err_str) _insert(_ins_sql) else: etllog.info(u'异常数据===>' + _ins_sql) # 重新put后面的数据 self._roll_back() self._datas = self.back_datas[:] self._roll_back() self.bulk_put(exflag=1) break except Exception, e: etllog.error('[load_datas.py] LoadDatas [_exception_clean]' + str(e))