class NginxLogParser: ''' nginx log parser ''' def __init__(self, regex_str, nginx_log): self.__regex_str = regex_str; self.__pattern = re.compile(regex_str) self.logger = Log() with open(nginx_log) as f: loglines = f.readlines() for logline in loglines: print logline print '\n' result = self.__pattern.search(logline) if result: regex_dict = result.groupdict(); print 'Bingo!' print '\n' else: print 'None!' print '\n' def parse(self, line): line = logline result = self.__pattern.search(line) if result: print 'Bingo!' return result else: print 'None!' return None def do_parse(self, regex_search_result): regex_dict = regex_search_result.groupdict(); schema = self.schema_parser.schema() columns = [] for column in schema: try: key = column['name'] # source_host 是从mapred中获取的,不是来自于日志 if key == 'source_host': continue if key in regex_dict: column_value = regex_dict[key] columns.append(column_value) # 只要HIVE表有个一个字段没在日志里就丢掉 else: self.logger.fatal('regex_dict has no key[%s]' %(key)) return None except KeyError,e: self.logger.fatal('Exception in log parser:%s' %(e)) return None output_str = '\001'.join(columns) return output_str
class NginxLogParser: ''' nginx log parser ''' def __init__(self, regex_str, nginx_log): self.__regex_str = regex_str self.__pattern = re.compile(regex_str) self.logger = Log() with open(nginx_log) as f: loglines = f.readlines() for logline in loglines: print logline print '\n' result = self.__pattern.search(logline) if result: regex_dict = result.groupdict() print 'Bingo!' print '\n' else: print 'None!' print '\n' def parse(self, line): line = logline result = self.__pattern.search(line) if result: print 'Bingo!' return result else: print 'None!' return None def do_parse(self, regex_search_result): regex_dict = regex_search_result.groupdict() schema = self.schema_parser.schema() columns = [] for column in schema: try: key = column['name'] # source_host 是从mapred中获取的,不是来自于日志 if key == 'source_host': continue if key in regex_dict: column_value = regex_dict[key] columns.append(column_value) # 只要HIVE表有个一个字段没在日志里就丢掉 else: self.logger.fatal('regex_dict has no key[%s]' % (key)) return None except KeyError, e: self.logger.fatal('Exception in log parser:%s' % (e)) return None output_str = '\001'.join(columns) return output_str
class Parser: """ basic log parser class """ def __init__(self, log_parser, schema_parser): self.log_parser = log_parser() self.schema_parser = schema_parser self.logger = Log() def parse(self, result_dict, line): """ 通用解析入口 """ qs = result_dict['qs_dict'] # 各日志的私有字段 result_dict = self.log_parser.extract_fields(qs, result_dict) if result_dict: ret = self.join_hive_columns(result_dict) if not ret: self.logger.fatal('parse line [%s] failed.' %(line)) sys.exit() return ret else: self.logger.fatal('get common fields failed, line[%s].' % (line)) return None def join_hive_columns(self, result_dict): """ 合并hive字段 """ schema = self.schema_parser.schema() columns = [] for column in schema: try: key = column['name'] # source_host 是从mapred中获取的,不是来自于日志 if key == 'source_host': continue if key in result_dict: column_value = result_dict[key] columns.append(column_value) # 只要HIVE表有个一个字段没在result_dict里就丢掉 else: self.logger.fatal('regex_dict has no key[%s]' %(key)) return None except KeyError,e: self.logger.fatal('Exception in log parser:%s' %(e)) return None # hive默认列连接符\001 try: output_str = '\001'.join(columns) # 回车换行符转换成urlencode形式 return output_str.replace('\n', '%0A').replace('\r', '%0D') except: return None
class SchemaParser: ''' hive schema parser ''' def __init__(self, schema_file): self.regex = '.*CREATE TABLE IF NOT EXISTS (?P<table_name>\w+)\s*\(\s+(?P<columns>.*)\s*\)\s*COMMENT.*' self.schema_file = schema_file self.logger = Log() def parse(self): fp = open(self.schema_file, 'r') if fp: schema_data = fp.read() return self.do_parse(schema_data) else: self.logger.fatal("open schema file[%s] failed." % self.schema_file) return None def do_parse(self, hive_schema): content = hive_schema.replace('\n', ' ') pattern = re.compile(self.regex) # 从hive表结构定义的hql中解析出表名,各个字段名和字段类型 result = pattern.search(content) if result: schema_dict = result.groupdict() self.table_name = schema_dict['table_name'] columns = schema_dict['columns'] self.schemas = [] # 注意空格! map<string,string> column_list = columns.split(', ') for column in column_list: column_pairs = column.split() if len(column_pairs) >= 2 : column_name = column_pairs[0] column_type = column_pairs[1] self.schemas.append({'name':column_name,'type':column_type}) else : self.logger.fatal('wrong format line:%s' %(column_pairs)) return None else: self.logger.fatal("schema regex search is not match, please check the schema") return None return self.schemas def print_schema(self): print 'table_name:',self.table_name i = 0 for column in self.schemas: print 'column[%d]: %s, %s' %(i, column['name'], column['type']) i += 1 def schema(self): return self.schemas def table_name(self): return self.table_name