Ejemplo n.º 1
0
class NginxLogParser:
    '''
    nginx log parser 	

    '''

    def __init__(self, regex_str, nginx_log):
        self.__regex_str = regex_str;
        self.__pattern = re.compile(regex_str)
        self.logger = Log()
        with open(nginx_log) as f:
            loglines = f.readlines()
        for logline in loglines:
            print logline
            print '\n'
            result = self.__pattern.search(logline)
            if result:
                regex_dict = result.groupdict();
                print 'Bingo!'
                print '\n'
            else:
                print 'None!'
                print '\n'

    def parse(self, line):
        line = logline
        result = self.__pattern.search(line)
        if result:
            print 'Bingo!'
            return result
        else:
            print 'None!'
            return None

    def do_parse(self, regex_search_result):
        regex_dict = regex_search_result.groupdict();
        schema = self.schema_parser.schema()
        columns = []
        for column in schema:
            try:
                key = column['name']
                # source_host 是从mapred中获取的,不是来自于日志
                if key == 'source_host':
                    continue

                if key in regex_dict:
                    column_value = regex_dict[key]
                    columns.append(column_value)
                # 只要HIVE表有个一个字段没在日志里就丢掉
                else:
                    self.logger.fatal('regex_dict has no key[%s]' %(key))
                    return None
            except KeyError,e:
                self.logger.fatal('Exception in log parser:%s' %(e))
                return None
        
        output_str = '\001'.join(columns)
        return output_str
Ejemplo n.º 2
0
class NginxLogParser:
    '''
    nginx log parser 	

    '''
    def __init__(self, regex_str, nginx_log):
        self.__regex_str = regex_str
        self.__pattern = re.compile(regex_str)
        self.logger = Log()
        with open(nginx_log) as f:
            loglines = f.readlines()
        for logline in loglines:
            print logline
            print '\n'
            result = self.__pattern.search(logline)
            if result:
                regex_dict = result.groupdict()
                print 'Bingo!'
                print '\n'
            else:
                print 'None!'
                print '\n'

    def parse(self, line):
        line = logline
        result = self.__pattern.search(line)
        if result:
            print 'Bingo!'
            return result
        else:
            print 'None!'
            return None

    def do_parse(self, regex_search_result):
        regex_dict = regex_search_result.groupdict()
        schema = self.schema_parser.schema()
        columns = []
        for column in schema:
            try:
                key = column['name']
                # source_host 是从mapred中获取的,不是来自于日志
                if key == 'source_host':
                    continue

                if key in regex_dict:
                    column_value = regex_dict[key]
                    columns.append(column_value)
                # 只要HIVE表有个一个字段没在日志里就丢掉
                else:
                    self.logger.fatal('regex_dict has no key[%s]' % (key))
                    return None
            except KeyError, e:
                self.logger.fatal('Exception in log parser:%s' % (e))
                return None

        output_str = '\001'.join(columns)
        return output_str
Ejemplo n.º 3
0
class Parser:
    """
    basic log parser class
    """

    def __init__(self, log_parser, schema_parser):
        self.log_parser = log_parser()
        self.schema_parser = schema_parser
        self.logger = Log()

    def parse(self, result_dict, line):
        """
        通用解析入口
        """
        qs = result_dict['qs_dict']
        # 各日志的私有字段
        result_dict = self.log_parser.extract_fields(qs, result_dict)

        if result_dict:
            ret = self.join_hive_columns(result_dict)
            if not ret:
                self.logger.fatal('parse line [%s] failed.' %(line))
                sys.exit()
            return ret
        else:
            self.logger.fatal('get common fields failed, line[%s].' % (line))
            return None

    def join_hive_columns(self, result_dict):
        """
        合并hive字段
        """
        schema = self.schema_parser.schema()
        columns = []
        for column in schema:
            try:
                key = column['name']
                # source_host 是从mapred中获取的,不是来自于日志
                if key == 'source_host':
                    continue

                if key in result_dict:
                    column_value = result_dict[key]
                    columns.append(column_value)
                # 只要HIVE表有个一个字段没在result_dict里就丢掉
                else:
                    self.logger.fatal('regex_dict has no key[%s]' %(key))
                    return None
            except KeyError,e:
                self.logger.fatal('Exception in log parser:%s' %(e))
                return None
        # hive默认列连接符\001 
        try:
            output_str = '\001'.join(columns)
            # 回车换行符转换成urlencode形式
            return output_str.replace('\n', '%0A').replace('\r', '%0D')
        except:
            return None
Ejemplo n.º 4
0
class SchemaParser:
    '''
    hive schema parser
    '''

    def __init__(self, schema_file):
        self.regex = '.*CREATE TABLE IF NOT EXISTS (?P<table_name>\w+)\s*\(\s+(?P<columns>.*)\s*\)\s*COMMENT.*'
        self.schema_file = schema_file
        self.logger = Log()

    def parse(self):
        fp = open(self.schema_file, 'r')
        if fp:
            schema_data = fp.read()
            return self.do_parse(schema_data)
        else:
            self.logger.fatal("open schema file[%s] failed." % self.schema_file)
            return None

    def do_parse(self, hive_schema):
        content = hive_schema.replace('\n', ' ')
        pattern = re.compile(self.regex)
        # 从hive表结构定义的hql中解析出表名,各个字段名和字段类型
        result = pattern.search(content)
        if result:
            schema_dict = result.groupdict()

            self.table_name = schema_dict['table_name']
            columns = schema_dict['columns']

            self.schemas = []
            # 注意空格! map<string,string> 
            column_list = columns.split(', ')
            for column in column_list:
                column_pairs = column.split()
                if len(column_pairs) >= 2 :
                    column_name = column_pairs[0]
                    column_type = column_pairs[1]
                    self.schemas.append({'name':column_name,'type':column_type})
                else :
                    self.logger.fatal('wrong format line:%s' %(column_pairs))
                    return None
        else:
            self.logger.fatal("schema regex search is not match, please check the schema")
            return None

        return self.schemas

    def print_schema(self):
        print 'table_name:',self.table_name
        i = 0
        for column in self.schemas:
            print 'column[%d]: %s, %s' %(i, column['name'], column['type'])
            i += 1

    def schema(self):
        return self.schemas

    def table_name(self):
        return self.table_name