#!/usr/bin/env python import sys sys.path.append('../') from logparser.LogCluster import * from logparser.LogCluster import LogCluster input_dir = '../logs/HDFS/' # The input directory of log file output_dir = 'LogCluster_result/' # The output directory of parsing results log_file = 'HDFS_2k.log' # The input log file name log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' # HDFS log format rsupport = 10 # The minimum threshold of relative support, 10 denotes 10% regex = [] # Regular expression list for optional preprocessing (default: []) parser = LogCluster.LogParser(input_dir, log_format, output_dir, rsupport=rsupport) parser.parse(log_file)
'log_format': '<Month> <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>', 'regex': [r'([\w-]+\.){2,}[\w-]+'], 'rsupport': 0.2, }, } bechmark_result = [] for dataset, setting in benchmark_settings.iteritems(): print('\n=== Evaluation on %s ===' % dataset) indir = os.path.join(input_dir, os.path.dirname(setting['log_file'])) log_file = os.path.basename(setting['log_file']) parser = LogCluster.LogParser(indir, setting['log_format'], output_dir, rex=setting['regex'], rsupport=setting['rsupport']) parser.parse(log_file) F1_measure, accuracy = evaluator.evaluate( groundtruth=os.path.join(indir, log_file + '_structured.csv'), parsedresult=os.path.join(output_dir, log_file + '_structured.csv')) bechmark_result.append([dataset, F1_measure, accuracy]) print('\n=== Overall evaluation results ===') df_result = pd.DataFrame(bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy']) df_result.set_index('Dataset', inplace=True) print(df_result) df_result.T.to_csv('LogCluster_bechmark_result.csv')