folder_name = 'Tuning_CT' # Mention the parameter that you are Tuning input_dir = os.path.dirname(os.path.dirname(os.path.abspath( __file__))) + '/logs/Sample_logs/' # The input directory of log file output_dir = 'IPLoM_result/financial_transaction_results/Tuning_results/' + folder_name # The output directory of parsing results log_file = 'fin-transaction_log_anonimized.log' #log_file = 'HDFS_2k.log' # The input log file name #log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' # HDFS log format #log_format = '<Date> <Time> <Level> <Router> <Pid>: <Month> <Day> <UTCTime>: <Component>: <Content>' # cisco_router log format log_format = '<Date> <Time> <Level> <Module> \[<StatusAndPayThread>\] - <Content>' for CT in np.arange(0.3, .45, .01): maxEventLen = 200 # The maximal token number of log messages (default: 200) # CT = 0.35 # The cluster goodness threshold (default: 0.35) lowerBound = 0.25 # The lower bound distance (default: 0.25) upperBound = 0.9 # The upper bound distance (default: 0.9) regex = [ ] # Regular expression list for optional preprocessing (default: []) step2Support = 0 parser = IPLoM.LogParser(log_format=log_format, indir=input_dir, outdir=output_dir, maxEventLen=maxEventLen, step2Support=step2Support, CT=round(CT, 2), lowerBound=lowerBound, upperBound=upperBound, rex=regex) parser.parse(log_file)
parser = argparse.ArgumentParser() parser.add_argument( '-dir', default= '/Users/haraldott/Development/thesis/detector/data/openstack/utah/raw/sorted_per_request/', type=str) parser.add_argument('-file', default='18k_spr', type=str) parser.add_argument('-logtype', type=str, default="OpenStack") args = parser.parse_args() input_dir = args.dir # The input directory of log file log_file = args.file # The input log file name output_dir = 'IPLoM_result/' # The output directory of parsing results try: log_format = settings[args.logtype]["log_format"] regex = settings[args.logtype]["regex"] lower_bound = settings[args.logtype]["lowerBound"] CT = settings[args.logtype]["CT"] except ValueError: print("log format does not exist") raise parser = IPLoM.LogParser(log_format=log_format, indir=input_dir, outdir=output_dir, rex=regex, CT=CT, lowerBound=lower_bound) parser.parse(log_file)
'<Month> <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>', 'CT': 0.3, 'lowerBound': 0.25, 'regex': [r'([\w-]+\.){2,}[\w-]+'] } } bechmark_result = [] for dataset, setting in benchmark_settings.items(): print('\n=== Evaluation on %s ===' % dataset) indir = os.path.join(input_dir, os.path.dirname(setting['log_file'])) log_file = os.path.basename(setting['log_file']) parser = IPLoM.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, CT=setting['CT'], lowerBound=setting['lowerBound'], rex=setting['regex']) parser.parse(log_file) F1_measure, accuracy = evaluator.evaluate( groundtruth=os.path.join(indir, log_file + '_structured.csv'), parsedresult=os.path.join(output_dir, log_file + '_structured.csv')) bechmark_result.append([dataset, F1_measure, accuracy]) print('\n=== Overall evaluation results ===') df_result = pd.DataFrame(bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy']) df_result.set_index('Dataset', inplace=True) print(df_result) df_result.T.to_csv('IPLoM_bechmark_result.csv')
import collections import os import pandas as pd from benchmark.IPLoM_benchmark import benchmark_settings from logparser import IPLoM from File_Info import get_file_size dataset = 'BGL' output_dir = 'IPLoM_result/' one_setting = benchmark_settings[dataset] log_file = os.path.basename(one_setting['log_file']) input_dir = os.path.join('../logs/', os.path.dirname(one_setting['log_file'])) file_size = get_file_size(dataset) results = collections.defaultdict(list) for size, file in file_size.items(): parser = IPLoM.LogParser(log_format=one_setting['log_format'], indir=input_dir, outdir=output_dir, CT=one_setting['CT'], lowerBound=one_setting['lowerBound'], rex=one_setting['regex'], keep_para=False) time_elapsed = parser.parse(file) results['size'].append(size) results['time'].append(time_elapsed.total_seconds()) print(results['time']) pd.DataFrame(results).to_csv('./IPLoM_%s.csv' % dataset)