def processing(self): if self.abafile and self.syncfile and self.segfile and self.poifile: self.ppdata = pre_processing(self.abafile, self.syncfile, self.segfile, self.poifile, None) self.savefileButton.setVisible(True) self.pprocessButton.setVisible(False) self.abaEdit.setText("ABA file") self.syncEdit.setText("SYNC file") self.segEdit.setText("SEG file") self.poiEdit.setText("POI file") else: msg = QMessageBox() msg.setIcon(QMessageBox.Critical) msg.setText("File Error") msg.setInformativeText('Please load all the required files...') msg.setWindowTitle("File missing!") msg.exec_()
from __future__ import absolute_import from __future__ import division from __future__ import print_function import data_processing import tensorflow as tf import tensorflow.contrib.slim as slim import tensorflow.contrib.slim.nets as nets import csv as csv CHECK_POINT_PATH = 'check_point/train_model.ckpt-2' NUM_KAGGLE_TEST = 12500 BATCH_SIZE = 50 batch_test_set = data_processing.pre_processing(data_set='test', batch_size=BATCH_SIZE) prediction_file = open('kaggle_result_file.csv', 'wb') prediction_file_object = csv.writer(prediction_file) prediction_file_object.writerow(['id', 'label']) with tf.Graph().as_default(): images = tf.placeholder(tf.float32, [BATCH_SIZE, 224, 224, 3]) keep_prob = tf.placeholder(tf.float32) logits, _ = nets.vgg.vgg_19(inputs=images, num_classes=2, dropout_keep_prob=keep_prob, is_training=False) variables_to_restore = slim.get_variables_to_restore() restorer = tf.train.Saver(variables_to_restore) pros = tf.nn.softmax(logits)
from data_processing import pre_processing, _window_logs, get_session_attributes from parsing import parse_log_file from detection_rules import ( has_robots_txt_request, has_bot_name_in_user_agent, has_high_number_requests, has_low_request_interrarival_time, ) import numpy as np import datetime as dt # Processing the input print("Starting app...") parsed_logs = parse_log_file("./access.log", from_date=dt.date(2014, 2, 20)) requests = pre_processing(parsed_logs) # Generating features hourly_windowed_logs = _window_logs(requests, window_time_frame="hour") daily_windowed_logs = _window_logs(requests, window_time_frame="day") hour_sessions = get_session_attributes(requests, aggregation_level="hour") day_sessions = get_session_attributes(requests, aggregation_level="day") # Examine generated features to flag potential bots robots_txt = has_robots_txt_request(requests) bot_names = has_bot_name_in_user_agent(requests) low_inter_request_time_hour = has_low_request_interrarival_time( hour_sessions, threshold_number_requests_no_referrer=100) low_inter_request_time_day = has_low_request_interrarival_time( day_sessions, threshold_number_requests_no_referrer=1000) high_number_requests_hour = has_high_number_requests(hour_sessions,
import os import os.path import time TRAIN_LOG_DIR = os.path.join('Log/train/', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) TRAIN_CHECK_POINT = 'check_point/train_model.ckpt' VALIDATION_LOG_DIR = 'Log/validation/' VGG_19_MODEL_DIR = 'check_point/vgg_19.ckpt' BATCH_SIZE = 32 EPOCH = 3 if not tf.gfile.Exists(TRAIN_LOG_DIR): tf.gfile.MakeDirs(TRAIN_LOG_DIR) if not tf.gfile.Exists(VALIDATION_LOG_DIR): tf.gfile.MakeDirs(VALIDATION_LOG_DIR) batch_train_set, batch_validation_set, images_num = data_processing.pre_processing(data_set='train', batch_size=BATCH_SIZE) def get_accuracy(logits, labels): correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return accuracy with tf.Graph().as_default(): images = tf.placeholder(tf.float32, [BATCH_SIZE, 224, 224, 3]) labels = tf.placeholder(tf.float32, [BATCH_SIZE, len(data_processing.IMG_CLASSES)]) keep_prob = tf.placeholder(tf.float32) with slim.arg_scope(nets.vgg.vgg_arg_scope()): logits, _ = nets.vgg.vgg_19(inputs=images, num_classes=2, dropout_keep_prob=keep_prob, is_training=True) variables_to_restore = slim.get_variables_to_restore(exclude=['vgg_19/fc8']) restorer = tf.train.Saver(variables_to_restore)
def anomaly_detection(self, pprocessed_file, seg_file, features='RMS', sliding_window=1000, sub_sampling=128, impurity=0.05, num_trees=100): self.processed_file = pprocessed_file # In case pre-processed file is not available if not os.path.isfile(self.processed_file): pre_processing(self.data_file, self.sync_file, self.seg_file, self.poi_file, self.processed_file) else: geo_list = [] with open(self.poi_file) as csv_file: csv_reader = csv.reader(csv_file) line_count = 0 for row in csv_reader: tempStr = ''.join(row) if tempStr.startswith('#') or len(tempStr) == 0: continue elif tempStr.startswith('CNT'): print(f'Column names are {", ".join(row)}') line_count += 1 else: line_count += 1 tlist = tempStr.split(";") ttlist = [float(x) for x in tlist if len(x) > 0] geo_list.append(ttlist) print(f'Processed {line_count} lines in POI file.') geo_list = np.array(geo_list) lat = geo_list[:, 1] lon = geo_list[:, 2] # Interpolation external and geo-coordinates get_lat = interp1d(geo_list[:, 0], lat, fill_value='extrapolate') get_long = interp1d(geo_list[:, 0], lon, fill_value='extrapolate') # read pre-processed data in dataframe processed_data = pd.read_hdf(self.processed_file, 'processed', mode='r') # CHC1 = np.array(processed_data.CHC1) # CHC3 = np.array(processed_data.CHC3) # CHD1 = np.array(processed_data.CHD1) # CHD3 = np.array(processed_data.CHD3) EDIR = np.array(processed_data.ERS_DIR) CHA1 = np.array(processed_data.CHA1) CHA3 = np.array(processed_data.CHA3) CHB1 = np.array(processed_data.CHB1) CHB3 = np.array(processed_data.CHB3) # read internal and external counters int_count = np.array(processed_data.INTCNT) ext_count = np.array(processed_data.EXTCNT) # date_time = syncdat.DateTime # Pushing & Pulling ABA data for one side (left or right) pull_data_cha1 = CHA1[(EDIR == 1)] push_data_cha1 = CHA1[(EDIR == -1)] pull_data_cha3 = CHA3[(EDIR == 1)] push_data_cha3 = CHA3[(EDIR == -1)] pull_int_count = int_count[(EDIR == 1)] push_int_count = int_count[(EDIR == -1)] pull_ext_count = ext_count[(EDIR == 1)] push_ext_count = ext_count[(EDIR == -1)] if len(push_ext_count) == 0: push_ext_count = pull_ext_count # transformed data from X and Z axes pull_data = np.power((np.power(pull_data_cha1, 2) + np.power(pull_data_cha3, 2)), 1 / 2) push_data = np.power((np.power(push_data_cha1, 2) + np.power(push_data_cha3, 2)), 1 / 2) cha_data = np.power((np.power(CHA1, 2) + np.power(CHA3, 2)), 1 / 2) # Pulling and pushing data for other side of the rail pull_data_chb1 = CHB1[(EDIR == 1)] push_data_chb1 = CHB1[(EDIR == -1)] pull_data_chb3 = CHB3[(EDIR == 1)] push_data_chb3 = CHB3[(EDIR == -1)] # transformed data from X and Z axes pull_data2 = np.power((np.power(pull_data_chb1, 2) + np.power(pull_data_chb3, 2)), 1 / 2) push_data2 = np.power((np.power(push_data_chb1, 2) + np.power(push_data_chb3, 2)), 1 / 2) ######################################## rail_data = [] rail_counters = [] rail_xcounters = [] data_list = [] counters_list = [] xcounters_list = [] data_list.append(pull_data) # check if data exist for push mode if len(push_data) != 0: data_list.append(push_data) counters_list.append(pull_int_count) counters_list.append(push_int_count) xcounters_list.append(pull_ext_count) if len(push_ext_count) != 0: xcounters_list.append(push_ext_count) ######################################## data_list2 = [] data_list2.append(pull_data2) # check if data exist for push mode if len(push_data2) != 0: data_list2.append(push_data2) rail_data.append(data_list) rail_data.append(data_list2) rail_counters.append(counters_list) rail_counters.append(counters_list) rail_xcounters.append(xcounters_list) rail_xcounters.append(xcounters_list) # interpolating internal and external counters get_xcount = interp1d(int_count, ext_count, fill_value='extrapolate') get_icount = interp1d(ext_count, int_count, fill_value='extrapolate') # ///////////// Feature Extraction ////////////// aba_data_side = [] all_xcount_mode = [] anom_xcount_mode = [] anom_score_mode = [] for i in range(2): aba_data_mode = [] int_count_mode = [] anom_xcount_list = [] anom_score_list = [] input_data = rail_data[i] for j in range(len(data_list)): in_data = input_data[j] if len(in_data) == 0: continue counters = counters_list[j] # inputs are ABA data, counters and the sliding window size list_of_features = extract_features(in_data, counters, sliding_window) rms = np.array(list_of_features[:, 0]) kurtosis = np.array(list_of_features[:, 2]) skewness = np.array(list_of_features[:, 3]) peak_to_peak = np.array(list_of_features[:, 4]) crest_factor = np.array(list_of_features[:, 5]) impulse_factor = np.array(list_of_features[:, 6]) rmsf = np.array(list_of_features[:, 12]) int_count = np.array(list_of_features[:, 13]) # features comparison # plt.figure(2) # plt.subplot(211) # plt.ylabel('ABA') # plt.plot(list(range(0, 401000)), in_data[:401000]) # plt.subplot(212) # plt.ylabel('RMS') # plt.plot(list(range(1000, 401000, 2000)), rms[:200], '*') # plt.subplot(413) # plt.ylabel('Kurtosis') # plt.plot(list(range(1000, 401000, 2000)), kurtosis[:200], '*') # plt.subplot(212) # plt.ylabel('Peak to peak') # plt.xlabel('Data Samples') # plt.plot(list(range(1000, 401000, 2000)), peak_to_peak[:200], '*') # plt.show() # plt.figure(2) # plt.subplot(211) # plt.ylabel('ABA') # plt.plot(list(range(390000, 400000)), in_data[390000:400000]) # plt.subplot(212) # plt.ylabel('RMS') # plt.xlabel('Samples') # plt.plot(list(range(391000, 401000, 2000)), rms[195:200], 'r*') # plt.xlim(390000, 400000) # plt.show() if features == 'RMS': mylist = np.stack((rms, rms), axis=-1) if features == 'Kurtosis': mylist = np.stack((kurtosis, kurtosis), axis=-1) if features == 'Crest factor': mylist = np.stack((crest_factor, crest_factor), axis=-1) if features == 'Impulse factor': mylist = np.stack((impulse_factor, impulse_factor), axis=-1) if features == 'Skewness': mylist = np.stack((skewness, skewness), axis=-1) if features == 'Peak-to-peak': mylist = np.stack((peak_to_peak, peak_to_peak), axis=-1) if features == 'All': mylist = np.stack((rms, kurtosis, peak_to_peak, crest_factor, impulse_factor, skewness), axis=-1) else: mylist = np.stack((kurtosis, peak_to_peak), axis=-1) norm_train, anom_train, norm_test, anom_test, anom_icount, anom_icount_train, anom_score = isolation_forest( mylist, int_count, sub_sampling, impurity, num_trees) print("Return from Anomaly detection") all_xcount_mode.append(get_xcount(int_count)) anom_xcount_test = get_xcount(anom_icount) anom_xcount_train = get_xcount(anom_icount_train) anom_xcount = np.concatenate((anom_xcount_train, anom_xcount_test), axis=0) anom_score = anom_score anom_xcount_list.append(anom_xcount) anom_score_list.append(anom_score) # validation of anomalies latitude = get_lat(anom_xcount) longitude = get_long(anom_xcount) dist = [0] for z in range(len(latitude) - 1): point_one = (latitude[z], longitude[z]) point_two = (latitude[z + 1], longitude[z + 1]) # distance = geodesic(point_one, point_two).km # dist.append(1000 * distance) dist.append(longitude[z]) anom_xcount = [str(x) for x in anom_xcount] latitude = [str(x) for x in latitude] longitude = [str(x) for x in longitude] dist = [str(x) for x in dist] write_data = zip(anom_xcount, latitude, longitude, dist) track_side = 'chb' if i else 'cha' train_mode = 'pushing' if j else 'pulling' with open(self.counters_path + '\Prorail17112805si12_' + track_side + '_' + train_mode + '.csv', 'w', newline='') as file: try: writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(['counters', 'latitude', 'longitude', 'distance']) for cnt, lat, lon, dist in write_data: writer.writerow([cnt, lat, lon, dist]) finally: file.close() ####################################################### lat_list = get_lat(anom_xcount).tolist() long_list = get_long(anom_xcount).tolist() lat_list_train = get_lat(anom_xcount_train).tolist() long_list_train = get_long(anom_xcount_train).tolist() # gmap_plot(lat_list_train + lat_list, long_list_train + long_list) aba_data_side.append(aba_data_mode) anom_xcount_mode.append(anom_xcount_list) anom_score_mode.append(anom_score_list) # function call: compare anomalies in ABA on both channels i.e. CHA and CHB and return anomaly_positions anomaly_positions = match_anomaly(rail_data, rail_xcounters, anom_xcount_mode, seg_file) # /////////////////////////////////////////// if len(anomaly_positions) > 2: anom_pos_cha = np.round(anomaly_positions[0] + anomaly_positions[2], 2) anom_xcount_cha = np.round(np.concatenate((anom_xcount_mode[0][0], anom_xcount_mode[0][1]), axis=0), 2) anom_score_cha = np.round(np.concatenate((anom_score_mode[0][0], anom_score_mode[0][1]), axis=0), 3) else: anom_pos_cha = np.round(anomaly_positions[0], 2) anom_xcount_cha = np.round(anom_xcount_mode[0][0], 2) anom_score_cha = np.round(anom_score_mode[0][0], 3) anom_pos_xcount = np.stack((anom_pos_cha, anom_xcount_cha, anom_score_cha), axis=-1) anom_pos_xcount_sorted = anom_pos_xcount[anom_pos_xcount[:, 0].argsort()] anom_pos_cha = list(anom_pos_xcount_sorted[:, 0]) anom_xcount_cha = list(anom_pos_xcount_sorted[:, 1]) anom_score_cha = list(anom_pos_xcount_sorted[:, 2]) # Severity analysis (head-checks vs ABA anomaly severity) # dict = {'position': anom_pos_cha, 'counters': anom_xcount_cha, 'score': anom_score_cha} # df_anom_pos_score = pd.DataFrame(data=dict) # ectpath = r'D:\strukton_project\WP_180306\ECT\EC_data_2018_FC_FO_LR.csv' # headchecks = DefectSeverity(df_anom_pos_score, ectpath).get_trend() # # plotlist = [] # depth = normalize(headchecks['depth'].tolist()) # score = headchecks['score'].tolist() # plotlist.append(depth) # plotlist.append(score) # pltlist = [[plotlist[j][i] for j in range(len(plotlist))] for i in range(len(plotlist[0]))] # pltarr = np.array(pltlist) # sorted = pltarr[pltarr[:, 0].argsort()] # cracksize = sorted[:, 0] # anomscore = sorted[:, 1] # # write_data = zip(cracksize, anomscore) # track_side = 'cha_crack_anom' # # with open(self.counters_path + '\Prorail18030614si12_' + track_side + '.csv', 'w', # newline='') as file: # try: # writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # writer.writerow(['crack_depth', 'anom_severity']) # for crack, sev in write_data: # writer.writerow([crack, sev]) # finally: # file.close() # # ################################## write_data = zip(anom_pos_cha, anom_xcount_cha, anom_score_cha) return anom_pos_xcount_sorted