def main(): args = get_args() centroids = {} df_breakpoints = pd.read_csv(args.src_breakpoint) for column in args.columns: log('parsing column %s ...' % (column, )) data_points = np.array([]) for src in args.srcs: log('parsing %s ...' % (src, )) df = pd.read_csv(src, usecols=[column]) data_points = np.concatenate((data_points, df[column]), axis=0) init_centroids = breakpoints_to_centroids(df_breakpoints[column].values) kmeans = KMeans( init=np.array(init_centroids).reshape((-1, 1)), n_clusters=args.symbol_size, random_state=0, n_jobs=1, verbose=0, max_iter=500 ).fit(np.array(data_points).reshape((-1, 1))) centroids[column] = np.array(kmeans.cluster_centers_).reshape(-1) df_centroids = pd.DataFrame(centroids) dest_dir = prepare_directory(args.dest_dir) df_centroids.to_csv( os.path.join( dest_dir, 'centroid-{0}.csv'.format(args.symbol_size) ), header=True, index=False )
def main(): dest_dir = prepare_directory( os.path.join('../../build/data', SCOPE, 'initialized')) for dataset in DATASETS_TO_PARSE: filenames = glob.glob(os.path.join(SRC_DIR, dataset, '*.csv')) for filename in filenames: log('parsing ' + filename + '...') dfChunks = pd.read_csv(filename, header=None, names=INPUT_CSV_COLUMNS, chunksize=CHUNK_SIZE) i = 0 for dfChunk in dfChunks: i = i + 1 header = i is 1 mode = 'a' if i > 1 else 'w' log('parsing chunk %d' % i) dfChunk['datetime'] = pd.to_datetime(dfChunk['timestamp'], unit='ms') dfChunk['u'] = dfChunk['u'] * 0.1 dfChunk['v'] = dfChunk['v'] * 0.1 dfChunk['w'] = dfChunk['w'] * 0.1 dfChunk['x'] = dfChunk['x'] * 0.004 dfChunk['y'] = dfChunk['y'] * 0.004 dfChunk['z'] = dfChunk['z'] * 0.004 dfChunk.to_csv(os.path.join( dest_dir, '%s-%s' % (dataset, os.path.basename(filename))), mode=mode, header=header, index=False, columns=OUTPUT_CSV_COLUMNS)
def visualize(model, sess, epoch, train_mse): dest_dir = prepare_directory(os.path.join( '../build/plots', args.scope, args.name )) if args.sample_size: x_axis = np.linspace( 0, len(dataset_in_order['y']) - 1, num=args.sample_size, dtype=int ) ground_truth = np.reshape( np.array(dataset_in_order['y'])[x_axis], (args.sample_size) ) ps = model.prediction.eval( session=sess, feed_dict={ model.xs: np.array(dataset_in_order['x'])[x_axis], } ) predicted = np.reshape(ps, (args.sample_size)) else: x_axis = np.arange(data_size) ground_truth = np.reshape( dataset_in_order['y'][0:data_size], (data_size) ) ps = np.empty(shape=[0, 1]) for batch_idx in range(0, batch_count): begin_idx = batch_idx * args.batch_size end_idx = min(begin_idx + args.batch_size, data_size) p = model.prediction.eval( session=sess, feed_dict={ model.xs: dataset_in_order['x'][begin_idx: end_idx], } ) ps = np.concatenate((ps, p), axis=0) if (batch_idx + 1) % 5000 == 0: log('drawing %d' % (batch_idx + 1)) predicted = np.reshape(ps, (data_size)) plt.ylim(Y_LIMIT) plt.plot(x_axis, ground_truth, 'g.') plt.plot(x_axis, predicted, color='red', linestyle='--', linewidth=1) title = 'epoch-{0}\nmse = {1}'.format(epoch, train_mse) plt.title(title) plt.savefig( os.path.join(dest_dir, 'epoch-{0}.png'.format(epoch)), dpi=400, format='png' ) plt.clf()
def visualize(mses): dest_dir = prepare_directory( os.path.join('../build/plots', args.scope, args.name, os.path.basename(args.test_src).rsplit('.', 1)[0])) f, axarr = plt.subplots(2, sharex=True, figsize=(7, 5)) axarr[0].set_title(args.title) axarr[0].set_ylabel('Vibration Signal (g)') axarr[1].set_ylabel('Reconstruct Error (MSE)') plt.xlabel('Bearing Life (390ms)') threshold = args.threshold anomaly_flags = mses >= threshold colors = ['red' if a else 'green' for a in anomaly_flags] linestyles = ['dotted' if a else 'solid' for a in anomaly_flags] lines = [((x0, y0), (x1, y1)) for x0, y0, x1, y1 in zip(xs[:-1], ys[:-1], xs[1:], ys[1:])] colored_lines = LineCollection(lines, colors=colors, linestyles=linestyles, linewidths=(1, )) axarr[0].add_collection(colored_lines) axarr[0].autoscale_view() axarr[0].plot([], [], c='green', label='predicted normal') axarr[0].plot([], [], c='red', linestyle='dotted', label='predicted anomalous') bound = xs[int(len(xs) * 0.9)] axarr[0].plot([bound, bound], [np.amin(ys), np.amax(ys)], color='blue', linestyle='--', linewidth=1, label='actual anomalous') axarr[0].legend() axarr[1].plot(xs, mses, color='blue', label='reconstruct error') axarr[1].plot([xs[0], xs[-1]], [threshold, threshold], color='blue', linestyle='--', linewidth=1, label='anomaly threshold') axarr[1].legend() plt.savefig(os.path.join( dest_dir, '{0}(seed={1}, smooth={2}).eps'.format(args.name, args.seed, args.smooth)), dpi=800, format='eps') plt.clf()
def main(): args = get_args() filename = args.src first_datetime, last_datetime, _ = get_datetime(filename) bound_datetime = last_datetime - pd.Timedelta(minutes=args.alarm_minutes) for column in args.columns: log('\n=====================') log('Feature "%s":', column) thresholds = np.arange(args.thresholds[0], args.thresholds[1], args.thresholds[2]) rates = [] for threshold in thresholds: df_chunks = pd.read_csv(filename, chunksize=args.chunk_size) n_total = 0 n_above_threshold = 0 for chunk_idx, df_chunk in enumerate(df_chunks): df_chunk['datetime'] = pd.to_datetime( df_chunk['datetime'], infer_datetime_format=True) if args.abs: df_chunk[column] = abs(df_chunk[column]) df_before_time_bound = df_chunk[ df_chunk.datetime < bound_datetime] df_above_threshold = df_before_time_bound[ df_before_time_bound[column] > threshold] n_above_threshold += len(df_above_threshold) n_total += len(df_before_time_bound) rate = float(n_above_threshold) / n_total * 100 rates.append(rate) log('threshold %f = %f%% (%d / %d)' % (threshold, rate, n_above_threshold, n_total)) # visualization basename = os.path.basename(filename) dest_dir = prepare_directory(os.path.join(args.dest_dir, basename)) plt.title('%s\n%d minutes alarm of feature "%s"' % (basename, args.alarm_minutes, column)) plt.xlabel('feature thresholds') plt.ylabel('rates above threshold(%)') plt.ylim([0, 100]) plt.plot(thresholds, rates, 'bx-') plt.savefig(os.path.join(dest_dir, '%dmin-%s.png' % (args.alarm_minutes, column)), dpi=400, format='png') plt.clf()
def main(): datasets = os.listdir(SRC_DIR) for dataset in datasets: workingTypes = os.listdir(os.path.join(SRC_DIR, dataset)) for workingType in workingTypes: log('parsing ' + dataset + '/' + workingType + '...') destDir = prepare_directory(os.path.join(DEST_DIR, dataset)) fdInputs = [] fdOutput = open( os.path.join(destDir, '{0}.csv'.format(workingType)), 'w') lines = [] filenames = ['x', 'y', 'z', 'u', 'v', 'w'] for i in range(0, 6): fdInputs.append(None) lines.append(None) fdInputs[i] = open( os.path.join(SRC_DIR, dataset, workingType, '{0}.csv'.format(filenames[i]))) lines[i] = readline(fdInputs[i]) count = 0 while all(lines): count = count + 1 if count % 100000 == 0: log(count) updateIndices = None if all(line[0] == lines[0][0] for line in lines): updateIndices = list(range(0, 6)) fdOutput.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( lines[0][0], lines[0][1], lines[1][1], lines[2][1], lines[3][1], lines[4][1], lines[5][1], )) else: timestamps = np.array(lines)[:, 0].astype(np.int64) updateIndices = np.where(timestamps == timestamps.min())[0] for updateIndex in updateIndices: lines[updateIndex] = readline(fdInputs[updateIndex]) for i in range(0, 6): fdInputs[i].close() fdOutput.close()
def main(): args = get_args() src_dir = os.path.join('../build/data', args.scope, 'initialized') dest_dir = prepare_directory( os.path.join('../build/data', args.scope, 'labeled')) filenames = glob.glob(os.path.join(src_dir, '*.csv')) threshold_step = args.thresholds[2] thresholds = np.arange(args.thresholds[0], args.thresholds[1], threshold_step) df_breakpoints = pd.read_csv(args.src_breakpoint) for filename in filenames: log('parsing %s in scope %s' % (os.path.basename(filename), args.scope)) first_datetime, last_datetime, chunk_count = get_datetime(filename) total_seconds = (last_datetime - first_datetime).total_seconds() df_chunks = pd.read_csv(filename, chunksize=args.chunk_size) for chunk_idx, df_chunk in enumerate(df_chunks): header = chunk_idx is 0 mode = 'a' if chunk_idx > 0 else 'w' df_chunk['datetime'] = pd.to_datetime(df_chunk['datetime'], infer_datetime_format=True) log('parsing chunk %d/%d' % (chunk_idx, chunk_count)) df_chunk['rul'] = (last_datetime - df_chunk['datetime'] ).astype('timedelta64[us]') / 1000000 df_chunk['rulp'] = df_chunk['rul'] / total_seconds df_chunk['level_x'] = [ bisect_left(thresholds, element) for element in df_chunk['x'] ] df_chunk['level_y'] = [ bisect_left(thresholds, element) for element in df_chunk['y'] ] df_chunk['symbol_x'] = [ bisect_left(df_breakpoints['x'], element) for element in df_chunk['x'] ] df_chunk['symbol_y'] = [ bisect_left(df_breakpoints['y'], element) for element in df_chunk['y'] ] df_chunk.to_csv(os.path.join(dest_dir, os.path.basename(filename)), mode=mode, header=header, index=False)
def visualize_dataset(model, sess, epoch, dataset_name): dest_dir = prepare_directory(os.path.join( '../build/plots', args.scope, args.name )) if args.sample_size: args.sample_size = args.sample_size - (args.sample_size % args.batch_size) x_axis = np.linspace( 0, len(dataset[dataset_name]) - 1, num=args.sample_size, dtype=int ) ground_truth = dataset[dataset_name][x_axis, 0] assignments, ps = sess.run([model.expanded_assignments, model.prediction], feed_dict={ model.xs: dataset[dataset_name][x_axis], model.ys: dataset[dataset_name][x_axis], model.feed_previous: True, }) # ps = model.prediction.eval( # session=sess, # feed_dict={ # model.xs: dataset[dataset_name][x_axis], # model.ys: dataset[dataset_name][x_axis], # model.feed_previous: True, # } # ) predicted = np.array(ps)[:, 0] assigned = np.array(assignments)[:, 0] plt.ylim(Y_LIMIT) plt.scatter(x_axis, assigned, color='green', marker='x', s=12) plt.scatter(x_axis, predicted, color='blue', s=10, linewidth=0) plt.plot(x_axis, abs(predicted - assigned), color='red', linestyle='--', linewidth=1) mse = eval_mse(model, sess, dataset_name) title = 'epoch-{0}\n{1} mse = {2}'.format(epoch, dataset_name, mse) plt.title(title) plt.savefig( os.path.join(dest_dir, 'epoch-{0}-{1}.png'.format(epoch, dataset_name)), dpi=400, format='png' ) plt.clf() return mse
def main(): args = get_args() df_breakpoints = pd.read_csv(args.src_breakpoint) columns = df_breakpoints.columns.values for src, dest in zip(args.srcs, args.dests): log('parsing %s ...' % (src, )) prepare_directory(os.path.dirname(dest)) df_chunks = pd.read_csv(src, chunksize=args.chunk_size) for chunk_idx, df_chunk in enumerate(df_chunks): if chunk_idx % 1 == 0: print(chunk_idx) for column in columns: df_chunk['level_' + column] = [ bisect_left(df_breakpoints[column], element) for element in df_chunk[column] ] header = chunk_idx is 0 mode = 'a' if chunk_idx > 0 else 'w' df_chunk.to_csv(os.path.join(dest), mode=mode, header=header, index=False)
def visualize(model, sess): dest_dir = prepare_directory( os.path.join('../build/plots', args.scope, args.name, os.path.basename(args.test_src).rsplit('.', 1)[0])) plt.figure(figsize=(6, 4)) plt.ylim(args.ylim) plt.ylabel('Health Indicator (%)') plt.xlabel('Data Entry') title = 'HI Prediction Result' x_axis = np.linspace(0, len(dataset_in_order['y']) - 1, num=args.sample_size, dtype=int) ground_truth = np.reshape( np.array(dataset_in_order['y'])[x_axis], (args.sample_size)) ps = model.prediction.eval(session=sess, feed_dict={ model.xs: np.array(dataset_in_order['x'])[x_axis], }) predicted = np.reshape(ps, (args.sample_size)) if args.smooth: predicted = smooth(predicted, args.smooth) plt.plot(x_axis, ground_truth * 100, color='green', linewidth=2, label='real HI') plt.plot(x_axis, predicted * 100, color='blue', linestyle='--', linewidth=2, label='predicted HI') plt.legend() plt.title(title) plt.savefig(os.path.join( dest_dir, 'test-health-index-batch_step-{0}.eps'.format(args.batch_step)), dpi=800, format='eps') plt.clf()
def visualize_dataset(model, sess, epoch, dataset_name): dest_dir = prepare_directory(os.path.join( '../build/plots', args.scope, args.name )) if args.sample_size: args.sample_size = args.sample_size - (args.sample_size % args.batch_size) x_axis = np.linspace( 0, len(dataset[dataset_name + '_feature']) - 1, num=args.sample_size, dtype=int ) ground_truth = dataset[dataset_name + '_label'][x_axis, 0] ps = model.prediction.eval( session=sess, feed_dict={ model.xs: dataset[dataset_name + '_feature'][x_axis], model.ys: dataset[dataset_name + '_label'][x_axis], } ) predicted = np.array(ps)[:, 0] plt.ylim(Y_LIMIT) plt.scatter(x_axis, ground_truth, color='green', marker='x', s=12) plt.scatter(x_axis, predicted, color='blue', s=10, linewidth=0) plt.plot(x_axis, abs(predicted - ground_truth), color='red', linestyle='--', linewidth=1) acc, entropy = eval_metric(model, sess, dataset_name) title = 'epoch-{0}\n{1} accuracy = {2}'.format(epoch, dataset_name, acc) plt.title(title) plt.savefig( os.path.join(dest_dir, 'epoch-{0}-{1}.png'.format(epoch, dataset_name)), dpi=400, format='png' ) plt.clf() return acc, entropy
def visualize(xs, ys): dest_dir = prepare_directory( os.path.join('../build/plots', args.scope, args.name, os.path.basename(args.test_src).rsplit('.', 1)[0])) plt.ylim(args.ylim) plt.ylabel('Accuracy') plt.xlabel('Index') title = 'Test Accuracy' if args.batch_step < 200: plt.scatter(xs, ys, color='purple', s=0.1) else: plt.plot(xs, ys, color='purple', linestyle='--', linewidth=1) plt.title(title) plt.savefig(os.path.join( dest_dir, 'test-accuracy-batch_step-{0}.png'.format(args.batch_step)), dpi=400, format='png') plt.clf()
def main(): dest_dir = prepare_directory( os.path.join('../../build/data', SCOPE, 'initialized')) for dataset in DATASETS_TO_PARSE: instances = os.listdir(os.path.join(SRC_DIR, dataset)) for prefix in PREFIXES_TO_PARSE: for instance in instances: filenames = sorted( glob.glob( os.path.join(SRC_DIR, dataset, instance, prefix + '_*.csv'))) i = 0 length = len(filenames) for filename in filenames: i = i + 1 header = i is 1 mode = 'a' if i > 1 else 'w' log('parsing ' + dataset + '/' + instance + '...' + str(i) + '/' + str(length)) df = pd.read_csv(filename, sep=detectSep(filename), header=None, names=INPUT_CSV_COLUMNS) df['year'] = 2017 df['month'] = 5 df['day'] = 26 df['datetime'] = pd.to_datetime(df[DATETIME_FIELDS]) df.to_csv(os.path.join( dest_dir, '%s-%s-%s.csv' % (dataset, instance, prefix)), mode=mode, header=header, index=False, columns=OUTPUT_CSV_COLUMNS)
def visualize_dataset(model, sess, epoch, dataset_name): dest_dir = prepare_directory( os.path.join('../build/plots', args.scope, args.name)) if args.sample_size: x_axis = np.linspace(0, len(dataset[dataset_name]) - 1, num=args.sample_size, dtype=int) ground_truth = dataset[dataset_name][x_axis, 0, 0] ps = model.prediction.eval(session=sess, feed_dict={ model.xs: dataset[dataset_name][x_axis], model.ys: dataset[dataset_name][x_axis], }) predicted = np.array(ps)[:, 0, 0] plt.ylim(Y_LIMIT) plt.scatter(x_axis, ground_truth, color='green', marker='x', s=12) plt.scatter(x_axis, predicted, color='blue', s=10, linewidth=0) plt.plot(x_axis, np.absolute(predicted - ground_truth), color='red', linestyle='--', linewidth=1) mse = eval_mse(model, sess, dataset_name) title = '{0}\nepoch-{1}\nmse = {2}'.format(dataset_name, epoch, mse) plt.title(title) plt.savefig(os.path.join(dest_dir, 'epoch-{0}-{1}.png'.format(epoch, dataset_name)), dpi=400, format='png') plt.clf() return mse
# log y-axis if args.log_y_axis: dy = 0.00001 t = np.arange(dy, 1.0, dy) plt.semilogy(t, np.exp(-t / 5.0), alpha=0.0) if args.grid: plt.grid(True) if args.legend_outside: lgd = plt.legend(loc='center right', bbox_to_anchor=(args.legend_outside, 0.5), fontsize=10) else: if args.legend_location: lgd = plt.legend(fontsize=10, loc=args.legend_location) else: lgd = plt.legend(fontsize=10) dest_dir = prepare_directory(os.path.dirname(args.dest)) if args.legend_outside: plt.savefig(args.dest, dpi=1200, format='eps', bbox_extra_artists=(lgd, ), bbox_inches='tight') else: plt.savefig(args.dest, dpi=1200, format='eps') plt.clf()
def get_anomaly_flags(df): length = len(df) normal_length = int(length * 0.9) anomalous_length = length - normal_length normal_flags = np.repeat(0, normal_length) anomalous_flags = np.repeat(1, anomalous_length) anomaly_flags = np.concatenate((normal_flags, anomalous_flags)) return anomaly_flags if __name__ == '__main__': args = get_args() for src, dest in zip(args.srcs, args.dests): log('parsing %s ...' % (src, )) prepare_directory(os.path.dirname(dest)) df_chunks = pd.read_csv(src, chunksize=args.chunk_size) df_result = pd.DataFrame({ 'avg': [], 'max': [], 'min': [], 'fft1': [], 'fft2': [], 'paa': [], }) for batch_idx, df_batch in get_batch(df_chunks, args.batch_size): if batch_idx % 1000 == 0: print(batch_idx) values = np.array(df_batch['x']) fft1, fft2 = get_fft(values, args.batch_size) paa_value = get_paa_value(values)
# start session sess = tf.InteractiveSession( # config=tf.ConfigProto(intra_op_parallelism_threads=N_THREADS) ) # prepare model import or export if args.src: importSaver = tf.train.Saver() importSaver.restore(sess, args.src) else: # initize variable sess.run(tf.global_variables_initializer()) if args.dest: exportSaver = tf.train.Saver() prepare_directory(os.path.dirname(args.dest)) filename = args.log or os.path.join( prepare_directory(os.path.join('../build/plots', args.scope, args.name)), 'log.csv') min_validate_mse = 999999 batch_count, data_size = get_batch_count(dataset['train'], args.batch_size) with open(filename, 'w') as fd_log: start_time = time.time() # before training validate_mse = visualize_dataset(model, sess, 0, 'validate') anomalous_mse = visualize_dataset(model, sess, 0, 'anomalous') print( 'Epoch\t%d, Batch\t%d, Elapsed time\t%.1fs, Validate MSE\t%s, Anomalous MSE\t%s, Min Validate MSE\t%s' % (0, 0, 0, validate_mse, anomalous_mse, min_validate_mse))
while True: dataset = read_dataset() for batch_idx in range(0, 1): begin_idx = batch_idx * args.batch_size end_idx = begin_idx + args.batch_size xs = dataset[begin_idx:end_idx] restored_predictions = sess.run( 'compute_cost/Reshape_1:0', feed_dict={ 'input_layer/xs:0': xs, 'input_layer/ys:0': xs, 'input_layer/feed_previous:0': True, }) restored_ys = sess.run('compute_cost/Reshape_3:0', feed_dict={ 'input_layer/xs:0': xs, 'input_layer/ys:0': xs, 'input_layer/feed_previous:0': True, }) mse = np.mean((restored_ys - restored_predictions)**2, axis=1) is_anomaly = mse[0] > args.threshold dest_dir = prepare_directory( os.path.join(args.src, '../../inference-result')) with open(os.path.join(dest_dir, 'last.txt'), 'w') as fd_result: print('anomaly' if is_anomaly else 'normal') fd_result.write('anomaly' if is_anomaly else 'normal') time.sleep(5)
def main(): datasets = os.listdir(SRC_DIR) for dataset in datasets: dataDirs = os.listdir(os.path.join(SRC_DIR, dataset)) for dataDir in dataDirs: if not os.path.isdir(os.path.join(SRC_DIR, dataset, dataDir)): continue log('parsing ' + dataset + '/' + dataDir + '...') workingType = dataDir.split('_')[0] dataType = dataDir.split('_')[1] readDir = os.path.join( SRC_DIR, dataset, workingType + '_' + dataType ) channel_map = [None, 'x', 'y', 'z', 'u', 'v', 'w'] channels = [1, 2, 3] if dataType == 'acc' else [4, 5, 6] if dataset in [ '2017-07-18-168000rpm', '2017-08-17-0.35mm', '2017-08-21-0.5mm', '2017-08-21-0.8mm', '2017-08-21-1.0mm', '2017-08-21-1.55mm', '2017-08-21-2.0mm', '2017-08-21-3.175mm', ]: channels = [4, 5, 6] if dataType == 'acc' else [1, 2, 3] channel_map = [None, 'u', 'v', 'w', 'x', 'y', 'z'] for channel in channels: filenames = glob.glob(os.path.join( readDir, 'Channel{0}_*.csv'.format(channel) )) for filename in filenames: df = pd.read_csv( filename, names=INPUT_CSV_COLUMNS, header=None ) if dataset == '2017-07-18-168000rpm': df['timestamp'] = pd.to_datetime( df['timestamp'], format='%m/%d/%Y %H:%M:%S.%f' ).astype(np.int64) // int(1e6) elif dataset in [ '2017-08-17-0.35mm', '2017-08-21-0.5mm', '2017-08-21-0.8mm', '2017-08-21-1.0mm', '2017-08-21-1.55mm', '2017-08-21-2.0mm', '2017-08-21-3.175mm', ]: df['timestamp'] = pd.to_datetime( df['timestamp'], format='%m/%d/%Y %H:%M:%S.%f' ).astype(np.int64) // int(1e6) else: df['timestamp'] = pd.to_datetime( df['timestamp'], format='%Y%m%d%H%M%S%f' ).astype(np.int64) // int(1e6) destDir = prepare_directory(os.path.join( '../../build/data/', SCOPE, 'merged', dataset, workingType )) df.to_csv( os.path.join( destDir, channel_map[channel] + '.csv' ), mode='a', header=False, index=False )
def main(): thresholds = np.arange(args.thresholds[0], args.thresholds[1], args.thresholds[2]) table_true_alarm = np.empty([ len(args.columns), len(args.srcs), len(thresholds), ]) table_false_alarm = np.empty([ len(args.columns), len(args.srcs), len(thresholds), ]) for src_idx, src in enumerate(args.srcs): log('\n=====================') log('File %d "%s":' % (src_idx, src)) _, last_datetime, _ = get_datetime(src) bound_datetime = last_datetime - pd.Timedelta( minutes=args.alarm_minutes) df_chunks = pd.read_csv(src, chunksize=args.chunk_size) df_featured = add_df_feature(df_chunks) for column_idx, column in enumerate(args.columns): log('\n\tFeature "%s":' % column) for threshold_idx, threshold in enumerate(thresholds): df_before_time_bound = df_featured[ df_featured.datetime < bound_datetime] df_total_alarm = df_before_time_bound[ df_before_time_bound[column] > threshold] n_total_alarm = len(df_total_alarm) n_true_alarm = 1 if n_total_alarm > 0 else 0 n_false_alarm = n_total_alarm - n_true_alarm table_true_alarm[column_idx][src_idx][ threshold_idx] = n_true_alarm table_false_alarm[column_idx][src_idx][ threshold_idx] = n_false_alarm log('\tthreshold = %f, n_true_alarm = %d, n_total_alarm = %d' % (threshold, n_true_alarm, n_true_alarm + n_false_alarm)) log('\ntrue alarm table') log('================\n') log(table_true_alarm) log('\nfalse alarm table') log('=================\n') log(table_false_alarm) table_total_alarm = table_true_alarm + table_false_alarm prevent_zero_division = np.vectorize(lambda total_alarm: 1 if total_alarm == 0 else total_alarm) table_true_alarm_indicator = table_true_alarm.astype(bool).astype(float) log('\ntrue alarm indicator table') log('==========================\n') log(table_true_alarm_indicator) for column_idx, column in enumerate(args.columns): true_alarms = np.average(table_true_alarm[column_idx], axis=0) total_alarms = np.average(table_total_alarm[column_idx], axis=0) true_alarm_indicators = np.average( table_true_alarm_indicator[column_idx], axis=0) log('\ntrue_alarm_indicators') log('=====================\n') log(true_alarm_indicators) # visualization dest_dir = prepare_directory(os.path.join(args.dest_dir)) fig, ax_true_alarm_indicator = plt.subplots() ax_true_alarm_indicator.set_xlabel('Feature Thresholds') ax_true_alarm_indicator.set_ylabel('True Alarm Indicator (%)', color='blue') ax_true_alarm_indicator.plot(thresholds, true_alarm_indicators * 100, color='blue', marker='.') ax_true_alarm_indicator.set_ylim([0, 100]) ax_true_alarm_indicator.tick_params('y', colors='blue') ax_n_total_alarm = ax_true_alarm_indicator.twinx() ax_true_alarm_indicator.set_zorder(ax_n_total_alarm.get_zorder() + 1) ax_true_alarm_indicator.patch.set_visible(False) ax_n_total_alarm.yaxis.tick_right() ax_n_total_alarm.set_ylabel('Total Alarm Count', color='red') ax_n_total_alarm.tick_params('y', colors='red') ax_n_total_alarm.plot(thresholds, total_alarms, color='red', marker='.') plt.title('%d minutes alarm of feature "%s"' % (args.alarm_minutes, column)) plt.savefig(os.path.join( dest_dir, '%dmin-%f,%f,%fth-%s.png' % (args.alarm_minutes, args.thresholds[0], args.thresholds[1], args.thresholds[2], column)), dpi=400, format='png') plt.clf()
breakpoints[column].append(n.ppf(probability)) equal_breakpoints[column] = np.linspace(minValue, maxValue, args.symbol_size + 1)[1:-1] print('==== Report ====') print('lens\t', lens) print('sums\t', sums) print('mean\t', mean) print('std\t', std) print('minValue\t', minValue) print('maxValue\t', maxValue) print('step\t', step) print('len(breakpoints)\t', len(breakpoints[column])) print('breakpoints\t', breakpoints[column]) df_breakpoints = pd.DataFrame(breakpoints) df_equal_breakpoints = pd.DataFrame(equal_breakpoints) dest_dir = prepare_directory(args.dest_dir) df_breakpoints.to_csv( os.path.join( dest_dir, 'breakpoint-{0}.csv'.format(args.symbol_size) ), header=True, index=False ) df_equal_breakpoints.to_csv( os.path.join( dest_dir, 'equal-breakpoint-{0}.csv'.format(args.symbol_size) ), header=True, index=False