def predict_proba(X_test, class_names): # load trained anomaly models anomaly_model_group_by_label = {} anomaly_data_path = training_config.anomaly_data_path folders = os.listdir(anomaly_data_path) for fo in folders: anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, fo, training_config.config_by_user['data_type_chosen'], training_config.config_by_user['model_type_chosen'], training_config.model_id) try: anomaly_model_group_by_label[fo] = joblib.load(anomaly_model_path + "/model_s%s.pkl" % (1, )) except IOError: print 'anomaly model of %s not found' % (fo, ) ipdb.set_trace() raw_input("sorry! cann't load the anomaly model") continue predict_score = [] calc_cofidence_resourse = [] for i in range(len(X_test)): temp_loglik = [] for model_label in class_names: one_log_curve_of_this_model = util.fast_log_curve_calculation( X_test[i], anomaly_model_group_by_label[model_label]) temp_loglik.append(one_log_curve_of_this_model[-1]) temp_score = temp_loglik / np.sum(temp_loglik) predict_score.append(temp_score) return np.array(predict_score)
def run(model_save_path, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue threshold_group_by_state = {} mean_curve_group_by_state = {} for state_no in model_group_by_state: all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no]) all_log_curves_of_this_state.append(one_log_curve_of_this_state) # use np matrix to facilitate the computation of mean curve and std np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) mean_of_log_curve = np_matrix_traj_by_time.mean(0) diff_traj_by_time = np_matrix_traj_by_time - mean_of_log_curve deri_of_diff_traj_by_time = diff_traj_by_time[:, 1:] - diff_traj_by_time[:, : -1] mean_curve_group_by_state[state_no] = mean_of_log_curve threshold_group_by_state[state_no] = assess_threshold_and_decide( deri_of_diff_traj_by_time, curve_owner, state_no, figure_save_path, ) if not os.path.isdir(model_save_path): os.makedirs(model_save_path) if len(threshold_group_by_state) != 0: joblib.dump(threshold_group_by_state, model_save_path + "/threshold_for_deri_of_diff.pkl") joblib.dump(mean_curve_group_by_state, model_save_path + "/mean_curve_group_by_state.pkl")
def run(model_save_path, figure_save_path, threshold_c_value, trials_group_by_folder_name, data_class, ): output_dir = os.path.join( figure_save_path, "gradient_of_log_likelihood_plot", data_class, ) if not os.path.isdir(output_dir): os.makedirs(output_dir) trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length(trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues().next() state_amount = len(one_trial_data_group_by_state) threshold_constant = 10 threshold_offset = 10 model_group_by_state = {} for state_no in range(1, state_amount+1): try: model_group_by_state[state_no] = joblib.load(model_save_path+"/model_s%s.pkl"%(state_no,)) except IOError: print 'model of state %s not found'%(state_no,) continue for state_no in model_group_by_state: all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no] ) all_log_curves_of_this_state.append(one_log_curve_of_this_state) # use np matrix to facilitate the computation of mean curve and std np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) assess_threshold_and_decide( np_matrix_traj_by_time, curve_owner, state_no, output_dir, data_class, )
def run(model_save_path, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) threshold_constant = 10 threshold_offset = 10 model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue expected_log = [] std_of_log = [] deri_threshold = [] for state_no in model_group_by_state: all_log_curves_of_this_state = [] list_of_log_prob_mat = [] log_prob_owner = [] for trial_name in trials_group_by_folder_name: log_prob_owner.append(trial_name) emission_log_prob_mat = util.get_emission_log_prob_matrix( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no]) list_of_log_prob_mat.append(emission_log_prob_mat) one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no]) all_log_curves_of_this_state.append(one_log_curve_of_this_state) # use np matrix to facilitate the computation of mean curve and std np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) gradient_traj_by_time = np_matrix_traj_by_time[:, 1:] - np_matrix_traj_by_time[:, : -1] plot_log_prob_of_all_trials(gradient_traj_by_time, list_of_log_prob_mat, log_prob_owner, state_no, figure_save_path)
def run(model_save_path, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) threshold_constant = 10 threshold_offset = 10 model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue for state_no in model_group_by_state: compute_score_time_cost = 0 total_step_times = 0 all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] start_time = time.time() one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no]) compute_score_time_cost += time.time() - start_time total_step_times += len( trials_group_by_folder_name[trial_name][state_no]) all_log_curves_of_this_state.append(one_log_curve_of_this_state) # use np matrix to facilitate the computation of mean curve and std np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) score_time_cost_per_point = float( compute_score_time_cost) / total_step_times assess_threshold_and_decide(np_matrix_traj_by_time, curve_owner, state_no, figure_save_path, score_time_cost_per_point)
def get_anomaly_detection_msg(self, arrived_data, arrived_state, data_header): hmm_log = Hmm_Log() arrived_length = len(arrived_data) if arrived_length < 10: return hmm_log if arrived_state <= 0: return hmm_log try: self.expected_log_group_by_state[arrived_state][arrived_length - 1] self.threshold_group_by_state[arrived_state][arrived_length - 1] log_curve = util.fast_log_curve_calculation( arrived_data, self.model_group_by_state[arrived_state]) idx = arrived_length - 2 prev_threshold = self.threshold_group_by_state[arrived_state][idx] prev_log_lik = log_curve[idx] prev_diff = prev_log_lik - prev_threshold idx = arrived_length - 1 now_threshold = self.threshold_group_by_state[arrived_state][idx] now_log_lik = log_curve[idx] now_diff = now_log_lik - now_threshold deri_of_diff = now_diff - prev_diff if abs(deri_of_diff) < self.deri_threshold: hmm_log.event_flag = 1 else: hmm_log.event_flag = 0 hmm_log.current_log.data = now_log_lik hmm_log.expected_log.data = self.expected_log_group_by_state[ arrived_state][idx] hmm_log.threshold.data = now_threshold hmm_log.diff_btw_curlog_n_thresh.data = now_diff hmm_log.deri_of_diff_btw_curlog_n_thresh.data = deri_of_diff hmm_log.header = data_header hmm_log.header.stamp = rospy.Time.now() except IndexError: rospy.loginfo( 'received data is longer than the threshold. DTW needed.') return hmm_log
def run(model_save_path, figure_save_path, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue fig = plt.figure(1) ax = fig.add_subplot(111) from matplotlib.pyplot import cm for trial_name in trials_group_by_folder_name: color = iter(cm.rainbow(np.linspace(0, 1, state_amount))) all_log_curves_of_this_model = [[]] for model_no in model_group_by_state: all_log_curves_of_this_model.append([]) for state_no in range(1, state_amount + 1): one_log_curve_of_this_model = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[model_no]) all_log_curves_of_this_model[model_no] = np.hstack([ all_log_curves_of_this_model[model_no], one_log_curve_of_this_model ]) ax.plot(all_log_curves_of_this_model[model_no], linestyle="solid", label='state_' + str(model_no), color=next(color)) title = ('skill_identification' + trial_name) ax.set_title(title) if not os.path.isdir(figure_save_path + '/skill_identification_plot'): os.makedirs(figure_save_path + '/skill_identification_plot') fig.savefig(os.path.join(figure_save_path, 'skill_identification_plot', title + ".jpg"), format="jpg") fig.show()
def predict(self, x_test, class_names): # load trained anomaly models anomaly_model_group_by_label = {} for fo in class_names: anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, fo) try: anomaly_model_group_by_label[fo] = joblib.load( anomaly_model_path + "/model_s%s.pkl" % (1, )) except IOError: print 'anomaly model of %s not found' % (fo, ) raw_input("sorry! cann't load the anomaly model") continue y_pred = [] for i in range(len(x_test)): # plot # fig = plt.figure() # ax = fig.add_subplot(111) # from matplotlib.pyplot import cm # color = iter(cm.rainbow(np.linspace(0, 1, len(anomaly_model_group_by_label)))) calc_cofidence_resourse = [] for idx, model_label in enumerate(class_names): one_log_curve_of_this_model = util.fast_log_curve_calculation( x_test[i], anomaly_model_group_by_label[model_label]) calc_cofidence_resourse.append({ 'model_idx': idx, 'model_label': model_label, 'culmulative_loglik': one_log_curve_of_this_model[-1], }) # c = next(color) # plot_line, = ax.plot(one_log_curve_of_this_model, linestyle="solid", color = c) # plot_line.set_label(model_label) # title = ('Anomaly_identification for ' + fo) # ax.set_title(title) # plt.savefig('images/'+str(i) + '.png', dpi=120) sorted_list = sorted(calc_cofidence_resourse, key=lambda x: x['culmulative_loglik']) optimal_result = sorted_list[-1] classified_idx = optimal_result['model_idx'] y_pred.append(classified_idx) return y_pred
def predict(X_test): # load trained anomaly models anomaly_model_group_by_label = {} anomaly_data_path = os.path.join( training_config.config_by_user['base_path'], 'all_anomalies') folders = os.listdir(anomaly_data_path) for fo in folders: anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, fo, training_config.config_by_user['data_type_chosen'], training_config.config_by_user['model_type_chosen'], training_config.model_id) try: anomaly_model_group_by_label[fo] = joblib.load(anomaly_model_path + "/model_s%s.pkl" % (1, )) except IOError: print 'anomaly model of %s not found' % (fo, ) ipdb.set_trace() raw_input("sorry! cann't load the anomaly model") continue predict_class = [] calc_cofidence_resourse = [] for i in range(len(X_test)): for model_label in anomaly_model_group_by_label: one_log_curve_of_this_model = util.fast_log_curve_calculation( X_test[i], anomaly_model_group_by_label[model_label]) calc_cofidence_resourse.append({ 'model_label': model_label, 'culmulative_loglik': one_log_curve_of_this_model[-1], }) sorted_list = sorted(calc_cofidence_resourse, key=lambda x: x['culmulative_loglik']) optimal_result = sorted_list[-1] classified_model = optimal_result['model_label'] predict_class.append(classified_model) return predict_class
def predict_proba(self, x_test, class_names): # load trained anomaly models anomaly_model_group_by_label = {} for fo in class_names: anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, fo) try: anomaly_model_group_by_label[fo] = joblib.load( anomaly_model_path + "/model_s%s.pkl" % (1, )) except IOError: print 'anomaly model of %s not found' % (fo, ) raw_input("sorry! cann't load the anomaly model") continue predict_score = [] for i in range(len(x_test)): temp_loglik = [] for model_label in class_names: one_log_curve_of_this_model = util.fast_log_curve_calculation( x_test[i], anomaly_model_group_by_label[model_label]) temp_loglik.append(one_log_curve_of_this_model[-1]) temp_score = temp_loglik / np.sum(temp_loglik) predict_score.append(temp_score) return np.array(predict_score)
def generate_performance_logging_report_with_varible_model_parameters(): import model_generation # load the train/test/labels file TRAIN_TEST_DATASET_PATH = training_config.anomaly_data_path x_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_train.npy") y_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_train.npy") x_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_test.npy") y_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_test.npy") labels_path = os.path.join(TRAIN_TEST_DATASET_PATH, "labels_list.npy") try: x_train = np.load(x_train_path) y_train = np.load(y_train_path) x_test = np.load(x_test_path) y_test = np.load(y_test_path) labels = np.load(labels_path) except IOError: print( 'Error occured trying to read the file, please check the path: ' + TRAIN_TEST_DATASET_PATH) sys.exit() x_train = x_train.transpose((0, 2, 1)) x_test = x_test.transpose((0, 2, 1)) y_train = y_train.reshape(-1, ).tolist() y_test = y_test.reshape(-1, ).tolist() class_names = labels.tolist() train_data_by_class = {} train_lengths_by_class = {} for idx, class_name in enumerate(class_names): indices = [i for i, label in enumerate(y_train) if label == idx] train_data = x_train[indices] lengths = [] for i in range(len(train_data)): lengths.append(train_data[i].shape[0]) if i == 0: data_tempt = train_data[i] else: data_tempt = np.concatenate((data_tempt, train_data[i]), axis=0) train_data = data_tempt train_data_by_class[class_name] = train_data lengths[-1] -= 1 train_lengths_by_class[class_name] = lengths model_generator = model_generation.get_model_generator( training_config.model_type_chosen, training_config.model_config) for model, now_model_config in model_generator: logger.info(now_model_config) model_collection_for_all_classes = {} for idx, _name in enumerate(class_names): fitted_model = model.fit( train_data_by_class[_name], lengths=train_lengths_by_class[_name]) # n_samples, n_features # --- dump model and load it, confuse on this, but it works anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, 'temp_classification_report_model', _name) if not os.path.isdir(anomaly_model_path): os.makedirs(anomaly_model_path) joblib.dump( fitted_model, os.path.join(anomaly_model_path, "model_s%s.pkl" % (1, ))) model_collection_for_all_classes[_name] = joblib.load( anomaly_model_path + "/model_s%s.pkl" % (1, )) y_pred = [] for i in range(len(x_test)): calc_cofidence_resourse = [] for idx, model_label in enumerate(class_names): one_log_curve_of_this_model = util.fast_log_curve_calculation( x_test[i], model_collection_for_all_classes[model_label]) calc_cofidence_resourse.append({ 'model_idx': idx, 'model_label': model_label, 'culmulative_loglik': one_log_curve_of_this_model[-1], }) sorted_list = sorted(calc_cofidence_resourse, key=lambda x: x['culmulative_loglik']) optimal_result = sorted_list[-1] classified_idx = optimal_result['model_idx'] y_pred.append(classified_idx) # for confusion matrix _clf_report = classification_report( y_test, y_pred, target_names=[l for l in class_names]) logger.info(_clf_report)
def score(score_metric, model, X, lengths): if score_metric == '_score_metric_worst_stdmeanratio_in_10_slice_': slice_10_time_step_log_lik = [[ model.score(X[i:i + k * (j - i) / 10]) for k in range(1, 11, 1) ] for i, j in util.iter_from_X_lengths(X, lengths)] matrix = np.matrix(slice_10_time_step_log_lik) slice_10_means = abs(matrix.mean(0)) slice_10_std = matrix.std(0) slice_10_stme_ratio = slice_10_std / slice_10_means score = slice_10_stme_ratio.max() elif score_metric == '_score_metric_last_time_stdmeanratio_': final_time_step_log_lik = [ model.score(X[i:j]) for i, j in util.iter_from_X_lengths(X, lengths) ] matrix = np.matrix(final_time_step_log_lik) mean = abs(matrix.mean()) std = matrix.std() score = std / mean elif score_metric == '_score_metric_sum_stdmeanratio_using_fast_log_cal_': final_time_step_log_lik = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] curve_mat = np.matrix(final_time_step_log_lik) mean_of_log_curve = curve_mat.mean(0) std_of_log_curve = curve_mat.std(0) score = abs(std_of_log_curve / mean_of_log_curve).mean() elif score_metric == '_score_metric_mean_of_std_using_fast_log_cal_': log_curves_of_all_trials = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] curve_mat = np.matrix(log_curves_of_all_trials) std_of_log_curve = curve_mat.std(0) score = std_of_log_curve.mean() elif score_metric == '_score_metric_hamming_distance_using_fast_log_cal_': import scipy.spatial.distance as sp_dist log_lik = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] log_mat = np.matrix(log_lik) std_of_log_mat = log_mat.std(0) mean_of_log_mat = log_mat.mean(0) lower_bound = mean_of_log_mat - 20 * std_of_log_mat ipdb.set_trace() hamming_score = sp_dist.hamming(mean_of_log_mat, lower_bound) score = hamming_score elif score_metric == '_score_metric_std_of_std_using_fast_log_cal_': log_curves_of_all_trials = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] curve_mat = np.matrix(log_curves_of_all_trials) std_of_log_curve = curve_mat.std(0) score = std_of_log_curve.std() elif score_metric == '_score_metric_mean_of_std_divied_by_final_log_mean_': log_curves_of_all_trials = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] curve_mat = np.matrix(log_curves_of_all_trials) std_of_log_curve = curve_mat.std(0) mean_of_std = std_of_log_curve.mean() final_log_mean = curve_mat.mean(0)[0, -1] score = abs(mean_of_std / final_log_mean) elif score_metric == '_score_metric_mean_of_std_of_gradient_divied_by_final_log_mean_': log_curves_of_all_trials = [ util.fast_log_curve_calculation(X[i:j], model) for i, j in util.iter_from_X_lengths(X, lengths) ] curve_mat = np.matrix(log_curves_of_all_trials) gradient_mat = curve_mat[:, 1:] - curve_mat[:, :-1] std_of_log_curve = gradient_mat.std(0) mean_of_std = std_of_log_curve.mean() final_log_mean = gradient_mat.mean(0)[0, -1] score = abs(mean_of_std / final_log_mean) elif score_metric == '_score_metric_minus_diff_btw_1st_2ed_emissionprob_': score_of_trials = [] for i, j in util.iter_from_X_lengths(X, lengths): framelogprob = util.get_emission_log_prob_matrix(X[i:j], model) if framelogprob.shape[1] == 1: print 'hidden state amount = 1, but _score_metric_minus_diff_btw_1st_2ed_emissionprob_ wants hidden state amount > 1, so no score for this turn' return None framelogprob.sort(1) diff_btw_1st_2ed_eprob = framelogprob[:, -1] - framelogprob[:, -2] score_of_trials.append(np.sum(diff_btw_1st_2ed_eprob) / (j - i)) score = -np.array(score_of_trials).mean() elif score_metric == '_score_metric_minus_diff_btw_1st_2ed(>=0)_divide_maxeprob_emissionprob_': score_of_trials = [] for i, j in util.iter_from_X_lengths(X, lengths): framelogprob = util.get_emission_log_prob_matrix(X[i:j], model) if framelogprob.shape[1] == 1: print 'hidden state amount = 1, but _score_metric_minus_diff_btw_1st_2ed_emissionprob_ wants hidden state amount > 1, so no score for this turn' return None framelogprob.sort(1) eprob_2ed = framelogprob[:, -2].clip(min=0) eprob_1st = framelogprob[:, -1].clip(min=0) max_eprob = np.max(eprob_1st) if max_eprob == 0: print 'max_eprob = 0, so no score for this turn' return None diff_btw_1st_2ed_eprob = eprob_1st - eprob_2ed score_of_trials.append( np.sum(diff_btw_1st_2ed_eprob) / (max_eprob * (j - i))) score = -np.array(score_of_trials).mean() elif score_metric == '_score_metric_minus_diff_btw_1st_2ed(delete<0)_divide_maxeprob_emissionprob_': score_of_trials = [] for i, j in util.iter_from_X_lengths(X, lengths): framelogprob = util.get_emission_log_prob_matrix(X[i:j], model) if framelogprob.shape[1] == 1: print 'hidden state amount = 1, but _score_metric_minus_diff_btw_1st_2ed_emissionprob_ wants hidden state amount > 1, so no score for this turn' return None framelogprob.sort(1) eprob_2ed = framelogprob[:, -2] eprob_1st = framelogprob[:, -1] entry_filter = eprob_2ed > 0 eprob_2ed = eprob_2ed[entry_filter] eprob_1st = eprob_1st[entry_filter] entry_length = len(eprob_2ed) if entry_length == 0: print 'entry_length = 0, so no score for this turn' return None max_eprob = np.max(eprob_1st) if max_eprob == 0: print 'max_eprob = 0, so no score for this turn' return None diff_btw_1st_2ed_eprob = eprob_1st - eprob_2ed score_of_trials.append( np.sum(diff_btw_1st_2ed_eprob) / (max_eprob * entry_length)) score = -np.array(score_of_trials).mean() elif score_metric == '_score_metric_mean_of_(std_of_(max_emissionprob_of_trials))_': mat = [] for i, j in util.iter_from_X_lengths(X, lengths): framelogprob = util.get_emission_log_prob_matrix(X[i:j], model) if framelogprob.shape[1] == 1: print 'hidden state amount = 1, but _score_metric_minus_diff_btw_1st_2ed_emissionprob_ wants hidden state amount > 1, so no score for this turn' return None max_omissionprb = framelogprob.max(1) mat.append(max_omissionprb) mat = np.matrix(mat) std_list = mat.std(0) score = std_list.mean() elif score_metric == '_score_metric_duration_of_(diff_btw_1st_2ed_emissionprob_<_10)_': score_of_trials = [] for i, j in util.iter_from_X_lengths(X, lengths): framelogprob = util.get_emission_log_prob_matrix(X[i:j], model) if framelogprob.shape[1] == 1: print 'hidden state amount = 1, but _score_metric_minus_diff_btw_1st_2ed_emissionprob_ wants hidden state amount > 1, so no score for this turn' return None framelogprob.sort(1) diff_btw_1st_2ed_eprob = framelogprob[:, -1] - framelogprob[:, -2] duration = (diff_btw_1st_2ed_eprob < 10).sum() score_of_trials.append(float(duration) / (j - i)) score = np.array(score_of_trials).mean() else: raise Exception('unknown score metric \'%s\'' % (score_metric, )) return score
def run(model_save_path, model_type, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) list_of_trials = trials_group_by_folder_name.values() one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) training_report_by_state = {} for state_no in range(1, state_amount + 1): try: training_report_by_state[state_no] = json.load( open( model_save_path + "/model_s%s_training_report.json" % (state_no, ), 'r')) except IOError: print 'training report of state %s not found' % (state_no, ) continue model_config_by_state = {} for state_no in training_report_by_state: best_model_record = training_report_by_state[state_no][0] best_model_id = best_model_record.keys()[0] model_config_by_state[state_no] = joblib.load( model_save_path + "/model_s%s_config_%s.pkl" % (state_no, best_model_id)) training_data_group_by_state = {} training_length_array_group_by_state = {} for state_no in training_report_by_state: length_array = [] for trial_no in range(len(list_of_trials)): length_array.append(list_of_trials[trial_no][state_no].shape[0]) if trial_no == 0: data_tempt = list_of_trials[trial_no][state_no] else: data_tempt = np.concatenate( (data_tempt, list_of_trials[trial_no][state_no]), axis=0) X = data_tempt lengths = length_array list_of_scored_models = training_report_by_state[state_no] model_config_template = model_config_by_state[state_no] for idx in range(len(list_of_scored_models)): model_id = list_of_scored_models[idx].keys()[0] model_score = list_of_scored_models[idx].values()[0] model_config = util.bring_model_id_back_to_model_config( model_id, model_config_template) model_config = _translate_into_new_config_paradigm(model_config) model = model_generation.model_factory(model_type, model_config) model = model.fit(X, lengths=lengths) all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model, ) all_log_curves_of_this_state.append( one_log_curve_of_this_state) np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) plot_trials_loglik_curves_of_one_state( np_matrix_traj_by_time, curve_owner, state_no, os.path.join(figure_save_path, 'check_if_score_metric_converge_loglik_curves', 'state_%s' % (state_no, )), title='state_%s_training_rank_%s_id_%s_score_%s' % (state_no, idx, model_id, model_score))
def run(model_save_path, trials_group_by_folder_name, parsed_options): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue base_dir = os.path.dirname(os.path.realpath(__file__)) exp_dir = os.path.join(base_dir, 'experiment_output', 'test_if_parallelity_can_be_restored') output_id = '(tamper_input)' tampered = False if parsed_options.tamper_transmat: output_id += '_(tamper_transmat)' tampered = True if parsed_options.tamper_startprob: output_id += '_(tamper_startprob)' tampered = True output_dir = os.path.join(exp_dir, output_id) if not os.path.isdir(output_dir): os.makedirs(output_dir) for state_no in model_group_by_state: X = one_trial_data_group_by_state[state_no] list_of_growing_viterbi_paths, n_samples, n_components = util.fast_growing_viterbi_paths_cal( X, model_group_by_state[state_no]) list_of_lock_t, n_samples, n_components = util.fast_viterbi_lock_t_cal( X, model_group_by_state[state_no]) util.output_growing_viterbi_path_img( list_of_growing_viterbi_paths, n_components, os.path.join( output_dir, 'check_if_viterbi_path_grow_incrementally_state_%s.png' % state_no, ), list_of_lock_t, ) util.visualize_viterbi_alog( X, model_group_by_state[state_no], os.path.join(output_dir, 'state %s visualized viterbi alog.png' % (state_no, ))) all_Xs = [trials_group_by_folder_name[trial_name][state_no]\ for trial_name in trials_group_by_folder_name] tampered_X, list_of_tampered_range = tamper_input_mat(X.copy(), all_Xs) model = model_group_by_state[state_no] profile_model(model, output_dir, 'state %s raw' % (state_no, )) if parsed_options.tamper_transmat: tamper_transmat(model) if parsed_options.tamper_startprob: tamper_startprob(model) if tampered: profile_model(model, output_dir, 'state %s tampered' % (state_no, )) log_transmat = util.get_log_transmat(model) log_lik_of_X = np.array(util.fast_log_curve_calculation(X, model)) framelogprob_of_X = np.array( util.get_emission_log_prob_matrix(X, model)) fwdlattice_of_X = util.get_hidden_state_log_prob_matrix(X, model) max_hstate_of_X = fwdlattice_of_X.argmax(1) the_term_of_X = [framelogprob_of_X[0][max_hstate_of_X[0]]] for t in range(1, len(max_hstate_of_X)): hs1 = max_hstate_of_X[t - 1] hs2 = max_hstate_of_X[t] the_term_of_X.append(framelogprob_of_X[t][hs2] + log_transmat[hs1][hs2]) profile_log_curve_cal(X, model, output_dir, 'state %s X' % (state_no, ), list_of_tampered_range) log_lik_of_tampered_X = np.array( util.fast_log_curve_calculation(tampered_X, model)) framelogprob_of_tampered_X = np.array( util.get_emission_log_prob_matrix(tampered_X, model)) fwdlattice_of_tampered_X = util.get_hidden_state_log_prob_matrix( tampered_X, model) max_hstate_of_tampered_X = fwdlattice_of_tampered_X.argmax(1) the_term_of_tampered_X = [ framelogprob_of_tampered_X[0][max_hstate_of_tampered_X[0]] ] for t in range(1, len(max_hstate_of_tampered_X)): hs1 = max_hstate_of_tampered_X[t - 1] hs2 = max_hstate_of_tampered_X[t] the_term_of_tampered_X.append(framelogprob_of_tampered_X[t][hs2] + log_transmat[hs1][hs2]) profile_log_curve_cal(tampered_X, model, output_dir, 'state %s tampered_X' % (state_no, ), list_of_tampered_range) deri_of_X = log_lik_of_X.copy() deri_of_X[1:] = log_lik_of_X[1:] - log_lik_of_X[:-1] deri_of_X[0] = 0 deri_of_tampered_X = log_lik_of_tampered_X.copy() deri_of_tampered_X[ 1:] = log_lik_of_tampered_X[1:] - log_lik_of_tampered_X[:-1] deri_of_tampered_X[0] = 0 diff = log_lik_of_X - log_lik_of_tampered_X fig = plt.figure() bbox_extra_artists = [] ax = fig.add_subplot(411) title = "log lik" ax.set_title(title) ax.plot(log_lik_of_X, color='black', marker='None', linestyle='solid', label='Normal') ax.plot(log_lik_of_tampered_X, color='blue', marker='None', linestyle='solid', label='Tampered') for r in list_of_tampered_range: ax.axvspan(r[0], r[1], facecolor='red', alpha=0.5) lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) bbox_extra_artists.append(lgd) ax = fig.add_subplot(412) title = "1st deri" ax.set_title(title) ax.plot(deri_of_X, color='black', marker='None', linestyle='solid', label='Normal') ax.plot(deri_of_tampered_X, color='blue', marker='None', linestyle='solid', label='Tampered') for r in list_of_tampered_range: ax.axvspan(r[0], r[1], facecolor='red', alpha=0.5) lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) bbox_extra_artists.append(lgd) ax = fig.add_subplot(413) title = "1st deri and max emission prob of Normal" ax.set_title(title) ax.plot(deri_of_X, color='black', marker='None', linestyle='solid', label='Normal 1st deri') ax.plot(the_term_of_X, color='red', marker='None', linestyle='solid', label='Normal the term') for r in list_of_tampered_range: ax.axvspan(r[0], r[1], facecolor='red', alpha=0.5) lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) bbox_extra_artists.append(lgd) ax = fig.add_subplot(414) title = "1st deri and max emission prob of Tampered" ax.set_title(title) ax.plot(deri_of_tampered_X, color='blue', marker='None', linestyle='solid', label='Tampered 1st deri') ax.plot(the_term_of_tampered_X, color='red', marker='None', linestyle='solid', label='Tampered the term') for r in list_of_tampered_range: ax.axvspan(r[0], r[1], facecolor='red', alpha=0.5) lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) bbox_extra_artists.append(lgd) title = "output_id %s state %s" % (output_id, state_no) fig.suptitle(title) plt.tight_layout() fig.savefig(os.path.join(output_dir, title + ".eps"), format="eps", bbox_extra_artists=bbox_extra_artists, bbox_inches='tight') fig.savefig(os.path.join(output_dir, title + ".png"), format="png", bbox_extra_artists=bbox_extra_artists, bbox_inches='tight')
def run(): # load the dataset output_dir = os.path.join(training_config.anomaly_data_path, 'synthetic_data') if not os.path.isdir(output_dir): os.makedirs(output_dir) plt.subplot(111) testing_ratio_list = np.arange(0.9, 0.4, -0.1) num_syn_data_list = range(2, 22, 2) for testing_ratio in testing_ratio_list: x_train, x_test = load_dataset(testing_ratio=testing_ratio) n_real = len(x_train) # calculate the threshold for identification based on real training trials print 'calculate the threshold for identification based on real training trials' lengths = [] for idx in range(len(x_train)): lengths.append(x_train[idx].shape[0]) if idx == 0: train_data = x_train[idx] else: train_data = np.concatenate((train_data, x_train[idx]), axis=0) best_model, model_id = hmm_model_training.train_hmm_model(train_data, lengths) all_log_curves = [] for itrial in range(len(x_train)): one_log_curve = util.fast_log_curve_calculation(x_train[itrial], best_model['model']) all_log_curves.append(one_log_curve) np_matrix_of_all_log_curves = np.matrix(all_log_curves) ''' plt.figure() plt.subplot(111) plt.title('All log likelihood curves and calculated threshold') for no in range(np_matrix_of_all_log_curves.shape[0]): plt.plot(np_matrix_of_all_log_curves[no].tolist()[0], linestyle='--', color='gray', label = 'testing_trial') colors = iter(cm.rainbow(np.linspace(0,1,5))) for c in np.arange(0, 10, 2): plt.plot((np_matrix_of_all_log_curves.mean(0) - c * np_matrix_of_all_log_curves.std(0)).tolist()[0], label = 'mean-%s*std' %(c,), linestyle='solid', color = next(colors)) plt.legend(loc='best') ''' threshold_c = 3 threshold_for_log_likelihood = (np_matrix_of_all_log_curves.mean(0) - threshold_c * np_matrix_of_all_log_curves.std(0)).tolist()[0] # train the model with data augmentation and test it print "train the model with data augmentation and test it" acc_list = [] for num_data in num_syn_data_list: old_files = glob.glob(os.path.join(output_dir, '*')) for old_file in old_files: os.remove(old_file) for i in range(len(x_train)): print ('Generating synthetic data from real_{0}'.format(i)) df = pd.DataFrame(x_train[i], columns=training_config.interested_data_fields) df.to_csv(os.path.join(output_dir, 'real_' + str(i) + '.csv')) generate_synthetic_data.run_finite_differece_matrix(df=df, num_data = num_data, csv_save_path=output_dir, trial_name='real_'+str(i)) # generate_synthetic_data.run_bootstrap(df=df, # num_data = num_data, # csv_save_path=output_dir, # trial_name='real_'+str(i)) # generate_synthetic_data.run_sampling_from_trained_hmm_model(df=df, # model = best_model['model'], # num_data = num_data, # csv_save_path = output_dir, # trial_name = 'sampling_from_trained_model') # generate_synthetic_data.cross_signals_difference_with_weights(x_train = x_train, # total_num_data = num_data * len(train_data), # csv_save_path=output_dir, # trial_name='cross_diff') #train model with synthetic data anomaly_data_group_by_folder_name = util.get_anomaly_data_for_labelled_case(training_config, output_dir) list_of_trials = anomaly_data_group_by_folder_name.values() lengths = [] for idx in range(len(list_of_trials)): lengths.append(list_of_trials[idx][1].shape[0]) if idx == 0: train_data = list_of_trials[idx][1] else: train_data = np.concatenate((train_data, list_of_trials[idx][1]), axis=0) best_model, model_id = hmm_model_training.train_hmm_model(train_data, lengths) FP = 0.0 for itest in range(len(x_test)): one_log_curve = util.fast_log_curve_calculation(x_test[itest], best_model['model']) if one_log_curve[-1] > threshold_for_log_likelihood[-1]: FP +=1 idfyRate = FP / len(x_test) acc_list.append(idfyRate) print idfyRate plt.plot(num_syn_data_list, acc_list, 'o-', label = 'n_real=' + str(n_real)) plt.title('Anomaly identification accuracy vs num synthetic data') plt.xlabel('Identification accuracy') plt.ylabel('Synthetic trials for each real trail') plt.legend() plt.savefig('./images/Anomaly_identification_accuracy_vs_num_synthetic_data.eps', format='eps', dpi=300) plt.show()
def run(model_save_path, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue expected_log = {} std_of_log = {} threshold = {} for state_no in model_group_by_state: compute_score_time_cost = 0 total_step_times = 0 all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] start_time = time.time() one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model_group_by_state[state_no]) compute_score_time_cost += time.time() - start_time total_step_times += len( trials_group_by_folder_name[trial_name][state_no]) all_log_curves_of_this_state.append(one_log_curve_of_this_state) # use np matrix to facilitate the computation of mean curve and std np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) mean_of_log_curve = np_matrix_traj_by_time.mean(0) std_of_log_curve = np_matrix_traj_by_time.std(0) score_time_cost_per_point = float( compute_score_time_cost) / total_step_times decided_threshold_log_curve = assess_threshold_and_decide( threshold_c_value, mean_of_log_curve, std_of_log_curve, np_matrix_traj_by_time, curve_owner, state_no, figure_save_path, score_time_cost_per_point) expected_log[state_no] = mean_of_log_curve.tolist()[0] threshold[state_no] = decided_threshold_log_curve.tolist()[0] std_of_log[state_no] = std_of_log_curve.tolist()[0] if not os.path.isdir(model_save_path): os.makedirs(model_save_path) joblib.dump(expected_log, model_save_path + "/mean_of_log_likelihood.pkl") joblib.dump(threshold, model_save_path + "/threshold_for_log_likelihood.pkl") joblib.dump(std_of_log, model_save_path + "/std_of_log_likelihood.pkl")
def run(model_save_path, figure_save_path, trials_group_by_folder_name, state_order_group_by_folder_name, parsed_options): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) model_group_by_state = {} for state_no in range(1, state_amount + 1): try: model_group_by_state[state_no] = joblib.load(model_save_path + "/model_s%s.pkl" % (state_no, )) except IOError: print 'model of state %s not found' % (state_no, ) continue state_color = {} color = iter(cm.rainbow(np.linspace(0, 1, state_amount))) for state_no in model_group_by_state: state_color[state_no] = color.next() output_dir = os.path.join(figure_save_path, 'test_if_gradient_can_detect_state_switch') if not os.path.isdir(output_dir): os.makedirs(output_dir) trial_amount = len(trials_group_by_folder_name) subpolt_amount_for_each_trial = 2 subplot_per_row = 1 subplot_amount = trial_amount * subpolt_amount_for_each_trial row_amount = int(math.ceil(float(subplot_amount) / subplot_per_row)) fig, ax_mat = plt.subplots(nrows=row_amount, ncols=subplot_per_row) if row_amount == 1: ax_mat = ax_mat.reshape(1, -1) if subplot_per_row == 1: ax_mat = ax_mat.reshape(-1, 1) ax_list = [] for i in range(trial_amount): for k in range(subpolt_amount_for_each_trial): j = subpolt_amount_for_each_trial * i + k row_no = j / subplot_per_row col_no = j % subplot_per_row ax_list.append(ax_mat[row_no, col_no]) trial_count = -1 for trial_name in trials_group_by_folder_name: trial_count += 1 X = None state_start_idx = [0] state_order = state_order_group_by_folder_name[trial_name] for state_no in state_order: if X is None: X = trials_group_by_folder_name[trial_name][state_no] else: X = np.concatenate( (X, trials_group_by_folder_name[trial_name][state_no]), axis=0) state_start_idx.append(len(X)) plot_idx = trial_count * 2 ax_loglik = ax_list[plot_idx] ax_loglik_gradient = ax_list[plot_idx + 1] color_bg_by_state(state_order, state_color, state_start_idx, ax_loglik) color_bg_by_state(state_order, state_color, state_start_idx, ax_loglik_gradient) log_lik_mat = [] log_lik_gradient_mat = [] mat_row_color = [] mat_row_name = [] for state_no in model_group_by_state: log_lik_curve = np.array( util.fast_log_curve_calculation( X, model_group_by_state[state_no])) log_lik_gradient_curve = log_lik_curve[1:] - log_lik_curve[:-1] log_lik_mat.append(log_lik_curve) log_lik_gradient_mat.append(log_lik_gradient_curve) mat_row_color.append(state_color[state_no]) mat_row_name.append('state %s' % (state_no, )) log_lik_mat = np.matrix(log_lik_mat) log_lik_gradient_mat = np.matrix(log_lik_gradient_mat) log_lik_gradient_mat[log_lik_gradient_mat < 0] = 0 for row_no in range(log_lik_mat.shape[0]): ax_loglik.plot(log_lik_mat[row_no].tolist()[0], label=mat_row_name[row_no], color=mat_row_color[row_no]) ax_loglik_gradient.plot(log_lik_gradient_mat[row_no].tolist()[0], label=mat_row_name[row_no], color=mat_row_color[row_no]) title = "log-likelihood of %s HMM models" % state_amount ax_loglik.set_title(title) ax_loglik.set_ylabel('log probability') ax_loglik.set_xlabel('time step') title = "gradient of log-likelihood of %s HMM models" % state_amount ax_loglik_gradient.set_title(title) ax_loglik_gradient.set_ylabel('log probability') ax_loglik_gradient.set_xlabel('time step') title = "trial %s" % (trial_name, ) fig.set_size_inches(8 * subplot_per_row, 2 * row_amount) fig.tight_layout() fig.savefig(os.path.join(output_dir, "test_if_gradient_can_detect_state_switch.png"), format="png") fig.savefig(os.path.join(output_dir, "test_if_gradient_can_detect_state_switch.eps"), format="eps")