def run( list_of_train_mat, list_of_test_mat, model_type, model_config, score_metric, logger=None ): if logger is None: logger = logging.getLogger('birl_hmm_train_model') list_of_train_mat = np.array(list_of_train_mat) list_of_test_mat = np.array(list_of_test_mat) tried_models = [] model_generator = model_generation.get_model_generator(model_type, model_config) for raw_model, model_config in model_generator: logger.debug('-'*20) logger.debug(' working on config: %s'%model_config) try: kf = KFold(n_splits=3, shuffle=True) scores = [] for cv_train_index, cv_test_index in kf.split(list_of_train_mat): list_of_cv_train_mat = (list_of_train_mat.copy())[cv_train_index] list_of_cv_test_mat = (list_of_train_mat.copy())[cv_test_index] cv_train_lengths = [i.shape[0] for i in list_of_cv_train_mat] cv_train_lengths[-1] -= 1 #for autoregressive observation cv_train_X = np.concatenate(list_of_cv_train_mat, axis=0) cv_test_lengths = [i.shape[0] for i in list_of_cv_test_mat] cv_test_X = np.concatenate(list_of_cv_test_mat, axis=0) model = model_generation.model_factory(model_type, model_config) model = model.fit(cv_train_X, lengths=cv_train_lengths) score = model_score.score(score_metric, model, cv_test_X, cv_test_lengths) if score == None: raise Exception("scorer says to skip this model") else: scores.append(score) except Exception as e: logger.error("Failed to run CV on this model: %s"%e) logger.error("traceback: %s"%traceback.format_exc()) continue tried_models.append({ "model": model, "model_config": model_config, "cv_score_mean": np.mean(scores), "cv_score_std": np.std(scores), }) logger.debug('score: %s'%score) logger.debug('='*20) if len(tried_models) == 0: raise Exception("All models tried failed to train.") tried_models = sorted(tried_models, key=lambda x:x['cv_score_mean']) best_model = tried_models[0]['model'] test_score = tried_models[0]['cv_score_mean'] return best_model, test_score, tried_models
def fit(X, y, class_names): ''' function: train all the anomalious models ''' for model_name in class_names: indices = [i for i, label in enumerate(y) if label == model_name] train_data = [X[i] for i in indices] model_list, lengths = [], [] for i in range(len(train_data)): lengths.append(len(train_data[i])) try: train_data = np.concatenate(train_data) except ValueError: print('Oops!. something wrong...') ipdb.set_trace() lengths[-1] -= 1 model_generator = model_generation.get_model_generator( training_config.model_type_chosen, training_config.model_config) for model, now_model_config in model_generator: model = model.fit(train_data, lengths=lengths) # n_samples, n_features score = model_score.score(training_config.score_metric, model, train_data, lengths) if score == None: print "scorer says to skip this model, will do" continue model_list.append({ "model": model, "now_model_config": now_model_config, "score": score }) print 'score:', score model_generation.update_now_score(score) sorted_model_list = sorted(model_list, key=lambda x: x['score']) best = sorted_model_list[0] model_id = util.get_model_config_id(best['now_model_config']) anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, model_name, training_config.config_by_user['data_type_chosen'], training_config.config_by_user['model_type_chosen'], training_config.model_id) if not os.path.isdir(anomaly_model_path): os.makedirs(anomaly_model_path) joblib.dump(best['model'], os.path.join(anomaly_model_path, "model_s%s.pkl" % (1, )))
def train_hmm_model(train_data, lengths): model_list = [] lengths[-1] -= 1 model_generator = model_generation.get_model_generator( training_config.model_type_chosen, training_config.model_config) for model, now_model_config in model_generator: model = model.fit(train_data, lengths=lengths) # n_samples, n_features score = model_score.score(training_config.score_metric, model, train_data, lengths) if score == None: print "scorer says to skip this model, will do" continue model_list.append({ "model": model, "now_model_config": now_model_config, "score": score }) print 'score:', score model_generation.update_now_score(score) sorted_model_list = sorted(model_list, key=lambda x: x['score']) best_model = sorted_model_list[0] model_id = util.get_model_config_id(best_model['now_model_config']) return best_model, model_id
def generate_performance_logging_report_with_varible_model_parameters(): import model_generation # load the train/test/labels file TRAIN_TEST_DATASET_PATH = training_config.anomaly_data_path x_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_train.npy") y_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_train.npy") x_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_test.npy") y_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_test.npy") labels_path = os.path.join(TRAIN_TEST_DATASET_PATH, "labels_list.npy") try: x_train = np.load(x_train_path) y_train = np.load(y_train_path) x_test = np.load(x_test_path) y_test = np.load(y_test_path) labels = np.load(labels_path) except IOError: print( 'Error occured trying to read the file, please check the path: ' + TRAIN_TEST_DATASET_PATH) sys.exit() x_train = x_train.transpose((0, 2, 1)) x_test = x_test.transpose((0, 2, 1)) y_train = y_train.reshape(-1, ).tolist() y_test = y_test.reshape(-1, ).tolist() class_names = labels.tolist() train_data_by_class = {} train_lengths_by_class = {} for idx, class_name in enumerate(class_names): indices = [i for i, label in enumerate(y_train) if label == idx] train_data = x_train[indices] lengths = [] for i in range(len(train_data)): lengths.append(train_data[i].shape[0]) if i == 0: data_tempt = train_data[i] else: data_tempt = np.concatenate((data_tempt, train_data[i]), axis=0) train_data = data_tempt train_data_by_class[class_name] = train_data lengths[-1] -= 1 train_lengths_by_class[class_name] = lengths model_generator = model_generation.get_model_generator( training_config.model_type_chosen, training_config.model_config) for model, now_model_config in model_generator: logger.info(now_model_config) model_collection_for_all_classes = {} for idx, _name in enumerate(class_names): fitted_model = model.fit( train_data_by_class[_name], lengths=train_lengths_by_class[_name]) # n_samples, n_features # --- dump model and load it, confuse on this, but it works anomaly_model_path = os.path.join( training_config.anomaly_model_save_path, 'temp_classification_report_model', _name) if not os.path.isdir(anomaly_model_path): os.makedirs(anomaly_model_path) joblib.dump( fitted_model, os.path.join(anomaly_model_path, "model_s%s.pkl" % (1, ))) model_collection_for_all_classes[_name] = joblib.load( anomaly_model_path + "/model_s%s.pkl" % (1, )) y_pred = [] for i in range(len(x_test)): calc_cofidence_resourse = [] for idx, model_label in enumerate(class_names): one_log_curve_of_this_model = util.fast_log_curve_calculation( x_test[i], model_collection_for_all_classes[model_label]) calc_cofidence_resourse.append({ 'model_idx': idx, 'model_label': model_label, 'culmulative_loglik': one_log_curve_of_this_model[-1], }) sorted_list = sorted(calc_cofidence_resourse, key=lambda x: x['culmulative_loglik']) optimal_result = sorted_list[-1] classified_idx = optimal_result['model_idx'] y_pred.append(classified_idx) # for confusion matrix _clf_report = classification_report( y_test, y_pred, target_names=[l for l in class_names]) logger.info(_clf_report)
def run(model_save_path, model_type, figure_save_path, threshold_c_value, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) list_of_trials = trials_group_by_folder_name.values() one_trial_data_group_by_state = trials_group_by_folder_name.itervalues( ).next() state_amount = len(one_trial_data_group_by_state) training_report_by_state = {} for state_no in range(1, state_amount + 1): try: training_report_by_state[state_no] = json.load( open( model_save_path + "/model_s%s_training_report.json" % (state_no, ), 'r')) except IOError: print 'training report of state %s not found' % (state_no, ) continue model_config_by_state = {} for state_no in training_report_by_state: best_model_record = training_report_by_state[state_no][0] best_model_id = best_model_record.keys()[0] model_config_by_state[state_no] = joblib.load( model_save_path + "/model_s%s_config_%s.pkl" % (state_no, best_model_id)) training_data_group_by_state = {} training_length_array_group_by_state = {} for state_no in training_report_by_state: length_array = [] for trial_no in range(len(list_of_trials)): length_array.append(list_of_trials[trial_no][state_no].shape[0]) if trial_no == 0: data_tempt = list_of_trials[trial_no][state_no] else: data_tempt = np.concatenate( (data_tempt, list_of_trials[trial_no][state_no]), axis=0) X = data_tempt lengths = length_array list_of_scored_models = training_report_by_state[state_no] model_config_template = model_config_by_state[state_no] for idx in range(len(list_of_scored_models)): model_id = list_of_scored_models[idx].keys()[0] model_score = list_of_scored_models[idx].values()[0] model_config = util.bring_model_id_back_to_model_config( model_id, model_config_template) model_generator = model_generation.get_model_generator( model_type, model_config) model, trash = next(model_generator) model = model.fit(X, lengths=lengths) all_log_curves_of_this_state = [] curve_owner = [] for trial_name in trials_group_by_folder_name: curve_owner.append(trial_name) one_log_curve_of_this_state = [] one_log_curve_of_this_state = util.fast_log_curve_calculation( trials_group_by_folder_name[trial_name][state_no], model, ) all_log_curves_of_this_state.append( one_log_curve_of_this_state) np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state) plot_trials_loglik_curves_of_one_state( np_matrix_traj_by_time, curve_owner, state_no, os.path.join(figure_save_path, 'check_if_score_metric_converge_loglik_curves', 'state_%s' % (state_no, )), title='state_%s_training_rank_%s_id_%s_score_%s' % (state_no, idx, model_id, model_score))
def run(model_save_path, model_type, model_config, score_metric, trials_group_by_folder_name): trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length( trials_group_by_folder_name) list_of_trials = trials_group_by_folder_name.values() trials_amount = len(trials_group_by_folder_name) if not os.path.isdir(model_save_path): os.makedirs(model_save_path) one_trial_data_group_by_state = list_of_trials[0] state_amount = len(one_trial_data_group_by_state) training_data_group_by_state = {} training_length_array_group_by_state = {} for state_no in range(1, state_amount + 1): length_array = [] for trial_no in range(len(list_of_trials)): length_array.append(list_of_trials[trial_no][state_no].shape[0]) if trial_no == 0: data_tempt = list_of_trials[trial_no][state_no] else: data_tempt = np.concatenate( (data_tempt, list_of_trials[trial_no][state_no]), axis=0) training_data_group_by_state[state_no] = data_tempt training_length_array_group_by_state[state_no] = length_array if not os.path.isdir(model_save_path): os.makedirs(model_save_path) for state_no in range(1, state_amount + 1): model_list = [] model_generator = model_generation.get_model_generator( model_type, model_config) X = training_data_group_by_state[state_no] lengths = training_length_array_group_by_state[state_no] lengths[ -1] -= 1 # Adapting for bnpy's observation is firt-order autoregressive gaussian for model, now_model_config in model_generator: print print '-' * 20 print 'in state', state_no, ' working on config:', now_model_config model = model.fit(X, lengths=lengths) #n_samples, n_features score = model_score.score(score_metric, model, X, lengths) if score == None: print "scorer says to skip this model, will do" continue model_list.append({ "model": model, "now_model_config": now_model_config, "score": score }) print 'score:', score print '=' * 20 print model_generation.update_now_score(score) sorted_model_list = sorted(model_list, key=lambda x: x['score']) best = sorted_model_list[0] model_id = util.get_model_config_id(best['now_model_config']) joblib.dump( best['model'], os.path.join(model_save_path, "model_s%s.pkl" % (state_no, ))) joblib.dump( best['now_model_config'], os.path.join(model_save_path, "model_s%s_config_%s.pkl" % (state_no, model_id))) joblib.dump( None, os.path.join(model_save_path, "model_s%s_score_%s.pkl" % (state_no, best['score']))) train_report = [{ util.get_model_config_id(i['now_model_config']): i['score'] } for i in sorted_model_list] import json json.dump(train_report, open( os.path.join( model_save_path, "model_s%s_training_report.json" % (state_no)), 'w'), separators=(',\n', ': '))