Ejemplo n.º 1
0
def run(
    list_of_train_mat,
    list_of_test_mat,
    model_type,
    model_config,
    score_metric,
    logger=None
):
    if logger is None:
        logger = logging.getLogger('birl_hmm_train_model')

    list_of_train_mat = np.array(list_of_train_mat)
    list_of_test_mat = np.array(list_of_test_mat)

    tried_models = []
    model_generator = model_generation.get_model_generator(model_type, model_config)
    for raw_model, model_config in model_generator:
        logger.debug('-'*20)
        logger.debug(' working on config: %s'%model_config)

        try:
            kf = KFold(n_splits=3, shuffle=True)
            scores = []
            for cv_train_index, cv_test_index in kf.split(list_of_train_mat):
                list_of_cv_train_mat = (list_of_train_mat.copy())[cv_train_index]
                list_of_cv_test_mat = (list_of_train_mat.copy())[cv_test_index]
                cv_train_lengths = [i.shape[0] for i in list_of_cv_train_mat]
                cv_train_lengths[-1] -= 1 #for autoregressive observation
                cv_train_X = np.concatenate(list_of_cv_train_mat, axis=0)
                cv_test_lengths = [i.shape[0] for i in list_of_cv_test_mat]
                cv_test_X = np.concatenate(list_of_cv_test_mat, axis=0)

                model = model_generation.model_factory(model_type, model_config)
                model = model.fit(cv_train_X, lengths=cv_train_lengths)
                score = model_score.score(score_metric, model, cv_test_X, cv_test_lengths)
                    
                if score == None:
                    raise Exception("scorer says to skip this model")
                else:
                    scores.append(score)
        except Exception as e:
            logger.error("Failed to run CV on this model: %s"%e)
            logger.error("traceback: %s"%traceback.format_exc())
            continue

        tried_models.append({
            "model": model,
            "model_config": model_config,
            "cv_score_mean": np.mean(scores),
            "cv_score_std": np.std(scores),
        })
        logger.debug('score: %s'%score)
        logger.debug('='*20)

    if len(tried_models) == 0:
        raise Exception("All models tried failed to train.")
    tried_models = sorted(tried_models, key=lambda x:x['cv_score_mean'])
    best_model = tried_models[0]['model'] 
    test_score = tried_models[0]['cv_score_mean']
    return best_model, test_score, tried_models
Ejemplo n.º 2
0
def fit(X, y, class_names):
    '''
    function: train all the anomalious models
    '''
    for model_name in class_names:
        indices = [i for i, label in enumerate(y) if label == model_name]
        train_data = [X[i] for i in indices]
        model_list, lengths = [], []
        for i in range(len(train_data)):
            lengths.append(len(train_data[i]))
        try:
            train_data = np.concatenate(train_data)
        except ValueError:
            print('Oops!. something wrong...')
            ipdb.set_trace()
        lengths[-1] -= 1
        model_generator = model_generation.get_model_generator(
            training_config.model_type_chosen, training_config.model_config)
        for model, now_model_config in model_generator:
            model = model.fit(train_data,
                              lengths=lengths)  # n_samples, n_features
            score = model_score.score(training_config.score_metric, model,
                                      train_data, lengths)
            if score == None:
                print "scorer says to skip this model, will do"
                continue
            model_list.append({
                "model": model,
                "now_model_config": now_model_config,
                "score": score
            })
            print 'score:', score
            model_generation.update_now_score(score)
        sorted_model_list = sorted(model_list, key=lambda x: x['score'])

        best = sorted_model_list[0]
        model_id = util.get_model_config_id(best['now_model_config'])

        anomaly_model_path = os.path.join(
            training_config.anomaly_model_save_path, model_name,
            training_config.config_by_user['data_type_chosen'],
            training_config.config_by_user['model_type_chosen'],
            training_config.model_id)

        if not os.path.isdir(anomaly_model_path):
            os.makedirs(anomaly_model_path)

        joblib.dump(best['model'],
                    os.path.join(anomaly_model_path, "model_s%s.pkl" % (1, )))
def train_hmm_model(train_data, lengths):
    model_list = []
    lengths[-1] -= 1
    model_generator = model_generation.get_model_generator(
        training_config.model_type_chosen, training_config.model_config)
    for model, now_model_config in model_generator:
        model = model.fit(train_data, lengths=lengths)  # n_samples, n_features
        score = model_score.score(training_config.score_metric, model,
                                  train_data, lengths)
        if score == None:
            print "scorer says to skip this model, will do"
            continue
        model_list.append({
            "model": model,
            "now_model_config": now_model_config,
            "score": score
        })
        print 'score:', score
        model_generation.update_now_score(score)
    sorted_model_list = sorted(model_list, key=lambda x: x['score'])
    best_model = sorted_model_list[0]
    model_id = util.get_model_config_id(best_model['now_model_config'])
    return best_model, model_id
Ejemplo n.º 4
0
def generate_performance_logging_report_with_varible_model_parameters():
    import model_generation
    # load the train/test/labels file
    TRAIN_TEST_DATASET_PATH = training_config.anomaly_data_path
    x_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_train.npy")
    y_train_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_train.npy")
    x_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "X_test.npy")
    y_test_path = os.path.join(TRAIN_TEST_DATASET_PATH, "y_test.npy")
    labels_path = os.path.join(TRAIN_TEST_DATASET_PATH, "labels_list.npy")
    try:
        x_train = np.load(x_train_path)
        y_train = np.load(y_train_path)
        x_test = np.load(x_test_path)
        y_test = np.load(y_test_path)
        labels = np.load(labels_path)
    except IOError:
        print(
            'Error occured trying to read the file, please check the path: ' +
            TRAIN_TEST_DATASET_PATH)
        sys.exit()
    x_train = x_train.transpose((0, 2, 1))
    x_test = x_test.transpose((0, 2, 1))
    y_train = y_train.reshape(-1, ).tolist()
    y_test = y_test.reshape(-1, ).tolist()
    class_names = labels.tolist()

    train_data_by_class = {}
    train_lengths_by_class = {}
    for idx, class_name in enumerate(class_names):
        indices = [i for i, label in enumerate(y_train) if label == idx]
        train_data = x_train[indices]
        lengths = []
        for i in range(len(train_data)):
            lengths.append(train_data[i].shape[0])
            if i == 0:
                data_tempt = train_data[i]
            else:
                data_tempt = np.concatenate((data_tempt, train_data[i]),
                                            axis=0)
        train_data = data_tempt
        train_data_by_class[class_name] = train_data
        lengths[-1] -= 1
        train_lengths_by_class[class_name] = lengths

    model_generator = model_generation.get_model_generator(
        training_config.model_type_chosen, training_config.model_config)
    for model, now_model_config in model_generator:
        logger.info(now_model_config)
        model_collection_for_all_classes = {}
        for idx, _name in enumerate(class_names):
            fitted_model = model.fit(
                train_data_by_class[_name],
                lengths=train_lengths_by_class[_name])  # n_samples, n_features
            # --- dump model and load it, confuse on this, but it works
            anomaly_model_path = os.path.join(
                training_config.anomaly_model_save_path,
                'temp_classification_report_model', _name)
            if not os.path.isdir(anomaly_model_path):
                os.makedirs(anomaly_model_path)
            joblib.dump(
                fitted_model,
                os.path.join(anomaly_model_path, "model_s%s.pkl" % (1, )))
            model_collection_for_all_classes[_name] = joblib.load(
                anomaly_model_path + "/model_s%s.pkl" % (1, ))

        y_pred = []
        for i in range(len(x_test)):
            calc_cofidence_resourse = []
            for idx, model_label in enumerate(class_names):
                one_log_curve_of_this_model = util.fast_log_curve_calculation(
                    x_test[i], model_collection_for_all_classes[model_label])
                calc_cofidence_resourse.append({
                    'model_idx':
                    idx,
                    'model_label':
                    model_label,
                    'culmulative_loglik':
                    one_log_curve_of_this_model[-1],
                })
            sorted_list = sorted(calc_cofidence_resourse,
                                 key=lambda x: x['culmulative_loglik'])
            optimal_result = sorted_list[-1]
            classified_idx = optimal_result['model_idx']
            y_pred.append(classified_idx)
        # for confusion matrix
        _clf_report = classification_report(
            y_test, y_pred, target_names=[l for l in class_names])
        logger.info(_clf_report)
def run(model_save_path, model_type, figure_save_path, threshold_c_value,
        trials_group_by_folder_name):

    trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length(
        trials_group_by_folder_name)
    list_of_trials = trials_group_by_folder_name.values()

    one_trial_data_group_by_state = trials_group_by_folder_name.itervalues(
    ).next()
    state_amount = len(one_trial_data_group_by_state)

    training_report_by_state = {}
    for state_no in range(1, state_amount + 1):
        try:
            training_report_by_state[state_no] = json.load(
                open(
                    model_save_path + "/model_s%s_training_report.json" %
                    (state_no, ), 'r'))
        except IOError:
            print 'training report of state %s not found' % (state_no, )
            continue

    model_config_by_state = {}
    for state_no in training_report_by_state:
        best_model_record = training_report_by_state[state_no][0]
        best_model_id = best_model_record.keys()[0]
        model_config_by_state[state_no] = joblib.load(
            model_save_path + "/model_s%s_config_%s.pkl" %
            (state_no, best_model_id))

    training_data_group_by_state = {}
    training_length_array_group_by_state = {}

    for state_no in training_report_by_state:

        length_array = []
        for trial_no in range(len(list_of_trials)):
            length_array.append(list_of_trials[trial_no][state_no].shape[0])
            if trial_no == 0:
                data_tempt = list_of_trials[trial_no][state_no]
            else:
                data_tempt = np.concatenate(
                    (data_tempt, list_of_trials[trial_no][state_no]), axis=0)

        X = data_tempt
        lengths = length_array

        list_of_scored_models = training_report_by_state[state_no]
        model_config_template = model_config_by_state[state_no]

        for idx in range(len(list_of_scored_models)):
            model_id = list_of_scored_models[idx].keys()[0]
            model_score = list_of_scored_models[idx].values()[0]
            model_config = util.bring_model_id_back_to_model_config(
                model_id, model_config_template)
            model_generator = model_generation.get_model_generator(
                model_type, model_config)
            model, trash = next(model_generator)

            model = model.fit(X, lengths=lengths)

            all_log_curves_of_this_state = []
            curve_owner = []

            for trial_name in trials_group_by_folder_name:
                curve_owner.append(trial_name)
                one_log_curve_of_this_state = []

                one_log_curve_of_this_state = util.fast_log_curve_calculation(
                    trials_group_by_folder_name[trial_name][state_no],
                    model,
                )

                all_log_curves_of_this_state.append(
                    one_log_curve_of_this_state)

            np_matrix_traj_by_time = np.matrix(all_log_curves_of_this_state)

            plot_trials_loglik_curves_of_one_state(
                np_matrix_traj_by_time,
                curve_owner,
                state_no,
                os.path.join(figure_save_path,
                             'check_if_score_metric_converge_loglik_curves',
                             'state_%s' % (state_no, )),
                title='state_%s_training_rank_%s_id_%s_score_%s' %
                (state_no, idx, model_id, model_score))
Ejemplo n.º 6
0
def run(model_save_path, model_type, model_config, score_metric,
        trials_group_by_folder_name):

    trials_group_by_folder_name = util.make_trials_of_each_state_the_same_length(
        trials_group_by_folder_name)
    list_of_trials = trials_group_by_folder_name.values()

    trials_amount = len(trials_group_by_folder_name)

    if not os.path.isdir(model_save_path):
        os.makedirs(model_save_path)

    one_trial_data_group_by_state = list_of_trials[0]
    state_amount = len(one_trial_data_group_by_state)

    training_data_group_by_state = {}
    training_length_array_group_by_state = {}

    for state_no in range(1, state_amount + 1):
        length_array = []
        for trial_no in range(len(list_of_trials)):
            length_array.append(list_of_trials[trial_no][state_no].shape[0])
            if trial_no == 0:
                data_tempt = list_of_trials[trial_no][state_no]
            else:
                data_tempt = np.concatenate(
                    (data_tempt, list_of_trials[trial_no][state_no]), axis=0)
        training_data_group_by_state[state_no] = data_tempt
        training_length_array_group_by_state[state_no] = length_array

    if not os.path.isdir(model_save_path):
        os.makedirs(model_save_path)

    for state_no in range(1, state_amount + 1):
        model_list = []
        model_generator = model_generation.get_model_generator(
            model_type, model_config)

        X = training_data_group_by_state[state_no]
        lengths = training_length_array_group_by_state[state_no]
        lengths[
            -1] -= 1  # Adapting for bnpy's observation is firt-order autoregressive gaussian
        for model, now_model_config in model_generator:
            print
            print '-' * 20
            print 'in state', state_no, ' working on config:', now_model_config
            model = model.fit(X, lengths=lengths)  #n_samples, n_features
            score = model_score.score(score_metric, model, X, lengths)

            if score == None:
                print "scorer says to skip this model, will do"
                continue

            model_list.append({
                "model": model,
                "now_model_config": now_model_config,
                "score": score
            })
            print 'score:', score
            print '=' * 20
            print

            model_generation.update_now_score(score)

        sorted_model_list = sorted(model_list, key=lambda x: x['score'])

        best = sorted_model_list[0]
        model_id = util.get_model_config_id(best['now_model_config'])

        joblib.dump(
            best['model'],
            os.path.join(model_save_path, "model_s%s.pkl" % (state_no, )))

        joblib.dump(
            best['now_model_config'],
            os.path.join(model_save_path,
                         "model_s%s_config_%s.pkl" % (state_no, model_id)))

        joblib.dump(
            None,
            os.path.join(model_save_path,
                         "model_s%s_score_%s.pkl" % (state_no, best['score'])))

        train_report = [{
            util.get_model_config_id(i['now_model_config']):
            i['score']
        } for i in sorted_model_list]
        import json
        json.dump(train_report,
                  open(
                      os.path.join(
                          model_save_path,
                          "model_s%s_training_report.json" % (state_no)), 'w'),
                  separators=(',\n', ': '))