Ejemplo n.º 1
0
def main():
    """
        主函数
    """
    # 加载数据集
    all_daily_df = utils.load_data()

    # 对时序数据进行差分操作
    stationary_df = utils.make_stationary_seq(all_daily_df, vis=False)

    # 构建可用于模型的数据
    train_data, test_data = utils.make_data_for_model(stationary_df)

    # 归一化数据
    y_scaler, X_train, y_train, X_test, y_test = utils.scale_data(
        train_data, test_data)

    if not IS_LOAD_MODEL:
        # 训练LSTM模型
        lstm_model = utils.fit_lstm(X_train, y_train)
    else:
        # 加载LSTM模型
        if os.path.exists(config.model_file):
            lstm_model = load_model(config.model_file)
        else:
            print('{}模型文件不存在'.format(config.model_file))
            return

    #model summary
    print(lstm_model.summary())
    # 验证模型
    test_dates = test_data.index.tolist()
    pred_daily_df = pd.DataFrame(columns=['True Value', 'Pred Value'],
                                 index=test_dates)
    pred_daily_df['True Value'] = all_daily_df[config.raw_label_col]

    for i, test_date in enumerate(test_dates):
        X = X_test[i].reshape(1, -1)  # 将一天的数据特征转成行向量
        y_pred = utils.forecast_lstm(lstm_model, X)
        # scale反向操作,恢复数据范围
        rescaled_y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[0,
                                                                            0]

        # 差分反向操作,恢复数据的值:加上前一天的真实标签
        previous_date = test_date - pd.DateOffset(days=1)
        recoverd_y_pred = rescaled_y_pred + all_daily_df.loc[previous_date][
            config.raw_label_col]

        # 保存数据
        pred_daily_df.loc[test_date, 'Pred Value'] = recoverd_y_pred
        print('Date={}, 真实值={}, 预测值={}'.format(
            test_date, all_daily_df.loc[test_date][config.raw_label_col],
            recoverd_y_pred))

    # 保存结果
    pred_daily_df.to_csv(os.path.join(config.output_path, 'pred_daily_df.csv'))
    pred_daily_df.plot()
    plt.savefig(os.path.join(config.output_path, 'pred_daily_df.png'))
    plt.show()
Ejemplo n.º 2
0
def train_logistic(train_features, train_labels, test_features,
                   scikit_balancing, train_size, skip_feature_selection,
                   skip_grid_search, penalty, cost, dual, tol, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "logistic"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        penalty = params['penalty']
        cost = params['C']

    # Now perform the training on full train data. check on test data
    model = LogisticRegression(penalty=penalty,
                               dual=dual,
                               C=cost,
                               tol=tol,
                               max_iter=5000,
                               class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 3
0
def L1SVMFeatureSelection(train_features, train_labels, test_features,
                          feature_names, feature_selection_cost, threshold,
                          num_jobs):
    """
  Performs feature selection, using lasso svm, and returns the transformed train/test
  data after feature selection
  """
    # features have to be scaled for svm
    (scaled_train_features,
     scaled_test_features) = utils.scale_data(train_features, test_features,
                                              'minmax')

    if feature_selection_cost:
        model = LinearSVC(C=feature_selection_cost,
                          dual=False,
                          penalty='l1',
                          tol=0.005,
                          multi_class='ovr',
                          max_iter=50000)
    else:
        tuned_parameters = [{
            'C': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,
                50, 100, 500, 1000
            ]
        }]
        feature_selector_model = LinearSVC(penalty='l1',
                                           dual=False,
                                           tol=0.005,
                                           multi_class='ovr',
                                           max_iter=50000)
        scorer = make_scorer(precision_score, pos_label=None, average='macro')
        # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory
        # usage does not blow up
        skf = StratifiedKFold(n_splits=3, shuffle=True)
        model = GridSearchCV(estimator=feature_selector_model,
                             param_grid=tuned_parameters,
                             n_jobs=num_jobs,
                             pre_dispatch="n_jobs",
                             cv=skf,
                             scoring=scorer)

    model.fit(scaled_train_features, train_labels)
    feature_selector = SelectFromModel(model.best_estimator_,
                                       threshold=threshold,
                                       prefit=True)
    if not feature_selection_cost:
        print("Optimal L1-SVM feature selectoion params {}".format(
            model.best_params_))
    train_features = feature_selector.transform(train_features)
    test_features = feature_selector.transform(test_features)
    feature_names = feature_names[feature_selector.get_support(indices=True)]
    print 'Selected {} features '.format(train_features.shape[1])

    return (train_features, test_features, feature_names)
Ejemplo n.º 4
0
def main():
    """
        主函数
    """
    # 加载数据集
    all_daily_df = utils.load_data()

    # 对时序数据进行差分操作
    stationary_df = utils.make_stationary_seq(all_daily_df, vis=False)

    # 构建可用于模型的数据
    train_data, test_data = utils.make_data_for_model(stationary_df)

    # 归一化数据
    y_scaler, X_train, y_train, X_test, y_test = utils.scale_data(train_data, test_data)

    if not IS_LOAD_MODEL:
        # 训练LSTM模型
        lstm_model = utils.fit_lstm(X_train, y_train)
    else:
        # 加载LSTM模型
        if os.path.exists(config.model_file):
            lstm_model = load_model(config.model_file)
        else:
            print('{}模型文件不存在'.format(config.model_file))
            return

    #model summary
    print(lstm_model.summary())
    # 验证模型
    test_dates = test_data.index.tolist()
    pred_daily_df = pd.DataFrame(columns=['True Value', 'Pred Value'], index=test_dates)
    pred_daily_df['True Value'] = all_daily_df[config.raw_label_col]

    for i, test_date in enumerate(test_dates):
        X = X_test[i].reshape(1, -1)    # 将一天的数据特征转成行向量
        y_pred = utils.forecast_lstm(lstm_model, X)
        # scale反向操作,恢复数据范围
        rescaled_y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[0, 0]

        # 差分反向操作,恢复数据的值:加上前一天的真实标签
        previous_date = test_date - pd.DateOffset(days=1)
        recoverd_y_pred = rescaled_y_pred + all_daily_df.loc[previous_date][config.raw_label_col]

        # 保存数据
        pred_daily_df.loc[test_date, 'Pred Value'] = recoverd_y_pred
        print('Date={}, 真实值={}, 预测值={}'.format(test_date,
                                               all_daily_df.loc[test_date][config.raw_label_col],
                                               recoverd_y_pred))

    # 保存结果
    pred_daily_df.to_csv(os.path.join(config.output_path, 'pred_daily_df.csv'))
    pred_daily_df.plot()
    plt.savefig(os.path.join(config.output_path, 'pred_daily_df.png'))
    plt.show()
Ejemplo n.º 5
0
    def _scale_data(self,
                    train_file_num: int = 10000,
                    test_file_num: int = 1000):
        # reduce data
        (self.train_images,
         self.train_labels) = reduce_date(self.train_images, self.train_labels,
                                          train_file_num)
        (self.test_images,
         self.test_labels) = reduce_date(self.test_images, self.test_labels,
                                         test_file_num)

        # normalize data
        self.train_images = scale_data(self.train_images)
        self.test_images = scale_data(self.test_images)

        # reshape data
        self.train_images = np.reshape(self.train_images,
                                       (self.train_images.shape[0], -1))
        self.test_images = np.reshape(self.test_images,
                                      (self.test_images.shape[0], -1))
Ejemplo n.º 6
0
    def __init__(self, evaluation, all_features, all_labels,
                 feature_label_names, num_samples, penalty_weights, algorithm,
                 num_jobs):
        '''
    Initializes a feature selector object by cross validating the selector model and
    storing the best transformer.

    all_feature: Must contain the features all scaled to the same range if l1-svm feature
    seleciton if requested.
    all_labels: labels corresponding to features
    num_samples: how many random data points to sample and use from scaled_features for
    penalty_weights: determines how to account for imbalance. either None, which means the
    data is balanced, or balanced which means use scikit balacning.
    training the feature selector models, to speed up the whole process.
    algorithm: what feature selection method to use: anova, logistic, tree
    '''
        # copy the features and labels, because we don't want to be modifying (scaling or
        # sampling) in place
        features = numpy.copy(all_features)
        labels = numpy.copy(all_labels)
        self.features_label_names_ = feature_label_names

        # scale features?
        if False:
            (feautres, dummy) = utils.scale_data(features, None, 'standard')

        if num_samples > 0:
            # select a smaller sample for feature selection
            indices = numpy.random.choice(features.shape[0],
                                          num_samples,
                                          replace=False)
            features = features[indices, :]
            labels = labels[indices]

        # Set the parameters for gid search and model based on algorithm choice
        if algorithm == 'anova' or algorithm == 'best':
            self.perform_feature_selection(evaluation, features, labels,
                                           penalty_weights, algorithm,
                                           num_jobs)
        else:
            sys.exit('bad algorithm for feature selection: ' + algorithm)

        self.best_params_ = self.clf_.best_params_
        print "Best Feature Selection Parameters are: " + str(
            self.best_params_)
        print "Best Feature Selection CV Score: " + str(self.clf_.best_score_)

        best_score_func = self.best_params_['feature_selection__score_func']
        best_percentile = self.best_params_['feature_selection__percentile']
        self.best_feature_selector_ = self.clf_.best_estimator_.named_steps[
            'feature_selection']
Ejemplo n.º 7
0
    def __init__(self, alg_type, data, all_metrics, use_metrics,
                 output_location, save_plots):
        self.alg_type = alg_type
        self.data = data
        self.all_metrics = all_metrics
        self.use_metrics = use_metrics
        self.output_location = output_location
        self.output_plots_location = None

        if output_location is not None:
            if not os.path.exists(output_location):
                os.makedirs(output_location)
            if save_plots:
                self.output_plots_location = output_location / "plots/"
                if not os.path.exists(self.output_plots_location):
                    os.makedirs(self.output_plots_location)

        self.scaled_data = scale_data(data, all_metrics)

        self.output_data = None
        self.output_scaled_data = None
        self.clustering_column_name = None
Ejemplo n.º 8
0
    def __init__(self,
                 data,
                 all_metrics,
                 use_metrics,
                 output_location=None,
                 save_plots=False):
        self.data = data
        self.all_metrics = all_metrics
        self.use_metrics = use_metrics
        self.output_location = output_location
        self.output_plots_location = None

        if output_location is not None:
            if not os.path.exists(output_location):
                os.makedirs(output_location)

            if save_plots:
                self.output_plots_location = output_location / "plots/"
                if not os.path.exists(self.output_plots_location):
                    os.makedirs(self.output_plots_location)

        self.scaled_data = scale_data(data, all_metrics)
        self.projected_res = None
Ejemplo n.º 9
0
def train_knn(train_features, train_labels, test_features, imbalanced_data,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, n_neighbors, weights,
              algorithm, metric, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size. Here instead of
    # scikit balancing, we will use imbalanced_data flag and discard the last output since
    # it is irrelevant to knn. In order not to balance the data, the third argument should
    # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of
    # scikit_balancing.
    train_features, train_labels, dummy = utils.prepare_train_data(
        train_features, train_labels, imbalanced_data, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), imbalanced_data)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "knn"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, imbalanced_data, algorithm,
                                      num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_neighbors = params['n_neighbors']
        weights = params['weights']
        algorithm = params['algorithm']
        metric = params['metric']

    # Now perform the training on full train data. check on test data
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 weights=weights,
                                 algorithm=algorithm,
                                 metric=metric)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 10
0
def train_svm(train_features, train_labels, test_features, scikit_balancing,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, kernel, gamma, cost,
              degree, num_jobs):
    """ Balances, extracts the requested train size, imputes, scales and finally performs
  features selection on the train data. Then it performs grid search, train a model using
  the best parameters.

  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "linear-svm" if kernel == "linear" else "kernel-svm"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        if 'kernel' in params:
            kernel = params['kernel']
        if 'gamma' in params:
            gamma = params['gamma']
        if 'C' in params:
            cost = params['C']
        if 'degree' in params:
            degree = params['degree']

    # Now perform the training on full train data. check on test data
    # We enable probability estimates, so that we can identify the top samples.
    model = svm.SVC(tol=0.05,
                    cache_size=6000,
                    class_weight=penalty_weights,
                    kernel=kernel,
                    gamma=gamma,
                    C=cost,
                    degree=degree,
                    probability=True)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 11
0
def train_random_forest(train_features, train_labels, test_features,
                        scikit_balancing, train_size, skip_feature_selection,
                        skip_grid_search, max_features, n_estimators,
                        criterion, min_samples_split, min_samples_leaf,
                        num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    max_features = utils.extract_max_features(max_features)
    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "random-forest"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_estimators = max(params['n_estimators'], n_estimators)
        criterion = params['criterion']
        max_features = params['max_features']
        min_samples_split = params['min_samples_split']
        min_samples_leaf = params['min_samples_leaf']

    # Now perform the training on full train data. check on test data
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   n_jobs=num_jobs,
                                   criterion=criterion,
                                   max_features=max_features,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
Ejemplo n.º 12
0
# Always normalize inputs
Xn_tr, norm_param = utl.normalize_data(X_tr)
Xn_te = utl.normalize_data(X_te, norm_param)

# Build boundaries using heuristic rules
LB, UB = utl.bounds_pso(Xn_tr,
                        n_mf,
                        n_outputs,
                        mu_delta=mu_delta,
                        s_par=s_par,
                        c_par=c_par,
                        A_par=A_par)

# Scale output(s) in continuous problems to reduce the range in <A_par>
if (problem != 'C'):
    Y_tr, scal_param = utl.scale_data(Y_tr)
    Y_te = utl.scale_data(Y_te, scal_param)

# Optimize using PSO
# theta = best solution (min)
# info[0] = function value in theta
# info[1] = index of the learner with the best solution
# info[2] = number of learners close to the learner with the best solution
func = interface_PSO
args = (Xn_tr, Y_tr, learners)
theta, info = pso.PSO(func,
                      LB,
                      UB,
                      nPop=nPop,
                      epochs=epochs,
                      K=K,
Ejemplo n.º 13
0
def main():
    read_config(config.config_filename)
    all_data, all_data_ids_map = utils.read_data(config.input_filename, config.input_separator, config.columns_num)

    ############ BEGIN: build learn dataset
    if not config.ids_rows_learn:
        config.ids_rows_learn = all_data_ids_map.keys()
    if config.read_learn_data_from_file:
        data_learn, data_learn_id_map = utils.read_data(config.learn_data_filename, config.input_separator, config.columns_num)
        config.ids_rows_learn = data_learn_id_map.keys()
    else:
        data_learn = all_data[[all_data_ids_map[row_id] for row_id in config.ids_rows_learn], :]
    ############ END: build learn dataset

    ############  BEGIN: Compute normalizers & normalize data
    if config.debug:
        print('Start normalizing data...')

    column_vars = compute_column_stds(data_learn)
    if config.normalize_by_mean_distance:
        column_mean_distances = compute_column_mean_distances(data_learn)
        column_normalizers = column_mean_distances.copy()
    else:
        column_normalizers = column_vars.copy()

    for i in range(column_normalizers.shape[0]):
        if column_normalizers[i] == 0.0:
            column_normalizers[i] = 1.0

    if config.normalizing:
        utils.normalize_data(all_data, column_normalizers)
        utils.normalize_data(data_learn, column_normalizers)
    ############  END: Compute normalizers & normalize data

    ############  BEGIN: Filter learn dataset
    data_learn_reduced = data_learn
    if not config.read_learn_data_from_file and config.use_reduction:
        if config.debug:
            print('Started filtering data...')
        ids_rows_learn_reduced = utils.reduce_data_set(config.ids_rows_learn, config.use_reduction_size)
        data_learn_reduced = all_data[[all_data_ids_map[row_id] for row_id in ids_rows_learn_reduced], :]
        config.ids_rows_learn = ids_rows_learn_reduced

    if config.debug:
        print('Saving filtered learning data to %s' % config.reduced_rows_filename)
    utils.print_data_rows(config.ids_rows_learn, data_learn, separator=config.input_separator, file=config.reduced_rows_filename,
                    useCommaInNumbers=config.output_numbers_use_comma)
    ############  END: Filter learn dataset

    ys_to_try = np.arange(config.y_min, config.y_max, config.y_step)

    for y_index in config.columns_ys:
        # For each of ys specified in 'column_ys' option...
        # (usually it's just one y)

        if config.debug:
            print('Preparing data for y index %d' % (y_index + 1))

        ############ BEGIN: Prepare data array for y_index
        columns_xs = [i for i in range(config.columns_num) if i not in config.columns_ys]
        all_data_for_index = np.column_stack((all_data[:, y_index], all_data[:, columns_xs]))
        data_learn_for_index = np.column_stack((data_learn[:, y_index], data_learn[:, columns_xs]))
        data_learn_for_index_reduced = np.column_stack((data_learn_reduced[:, y_index], data_learn_reduced[:, columns_xs]))
        prescalers = [config.prescalers[y_index]] + [config.prescalers[i] for i in columns_xs]
        ############ END: Prepare data array for y_index

        ############ BEGIN: prescale data
        all_data_for_index_prescaled = all_data_for_index.copy()
        data_learn_for_index_prescaled_reduced = data_learn_for_index_reduced.copy()
        utils.scale_data(all_data_for_index_prescaled, prescalers)
        utils.scale_data(data_learn_for_index_prescaled_reduced, prescalers)
        ############ END: prescale data

        # Now 'data_learn_for_index' contains data rows, with each row in the following format:
        # y, x_1, ..., x_n

        if config.debug:
            print('Done')

        of = sys.stdout
        if config.output_filename:
            try:
                of = open(utils.generate_output_filename(config.output_filename, y_index + 1), 'w')
            except IOError:
                utils.die('Can\'t open output file \'', utils.generate_output_filename(config.output_filename,
                                                                                       y_index + 1), '\'')

        if config.debug:
            print('Predicting for y with index %d' % (y_index + 1))

        utils.print_header(ys_to_try, len(columns_xs), file=of, separator=config.output_separator, \
                                 useCommaInNumbers=config.output_numbers_use_comma,
                                 printWeightedSumFormulaResults=config.output_weighted_sum_formula_results,
                                 printFuncValues=config.output_func_values)

        ys_actual = []
        ys_predicted = []

        for row_id in config.rows_predict:
            # For each row specified in 'predict' option...

            row_to_predict = all_data_for_index_prescaled[all_data_ids_map[row_id]]

            t1 = time.time()
            ############  BEGIN: Compute scalers
            data_learn_for_index_prescaled_reduced_scaled = data_learn_for_index_prescaled_reduced
            if config.compute_scalers:
                if config.debug:
                    print('Computing scalers for row #' + str(row_id))
                scalers, weightSumResults, best_shifts, best_func_deltas, best_row_directions, ys_to_try_sorted, \
                func_values, directions = \
                    compute_scalers(
                    row_to_predict, data_learn_for_index, ys_to_try)

                if config.test_output:
                    utils.print_test_output(row_id, row_to_predict, ys_to_try_sorted, best_row_directions, best_shifts,
                                            best_func_deltas, func_values, directions,
                                            useCommaInNumbers=config.output_numbers_use_comma,
                                            separator=config.output_separator)

                scalers[scalers == 0.0] = 1.0
                data_learn_for_index_prescaled_reduced_scaled = data_learn_for_index_prescaled_reduced.copy()
                utils.scale_data(data_learn_for_index_prescaled_reduced_scaled, scalers)
                utils.scale_data(row_to_predict, scalers)
            else:
                scalers = np.array([1.0 for _ in range(len(columns_xs) + 1)])
                weightSumResults = np.array([None for _ in range(len(columns_xs) + 1)])
            ############  END: Compute scalers

            if config.debug:
                print('Predicting row #' + str(row_id))
            try:
                # Make prediction for the current row
                y_actual, y_best, func_best, func_values, y_local_minims = \
                    make_prediction(row_to_predict, data_learn_for_index_prescaled_reduced_scaled, config.y_d, config.y_pow, ys_to_try)
                t2 = time.time()
                if config.debug:
                    print('Elapsed %.6f ms' % (((t2 - t1)) * 1000))

                ys_actual.append(y_actual)
                ys_predicted.append(y_best)

                #### BEGIN: Denormalizing results
                y_actual_denormalized = y_actual / prescalers[0]
                y_actual_denormalized = y_actual_denormalized / scalers[0]
                if config.normalizing:
                    y_actual_denormalized *= column_normalizers[0]
                y_best_denormalized = y_best / prescalers[0]  if y_best is not None else None
                y_best_denormalized = y_best_denormalized / scalers[0] if y_best is not None else None
                if config.normalizing and y_best_denormalized is not None:
                    y_best_denormalized *= column_normalizers[0]
                y_local_minims_denormalized = list(map(lambda y: y / prescalers[0],
                                                       y_local_minims))
                y_local_minims_denormalized = list(map(lambda y: y / scalers[0],
                                                       y_local_minims_denormalized))
                if config.normalizing:
                    y_local_minims_denormalized = list(map(lambda x: x * column_normalizers[0],
                                                       y_local_minims_denormalized))
                #### END: Denormalizing results

                # Print prediction for the current row
                utils.print_prediction(row_id, y_actual, y_actual_denormalized, y_best, y_best_denormalized,
                                       scalers, weightSumResults, func_values, \
                                 y_local_minims, \
                                 y_local_minims_denormalized, \
                                 file=of, \
                                 printWeightedSumFormulaResults=config.output_weighted_sum_formula_results,
                                 printFuncValues=config.output_func_values,
                                 separator=config.output_separator, \
                                 useCommaInNumbers=config.output_numbers_use_comma)

            except NoSuchDataLine:
                print('Can\'t predict line #' + str(row_id)
                      + ': no such line in data!')
                exit(1)

        utils.print_common_statistics(column_mean_distances, column_vars, config.columns_num, columns_xs,
                                 config.normalize_by_mean_distance,
                                of, y_index, ys_actual, ys_predicted)

        if config.output_filename:
            of.close()
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(data)
    data = imputer.transform(data)
    features = data[:, 0:args.label_column]
    labels = data[:, args.label_column].astype(int)

    train_features, test_features, train_labels, test_labels = (
        model_selection.train_test_split(features, labels, test_size=0.20))

    # scale data only if model is linear (svm, logisitic regression) or scales of features
    # are relevant (knn)
    if args.algorithm in ['linear-svm', 'kernel-svm', 'logistic', 'knn']:
        (train_features,
         test_features) = utils.scale_data(train_features, test_features,
                                           'minmax')

    # parse scoring methods
    scores = list()
    for scoring_function in args.scoring_functions.split(','):
        if not scoring_function in [
                "accuracy", "weighted-precision", "macro-precision",
                "weighted-recall", "macro-recall", "weighted-f1", "macro-f1"
        ]:
            sys.exit('Invalid scoring function: ' + scoring_function +
                     ' provided')
        scores.append(scoring_function)

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print("")
Ejemplo n.º 15
0
feature_importances = args.feature_importances

data = pd.read_csv(data_path)
data = utils.construct_features(data, is_lag)

if is_lag:
    features = np.r_[1, 2, 3, 4, 7, 8, 10:20, 24:32]
else:
    features = np.r_[1, 2, 3, 4, 7, 8, 10, 11]

x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, features],
                                                    data.iloc[:, 9],
                                                    test_size=0.33,
                                                    random_state=151)

x_train, x_test, standard_scaler = utils.scale_data(x_train, x_test)
if save_models:
    utils.save_transform(standard_scaler, is_lag)

x_train_ = np.copy(x_train)
y_train_ = np.copy(y_train)

models = [('Isolation Forest (Unsupervised)',
           IsolationForest(n_estimators=1000,
                           contamination=.5,
                           max_features=1,
                           max_samples=1000,
                           random_state=0)),
          ('One-Class SVM (Semi-Supervised)',
           OneClassSVM(kernel='linear', nu=0.2)),
          ('Random Forest (Supervised)',
Ejemplo n.º 16
0
def get_bold_for_condition(dir_input, num_runs, option_zscore=0):
    """" This function extracts the bold signal for three conditions. 
    option_zscore = 0 => no z-scoring
    option_zscore = 1 =>z-score the data
    
    Returns: bold values for all conditions for each run.
    A mean value for the entire run.
    """
    from utils import shift_timing, mask_data, scale_data
    #Initialize arrays
    stim_label = []
    bold_A = []
    bold_B = []
    bold_C = []
    bold_fix = []
    bold_mean_all = []
    TR_shift_size = 2  # Number of TRs to shift the extraction of the BOLD signal.

    maskdir = dir_input
    masks = ['ROI_Cool']

    ### Extract the BOLD Signal for the conditions A, B, C
    ###

    print("Processing Start ...")
    maskfile = (maskdir + "%s.nii.gz" % (masks[0]))
    mask = nib.load(maskfile)
    print("Loaded Mask")
    print(mask.shape)

    for run in range(1, num_runs + 1):
        epi_in = (dir_input + "lab1_r0%s.nii.gz" % (run))
        stim_label = np.load(dir_input + 'labels_r0%s.npy' % (run))

        # Haemodynamic shift
        label_TR_shifted = shift_timing(stim_label, TR_shift_size)

        # Get labels for conditions for A, B, C, and baseline fixation.
        A = np.squeeze(np.argwhere(label_TR_shifted == 1))
        B = np.squeeze(np.argwhere(label_TR_shifted == 2))
        C = np.squeeze(np.argwhere(label_TR_shifted == 3))

        fixation = np.squeeze(np.argwhere(label_TR_shifted == 0))
        epi_data = nib.load(epi_in)
        epi_mask_data = mask_data(epi_data, mask)

        if option_zscore == 1:
            epi_maskdata_zscore = scale_data(epi_mask_data)
            epi_mask_data = epi_maskdata_zscore

        if run == 1:
            bold_A = epi_mask_data[A]
            bold_B = epi_mask_data[B]
            bold_C = epi_mask_data[C]
            bold_fix = epi_mask_data[fixation]
            bold_data_all = epi_mask_data
        else:
            bold_A = np.vstack([bold_A, epi_mask_data[A]])
            bold_B = np.vstack([bold_B, epi_mask_data[B]])
            bold_C = np.vstack([bold_C, epi_mask_data[C]])
            bold_fix = np.vstack([bold_fix, epi_mask_data[fixation]])
            bold_data_all = np.vstack([bold_data_all, epi_mask_data])
        bold_mean_all.append(np.mean(epi_mask_data))
    print("Processing Completed")
    return bold_data_all, bold_mean_all, bold_A, bold_B, bold_C, bold_fix, label_TR_shifted
Ejemplo n.º 17
0
import numpy as np
from sklearn.utils import shuffle
from mlp import MyMlp
from utils import read_csv, oneHotEncoder, scale_data

if __name__ == "__main__":
    dataset = read_csv('datasets/iris.csv')
    label = dataset[0]
    data = dataset[1:]

    data = shuffle(data)
    target_values = data[0:100, -1:].flatten()

    data_feature = data[0:100, :-1].astype(float)
    data_feature = scale_data(data_feature, -1, 1)
    data_target = oneHotEncoder(target_values)

    input_layer = len(data_feature[0])
    output_layer = len(set(target_values))
    hidden_layer = [4, 3]

    mlp = MyMlp(input_layer, hidden_layer, output_layer)
    mlp.fit(data_feature,
            data_target,
            0.1,
            mini_batch_size=10,
            epochs=10000,
            learning_rate=0.01)

    mlp.print()
Ejemplo n.º 18
0
def main():
    df = pandas.read_csv(args.input_filename, index_col=False, header=0)
    data = df.values
    column_names = df.columns.values.tolist()

    # Impute the data and replace missing values
    imputer = preprocessing.Imputer(missing_values="NaN",
                                    strategy='mean',
                                    axis=0,
                                    copy=False)
    imputer.fit(data)
    data = imputer.transform(data)

    # Extract features/labels and their names from raw data
    features = data[:, 0:args.label_column]
    labels = data[:, args.label_column].astype(int)
    feature_names = column_names[0:args.label_column]
    label_name = column_names[args.label_column]

    # scale data no matter what, since the feature selector is L1-SVM
    (scaled_features, dummy) = utils.scale_data(features, None, 'minmax')

    # open output file and write header with max_num_features selected features
    output_file = open(args.output_filename, 'w')
    output_file_writer = csv.writer(output_file)
    header = [
        "num_features_selected", "test_size", "avg_true_positive",
        "avg_false_positive", "avg_true_negative", "avg_false_negative",
        "avg_accuracy", "avg_pos_f1", "avg_neg_f1", "avg_average_f1",
        "avg_pos_precision", "avg_neg_precision", "avg_average_precision",
        "avg_pos_recall", "avg_neg_recall", "avg_average_recall"
    ]

    for i in range(1, args.max_num_features + 1):
        header.extend(["feature" + str(i), "feature" + str(i) + "_weight"])
    output_file_writer.writerow(header)

    feature_selector_obj = feature_selection.feature_selector(
        scaled_features, labels, args.num_samples, args.scikit_balancing)

    for num_features in range(args.min_num_features,
                              args.max_num_features + 1):
        # Before anything, must set to feature selector object to num_feature
        feature_selector_obj.select_top_features(num_features)
        selected_features = feature_selector_obj.get_selected_features(
            feature_names)

        # Print selected and unselected features.
        print '\nSelected Feature,Weight'
        for feature, feature_coef in selected_features:
            print(feature + "," + str(feature_coef))

        # Now transform and restrict the features to those only selected by the L1-svm
        transformed_scaled_features = feature_selector_obj.transform(
            scaled_features)
        transformed_features = feature_selector_obj.transform(features)

        print('\n' + str(len(selected_features)) + ' out of ' +
              str(features.shape[1]) + ' features are selected.\n')

        # Now perform the learning task using the top features and report results. Make
        # sure to pass scaled features to svm
        num_test_trials = 10
        test_size = args.test_size if args.test_size <= 1.0 else int(
            args.test_size)
        if args.learning_algorithm == 'random-forest':
            rf_max_features = utils.extract_max_features(args.rf_max_features)
            metrics = perform_random_forest(
                transformed_features, labels, args.rf_num_trees,
                args.rf_criterion, rf_max_features, args.rf_min_samples_split,
                args.rf_min_samples_leaf, args.scikit_balancing, test_size,
                num_test_trials)

        elif args.learning_algorithm == 'svm':
            metrics = perform_svm(transformed_scaled_features, labels,
                                  args.svm_kernel, args.svm_gamma,
                                  args.svm_cost, args.svm_degree,
                                  args.scikit_balancing, test_size,
                                  num_test_trials)
        elif args.learning_algorithm == 'logistic':
            metrics = perform_logistic(transformed_features, labels,
                                       args.logistic_penalty,
                                       args.logistic_cost,
                                       args.scikit_balancing, test_size,
                                       num_test_trials)
        elif args.learning_algorithm == 'knn':
            metrics = perform_knn(transformed_scaled_features, labels,
                                  args.knn_num_neighbors, args.knn_weights,
                                  args.knn_algorithm, args.knn_metric,
                                  args.knn_imbalanced_data, test_size,
                                  num_test_trials)

        # write a row for num_features selected to output file
        output_row = [len(selected_features)]
        output_row.extend(metrics)
        for feature, feature_coef in selected_features:
            output_row.extend([feature, feature_coef])
        output_row.extend([''] * (len(header) - len(output_row)))
        output_file_writer.writerow(output_row)

        print '******************************\n'

    output_file.close()
Ejemplo n.º 19
0
                    dev_tuple = train_test_split(dev_idxs,
                                                 test_size=0.25,
                                                 shuffle=True,
                                                 stratify=dev_gold_labels)

                    train_idxs = dev_tuple[0]
                    train_instances = instances.iloc[train_idxs].values
                    train_gold_labels = gold_labels.iloc[
                        train_idxs].values.ravel()
                    val_idxs = dev_tuple[1]
                    validation_instances = instances.iloc[val_idxs].values
                    validation_gold_labels = gold_labels.iloc[
                        val_idxs].values.ravel()

                    scaled_instances = scale_data(train_instances,
                                                  validation_instances,
                                                  test_instances)
                    train_instances = scaled_instances[0]
                    validation_instances = scaled_instances[1]
                    test_instances = scaled_instances[2]

                    #TODO: separacao dos dados em dificuldade das instancias com kDN
                    for hardness_type, filter_func in config[
                            "validation_hardnesses"]:
                        print('Hardness type: ', hardness_type)

                        validation_instances, validation_gold_labels = select_validation_set(
                            train_instances, train_gold_labels, filter_func,
                            config["kdn"])

                        predictions[dataset_name][fold][hardness_type] = {}
Ejemplo n.º 20
0
num_layers = 1
output_step = 30 # predict step
output_size = 6 # dimension of output
learning_rate = 0.001

model_path = 'pred30/model121.tar'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
scale_data_squeezed = util.load_data_squeeze(scale_data_set, input_size)
test_data, test_data_col = util.load_data(test_data_set)

# Construct scaler
scaler = MinMaxScaler(feature_range=(-5, 5))
scaler.fit(scale_data_squeezed)
test_data_normalized = util.scale_data(test_data, test_data_col, scaler)

# Load model
if(model_path[-1] == 'r'):
    model = LSTM.RNN(input_size, sequence_length, hidden_size, num_layers, output_step, output_size, device).to(device)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
else:
    model = LSTM.RNN(input_size, sequence_length, hidden_size, num_layers, output_step, output_size, device).to(device)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint.state_dict())


model.eval()
print('Start adapting!')
param = model.state_dict()['linear.weight']
Ejemplo n.º 21
0
def main():
    # First read header columns
    input_file = open(args.input_filename, 'r')
    input_file_reader = csv.reader(input_file)
    headers = input_file_reader.next()
    input_file.close()

    # Let numpy know that NA corresponds to our missing value
    data = numpy.genfromtxt(args.input_filename,
                            delimiter=",",
                            skip_header=1,
                            missing_values="NA",
                            filling_values="NaN")
    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(data)
    data = imputer.transform(data)

    features = data[:, 0:args.label_column]
    labels = data[:, args.label_column:]

    # scale data
    (features, dummy) = utils.scale_data(features, None, args.scaling_method)

    num_components = args.num_components
    if num_components and num_components != 'mle':
        num_components = float(num_components)
        if num_components >= 1:
            num_components = int(num_components)

    pca = PCA(n_components=num_components, copy=True)
    transformed_features = pca.fit_transform(features)

    # write transformed features
    if args.transformed_output_filename:
        fields = []
        formats = []
        for i in range(1, num_components + 1):
            fields.append('component' + str(i))
            formats.append('%.20f')

        for i in range(1, data.shape[1] - args.label_column + 1):
            fields.append('label' + str(i))
            formats.append('%i')
        header = ','.join(fields)
        output_data = numpy.column_stack((transformed_features, labels))
        numpy.savetxt(args.transformed_output_filename,
                      output_data,
                      comments='',
                      fmt=formats,
                      delimiter=',',
                      header=header)

    # write component loading to output file
    loadings_output_file = open(args.loadings_output_filename, 'w')
    loadings_output_file_writer = csv.writer(loadings_output_file)
    loadings_output_file_writer.writerow(headers[0:args.label_column])
    for i in range(0, len(pca.components_)):
        component = pca.components_[i]
        loadings_output_file_writer.writerow(component)
    loadings_output_file.close()

    # Now write the individual and cumulative variance explained by each sucessive component
    explained_variance_output_file = open(
        args.explained_variance_output_filename, 'w')
    explained_variance_output_file_writer = csv.writer(
        explained_variance_output_file)
    explained_variance_output_file_writer.writerow(
        ['Component_Number', 'Explained_Variance', 'Total_Explained_Variance'])
    total_explained_variance = 0
    for i in range(0, len(pca.components_)):
        explained_variance = pca.explained_variance_ratio_[i] * 100.
        total_explained_variance += explained_variance
        explained_variance_output_file_writer.writerow(
            [i + 1, explained_variance, total_explained_variance])
    explained_variance_output_file.close()

    # print top loadings per component
    for i in range(0, len(pca.components_)):
        print 'Top ' + str(
            args.num_top_loadings) + ' loadings for component ' + str(i)
        component = pca.components_[i]
        abs_component = map(abs, component)
        # Get the indices of components sorted by the features loading
        sorted_indices = [
            i[0] for i in sorted(
                enumerate(abs_component), key=lambda x: x[1], reverse=True)
        ]
        for l in range(0, args.num_top_loadings):
            index = sorted_indices[l]
            print headers[index] + ' : ' + str(component[index])
    print '\n\n'
    print 'Explained variance ratio\n ' + str(pca.explained_variance_ratio_)
    print 'Total explained variance ' + str(
        numpy.sum(pca.explained_variance_ratio_))
Ejemplo n.º 22
0
    return model


# load MNIST data directly from keras
mnist_data = tf.keras.datasets.mnist
(train_images, train_labels), (test_images,
                               test_labels) = mnist_data.load_data()

# reduce training date
train_images, train_labels = utils.reduce_date(train_images, train_labels,
                                               10240)
# reduce testing date
test_images, test_labels = utils.reduce_date(test_images, test_labels, 500)

# scale input data
scaled_train_images, scaled_test_images = utils.scale_data(
    train_images, test_images)

# add a dummy channel dimension
scaled_train_images = scaled_train_images[..., np.newaxis]
scaled_test_images = scaled_test_images[..., np.newaxis]

# create validation set
scaled_train_images, scaled_val_images, train_labels, val_labels = \
    train_test_split(scaled_train_images, train_labels, test_size=0.15)

# initialize the plot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
plt.subplots_adjust(bottom=.2, wspace=.3)

# initiate the metric record for plot
metric_plot = {'layers': [], 'test_acc': [], 'train_t': [], 'test_t': []}
Ejemplo n.º 23
0
def main():
  (train_features, train_labels, test_features, test_labels, class_values, class_names,
   feature_label_names) = utils.prepare_data(args.input_filename,
                                             args.label_column,
                                             args.train_size,
                                             args.test_size,
                                             args.imbalanced_data)
  # now that we have limited the data to requested train size, scale data since svm needs
  # to be scaled
  (train_feautres, test_features) = utils.scale_data(train_features,
                                                     test_features,
                                                     args.scaling_method)
  
  # We let scikit use its balancing scheme if it is explicitly requested
  penalty_weights = 'balanced' if args.imbalanced_data else None
 
  # feature selection if requested
  if args.feature_selection_algo:
    feature_selector_obj =  feature_selection.feature_selector(args.evaluation,
                                                               train_features,
                                                               train_labels,
                                                               feature_label_names,
                                                               -1,
                                                               penalty_weights,
                                                               args.feature_selection_algo,
                                                               args.num_jobs)
    train_features = feature_selector_obj.transform(train_features)
    test_features = feature_selector_obj.transform(test_features)
    print "Selected " + str(len(feature_selector_obj.get_selected_features())) + " features"
    print "Top 10 features: " + str(feature_selector_obj.get_top_features(10))


  # ovr only works for linear svm
  multi_class = 'ovr' if args.kernel == 'linear' else args.multi_class
  model = models.train_svm(train_features,
                           train_labels,
                           penalty_weights,
                           args.skip_grid_search,
                           args.evaluation,
                           args.num_jobs,
                           args.kernel,
                           args.cost,
                           args.gamma,
                           args.degree,
                           args.multi_class)

  # Predict test and report full stats
  y_true, y_pred = test_labels, model.predict(test_features)
  print("\n*****************************\n")
  print('MAE: ' +
        str(metrics.mean_absolute_error(y_true, y_pred, multioutput='uniform_average')))
  print('MSE: ' +
        str(metrics.mean_squared_error(y_true, y_pred, multioutput='uniform_average')))
  
  print('Classification report:')
  print(metrics.classification_report(y_true, y_pred, class_values, class_names))
  print('Precision Recall')
  print(metrics.precision_recall_fscore_support(y_true, y_pred, labels=class_values,
                                                pos_label=None,
                                                average='weighted'))

  # print and plot confusion matrix
  print('Confusion Matrix Without Normalization')
  numpy.set_printoptions(precision=2)
  cm = metrics.confusion_matrix(y_true, y_pred, class_values)
  print(cm)
  print('Confusion Matrix With Normalization')
  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
  print(cm_normalized)
  
  plt.figure()
  plt.subplot(2, 1, 1)
  utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix')

  # Normalize the confusion matrix by row (i.e by the number of samples
  # in each class)
  plt.subplot(2, 1, 2)
  utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix')

  #plt.savefig(args.output_figure + '.pdf')
  pdf = PdfPages(args.output_figure + '.pdf')
  plt.savefig(pdf, format='pdf')
  pdf.close()
Ejemplo n.º 24
0
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from utils import scale_data
from pprint import pprint

# load MNIST data directly from keras
mnist_data = tf.keras.datasets.mnist
_, (test_images, test_labels) = mnist_data.load_data()

# scale input data
_, scaled_test_images = scale_data(test_images, test_images)

# add a dummy channel dimension
scaled_test_images = scaled_test_images[..., np.newaxis]

# randomly chose an image from test set - plot image
random_inx = np.random.choice(scaled_test_images.shape[0])
test_image = scaled_test_images[random_inx]
plt.imshow(test_image, cmap='Greys')

# load the model
n = 2  # model with this number of layers
ep = 5  # epoch number
model = load_model(f'02_IBS-Saved Model/{n}-Layers/IBS_Ep{ep:02d}')

# use the model to predict the label of the chosen image
prediction = model.predict(test_image[np.newaxis, ...])

print(f"Random index is:\t{random_inx}")  # print random index