def main(): """ 主函数 """ # 加载数据集 all_daily_df = utils.load_data() # 对时序数据进行差分操作 stationary_df = utils.make_stationary_seq(all_daily_df, vis=False) # 构建可用于模型的数据 train_data, test_data = utils.make_data_for_model(stationary_df) # 归一化数据 y_scaler, X_train, y_train, X_test, y_test = utils.scale_data( train_data, test_data) if not IS_LOAD_MODEL: # 训练LSTM模型 lstm_model = utils.fit_lstm(X_train, y_train) else: # 加载LSTM模型 if os.path.exists(config.model_file): lstm_model = load_model(config.model_file) else: print('{}模型文件不存在'.format(config.model_file)) return #model summary print(lstm_model.summary()) # 验证模型 test_dates = test_data.index.tolist() pred_daily_df = pd.DataFrame(columns=['True Value', 'Pred Value'], index=test_dates) pred_daily_df['True Value'] = all_daily_df[config.raw_label_col] for i, test_date in enumerate(test_dates): X = X_test[i].reshape(1, -1) # 将一天的数据特征转成行向量 y_pred = utils.forecast_lstm(lstm_model, X) # scale反向操作,恢复数据范围 rescaled_y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[0, 0] # 差分反向操作,恢复数据的值:加上前一天的真实标签 previous_date = test_date - pd.DateOffset(days=1) recoverd_y_pred = rescaled_y_pred + all_daily_df.loc[previous_date][ config.raw_label_col] # 保存数据 pred_daily_df.loc[test_date, 'Pred Value'] = recoverd_y_pred print('Date={}, 真实值={}, 预测值={}'.format( test_date, all_daily_df.loc[test_date][config.raw_label_col], recoverd_y_pred)) # 保存结果 pred_daily_df.to_csv(os.path.join(config.output_path, 'pred_daily_df.csv')) pred_daily_df.plot() plt.savefig(os.path.join(config.output_path, 'pred_daily_df.png')) plt.show()
def train_logistic(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, penalty, cost, dual, tol, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "logistic" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) penalty = params['penalty'] cost = params['C'] # Now perform the training on full train data. check on test data model = LogisticRegression(penalty=penalty, dual=dual, C=cost, tol=tol, max_iter=5000, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def L1SVMFeatureSelection(train_features, train_labels, test_features, feature_names, feature_selection_cost, threshold, num_jobs): """ Performs feature selection, using lasso svm, and returns the transformed train/test data after feature selection """ # features have to be scaled for svm (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') if feature_selection_cost: model = LinearSVC(C=feature_selection_cost, dual=False, penalty='l1', tol=0.005, multi_class='ovr', max_iter=50000) else: tuned_parameters = [{ 'C': [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000 ] }] feature_selector_model = LinearSVC(penalty='l1', dual=False, tol=0.005, multi_class='ovr', max_iter=50000) scorer = make_scorer(precision_score, pos_label=None, average='macro') # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory # usage does not blow up skf = StratifiedKFold(n_splits=3, shuffle=True) model = GridSearchCV(estimator=feature_selector_model, param_grid=tuned_parameters, n_jobs=num_jobs, pre_dispatch="n_jobs", cv=skf, scoring=scorer) model.fit(scaled_train_features, train_labels) feature_selector = SelectFromModel(model.best_estimator_, threshold=threshold, prefit=True) if not feature_selection_cost: print("Optimal L1-SVM feature selectoion params {}".format( model.best_params_)) train_features = feature_selector.transform(train_features) test_features = feature_selector.transform(test_features) feature_names = feature_names[feature_selector.get_support(indices=True)] print 'Selected {} features '.format(train_features.shape[1]) return (train_features, test_features, feature_names)
def main(): """ 主函数 """ # 加载数据集 all_daily_df = utils.load_data() # 对时序数据进行差分操作 stationary_df = utils.make_stationary_seq(all_daily_df, vis=False) # 构建可用于模型的数据 train_data, test_data = utils.make_data_for_model(stationary_df) # 归一化数据 y_scaler, X_train, y_train, X_test, y_test = utils.scale_data(train_data, test_data) if not IS_LOAD_MODEL: # 训练LSTM模型 lstm_model = utils.fit_lstm(X_train, y_train) else: # 加载LSTM模型 if os.path.exists(config.model_file): lstm_model = load_model(config.model_file) else: print('{}模型文件不存在'.format(config.model_file)) return #model summary print(lstm_model.summary()) # 验证模型 test_dates = test_data.index.tolist() pred_daily_df = pd.DataFrame(columns=['True Value', 'Pred Value'], index=test_dates) pred_daily_df['True Value'] = all_daily_df[config.raw_label_col] for i, test_date in enumerate(test_dates): X = X_test[i].reshape(1, -1) # 将一天的数据特征转成行向量 y_pred = utils.forecast_lstm(lstm_model, X) # scale反向操作,恢复数据范围 rescaled_y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[0, 0] # 差分反向操作,恢复数据的值:加上前一天的真实标签 previous_date = test_date - pd.DateOffset(days=1) recoverd_y_pred = rescaled_y_pred + all_daily_df.loc[previous_date][config.raw_label_col] # 保存数据 pred_daily_df.loc[test_date, 'Pred Value'] = recoverd_y_pred print('Date={}, 真实值={}, 预测值={}'.format(test_date, all_daily_df.loc[test_date][config.raw_label_col], recoverd_y_pred)) # 保存结果 pred_daily_df.to_csv(os.path.join(config.output_path, 'pred_daily_df.csv')) pred_daily_df.plot() plt.savefig(os.path.join(config.output_path, 'pred_daily_df.png')) plt.show()
def _scale_data(self, train_file_num: int = 10000, test_file_num: int = 1000): # reduce data (self.train_images, self.train_labels) = reduce_date(self.train_images, self.train_labels, train_file_num) (self.test_images, self.test_labels) = reduce_date(self.test_images, self.test_labels, test_file_num) # normalize data self.train_images = scale_data(self.train_images) self.test_images = scale_data(self.test_images) # reshape data self.train_images = np.reshape(self.train_images, (self.train_images.shape[0], -1)) self.test_images = np.reshape(self.test_images, (self.test_images.shape[0], -1))
def __init__(self, evaluation, all_features, all_labels, feature_label_names, num_samples, penalty_weights, algorithm, num_jobs): ''' Initializes a feature selector object by cross validating the selector model and storing the best transformer. all_feature: Must contain the features all scaled to the same range if l1-svm feature seleciton if requested. all_labels: labels corresponding to features num_samples: how many random data points to sample and use from scaled_features for penalty_weights: determines how to account for imbalance. either None, which means the data is balanced, or balanced which means use scikit balacning. training the feature selector models, to speed up the whole process. algorithm: what feature selection method to use: anova, logistic, tree ''' # copy the features and labels, because we don't want to be modifying (scaling or # sampling) in place features = numpy.copy(all_features) labels = numpy.copy(all_labels) self.features_label_names_ = feature_label_names # scale features? if False: (feautres, dummy) = utils.scale_data(features, None, 'standard') if num_samples > 0: # select a smaller sample for feature selection indices = numpy.random.choice(features.shape[0], num_samples, replace=False) features = features[indices, :] labels = labels[indices] # Set the parameters for gid search and model based on algorithm choice if algorithm == 'anova' or algorithm == 'best': self.perform_feature_selection(evaluation, features, labels, penalty_weights, algorithm, num_jobs) else: sys.exit('bad algorithm for feature selection: ' + algorithm) self.best_params_ = self.clf_.best_params_ print "Best Feature Selection Parameters are: " + str( self.best_params_) print "Best Feature Selection CV Score: " + str(self.clf_.best_score_) best_score_func = self.best_params_['feature_selection__score_func'] best_percentile = self.best_params_['feature_selection__percentile'] self.best_feature_selector_ = self.clf_.best_estimator_.named_steps[ 'feature_selection']
def __init__(self, alg_type, data, all_metrics, use_metrics, output_location, save_plots): self.alg_type = alg_type self.data = data self.all_metrics = all_metrics self.use_metrics = use_metrics self.output_location = output_location self.output_plots_location = None if output_location is not None: if not os.path.exists(output_location): os.makedirs(output_location) if save_plots: self.output_plots_location = output_location / "plots/" if not os.path.exists(self.output_plots_location): os.makedirs(self.output_plots_location) self.scaled_data = scale_data(data, all_metrics) self.output_data = None self.output_scaled_data = None self.clustering_column_name = None
def __init__(self, data, all_metrics, use_metrics, output_location=None, save_plots=False): self.data = data self.all_metrics = all_metrics self.use_metrics = use_metrics self.output_location = output_location self.output_plots_location = None if output_location is not None: if not os.path.exists(output_location): os.makedirs(output_location) if save_plots: self.output_plots_location = output_location / "plots/" if not os.path.exists(self.output_plots_location): os.makedirs(self.output_plots_location) self.scaled_data = scale_data(data, all_metrics) self.projected_res = None
def train_knn(train_features, train_labels, test_features, imbalanced_data, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, n_neighbors, weights, algorithm, metric, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. Here instead of # scikit balancing, we will use imbalanced_data flag and discard the last output since # it is irrelevant to knn. In order not to balance the data, the third argument should # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of # scikit_balancing. train_features, train_labels, dummy = utils.prepare_train_data( train_features, train_labels, imbalanced_data, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), imbalanced_data) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "knn" clf = grid_search.grid_search("macro-recall", train_features, train_labels, imbalanced_data, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_neighbors = params['n_neighbors'] weights = params['weights'] algorithm = params['algorithm'] metric = params['metric'] # Now perform the training on full train data. check on test data model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, metric=metric) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_svm(train_features, train_labels, test_features, scikit_balancing, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, kernel, gamma, cost, degree, num_jobs): """ Balances, extracts the requested train size, imputes, scales and finally performs features selection on the train data. Then it performs grid search, train a model using the best parameters. Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "linear-svm" if kernel == "linear" else "kernel-svm" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) if 'kernel' in params: kernel = params['kernel'] if 'gamma' in params: gamma = params['gamma'] if 'C' in params: cost = params['C'] if 'degree' in params: degree = params['degree'] # Now perform the training on full train data. check on test data # We enable probability estimates, so that we can identify the top samples. model = svm.SVC(tol=0.05, cache_size=6000, class_weight=penalty_weights, kernel=kernel, gamma=gamma, C=cost, degree=degree, probability=True) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_random_forest(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, max_features, n_estimators, criterion, min_samples_split, min_samples_leaf, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) max_features = utils.extract_max_features(max_features) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "random-forest" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_estimators = max(params['n_estimators'], n_estimators) criterion = params['criterion'] max_features = params['max_features'] min_samples_split = params['min_samples_split'] min_samples_leaf = params['min_samples_leaf'] # Now perform the training on full train data. check on test data model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=num_jobs, criterion=criterion, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
# Always normalize inputs Xn_tr, norm_param = utl.normalize_data(X_tr) Xn_te = utl.normalize_data(X_te, norm_param) # Build boundaries using heuristic rules LB, UB = utl.bounds_pso(Xn_tr, n_mf, n_outputs, mu_delta=mu_delta, s_par=s_par, c_par=c_par, A_par=A_par) # Scale output(s) in continuous problems to reduce the range in <A_par> if (problem != 'C'): Y_tr, scal_param = utl.scale_data(Y_tr) Y_te = utl.scale_data(Y_te, scal_param) # Optimize using PSO # theta = best solution (min) # info[0] = function value in theta # info[1] = index of the learner with the best solution # info[2] = number of learners close to the learner with the best solution func = interface_PSO args = (Xn_tr, Y_tr, learners) theta, info = pso.PSO(func, LB, UB, nPop=nPop, epochs=epochs, K=K,
def main(): read_config(config.config_filename) all_data, all_data_ids_map = utils.read_data(config.input_filename, config.input_separator, config.columns_num) ############ BEGIN: build learn dataset if not config.ids_rows_learn: config.ids_rows_learn = all_data_ids_map.keys() if config.read_learn_data_from_file: data_learn, data_learn_id_map = utils.read_data(config.learn_data_filename, config.input_separator, config.columns_num) config.ids_rows_learn = data_learn_id_map.keys() else: data_learn = all_data[[all_data_ids_map[row_id] for row_id in config.ids_rows_learn], :] ############ END: build learn dataset ############ BEGIN: Compute normalizers & normalize data if config.debug: print('Start normalizing data...') column_vars = compute_column_stds(data_learn) if config.normalize_by_mean_distance: column_mean_distances = compute_column_mean_distances(data_learn) column_normalizers = column_mean_distances.copy() else: column_normalizers = column_vars.copy() for i in range(column_normalizers.shape[0]): if column_normalizers[i] == 0.0: column_normalizers[i] = 1.0 if config.normalizing: utils.normalize_data(all_data, column_normalizers) utils.normalize_data(data_learn, column_normalizers) ############ END: Compute normalizers & normalize data ############ BEGIN: Filter learn dataset data_learn_reduced = data_learn if not config.read_learn_data_from_file and config.use_reduction: if config.debug: print('Started filtering data...') ids_rows_learn_reduced = utils.reduce_data_set(config.ids_rows_learn, config.use_reduction_size) data_learn_reduced = all_data[[all_data_ids_map[row_id] for row_id in ids_rows_learn_reduced], :] config.ids_rows_learn = ids_rows_learn_reduced if config.debug: print('Saving filtered learning data to %s' % config.reduced_rows_filename) utils.print_data_rows(config.ids_rows_learn, data_learn, separator=config.input_separator, file=config.reduced_rows_filename, useCommaInNumbers=config.output_numbers_use_comma) ############ END: Filter learn dataset ys_to_try = np.arange(config.y_min, config.y_max, config.y_step) for y_index in config.columns_ys: # For each of ys specified in 'column_ys' option... # (usually it's just one y) if config.debug: print('Preparing data for y index %d' % (y_index + 1)) ############ BEGIN: Prepare data array for y_index columns_xs = [i for i in range(config.columns_num) if i not in config.columns_ys] all_data_for_index = np.column_stack((all_data[:, y_index], all_data[:, columns_xs])) data_learn_for_index = np.column_stack((data_learn[:, y_index], data_learn[:, columns_xs])) data_learn_for_index_reduced = np.column_stack((data_learn_reduced[:, y_index], data_learn_reduced[:, columns_xs])) prescalers = [config.prescalers[y_index]] + [config.prescalers[i] for i in columns_xs] ############ END: Prepare data array for y_index ############ BEGIN: prescale data all_data_for_index_prescaled = all_data_for_index.copy() data_learn_for_index_prescaled_reduced = data_learn_for_index_reduced.copy() utils.scale_data(all_data_for_index_prescaled, prescalers) utils.scale_data(data_learn_for_index_prescaled_reduced, prescalers) ############ END: prescale data # Now 'data_learn_for_index' contains data rows, with each row in the following format: # y, x_1, ..., x_n if config.debug: print('Done') of = sys.stdout if config.output_filename: try: of = open(utils.generate_output_filename(config.output_filename, y_index + 1), 'w') except IOError: utils.die('Can\'t open output file \'', utils.generate_output_filename(config.output_filename, y_index + 1), '\'') if config.debug: print('Predicting for y with index %d' % (y_index + 1)) utils.print_header(ys_to_try, len(columns_xs), file=of, separator=config.output_separator, \ useCommaInNumbers=config.output_numbers_use_comma, printWeightedSumFormulaResults=config.output_weighted_sum_formula_results, printFuncValues=config.output_func_values) ys_actual = [] ys_predicted = [] for row_id in config.rows_predict: # For each row specified in 'predict' option... row_to_predict = all_data_for_index_prescaled[all_data_ids_map[row_id]] t1 = time.time() ############ BEGIN: Compute scalers data_learn_for_index_prescaled_reduced_scaled = data_learn_for_index_prescaled_reduced if config.compute_scalers: if config.debug: print('Computing scalers for row #' + str(row_id)) scalers, weightSumResults, best_shifts, best_func_deltas, best_row_directions, ys_to_try_sorted, \ func_values, directions = \ compute_scalers( row_to_predict, data_learn_for_index, ys_to_try) if config.test_output: utils.print_test_output(row_id, row_to_predict, ys_to_try_sorted, best_row_directions, best_shifts, best_func_deltas, func_values, directions, useCommaInNumbers=config.output_numbers_use_comma, separator=config.output_separator) scalers[scalers == 0.0] = 1.0 data_learn_for_index_prescaled_reduced_scaled = data_learn_for_index_prescaled_reduced.copy() utils.scale_data(data_learn_for_index_prescaled_reduced_scaled, scalers) utils.scale_data(row_to_predict, scalers) else: scalers = np.array([1.0 for _ in range(len(columns_xs) + 1)]) weightSumResults = np.array([None for _ in range(len(columns_xs) + 1)]) ############ END: Compute scalers if config.debug: print('Predicting row #' + str(row_id)) try: # Make prediction for the current row y_actual, y_best, func_best, func_values, y_local_minims = \ make_prediction(row_to_predict, data_learn_for_index_prescaled_reduced_scaled, config.y_d, config.y_pow, ys_to_try) t2 = time.time() if config.debug: print('Elapsed %.6f ms' % (((t2 - t1)) * 1000)) ys_actual.append(y_actual) ys_predicted.append(y_best) #### BEGIN: Denormalizing results y_actual_denormalized = y_actual / prescalers[0] y_actual_denormalized = y_actual_denormalized / scalers[0] if config.normalizing: y_actual_denormalized *= column_normalizers[0] y_best_denormalized = y_best / prescalers[0] if y_best is not None else None y_best_denormalized = y_best_denormalized / scalers[0] if y_best is not None else None if config.normalizing and y_best_denormalized is not None: y_best_denormalized *= column_normalizers[0] y_local_minims_denormalized = list(map(lambda y: y / prescalers[0], y_local_minims)) y_local_minims_denormalized = list(map(lambda y: y / scalers[0], y_local_minims_denormalized)) if config.normalizing: y_local_minims_denormalized = list(map(lambda x: x * column_normalizers[0], y_local_minims_denormalized)) #### END: Denormalizing results # Print prediction for the current row utils.print_prediction(row_id, y_actual, y_actual_denormalized, y_best, y_best_denormalized, scalers, weightSumResults, func_values, \ y_local_minims, \ y_local_minims_denormalized, \ file=of, \ printWeightedSumFormulaResults=config.output_weighted_sum_formula_results, printFuncValues=config.output_func_values, separator=config.output_separator, \ useCommaInNumbers=config.output_numbers_use_comma) except NoSuchDataLine: print('Can\'t predict line #' + str(row_id) + ': no such line in data!') exit(1) utils.print_common_statistics(column_mean_distances, column_vars, config.columns_num, columns_xs, config.normalize_by_mean_distance, of, y_index, ys_actual, ys_predicted) if config.output_filename: of.close()
strategy='mean', axis=0, copy=False) imputer.fit(data) data = imputer.transform(data) features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=0.20)) # scale data only if model is linear (svm, logisitic regression) or scales of features # are relevant (knn) if args.algorithm in ['linear-svm', 'kernel-svm', 'logistic', 'knn']: (train_features, test_features) = utils.scale_data(train_features, test_features, 'minmax') # parse scoring methods scores = list() for scoring_function in args.scoring_functions.split(','): if not scoring_function in [ "accuracy", "weighted-precision", "macro-precision", "weighted-recall", "macro-recall", "weighted-f1", "macro-f1" ]: sys.exit('Invalid scoring function: ' + scoring_function + ' provided') scores.append(scoring_function) for score in scores: print("# Tuning hyper-parameters for %s" % score) print("")
feature_importances = args.feature_importances data = pd.read_csv(data_path) data = utils.construct_features(data, is_lag) if is_lag: features = np.r_[1, 2, 3, 4, 7, 8, 10:20, 24:32] else: features = np.r_[1, 2, 3, 4, 7, 8, 10, 11] x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, features], data.iloc[:, 9], test_size=0.33, random_state=151) x_train, x_test, standard_scaler = utils.scale_data(x_train, x_test) if save_models: utils.save_transform(standard_scaler, is_lag) x_train_ = np.copy(x_train) y_train_ = np.copy(y_train) models = [('Isolation Forest (Unsupervised)', IsolationForest(n_estimators=1000, contamination=.5, max_features=1, max_samples=1000, random_state=0)), ('One-Class SVM (Semi-Supervised)', OneClassSVM(kernel='linear', nu=0.2)), ('Random Forest (Supervised)',
def get_bold_for_condition(dir_input, num_runs, option_zscore=0): """" This function extracts the bold signal for three conditions. option_zscore = 0 => no z-scoring option_zscore = 1 =>z-score the data Returns: bold values for all conditions for each run. A mean value for the entire run. """ from utils import shift_timing, mask_data, scale_data #Initialize arrays stim_label = [] bold_A = [] bold_B = [] bold_C = [] bold_fix = [] bold_mean_all = [] TR_shift_size = 2 # Number of TRs to shift the extraction of the BOLD signal. maskdir = dir_input masks = ['ROI_Cool'] ### Extract the BOLD Signal for the conditions A, B, C ### print("Processing Start ...") maskfile = (maskdir + "%s.nii.gz" % (masks[0])) mask = nib.load(maskfile) print("Loaded Mask") print(mask.shape) for run in range(1, num_runs + 1): epi_in = (dir_input + "lab1_r0%s.nii.gz" % (run)) stim_label = np.load(dir_input + 'labels_r0%s.npy' % (run)) # Haemodynamic shift label_TR_shifted = shift_timing(stim_label, TR_shift_size) # Get labels for conditions for A, B, C, and baseline fixation. A = np.squeeze(np.argwhere(label_TR_shifted == 1)) B = np.squeeze(np.argwhere(label_TR_shifted == 2)) C = np.squeeze(np.argwhere(label_TR_shifted == 3)) fixation = np.squeeze(np.argwhere(label_TR_shifted == 0)) epi_data = nib.load(epi_in) epi_mask_data = mask_data(epi_data, mask) if option_zscore == 1: epi_maskdata_zscore = scale_data(epi_mask_data) epi_mask_data = epi_maskdata_zscore if run == 1: bold_A = epi_mask_data[A] bold_B = epi_mask_data[B] bold_C = epi_mask_data[C] bold_fix = epi_mask_data[fixation] bold_data_all = epi_mask_data else: bold_A = np.vstack([bold_A, epi_mask_data[A]]) bold_B = np.vstack([bold_B, epi_mask_data[B]]) bold_C = np.vstack([bold_C, epi_mask_data[C]]) bold_fix = np.vstack([bold_fix, epi_mask_data[fixation]]) bold_data_all = np.vstack([bold_data_all, epi_mask_data]) bold_mean_all.append(np.mean(epi_mask_data)) print("Processing Completed") return bold_data_all, bold_mean_all, bold_A, bold_B, bold_C, bold_fix, label_TR_shifted
import numpy as np from sklearn.utils import shuffle from mlp import MyMlp from utils import read_csv, oneHotEncoder, scale_data if __name__ == "__main__": dataset = read_csv('datasets/iris.csv') label = dataset[0] data = dataset[1:] data = shuffle(data) target_values = data[0:100, -1:].flatten() data_feature = data[0:100, :-1].astype(float) data_feature = scale_data(data_feature, -1, 1) data_target = oneHotEncoder(target_values) input_layer = len(data_feature[0]) output_layer = len(set(target_values)) hidden_layer = [4, 3] mlp = MyMlp(input_layer, hidden_layer, output_layer) mlp.fit(data_feature, data_target, 0.1, mini_batch_size=10, epochs=10000, learning_rate=0.01) mlp.print()
def main(): df = pandas.read_csv(args.input_filename, index_col=False, header=0) data = df.values column_names = df.columns.values.tolist() # Impute the data and replace missing values imputer = preprocessing.Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(data) data = imputer.transform(data) # Extract features/labels and their names from raw data features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) feature_names = column_names[0:args.label_column] label_name = column_names[args.label_column] # scale data no matter what, since the feature selector is L1-SVM (scaled_features, dummy) = utils.scale_data(features, None, 'minmax') # open output file and write header with max_num_features selected features output_file = open(args.output_filename, 'w') output_file_writer = csv.writer(output_file) header = [ "num_features_selected", "test_size", "avg_true_positive", "avg_false_positive", "avg_true_negative", "avg_false_negative", "avg_accuracy", "avg_pos_f1", "avg_neg_f1", "avg_average_f1", "avg_pos_precision", "avg_neg_precision", "avg_average_precision", "avg_pos_recall", "avg_neg_recall", "avg_average_recall" ] for i in range(1, args.max_num_features + 1): header.extend(["feature" + str(i), "feature" + str(i) + "_weight"]) output_file_writer.writerow(header) feature_selector_obj = feature_selection.feature_selector( scaled_features, labels, args.num_samples, args.scikit_balancing) for num_features in range(args.min_num_features, args.max_num_features + 1): # Before anything, must set to feature selector object to num_feature feature_selector_obj.select_top_features(num_features) selected_features = feature_selector_obj.get_selected_features( feature_names) # Print selected and unselected features. print '\nSelected Feature,Weight' for feature, feature_coef in selected_features: print(feature + "," + str(feature_coef)) # Now transform and restrict the features to those only selected by the L1-svm transformed_scaled_features = feature_selector_obj.transform( scaled_features) transformed_features = feature_selector_obj.transform(features) print('\n' + str(len(selected_features)) + ' out of ' + str(features.shape[1]) + ' features are selected.\n') # Now perform the learning task using the top features and report results. Make # sure to pass scaled features to svm num_test_trials = 10 test_size = args.test_size if args.test_size <= 1.0 else int( args.test_size) if args.learning_algorithm == 'random-forest': rf_max_features = utils.extract_max_features(args.rf_max_features) metrics = perform_random_forest( transformed_features, labels, args.rf_num_trees, args.rf_criterion, rf_max_features, args.rf_min_samples_split, args.rf_min_samples_leaf, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'svm': metrics = perform_svm(transformed_scaled_features, labels, args.svm_kernel, args.svm_gamma, args.svm_cost, args.svm_degree, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'logistic': metrics = perform_logistic(transformed_features, labels, args.logistic_penalty, args.logistic_cost, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'knn': metrics = perform_knn(transformed_scaled_features, labels, args.knn_num_neighbors, args.knn_weights, args.knn_algorithm, args.knn_metric, args.knn_imbalanced_data, test_size, num_test_trials) # write a row for num_features selected to output file output_row = [len(selected_features)] output_row.extend(metrics) for feature, feature_coef in selected_features: output_row.extend([feature, feature_coef]) output_row.extend([''] * (len(header) - len(output_row))) output_file_writer.writerow(output_row) print '******************************\n' output_file.close()
dev_tuple = train_test_split(dev_idxs, test_size=0.25, shuffle=True, stratify=dev_gold_labels) train_idxs = dev_tuple[0] train_instances = instances.iloc[train_idxs].values train_gold_labels = gold_labels.iloc[ train_idxs].values.ravel() val_idxs = dev_tuple[1] validation_instances = instances.iloc[val_idxs].values validation_gold_labels = gold_labels.iloc[ val_idxs].values.ravel() scaled_instances = scale_data(train_instances, validation_instances, test_instances) train_instances = scaled_instances[0] validation_instances = scaled_instances[1] test_instances = scaled_instances[2] #TODO: separacao dos dados em dificuldade das instancias com kDN for hardness_type, filter_func in config[ "validation_hardnesses"]: print('Hardness type: ', hardness_type) validation_instances, validation_gold_labels = select_validation_set( train_instances, train_gold_labels, filter_func, config["kdn"]) predictions[dataset_name][fold][hardness_type] = {}
num_layers = 1 output_step = 30 # predict step output_size = 6 # dimension of output learning_rate = 0.001 model_path = 'pred30/model121.tar' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load data scale_data_squeezed = util.load_data_squeeze(scale_data_set, input_size) test_data, test_data_col = util.load_data(test_data_set) # Construct scaler scaler = MinMaxScaler(feature_range=(-5, 5)) scaler.fit(scale_data_squeezed) test_data_normalized = util.scale_data(test_data, test_data_col, scaler) # Load model if(model_path[-1] == 'r'): model = LSTM.RNN(input_size, sequence_length, hidden_size, num_layers, output_step, output_size, device).to(device) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model_state_dict']) else: model = LSTM.RNN(input_size, sequence_length, hidden_size, num_layers, output_step, output_size, device).to(device) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint.state_dict()) model.eval() print('Start adapting!') param = model.state_dict()['linear.weight']
def main(): # First read header columns input_file = open(args.input_filename, 'r') input_file_reader = csv.reader(input_file) headers = input_file_reader.next() input_file.close() # Let numpy know that NA corresponds to our missing value data = numpy.genfromtxt(args.input_filename, delimiter=",", skip_header=1, missing_values="NA", filling_values="NaN") # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(data) data = imputer.transform(data) features = data[:, 0:args.label_column] labels = data[:, args.label_column:] # scale data (features, dummy) = utils.scale_data(features, None, args.scaling_method) num_components = args.num_components if num_components and num_components != 'mle': num_components = float(num_components) if num_components >= 1: num_components = int(num_components) pca = PCA(n_components=num_components, copy=True) transformed_features = pca.fit_transform(features) # write transformed features if args.transformed_output_filename: fields = [] formats = [] for i in range(1, num_components + 1): fields.append('component' + str(i)) formats.append('%.20f') for i in range(1, data.shape[1] - args.label_column + 1): fields.append('label' + str(i)) formats.append('%i') header = ','.join(fields) output_data = numpy.column_stack((transformed_features, labels)) numpy.savetxt(args.transformed_output_filename, output_data, comments='', fmt=formats, delimiter=',', header=header) # write component loading to output file loadings_output_file = open(args.loadings_output_filename, 'w') loadings_output_file_writer = csv.writer(loadings_output_file) loadings_output_file_writer.writerow(headers[0:args.label_column]) for i in range(0, len(pca.components_)): component = pca.components_[i] loadings_output_file_writer.writerow(component) loadings_output_file.close() # Now write the individual and cumulative variance explained by each sucessive component explained_variance_output_file = open( args.explained_variance_output_filename, 'w') explained_variance_output_file_writer = csv.writer( explained_variance_output_file) explained_variance_output_file_writer.writerow( ['Component_Number', 'Explained_Variance', 'Total_Explained_Variance']) total_explained_variance = 0 for i in range(0, len(pca.components_)): explained_variance = pca.explained_variance_ratio_[i] * 100. total_explained_variance += explained_variance explained_variance_output_file_writer.writerow( [i + 1, explained_variance, total_explained_variance]) explained_variance_output_file.close() # print top loadings per component for i in range(0, len(pca.components_)): print 'Top ' + str( args.num_top_loadings) + ' loadings for component ' + str(i) component = pca.components_[i] abs_component = map(abs, component) # Get the indices of components sorted by the features loading sorted_indices = [ i[0] for i in sorted( enumerate(abs_component), key=lambda x: x[1], reverse=True) ] for l in range(0, args.num_top_loadings): index = sorted_indices[l] print headers[index] + ' : ' + str(component[index]) print '\n\n' print 'Explained variance ratio\n ' + str(pca.explained_variance_ratio_) print 'Total explained variance ' + str( numpy.sum(pca.explained_variance_ratio_))
return model # load MNIST data directly from keras mnist_data = tf.keras.datasets.mnist (train_images, train_labels), (test_images, test_labels) = mnist_data.load_data() # reduce training date train_images, train_labels = utils.reduce_date(train_images, train_labels, 10240) # reduce testing date test_images, test_labels = utils.reduce_date(test_images, test_labels, 500) # scale input data scaled_train_images, scaled_test_images = utils.scale_data( train_images, test_images) # add a dummy channel dimension scaled_train_images = scaled_train_images[..., np.newaxis] scaled_test_images = scaled_test_images[..., np.newaxis] # create validation set scaled_train_images, scaled_val_images, train_labels, val_labels = \ train_test_split(scaled_train_images, train_labels, test_size=0.15) # initialize the plot fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) plt.subplots_adjust(bottom=.2, wspace=.3) # initiate the metric record for plot metric_plot = {'layers': [], 'test_acc': [], 'train_t': [], 'test_t': []}
def main(): (train_features, train_labels, test_features, test_labels, class_values, class_names, feature_label_names) = utils.prepare_data(args.input_filename, args.label_column, args.train_size, args.test_size, args.imbalanced_data) # now that we have limited the data to requested train size, scale data since svm needs # to be scaled (train_feautres, test_features) = utils.scale_data(train_features, test_features, args.scaling_method) # We let scikit use its balancing scheme if it is explicitly requested penalty_weights = 'balanced' if args.imbalanced_data else None # feature selection if requested if args.feature_selection_algo: feature_selector_obj = feature_selection.feature_selector(args.evaluation, train_features, train_labels, feature_label_names, -1, penalty_weights, args.feature_selection_algo, args.num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print "Selected " + str(len(feature_selector_obj.get_selected_features())) + " features" print "Top 10 features: " + str(feature_selector_obj.get_top_features(10)) # ovr only works for linear svm multi_class = 'ovr' if args.kernel == 'linear' else args.multi_class model = models.train_svm(train_features, train_labels, penalty_weights, args.skip_grid_search, args.evaluation, args.num_jobs, args.kernel, args.cost, args.gamma, args.degree, args.multi_class) # Predict test and report full stats y_true, y_pred = test_labels, model.predict(test_features) print("\n*****************************\n") print('MAE: ' + str(metrics.mean_absolute_error(y_true, y_pred, multioutput='uniform_average'))) print('MSE: ' + str(metrics.mean_squared_error(y_true, y_pred, multioutput='uniform_average'))) print('Classification report:') print(metrics.classification_report(y_true, y_pred, class_values, class_names)) print('Precision Recall') print(metrics.precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='weighted')) # print and plot confusion matrix print('Confusion Matrix Without Normalization') numpy.set_printoptions(precision=2) cm = metrics.confusion_matrix(y_true, y_pred, class_values) print(cm) print('Confusion Matrix With Normalization') cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis] print(cm_normalized) plt.figure() plt.subplot(2, 1, 1) utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix') # Normalize the confusion matrix by row (i.e by the number of samples # in each class) plt.subplot(2, 1, 2) utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix') #plt.savefig(args.output_figure + '.pdf') pdf = PdfPages(args.output_figure + '.pdf') plt.savefig(pdf, format='pdf') pdf.close()
import tensorflow as tf import numpy as np from tensorflow.keras.models import load_model import matplotlib.pyplot as plt from utils import scale_data from pprint import pprint # load MNIST data directly from keras mnist_data = tf.keras.datasets.mnist _, (test_images, test_labels) = mnist_data.load_data() # scale input data _, scaled_test_images = scale_data(test_images, test_images) # add a dummy channel dimension scaled_test_images = scaled_test_images[..., np.newaxis] # randomly chose an image from test set - plot image random_inx = np.random.choice(scaled_test_images.shape[0]) test_image = scaled_test_images[random_inx] plt.imshow(test_image, cmap='Greys') # load the model n = 2 # model with this number of layers ep = 5 # epoch number model = load_model(f'02_IBS-Saved Model/{n}-Layers/IBS_Ep{ep:02d}') # use the model to predict the label of the chosen image prediction = model.predict(test_image[np.newaxis, ...]) print(f"Random index is:\t{random_inx}") # print random index