def train_logistic(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, penalty, cost, dual, tol, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "logistic" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) penalty = params['penalty'] cost = params['C'] # Now perform the training on full train data. check on test data model = LogisticRegression(penalty=penalty, dual=dual, C=cost, tol=tol, max_iter=5000, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def main(): (train_features, train_labels, test_features, test_labels, class_values, class_names, feature_label_names) = utils.prepare_data(args.input_filename, args.label_column, args.train_size, args.test_size, args.imbalanced_data) # now that we have limited the data to requested train size, scale data since svm needs # to be scaled (train_feautres, test_features) = utils.scale_data(train_features, test_features, args.scaling_method) # We let scikit use its balancing scheme if it is explicitly requested penalty_weights = 'balanced' if args.imbalanced_data else None # feature selection if requested if args.feature_selection_algo: feature_selector_obj = feature_selection.feature_selector(args.evaluation, train_features, train_labels, feature_label_names, -1, penalty_weights, args.feature_selection_algo, args.num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print "Selected " + str(len(feature_selector_obj.get_selected_features())) + " features" print "Top 10 features: " + str(feature_selector_obj.get_top_features(10)) # ovr only works for linear svm multi_class = 'ovr' if args.kernel == 'linear' else args.multi_class model = models.train_svm(train_features, train_labels, penalty_weights, args.skip_grid_search, args.evaluation, args.num_jobs, args.kernel, args.cost, args.gamma, args.degree, args.multi_class) # Predict test and report full stats y_true, y_pred = test_labels, model.predict(test_features) print("\n*****************************\n") print('MAE: ' + str(metrics.mean_absolute_error(y_true, y_pred, multioutput='uniform_average'))) print('MSE: ' + str(metrics.mean_squared_error(y_true, y_pred, multioutput='uniform_average'))) print('Classification report:') print(metrics.classification_report(y_true, y_pred, class_values, class_names)) print('Precision Recall') print(metrics.precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='weighted')) # print and plot confusion matrix print('Confusion Matrix Without Normalization') numpy.set_printoptions(precision=2) cm = metrics.confusion_matrix(y_true, y_pred, class_values) print(cm) print('Confusion Matrix With Normalization') cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis] print(cm_normalized) plt.figure() plt.subplot(2, 1, 1) utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix') # Normalize the confusion matrix by row (i.e by the number of samples # in each class) plt.subplot(2, 1, 2) utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix') #plt.savefig(args.output_figure + '.pdf') pdf = PdfPages(args.output_figure + '.pdf') plt.savefig(pdf, format='pdf') pdf.close()
def main(): add_log_vars = True (train_features, train_labels, test_features, test_labels, class_values, class_names, feature_label_names) = utils.prepare_data( args.input_filename, args.label_column, args.train_size, args.test_size, args.imbalanced_data, add_log_vars) print("Label is {}".format(feature_label_names[-1])) # We let scikit use its balancing scheme if it is explicitly requested penalty_weights = 'balanced' if args.imbalanced_data else None # feature selection if requested if args.feature_selection_algo: feature_selector_obj = feature_selection.feature_selector( args.evaluation, train_features, train_labels, feature_label_names, -1, penalty_weights, args.feature_selection_algo, args.num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print "Selected " + str( len(feature_selector_obj.get_selected_features())) + " features" print "Top 10 features: " + str( feature_selector_obj.get_top_features(10)) # multinomial only works with lbfgs solver = 'liblinear' if args.multi_class == 'ovr' else 'lbfgs' model = models.train_logistic(train_features, train_labels, penalty_weights, args.skip_grid_search, args.evaluation, args.num_jobs, args.penalty, args.cost, args.multi_class, solver) # Predict test and report full stats y_true = test_labels y_pred_prob = model.predict_proba(test_features) df = pd.DataFrame(data=y_pred_prob, columns=model.classes_) df['max_prob'] = df.max(axis=1) df['max_prob_class'] = df.idxmax(axis=1) df['true'] = y_true y_pred = df['max_prob_class'] print("\n*****************************\n") print('MAE on test: {}'.format( mean_absolute_error(y_true, y_pred, multioutput='uniform_average'))) print('Test Accuracy: {}'.format(accuracy_score(y_true, y_pred) * 100.)) print('Classification report:') print(classification_report(y_true, y_pred, class_values)) print('Weighted Precision Recall:') print( precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='weighted')) print('Unweighted Precision Recall:') print( precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='macro')) # print and plot confusion matrix print('Confusion Matrix Without Normalization') numpy.set_printoptions(precision=2) cm = metrics.confusion_matrix(y_true, y_pred, class_values) print(cm) print('Confusion Matrix With Normalization') cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis] print(cm_normalized) plt.figure() plt.subplot(2, 1, 1) utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix') # Normalize the confusion matrix by row (i.e by the number of samples # in each class) plt.subplot(2, 1, 2) utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix') #plt.savefig(args.output_figure + '.pdf') pdf = PdfPages(args.output_figure + '.pdf') plt.savefig(pdf, format='pdf') pdf.close() # Now print stats on subsets based on confidence of max_prob_class. Sort predictions # by confidence in descending order and take subsets from the top of the sorted df df = df.sort_values(by='max_prob', ascending=False) print(','.join([ 'Probability Threshold', 'Percentage Predicted', 'Accuracy', 'AverageRecall', 'AveragePrecision', 'AverageFscore' ])) for percent_to_predict in range(1, 100): lowest_idx = int(percent_to_predict * len(df.index) / 100.0) df_subset = df.iloc[0:lowest_idx] prob_threshold = df_subset['max_prob'].min() accuracy = accuracy_score(df_subset['true'], df_subset['max_prob_class']) (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred, labels=class_values, pos_label=None, average='macro') print(','.join( map(str, [ prob_threshold, percent_to_predict, accuracy, recall, precision, fscore ])))
def train_knn(train_features, train_labels, test_features, imbalanced_data, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, n_neighbors, weights, algorithm, metric, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. Here instead of # scikit balancing, we will use imbalanced_data flag and discard the last output since # it is irrelevant to knn. In order not to balance the data, the third argument should # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of # scikit_balancing. train_features, train_labels, dummy = utils.prepare_train_data( train_features, train_labels, imbalanced_data, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), imbalanced_data) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "knn" clf = grid_search.grid_search("macro-recall", train_features, train_labels, imbalanced_data, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_neighbors = params['n_neighbors'] weights = params['weights'] algorithm = params['algorithm'] metric = params['metric'] # Now perform the training on full train data. check on test data model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, metric=metric) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_svm(train_features, train_labels, test_features, scikit_balancing, train_size, scaling_method, minmax_min, minmax_max, skip_feature_selection, skip_grid_search, kernel, gamma, cost, degree, num_jobs): """ Balances, extracts the requested train size, imputes, scales and finally performs features selection on the train data. Then it performs grid search, train a model using the best parameters. Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # now that we have limited the data to requested train size, scale data (train_features, test_features) = utils.scale_data(train_features, test_features, scaling_method, minmax_min, minmax_max) if not skip_feature_selection: feature_selector_obj = feature_selection.feature_selector( train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "linear-svm" if kernel == "linear" else "kernel-svm" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) if 'kernel' in params: kernel = params['kernel'] if 'gamma' in params: gamma = params['gamma'] if 'C' in params: cost = params['C'] if 'degree' in params: degree = params['degree'] # Now perform the training on full train data. check on test data # We enable probability estimates, so that we can identify the top samples. model = svm.SVC(tol=0.05, cache_size=6000, class_weight=penalty_weights, kernel=kernel, gamma=gamma, C=cost, degree=degree, probability=True) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def train_random_forest(train_features, train_labels, test_features, scikit_balancing, train_size, skip_feature_selection, skip_grid_search, max_features, n_estimators, criterion, min_samples_split, min_samples_leaf, num_jobs): """ Performs all the data transformations on test data and returns the trained model and the transformed test data """ # balance the train data set and create requested train size. train_features, train_labels, penalty_weights = utils.prepare_train_data( train_features, train_labels, scikit_balancing, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) if not skip_feature_selection: # feature selector expects scaled features (scaled_train_features, scaled_test_features) = utils.scale_data(train_features, test_features, 'minmax') feature_selector_obj = feature_selection.feature_selector( scaled_train_features, train_labels, len(train_labels), scikit_balancing) feature_selector_obj.select_optimal_set(num_jobs) train_features = feature_selector_obj.transform(train_features) test_features = feature_selector_obj.transform(test_features) print("Selected %d features for grid search and final test." % len(feature_selector_obj.get_selected_features())) max_features = utils.extract_max_features(max_features) # requested grid search. find best parameters, to achieve highest average recall if not skip_grid_search: algorithm = "random-forest" clf = grid_search.grid_search("macro-recall", train_features, train_labels, scikit_balancing, algorithm, num_jobs) params = clf.best_params_ print("Best Parameters are: {} ".format(params)) print("Best Cross Validation Score (mean, std): ({},{})".format( clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_])) n_estimators = max(params['n_estimators'], n_estimators) criterion = params['criterion'] max_features = params['max_features'] min_samples_split = params['min_samples_split'] min_samples_leaf = params['min_samples_leaf'] # Now perform the training on full train data. check on test data model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=num_jobs, criterion=criterion, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, class_weight=penalty_weights) model = model.fit(train_features, train_labels) return (model, train_features, train_labels, test_features)
def optimise_step(self, df_train, df_target, npoints=1, nrandom=1, n_iter=50, set_callbacks=True): """Evaluates the data. Build the pipeline. If no parameters are set, default configuration for each step is used Parameters ---------- space : dict, default = None. df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train,) The numerical encoded target for classification tasks. max_evals : int, default = 20, max evaluation times set_callbacks (opt): bool,default: True If callable then callback(res) is called after each call to func. If list of callables, then each callable in the list is called. ---------- Returns --------- result : dict - result['best_score'] : Best Score after Tuning - result['best_score_std'] : Standar Divation of best score - result['best_parmas'] : Best parameters - result['params'] : all paramsters (# = checked candicated) - result['time_cost(s)'] : total time of finding out the best parameters - result['all_cv_results'] : all cv results - result['mean_score_time'] : time for each cv result """ # checke parallel strategy ce = Categorical_encoder() X = ce.fit_transform(df_train, df_target) if len(df_train.dtypes[df_train.dtypes == 'float'].index) != 0: scal = Scaler() X = scal.fit_transform(X, df_target) self.perform_scaling is True else: pass mid_result = {} tuning_result = {} if len(pd.DataFrame(X).columns) > 20: search_space_LGB = Classifier( strategy="LightGBM").get_search_spaces( need_feature_selection=True) search_space_SVC = Classifier(strategy="SVC").get_search_spaces( need_feature_selection=True) search_spaces = [search_space_SVC, search_space_LGB] else: search_space_LGB = Classifier( strategy="LightGBM").get_search_spaces( need_feature_selection=False) search_space_SVC = Classifier(strategy="SVC").get_search_spaces( need_feature_selection=False) search_spaces = [search_space_SVC, search_space_LGB] # Initialize a pipeline fs = None for i in range(len(search_spaces)): if isinstance(search_spaces, tuple): for p in search_spaces[i][0].keys(): if (p.startswith("fs__")): fs = feature_selector() else: print( ">> Number of Features < 20, ignore feature selection" ) pass else: for p in search_spaces[i].keys(): if (p.startswith("fs__")): fs = feature_selector() else: pass # Do we need to cache transformers? cache = False if (fs is not None): if ("fs__strategy" in search_spaces): if (search_spaces["fs__strategy"] != "variance"): cache = True else: pass else: pass mprint(f'Start turning Hyperparameters .... ') print("") print(">>> Categorical Features have encoded with :" + str({'strategy': ce.strategy})) print("") if self.perform_scaling is True: print(">>> Numerical Features have encoded with :" + scal.__class__.__name__) print("") for baseestimator in self.baseEstimator: # Pipeline creation lgb = Classifier(strategy="LightGBM").get_estimator() # rf = Classifier(strategy="RandomForest").get_estimator() # svc = Classifier(strategy="SVC").get_estimator() if (fs is not None): if cache: pipe = Pipeline([('fs', fs), ('model', lgb)], memory=self.to_path) else: pipe = Pipeline([('fs', fs), ('model', lgb)]) else: if cache: pipe = Pipeline([('model', lgb)], memory=self.to_path) else: pipe = Pipeline([('model', lgb)]) if (self.parallel_strategy is True): opt = BayesSearchCV(pipe, search_spaces=search_spaces, scoring=self.scoring, cv=self.cv, npoints=npoints, n_jobs=-1, n_iter=n_iter, nrandom=nrandom, return_train_score=False, optimizer_kwargs={ 'base_estimator': baseestimator, "acq_func": "EI" }, random_state=self.random_state, verbose=self.verbose, refit=self.refit) else: opt = BayesSearchCV(pipe, search_spaces=search_spaces, scoring=self.scoring, cv=self.cv, npoints=npoints, n_jobs=1, n_iter=n_iter, nrandom=nrandom, return_train_score=False, optimizer_kwargs={ 'base_estimator': baseestimator, "acq_func": "EI" }, random_state=self.random_state, verbose=self.verbose, refit=self.refit) if not isinstance(baseestimator, GaussianProcessRegressor): if set_callbacks is True: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model:' + baseestimator, callbacks=[ self.on_step, DeadlineStopper(60 * 60) # ,DeltaYStopper(0.000001) ]) else: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model: ' + baseestimator, ) tuning_result[baseestimator] = mid_result else: if set_callbacks is True: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model:' + baseestimator.__class__.__name__, callbacks=[ self.on_step, DeadlineStopper(60 * 60) # ,DeltaYStopper(0.000001) ]) else: mid_result = self.report_perf( opt, X, df_target, ' with Surrogate Model: ' + baseestimator.__class__.__name__, ) tuning_result[baseestimator.__class__.__name__] = mid_result bests = pd.DataFrame() for key in tuning_result.keys(): if tuning_result[key]['best_score'] == max( d['best_score'] for d in tuning_result.values()): bests = bests.append( { 'best_score': tuning_result[key]['best_score'], 'best_SM': key, 'time': tuning_result[key]['Time_cost'] }, ignore_index=True) bests = bests.sort_values( by=['time'], ascending=True).reset_index(drop=True) best_base_estimator = bests['best_SM'][0] best_param = tuning_result[best_base_estimator]['best_parmas'] print("") print('######## Congratulations! Here is the Best Parameters: #######') print('Best Score is:', tuning_result[best_base_estimator]['best_score']) try: print('with Surrogate Model ' + best_base_estimator) except: print('with Surrogate Model ' + best_base_estimator.__class__.__name__) pprint.pprint(best_param) self.best_param_ = best_param return best_param, tuning_result
def main(): df = pandas.read_csv(args.input_filename, index_col=False, header=0) data = df.values column_names = df.columns.values.tolist() # Impute the data and replace missing values imputer = preprocessing.Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(data) data = imputer.transform(data) # Extract features/labels and their names from raw data features = data[:, 0:args.label_column] labels = data[:, args.label_column].astype(int) feature_names = column_names[0:args.label_column] label_name = column_names[args.label_column] # scale data no matter what, since the feature selector is L1-SVM (scaled_features, dummy) = utils.scale_data(features, None, 'minmax') # open output file and write header with max_num_features selected features output_file = open(args.output_filename, 'w') output_file_writer = csv.writer(output_file) header = [ "num_features_selected", "test_size", "avg_true_positive", "avg_false_positive", "avg_true_negative", "avg_false_negative", "avg_accuracy", "avg_pos_f1", "avg_neg_f1", "avg_average_f1", "avg_pos_precision", "avg_neg_precision", "avg_average_precision", "avg_pos_recall", "avg_neg_recall", "avg_average_recall" ] for i in range(1, args.max_num_features + 1): header.extend(["feature" + str(i), "feature" + str(i) + "_weight"]) output_file_writer.writerow(header) feature_selector_obj = feature_selection.feature_selector( scaled_features, labels, args.num_samples, args.scikit_balancing) for num_features in range(args.min_num_features, args.max_num_features + 1): # Before anything, must set to feature selector object to num_feature feature_selector_obj.select_top_features(num_features) selected_features = feature_selector_obj.get_selected_features( feature_names) # Print selected and unselected features. print '\nSelected Feature,Weight' for feature, feature_coef in selected_features: print(feature + "," + str(feature_coef)) # Now transform and restrict the features to those only selected by the L1-svm transformed_scaled_features = feature_selector_obj.transform( scaled_features) transformed_features = feature_selector_obj.transform(features) print('\n' + str(len(selected_features)) + ' out of ' + str(features.shape[1]) + ' features are selected.\n') # Now perform the learning task using the top features and report results. Make # sure to pass scaled features to svm num_test_trials = 10 test_size = args.test_size if args.test_size <= 1.0 else int( args.test_size) if args.learning_algorithm == 'random-forest': rf_max_features = utils.extract_max_features(args.rf_max_features) metrics = perform_random_forest( transformed_features, labels, args.rf_num_trees, args.rf_criterion, rf_max_features, args.rf_min_samples_split, args.rf_min_samples_leaf, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'svm': metrics = perform_svm(transformed_scaled_features, labels, args.svm_kernel, args.svm_gamma, args.svm_cost, args.svm_degree, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'logistic': metrics = perform_logistic(transformed_features, labels, args.logistic_penalty, args.logistic_cost, args.scikit_balancing, test_size, num_test_trials) elif args.learning_algorithm == 'knn': metrics = perform_knn(transformed_scaled_features, labels, args.knn_num_neighbors, args.knn_weights, args.knn_algorithm, args.knn_metric, args.knn_imbalanced_data, test_size, num_test_trials) # write a row for num_features selected to output file output_row = [len(selected_features)] output_row.extend(metrics) for feature, feature_coef in selected_features: output_row.extend([feature, feature_coef]) output_row.extend([''] * (len(header) - len(output_row))) output_file_writer.writerow(output_row) print '******************************\n' output_file.close()