def fit(self, X, y, savepath=None, refit=True): rst = dict() param_dict = self._get_bayesian_param_dict() if savepath is None: savepath = os.getcwd() estimator_name = self._estimator_name if self.cv is None: self.cv = ms.RepeatedKFold() model = BayesSearchCV(estimator=self.estimator, search_spaces=param_dict, n_iter=self.n_iter, scoring=self.scoring, cv=self.cv, refit=refit) try: rst[estimator_name] = model.fit(X, y) except: log.error( 'Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize' ' one or more parameters over. Please check your input file and the sklearn docs for the mode' ' you are optimizing for the domain of correct values') exit() best_estimator = rst[estimator_name].best_estimator_ self._save_output(savepath, rst) return best_estimator
def set_custom_scorer_cv(n_splits=5, n_repeats=2): """Define custom scorer and Cross-Validation strategy Args: n_splits (int, optional): Num. of splits in Cross-Validation strategy. Defaults to 5. n_repeats (int, optional): Num. of repeats for repeated CV. Defaults to 1. Returns: objects: custom scorer, model_selection.RepeatedKFold """ print(f"\nCreate custom scorer...") scorer = make_scorer( score_func=gscreen.utils.accuracy, greater_is_better=True, # Whether score_func is a score function (default), # meaning high is good, or a loss function, meaning low is good. ) #%% Define cross-validation parameters print(f"Define Cross-Validation strategy...") cv = model_selection.RepeatedKFold( n_splits=n_splits, # Repeats K-Fold: n times with different randomization in each repetition. n_repeats=n_repeats, random_state=rnd_state, ) return scorer, cv
def KfoldValidation(model,train_set,train_label): cv=skl_ms.RepeatedKFold(n_splits=7,random_state=4,n_repeats=10) accuracyList = [] model.fit(train_set,train_label) scoring = {'accuracy' : make_scorer(accuracy_score),'precision' : make_scorer(precision_score),'recall' : make_scorer(recall_score),'f1_score' : make_scorer(f1_score)} score=skl_ms.cross_validate(model,train_set,train_label,scoring=scoring,cv=cv,n_jobs=-1) print('Accuracy : %.3f' % (np.mean(score['test_accuracy']))) print('Precision : %.3f' % (np.mean(score['test_precision']))) print('Recall : %.3f' % (np.mean(score['test_recall']))) print('F1 Score : %.3f' % (np.mean(score['test_f1_score']))) return model
def fit(self, X, y, model, cv=None, savepath=None): rst = dict() param_dict = self._get_grid_param_dict() if savepath is None: savepath = os.getcwd() estimator_name = model.model.__class__.__name__ param_dict = self._search_space_generator(param_dict) if cv is None: cv = ms.RepeatedKFold() metrics = Metrics(metrics_list=None)._metric_zoo() if self.scoring is None: scoring = make_scorer( metrics['mean_absolute_error'][1], greater_is_better=metrics['mean_absolute_error'][0] ) # Note using True b/c if False then sklearn multiplies by -1 else: scoring = make_scorer( metrics[self.scoring][1], greater_is_better=metrics[self.scoring][0] ) # Note using True b/c if False then sklearn multiplies by -1 model = GridSearchCV(model.model, param_dict, scoring=scoring, cv=cv, refit=True, n_jobs=self.n_jobs, verbose=0) try: rst[estimator_name] = model.fit(X, y) except: print( 'Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize' ' one or more parameters over. Please check your input file and the sklearn docs for the mode' ' you are optimizing for the domain of correct values') exit() best_estimator = rst[estimator_name].best_estimator_ self._save_output(savepath, rst) # Need to rebuild the estimator as SklearnModel best_estimator = SklearnModel(model=best_estimator.__class__.__name__, **best_estimator.get_params()) return best_estimator
def cv(df, cols_to_drop, xgboost_params=config.xgboost_params): X = df.drop(columns=['target'] + cols_to_drop) y = df.target started = dt.datetime.now() kf = model_selection.RepeatedKFold(n_repeats=1, n_splits=10) cv_perf = {'kf train': [], 'kf test': [], 'evals_result': []} for i, (train_index, test_index) in enumerate(kf.split(X)): kf_X_train, kf_X_test = X.iloc[train_index], X.iloc[test_index] kf_y_train, kf_y_test = y.iloc[train_index], y.iloc[test_index] model = xgboost.XGBClassifier(**xgboost_params) _ = model.fit(kf_X_train, kf_y_train, verbose=False, eval_metric=["error"], eval_set=[ (kf_X_train, kf_y_train), (kf_X_test, kf_y_test) ]) # early_stopping_rounds=100 kf_train_pred = model.predict(kf_X_train) kf_test_pred = model.predict(kf_X_test) evals_result = { k: v['error'] for k, v in zip(['train', 'test'], model.evals_result().values()) } cv_perf['kf train'].append( metrics.accuracy_score(kf_y_train, kf_train_pred, normalize=True)) cv_perf['kf test'].append( metrics.accuracy_score(kf_y_test, kf_test_pred, normalize=True)) cv_perf['evals_result'].append(evals_result) tr_cummean = np.mean(cv_perf['kf train']) te_cummean = np.mean(cv_perf['kf test']) print( f'Iteration #{i+1:02}. Elapsed: {dt.datetime.now()-started}. Cum. Accuracy: test: {te_cummean:.2%}, train: {tr_cummean:.2%}' )
def plot_cross_validated_coefs( pipe, numerical_columns, nominal_columns, X_train, X_test, y_train, y_test, scorer, n_repeats=5, n_splits=5, axis_tick_label_fontsize=12, fig_size=(8, 12), ): feature_names = (pipe.named_steps["preprocessor"]. named_transformers_["onehot"].get_feature_names( input_features=nominal_columns)) feature_names = np.concatenate([numerical_columns, feature_names]) cv_model = ms.cross_validate( pipe, X=pd.concat([X_train, X_test]), y=pd.concat([y_train, y_test]), cv=ms.RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42), scoring=scorer, return_train_score=True, return_estimator=True, n_jobs=-1, ) coefs = pd.DataFrame( [ est.named_steps["clf"].coef_.flatten() for est in cv_model["estimator"] ], columns=feature_names, ) coefs = coefs[coefs.mean(axis=0).sort_values(ascending=False).index] plot_coefs(coefs, "Coefficient variability", axis_tick_label_fontsize, fig_size)
for dataset_ele in dataset_list: DATA_URL1 = PRJ_FOLDER + dataset_ele[1] print(DATA_URL1) GIST_utility.set_data_params(PRJ_FOLDER, dataset_ele[0], DATA_URL1) print('\n\nProcessing the Dataset ############# '+ des_i+ GIST_utility.DATA_NAME + ' #############') X, y = GIST_utility.summary_data(DATA_URL1,des_i) print('Dataset: {0} - {1}'.format(X.shape, y.shape)) results = [] algNames = [] classifierNum = 0 seed = 5 testing_porc = 0.2 scoring = 'accuracy' k_fold = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=seed) # print(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testing_porc, random_state=seed) print('\n\nComparing the algorithms') print('Algorithm - Accuracy - recall_micro - precision_micro -f1 - Time (HH:MM:SS.mmm)') csv_data_header = ["Algorithm", "Accuracy", "recall_micro", "precision_micro","f1_micro", "Time_(HH:MM:SS.mmm)"] csv_data = [] for name, clf in zip(names, classifiers): time1 = dtm.datetime.now() clf.fit(X_train, y_train) cv_results = model_selection.cross_val_score(clf, X_train, y_train, cv=k_fold) print(name,cv_results.mean()) cv_f1 = model_selection.cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
from sklearn.svm import LinearSVC from sklearn.cross_validation import KFold from sklearn.cross_validation import cross_val_score from sklearn import cross_validation from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn import model_selection data = sio.loadmat('ex3data1.mat') A = data['X'] B = data['y'] rkf = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=42) for train_index, test_index in rkf.split(A): X_train, X_test = A[train_index], A[test_index] y_train, y_test = B[train_index], B[test_index] #X_train,X_test,y_train,y_test = train_test_split(A,B,test_size=0.3,random_state = 42) scaler = StandardScaler() scaler.fit(X_train) StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) clf = MLPClassifier(hidden_layer_sizes=(25), max_iter=500) clf.fit(X_train, y_train) p = clf.predict(X_test) print('This is the confusion matrix')
return history, test_acc # In[21]: epochs = 150 # maximum number of training epochs batch_size = 10 folds = 5 # the number of folds for k-fold cross validation n_repeats = 1 # the number of repeats for repeated k-fold cross validation # In[22]: test_accs = [] stratified_folds = model_selection.RepeatedKFold(n_splits=folds, n_repeats=n_repeats).split( graph_labels, graph_labels) checkpointer = tf.keras.callbacks.ModelCheckpoint(model_path, verbose=1, save_best_only='loss', save_weights_only=True, restore_best_weights=True) for i, (train_index, test_index) in enumerate(stratified_folds): print( f"Training and evaluating on fold {i+1} out of {folds * n_repeats}...") train_gen, test_gen = get_generators(train_index, test_index, graph_labels, batch_size=batch_size)
def _train_and_score(file_path, folds, iterations=None): """ Use the given log_entries to score the classifier in a <fold>-fold cross-validation. : param iterations : Optionally specify to repeat <iterations> times. """ if iterations <= 1: iterations = None log_entries = _read_file_flow(file_path) if len(log_entries) < 10000: raise IOError( "Insufficient number of entries found in the file. Need >= 10,000." ) scores = {} for app_id in ids_data.get_app_ids(): scores[app_id] = [] printer = util.prtr.Printer() printer.prt("Using {}-fold cross-validation".format(folds) + "" if iterations is None else " with {} iteration{}.". format(iterations, "s" if iterations > 1 else "")) folds = None if iterations: folds = sk_mod.RepeatedKFold(n_splits=folds, n_repeats=iterations) else: folds = sk_mod.KFold(n_splits=folds) current_round = 1 for train_indices, score_indices in folds.split(log_entries): printer.prt("Round {} of {}.".format(current_round, folds * iterations)) current_round += 1 # Selecting items based on the given indices printer.prt("Splitting... ", newline=False) training_entries = [log_entries[i] for i in train_indices] scoring_entries = [log_entries[i] for i in score_indices] preconditions_msg = "Please make sure that all preconditions are met and rerun." # Train printer.prt("Training... ", newline=False) training_succeeded = _train_entries(training_entries, squelch_output=True) if not training_succeeded: printer.prt("") printer.prt("Training failed. " + preconditions_msg) continue # Score printer.prt("Scoring... ", newline=False) scoring_result = _score_entries(scoring_entries, squelch_output=True) if not scoring_result: printer.prt("") printer.prt("Scoring failed. " + preconditions_msg) # Don't continue; reset needs to happen in order to allow for the next iteration for app_id in scoring_result: scores[app_id].append(scoring_result[app_id]) # Reset printer.prt("Resetting... ", newline=False) IntrusionClassifier.reset_models(purge=True) printer.prt("Done.") _print_scores(scores, printer)
) X_train_org, X_test_org, y_train_org, y_test_org = train_test_split( x, data['Method'], test_size=0.2, shuffle=True) X_train_org1, X_test_org1, y_train_org1, y_test_org1 = train_test_split( x, data['Fighter1Result'], test_size=0.2, shuffle=True) # In[ ]: ################################LOGISTIC REGRESSION##################################### print( "######################Logistic Regression#####################################" ) method_model = lm.LogisticRegression( warm_start=True, verbose=1) #model selection Logistic Regression kf = model_selection.RepeatedKFold( n_splits=5, n_repeats=4, random_state=None) #applying k-fold cross-validations y = y_train_org #choosing output data column lgreg_method_acc = [] #list for populating accuracies of validation phase for train_index, test_index in kf.split(X_train_org): X_train, X_test = X_train_org.iloc[train_index], X_train_org.iloc[ test_index] #splitting of input data rows for training and testing y_train, y_test = y.iloc[train_index], y.iloc[ test_index] #splitting of ouput data rows for training and testing method_model.fit(X_train, y_train) #calculating prediction on training data pred = method_model.predict( X_test) #calculating prediction on testing data acc = accuracy_score(y_test, pred) #computing accuracy lgreg_method_acc.append(acc)
Y, test_size=valid_size, random_state=1) if valid_size == 0: del X_valid, Y_valid, valid_size #%% Cross Validation n_lambdas = 31 #32 n_shuffles = 100 #100 n_folds = 100 #20 lambdas = np.linspace(.2, .5, num=n_lambdas) score_lambdas = np.zeros(n_lambdas) k = 0 for alpha in lambdas: #Go over all possible lambdas score_cv = pd.DataFrame(np.zeros(shape=(n_shuffles, n_folds))) kfold = model_selection.RepeatedKFold(n_splits=n_folds, n_repeats=n_shuffles, random_state=42) i = 0 j = 0 for train_index, test_index in kfold.split( X, Y): #Cross Validation for evaluation lambda X_train, X_test = X.iloc[train_index], X.iloc[test_index] Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index] model = linear_model.Lasso(alpha=alpha, fit_intercept=False, normalize=True, precompute=False, copy_X=True, max_iter=1000, tol=0.0001,
def solveRegressor(rfConfig, X, y, saveLoc=None, initModel=None, CV=False): '''Generate and run an RF model This function is used for generating an RF mdoel, and then running it. If an initial model is provided, then this will load the model provided, and then use that model as an initializer. The model generated will then be given a hot start from the initial model. Arguments: rfConfig {dict} -- the dictionary of hyperparameters X {numpy 2d array} -- The array of values that will be used for generating a prediction. y {numpy 1d array} -- The expected result that we want the model to train to. Keyword Arguments: saveLoc {str} -- Location where the model should be saved. This assumes that the location whre the model is to be saved will be writable and exists. Remember, at this time the function just does not do any error checking. (default: {None}, in which case, the model is not saved. ) initModel {RandomForestRegressor() model} -- This is the result of an earlier fitted model. In case one is provided, the current model will be restarted from this model (default: {None} in which case, a new model will be generated.) Returns: RandomForestRegressor() -- This is the result of a fitted model, given the data and the rest of the parameters. ''' if initModel is None: rfModel = RandomForestRegressor(**rfConfig) if CV: # We want to make sure that the information is meaningful # for all splits. Otherwise, its pretty meaningless ... # Obtain hyperparameters from the JSON file. This obviously # Takes a long time. So, we shall use this for testing only # ----------------------------------------------------------- rkfFact = json.load(open('../config/RepeatedKFold.json')) rkf = MS.RepeatedKFold(**rkfFact) scores = [] for train_index, test_index in tqdm(rkf.split(X), total=rkfFact['n_splits'] * rkfFact['n_repeats']): # We want to make sure that we start with # the provided model in every split. Otherwsie # we will be training on top of the other models # as warm-start is 1. if initModel is not None: rfModel = initModel rfModel.set_parameter(warm_start=True) rfModel.fit(X[train_index, :], y[train_index]) yHat = rfModel.predict(X[test_index]) score = 0 score = np.sqrt(((yHat - y[test_index])**2).mean()) scores.append(score) tqdm.write('Score = ({}) {}'.format(np.mean(scores), score)) # Refitting the model with the whole data if initModel is not None: rfModel = initModel rfModel.set_parameter(warm_start=True) print('Score summary: {} +-({})'.format(np.mean(scores), np.std(scores))) print('Percentage difference: {}'.format(100 * np.mean(scores) / np.mean(y))) rfModel.fit(X, y) return rfModel
from sklearn.model_selection import train_test_split from helper import prepare_data df = prepare_data() y = df["Berri1"] X = df[[ "day", "month", "day_of_week", "Mean Temp (°C)", "Total Precip (mm)", "Snow on Grnd (cm)", "Min Temp (°C)", "Max Temp (°C)" ]] regr = RandomForestRegressor(n_estimators=100) rkf = model_selection.RepeatedKFold() score, mse, r2 = [], [], [] for train_index, test_index in rkf.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] regr.fit(X_train, y_train) score.append(regr.score(X_test, y_test)) y_pred = regr.predict(X_test) mse.append(metrics.mean_squared_error(y_test, y_pred))
def learn(): # np.warnings.simplefilter(action='ignore', category=UserWarning) overlaped = 5 # windows_size = 10 # clusters = 5 data_set = process_from_files('test') print('get data set') classes_names_as_is_in_data = create_classes_names_list(data_set) print(f'get {len(classes_names_as_is_in_data)} classes') files_as_nested_list = get_files_as_list_of_lists(data_set) print( f"extract data for {len(files_as_nested_list)} files with {len(files_as_nested_list[0])} columns" ) for clusters in [5, 10, 20]: windows_sizes = [5, 10, 20] for windows_size in windows_sizes: if windows_size == 5: overlaps = [1, 4] elif windows_size == 10: overlaps = [1, 5, 9] elif windows_size == 15: overlaps = [1, 7, 14] elif windows_size == 20: overlaps = [1, 10, 19] elif windows_size == 25: overlaps = [1, 13, 24] elif windows_size == 30: overlaps = [1, 15, 29] elif windows_size == 35: overlaps = [1, 19, 34] elif windows_size == 40: overlaps = [1, 20, 39] for overlaped in overlaps: X_train, X_test, _, y_test = train_test_split( files_as_nested_list, classes_names_as_is_in_data, test_size=0.9, random_state=4564567, shuffle=True) files_as_windows_test = get_overlapped_chunks_separated_for_files( X_test, windows_size, overlaped) all_sliding_windows = get_all_overlapped_chunks( X_train, windows_size, overlaped) print( f'Generate {len(all_sliding_windows)} windows to create codebook' ) kmeans_models = prepare_codebook(all_sliding_windows, clusters) print(f'create {len(kmeans_models)} models') histograms_test = get_histogram_basic_on_kmean( clusters, kmeans_models, files_as_windows_test) # find_the_best(X_train, X_test, y_train1, y_test1) models = get_models() for name, model in models: kfold = model_selection.RepeatedKFold(n_splits=5, random_state=7, n_repeats=10) # selection = svc_param_selection(histograms_test, y_test, kfold, model, name) # print(selection) cv_results = model_selection.cross_val_score( model, histograms_test, y_test, cv=kfold, scoring='accuracy') msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg)