def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False, shuffle=True,metric='acc',clf_name='UnKnown'): folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed) folds.get_n_splits(X_train,y) #return stacking_proba for train set train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0])) score=0 for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)): # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds)) X_train_fold=X_train[train_index,:] y_train_fold=y[train_index] X_validate_fold=X_train[validate_index,:] y_validate_fold=y[validate_index] clf.fit(X_train_fold,y_train_fold) fold_preds=clf.predict_proba(X_validate_fold) train_stacking_proba[validate_index,:]=fold_preds #validation fold_preds_a = np.argmax(fold_preds, axis=1) fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold) # print('validate '+metric+":"+str(fold_score)) score+=fold_score score/=nfolds #return stacking_proba for test set clf.fit(X_train,y) test_stacking_proba=clf.predict_proba(X_test) if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba train_stacking_proba=train_stacking_proba[:,1] test_stacking_proba=test_stacking_proba[:,1] if return_score: return train_stacking_proba,test_stacking_proba,score else: return train_stacking_proba,test_stacking_proba
def test_kfold_valueerrors(): X1 = np.array([[1, 2], [3, 4], [5, 6]]) X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, next, KFold(4).split(X1)) # Check that a warning is raised if the least populated class has too few # members. y = np.array([3, 3, -1, -1, 2]) skf_3 = StratifiedKFold(3) assert_warns_message(Warning, "The least populated class", next, skf_3.split(X2, y)) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split with warnings.catch_warnings(): check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3) # Error when number of folds is <= 1 assert_raises(ValueError, KFold, 0) assert_raises(ValueError, KFold, 1) assert_raises(ValueError, StratifiedKFold, 0) assert_raises(ValueError, StratifiedKFold, 1) # When n_folds is not integer: assert_raises(ValueError, KFold, 1.5) assert_raises(ValueError, KFold, 2.0) assert_raises(ValueError, StratifiedKFold, 1.5) assert_raises(ValueError, StratifiedKFold, 2.0) # When shuffle is not a bool: assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
def __init__(self, fm_decoder, n_iter=5, n_folds=3, random_state=None): self.fm_decoder = fm_decoder StratifiedKFold.__init__( self, n_folds=n_folds, random_state=random_state)
def test_datasets(dataset_names): from sklearn.svm import SVC data = Data(dataset_names=dataset_names) def separate_sets(x, y, test_fold_id, test_folds): x_test = x[test_folds == test_fold_id, :] y_test = y[test_folds == test_fold_id] x_train = x[test_folds != test_fold_id, :] y_train = y[test_folds != test_fold_id] return [x_train, y_train, x_test, y_test] n_folds = 2 accuracies = {} for name, dataset in data.datasets.items(): dataset.print_summary() skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True) test_folds = skf.test_folds accuracies[name] = np.zeros(n_folds) test_fold = 0 for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target): x_train, y_train = dataset.data[train_idx], dataset.target[train_idx] x_test, y_test = dataset.data[test_idx], dataset.target[test_idx] svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01) svc.fit(x_train, y_train) prediction = svc.predict(x_test) accuracies[name][test_fold] = 100*np.mean((prediction == y_test)) print("Acc = {0:.2f}%".format(accuracies[name][test_fold])) test_fold += 1 return accuracies
def stratified_cross_validate(self, k): attributes = np.append(self.training_attributes, self.testing_attributes, axis=0) labels = np.append(self.training_labels, self.testing_labels, axis=0) all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))]) #print("all data : %s" % all_data) #print("") np.random.shuffle(all_data) X = all_data[:, :-1] y = all_data[:, -1] print(X.shape, y.shape) skf = StratifiedKFold(n_splits=2) print(skf.get_n_splits(X, y)) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield (X_train, y_train, X_test, y_test) #print("shuffled data : %s" % all_data) #print("") for i in range(k): split = len(all_data) / k #print("split : %s" % split) test_data = all_data[i * split:(i + 1) * split, :] train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0) train_input, train_output = train_data[:, :-1], train_data[:, -1] test_input, test_output = test_data[:, :-1], test_data[:, -1] yield (train_input, train_output, test_input, test_output)
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999): kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) scores = np.zeros((n_folds, n_epochs)) val_scores = np.zeros((n_folds, n_epochs)) best_epochs = np.zeros(n_folds) clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)] folds = kf.split(X, y_train) #iteratively train epochs kfsplit = [(itrain, itest) for itrain, itest in folds] for i in range(n_epochs): print('=============Epoch {}================'.format(i)) i_fold = 0 for itrain, itest in kfsplit: print('Fold ', i_fold) train = X[itrain,:] test = X[itest,:] ytrain, ytest = y[itrain], y[itest] clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1, validation_split=None, batch_size=64, patience=1) print('score: {}'.format(score)) scores[i_fold, i] = score best_epochs[i_fold] = num_epoch # predict on oof pred = clf.predict_proba(test) val_score = log_loss(ytest, pred) print('Validation score: ', val_score) val_scores[i_fold, i] = val_score i_fold += 1 return scores, val_scores, best_epochs
def get_cv_results(design, data, cv_splits=10): test_df, unit_onehot, unit_x = data cv_results = [] for i in range(design.shape[0]): lambda_int, lambda_x = design[i, :] val_losses = [] for rep in range(3): # Almost like bootstrap. Reshuffling cv_val_losses = [] skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(unit_x, test_df['unit']): re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x, .01, .0001, .92) X_train = [test_df["x"][train_index], unit_onehot[train_index], unit_x[train_index]] X_test = [test_df["x"][test_index], unit_onehot[test_index], unit_x[test_index]] y_train, y_test = test_df["y"][train_index], test_df["y"][test_index] h = re_model.fit(X_train, y_train, epochs = 15000, batch_size = 450, validation_data = (X_test, y_test), callbacks = callbacks, verbose = 0) cv_val_losses.append(np.min(h.history['val_loss'])) val_losses.append(np.mean(cv_val_losses)) cv_results.append(np.mean(val_losses)) return cv_results
def split_data(self, X, y, stratified = True, bad_chess = False): if bad_chess: n_points = int(X.shape[0] / self.nodes) for node in range(self.nodes): start_slice = node * n_points final_slice = start_slice + n_points dx = X[start_slice:final_slice] dy = y[start_slice:final_slice] frame_dx = pd.DataFrame(dx) frame_dy = pd.DataFrame(dy) file_data = datas_path.joinpath('data_' + str(node) + '.csv') file_class = datas_path.joinpath('class_' + str(node) + '.csv') frame_dx.to_csv(file_data, index = False) frame_dy.to_csv(file_class, index = False) else: node = 0 if stratified: skf = StratifiedKFold(n_splits = self.nodes) else: skf = KFold(n_splits = self.nodes, shuffle = True, random_state = 17) for splited_index in skf.split(X, y): new_X = pd.DataFrame(X[splited_index[1]]) new_y = pd.DataFrame(y[splited_index[1]]) X_path = datas_path.joinpath("data_" + str(node) + ".csv") y_path = datas_path.joinpath("class_" + str(node) + ".csv") new_X.to_csv(X_path, index = False) new_y.to_csv(y_path, index = False) node += 1
def cv(X_train, y_train): kfold = StratifiedKFold(n_splits=5, shuffle=True) scores_f = [] scores_p = [] scores_r = [] for train, test in kfold.split(X_train, y_train): model = TargetEnsembler(features) X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns) y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"]) X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns) y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"]) model.fit(X_train_cv, y_train_cv) y_pred = model.predict(X_test_cv) s_f = f1_score(y_test_cv, y_pred) s_p = precision_score(y_test_cv, y_pred) s_r = recall_score(y_test_cv, y_pred) print("\tscores f1", (s_f)) print("\tscores p", (s_p)) print("\tscores r", (s_r)) scores_f.append(s_f) scores_p.append(s_p) scores_r.append(s_r) print("mean scores f1", np.mean(scores_f)) print("mean scores p", np.mean(scores_p)) print("mean scores r", np.mean(scores_r))
def classify(X,y, clf,**para): # y = profile["Loss"].as_matrix() # X = profile[features].as_matrix() kf = KFold(n_splits=10) skf = StratifiedKFold(n_splits=6) # print(**para) classifier = clf(**para) name = str(classifier).split("(")[0] # dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep) print("{0} has been established with {1}".format(name, para)) # lr = LogisticRegression(penalty='l1') for train_index, test_index in skf.split(X, y): # print("TRAIN:",train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) score = accuracy_score(y_test, y_pred) print("10-fold Score is: {0}".format(score)) return classifier,y_test, y_pred
def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
def split(dependent, independent, n_folds): skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE) for train_indices, test_indices in skf.split(dependent, independent): train_x = dependent[train_indices] train_y = independent[train_indices] test_x = dependent[test_indices] test_y = independent[test_indices] yield train_x, train_y, test_x, test_y
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # The cv indices from stratified kfold (where stratification is done based # on the fine-grained iris classes, i.e, before the classes 0 and 1 are # conflated) is used for both clf and clf1 n_cv = 2 cv = StratifiedKFold(n_cv) precomputed_folds = list(cv.split(train, target)) # Train clf on the original dataset where classes 0 and 1 are separated clf = LogisticRegressionCV(cv=precomputed_folds) clf.fit(train, target) # Conflate classes 0 and 1 and train clf1 on this modified dataset clf1 = LogisticRegressionCV(cv=precomputed_folds) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) # Ensure that what OvR learns for class2 is same regardless of whether # classes 0 and 1 are separated or not assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10,)) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: max_iter = 2000 if solver in ['sag', 'saga'] else 15 clf_multi = LogisticRegressionCV( solver=solver, multi_class='multinomial', max_iter=max_iter, random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2, cv=2) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10,)) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10))
def gen_folds(X, y, n_folds=5, random_state=0): from sklearn.model_selection import StratifiedKFold kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) folds = kf.split(X, y) # iteratively train epochs kfsplit = [(itrain, itest) for itrain, itest in folds] return kfsplit
def categorical_average(variable, y, pred_0, feature_name): def calculate_average(sub1, sub2): s = pd.DataFrame(data = { variable: sub1.groupby(variable, as_index = False).count()[variable], 'sumy': sub1.groupby(variable, as_index = False).sum()['y'], 'avgY': sub1.groupby(variable, as_index = False).mean()['y'], 'cnt': sub1.groupby(variable, as_index = False).count()['y'] }) tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) del tmp['index'] tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0 tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0 def compute_beta(row): cnt = row['cnt'] if row['cnt'] < 200 else float('inf') return 1.0 / (g + exp((cnt - k) / f)) if lambda_val is not None: tmp['beta'] = lambda_val else: tmp['beta'] = tmp.apply(compute_beta, axis = 1) tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'], axis = 1) tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0'] tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0'] tmp['random'] = np.random.uniform(size = len(tmp)) tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k), axis = 1) return tmp['adj_avg'].ravel() #cv for training set k_fold = StratifiedKFold(5) X_train[feature_name] = -999 for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)), X_train['interest_level'].ravel()): sub = pd.DataFrame(data = {variable: X_train[variable], 'y': X_train[y], 'pred_0': X_train[pred_0]}) sub1 = sub.iloc[train_index] sub2 = sub.iloc[cv_index] X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2) #for test set sub1 = pd.DataFrame(data = {variable: X_train[variable], 'y': X_train[y], 'pred_0': X_train[pred_0]}) sub2 = pd.DataFrame(data = {variable: X_test[variable], 'y': X_test[y], 'pred_0': X_test[pred_0]}) X_test.loc[:, feature_name] = calculate_average(sub1, sub2)
def stratifiedCV(X, y, n_splits = 6): skf = StratifiedKFold(n_splits=n_splits) for train_index, test_index in skf.split(X, y): # print("TRAIN:",train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield X_train, y_train, X_test, y_test
def cv_stats(self): """Perform cross-validation for model evaluation. Returns ------- (list[int], list[int], list[float]) Tuple containing three lists of the same size: true labels predicted labels prediction probabilities """ if 'y_true' in self._cache: return self._cache['y_true'], self._cache['y_pred'], self._cache['y_prob'], self._cache['sigfeatures'] X = self._fe.X y = self._fe.y kf = StratifiedKFold(n_splits=10, shuffle=True) y_true, y_pred, y_prob = [], [], [] sigfeatures = [] order_indices = [] for train_index, test_index in kf.split(X, y): order_indices.extend(test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = self.get_new_classifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) prob = clf.predict_proba(X_test) prob = np.choose(pred, prob.T) for predy in pred: sigfeatures.append(get_sig_features(predy, clf.coef_, 20)) y_true.extend(y_test) y_pred.extend(pred) y_prob.extend(prob) # reorder the results so they match the order of original data y_true = [v for i, v in sorted(zip(order_indices, y_true))] y_pred = [v for i, v in sorted(zip(order_indices, y_pred))] y_prob = [v for i, v in sorted(zip(order_indices, y_prob))] assert list(y_true) == list(y) # cache the results self._cache['y_true'] = y_true self._cache['y_pred'] = y_pred self._cache['y_prob'] = y_prob self._cache['sigfeatures'] = sigfeatures return (y_true, y_pred, y_prob, sigfeatures)
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage X_40 = np.ones(40) y = [0] * 20 + [1] * 20 kf0 = StratifiedKFold(5, shuffle=True, random_state=0) kf1 = StratifiedKFold(5, shuffle=True, random_state=1) for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def get_cross_validated_confusion_matrix(data, label, estimator, index, nfolds=10): # nfolds = get_least_class(label) skf = StratifiedKFold(n_splits=nfolds) con_matrix = np.zeros((len(np.unique(label)), len(np.unique(label)))) for train_index, test_index in skf.split(data, label): train_data, test_data = data[train_index], data[test_index] train_label, test_label = np.array(label)[train_index], np.array(label)[test_index] estimator.train_matrix(train_data, train_label) pred_label = estimator.predict(test_data) con_matrix = con_matrix + confusion_matrix(test_label, pred_label, labels = index) return con_matrix
def run_cv_evaluation(data, n_folds, nlu_config): from sklearn import metrics from sklearn.model_selection import StratifiedKFold from collections import defaultdict # type: (List[rasa_nlu.training_data.Message], int, RasaNLUConfig) -> Dict[Text, List[float]] """Stratified cross validation on data :param data: list of rasa_nlu.training_data.Message objects :param n_folds: integer, number of cv folds :param nlu_config: nlu config file :return: dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ trainer = Trainer(nlu_config) results = defaultdict(list) y_true = [e.get("intent") for e in data] skf = StratifiedKFold(n_splits=n_folds, random_state=11, shuffle=True) counter = 1 logger.info("Evaluation started") for train_index, test_index in skf.split(data, y_true): train = [data[i] for i in train_index] test = [data[i] for i in test_index] logger.debug("Fold: {}".format(counter)) logger.debug("Training ...") trainer.train(TrainingData(training_examples=train)) model_directory = trainer.persist("projects/") # Returns the directory the model is stored in logger.debug("Evaluation ...") interpreter = Interpreter.load(model_directory, nlu_config) test_y = [e.get("intent") for e in test] preds = [] for e in test: res = interpreter.parse(e.text) if res.get('intent'): preds.append(res['intent'].get('name')) else: preds.append(None) # compute fold metrics results["Accuracy"].append(metrics.accuracy_score(test_y, preds)) results["F1-score"].append(metrics.f1_score(test_y, preds, average='weighted')) results["Precision"] = metrics.precision_score(test_y, preds, average='weighted') # increase fold counter counter += 1 return dict(results)
def runCrossValidation(train, RFfile): train_tracks = [] for feature in train: if feature[0] != 0.: train_tracks.append(feature) train_tracks = np.array(train_tracks) # Gets parameter values for training data trainArr = train_tracks[:,1:] # Gets class label of all training data trainRes = train_tracks[:,0] # Convert all NaNs to 0 for RF to work properly trainArr = np.nan_to_num(trainArr) trainRes = np.nan_to_num(trainRes) # Load the classifier rf = joblib.load(RFfile) # Stratified KFolds cross validation cv = StratifiedKFold(n_splits = 5) precision = [] accuracy = [] sensitivity = [] matthews = [] r2 = [] f1 = [] auroc = [] cm = [[0, 0], [0, 0]] for train_index, test_index in cv.split(trainArr, trainRes): probas_ = rf.fit(trainArr[train_index], trainRes[train_index]).predict_proba(trainArr[test_index]) classes = rf.fit(trainArr[train_index], trainRes[train_index]).predict(trainArr[test_index]) # r2 = np.append(r2, (r2_score(trainRes[test_index], probas_[:, 1]))) precision = np.append(precision, (precision_score(trainRes[test_index], classes))) # auroc = np.append(auroc, (roc_auc_score(trainRes[test_index], classes))) accuracy = np.append(accuracy, (accuracy_score(trainRes[test_index], classes))) sensitivity = np.append(sensitivity, (recall_score(trainRes[test_index], classes))) f1 = np.append(f1, (f1_score(trainRes[test_index], classes))) # matthews = np.append(matthews, (matthews_corrcoef(trainRes[test_index], classes))) #cma = np.add(cma, (confusion_matrix(trainRes[test_index], classes))) # cma = np.array(cma) # r2 = np.array(r2) precision = np.array(precision) accuracy = np.array(accuracy) sensitivity = np.array(sensitivity) f1 = np.array(f1) # auroc = np.array(auroc) # matthews = np.array(matthews) return accuracy, precision, sensitivity, f1
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None): """ Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>. :type dataset_path: str :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff) :type output_folder: str :param output_folder: Path to store both index file with folds and fold files. :type n_folds: int :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10. :type random_state: int :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed). """ import warnings warnings.filterwarnings('error') dataset_name = dataset_path.split('/')[-1].split('.')[0] af = load_arff(dataset_path) df = load_dataframe(af) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]]) fold_index = dict() jvm.start() csv_loader = Loader(classname="weka.core.converters.CSVLoader") arff_saver = Saver(classname='weka.core.converters.ArffSaver') for i, (arg_rest, arg_test) in enumerate(fold_iter): fold_index[i] = list(arg_test) _temp_path = 'temp_%s_%d.csv' % (dataset_name, i) fold_data = df.loc[arg_test] # type: pd.DataFrame fold_data.to_csv(_temp_path, sep=',', index=False) java_arff_dataset = csv_loader.load_file(_temp_path) java_arff_dataset.relationname = af['relation'] java_arff_dataset.class_is_last() arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i))) os.remove(_temp_path) json.dump( fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2 ) jvm.stop() warnings.filterwarnings('default')
def Kfold(dataset, k, shuffle=False, stratify=False): """ Envelop function for folding operation """ # remove class labels data = dataset[0] if stratify: kf = StratifiedKFold(k, shuffle) return kf.split(dataset[0], dataset[1]) kf = KFold(k, shuffle) return kf.split(data)
def rmseCvMean(model, X, y, cv=5, random_state=41): from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=cv, random_state=random_state) scr = 0 for train_index, test_index in skf.split(X, y): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] model.fit(X_train, y_train) pred = model.predict(X_test) scr += rmse(y_test, pred) print('\t', rmse(y_test, pred)) return scr/cv
def cross_validation(sgd_clf, x_train, y_train): skfolds = StratifiedKFold(n_splits=5, random_state=42) for train_index, test_index in skfolds.split(x_train, y_train): #40000, 20000 clone_clf = clone(sgd_clf) x_train_folds = x_train[train_index] y_train_folds = y_train[train_index] x_test_fold = x_train[test_index] y_test_fold = y_train[test_index] clone_clf.fit(x_train_folds, y_train_folds) y_pred = clone_clf.predict(x_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred))
def test_kfold_valueerrors(): X1 = np.array([[1, 2], [3, 4], [5, 6]]) X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, next, KFold(4).split(X1)) # Check that a warning is raised if the least populated class has too few # members. y = np.array([3, 3, -1, -1, 3]) skf_3 = StratifiedKFold(3) assert_warns_message(Warning, "The least populated class", next, skf_3.split(X2, y)) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split with warnings.catch_warnings(): warnings.simplefilter("ignore") check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3) # Check that errors are raised if all n_labels for individual # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) assert_raises(ValueError, next, skf_3.split(X2, y)) # Check that errors are raised if all n_labels for individual # classes are less than n_folds. y = np.array([3, 3, -1, -1, 2]) assert_raises(ValueError, next, skf_3.split(X2, y)) # Error when number of folds is <= 1 assert_raises(ValueError, KFold, 0) assert_raises(ValueError, KFold, 1) error_string = ("k-fold cross-validation requires at least one" " train/test split") assert_raise_message(ValueError, error_string, StratifiedKFold, 0) assert_raise_message(ValueError, error_string, StratifiedKFold, 1) # When n_splits is not integer: assert_raises(ValueError, KFold, 1.5) assert_raises(ValueError, KFold, 2.0) assert_raises(ValueError, StratifiedKFold, 1.5) assert_raises(ValueError, StratifiedKFold, 2.0) # When shuffle is not a bool: assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
def run_cv_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True): # use k-fold cross validation # we need to standardize the data for the KNN learner pipe_clf = Pipeline([ ('scl', StandardScaler() ), ('clf', MLPClassifier(alpha=alpha, batch_size=batch_size, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, momentum=momentum, beta_1=beta_1, beta_2=beta_2, hidden_layer_sizes=hidden_layer_sizes))]) # resample the test data without replacement. This means that each data point is part of a test a # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are # evenly disributed such that each test and training set is an accurate representation of the whole # this is the 0.17 version #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0) # this is the 0.18dev version skf = StratifiedKFold(n_folds=self.cv, random_state=0) # do the cross validation train_scores = [] test_scores = [] #for k, (train, test) in enumerate(kfold): for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)): # run the learning algorithm pipe_clf.fit(self.x_train[train], self.y_train[train]) train_score = pipe_clf.score(self.x_train[test], self.y_train[test]) train_scores.append(train_score) test_score = pipe_clf.score(self.x_test, self.y_test) test_scores.append(test_score) print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score) train_score = np.mean(train_scores) print('Training score is', train_score) test_score = np.mean(test_scores) print('Test score is', test_score) if do_plot: self.__plot_learning_curve(pipe_clf) return train_score, test_score
def evaluate_classifier(clf, features, labels): """ Evaluates the classifier using StratifiedKFold cross validation. The precision and recall scores are used to evaluate the algorithm's performance. clf = classifier features = features list as returned by the targetFeatureSplit script labels = target list as returned by the targetFeatureSplit script """ from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.model_selection import StratifiedKFold ### Use StratifiedKFold cross validation with 10 folds skf = StratifiedKFold(n_splits = 10, random_state = 42) precision = [] recall = [] count = 0 ### Split the features and labels into training and testing sets. for train_index, test_index in skf.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for i in train_index: features_train.append(features[i]) labels_train.append(labels[i]) for j in test_index: features_test.append(features[j]) labels_test.append(labels[j]) clf.fit(features_train, labels_train) pred = clf.predict(features_test) precision.append(precision_score(labels_test, pred)) recall.append(recall_score(labels_test, pred)) count += 1 print clf print "Folds:", count print "Average Precision:", sum(precision) / count print "Average Recall:", sum(recall) / count print ""
def kfold_sklearn_model_train(train_X, train_y, parameters, n_fold, sklearn_model, logger): best_auc = 0 best_param = None for params in tqdm(list(ParameterGrid(parameters))): logger.info('params: {}'.format(params)) auc_lst = [] skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0) for trn_i, val_i in skf.split(train_X, train_y): trn_x = train_X[trn_i] val_x = train_X[val_i] trn_y = train_y[trn_i] val_y = train_y[val_i] model = sklearn_model(**params) model.fit(trn_x, trn_y) pred = model.predict_proba(val_x) pred = np.array([p[1] for p in pred]) fpr, tpr, thresholds = metrics.roc_curve(val_y, pred) auc = metrics.auc(fpr, tpr) logger.info('AUC: {}'.format(auc)) auc_lst.append(auc) auc_avg = sum(auc_lst) / len(auc_lst) logger.info('AUC AVG: {}'.format(auc_avg)) if best_auc < auc_avg: best_auc = auc_avg best_param = params logger.info('best params: {}'.format(best_param)) logger.info('AUC: {}'.format(best_auc)) logger.info('train by best parameters') model = sklearn_model(**best_param) model.fit(train_X, train_y) return model
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False): """ 交叉验证 estimator: 模型 X: 数据集X部分 y: 数据集的label procsessor: 预处理器,其实就是做特征选择 cv: 做cv折交叉验证 times: 重复times次交叉验证 random_state: 随机数种子 imb: 是否使用SMOTE使得正负样本数平衡 """ res=[] for t in range(times): skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t) indices=list(skf.split(X=X,y=y)) for k in indices: x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]] if(imb==True): n,p=__lableCount(y_train) rus=RandomUnderSampler(random_state=random_state+t) x_train,y_train=rus.fit_sample(x_train,y_train) if(procsessor is not None): procsessor.fit(x_train,y_train) x_train,y_train=procsessor.transform(x_train,y_train) x_test,y_test=procsessor.transform(x_test,y_test) estimator.fit(x_train,y_train) res.append(Metrics.Score(estimator,x_test,y_test)) res=np.array(res) return res
plt.subplot(2,2,2) plt.imshow(tf.squeeze(resizing[311]), cmap = 'jet') plt.title(f'{tf.squeeze(resizing[311]).shape}') plt.tight_layout() plt.show() # --------------------------------------------------------------------- ''' # 이미지 증폭 정의 / idg2는 증폭없이 형태만 맞춰줌 idg = ImageDataGenerator(height_shift_range=(-1, 1), width_shift_range=(-1, 1)) idg2 = ImageDataGenerator() # kfold 정의 skf = StratifiedKFold(n_splits=40, random_state=42, shuffle=True) # callback 정의 redu_lr = ReduceLROnPlateau(patience= 80, verbose=1, factor=0.5) stop = EarlyStopping(monitor='val_loss', patience=160, verbose=1, mode='min') mc = ModelCheckpoint(filepath= '../data/modelcheckpoint/dacon3/1st_01.h5', save_best_only=True, verbose=1) result = 0 nth = 0 # for문으로 모델 + 컴파일 + 훈련 + 평가 for train_index, valid_index in skf.split(train2, train['digit']) : x_train = train2[train_index] x_val = train2[valid_index]
def kfold_lightgbm(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] lgb_params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric':'auc', 'n_estimators': 10000, 'learning_rate': 0.05, 'num_leaves': 34, # we should let it be smaller than 2^(max_depth) 'max_depth': 8, 'subsample': 0.8715623, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.9497036, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 60, 'min_split_gain': 0.0222415, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 0.041545473, # L1 regularization term on weights 'reg_lambda': 0.0735294, # L2 regularization term on weights 'nthread': 8, 'seed':42, 'verbose': -1, } # lgb_params = { # 'boosting_type': 'gbdt', # 'objective': 'binary', # 'metric':'auc', # 'n_estimators': 10000, # 'learning_rate': 0.05, # 'num_leaves': 15, # we should let it be smaller than 2^(max_depth) # #'max_depth': 8, # 'subsample': 0.7225, # Subsample ratio of the training instance. # #'subsample_freq': 1, # frequence of subsample, <=0 means no enable # 'colsample_bytree': 0.8443, # Subsample ratio of columns when constructing each tree. # 'min_child_weight': np.power(10, -1.7449), # 'min_split_gain': np.power(10, 0.1397), # lambda_l1, lambda_l2 and min_gain_to_split to regularization # 'reg_alpha': np.power(10, -3.1527), # L1 regularization term on weights # 'reg_lambda': np.power(10, 1.4779), # L2 regularization term on weights # 'nthread': 8, # 'random_state':42, # 'verbose': -1, # } CV_score = pd.DataFrame() FOLDS = [] SCORE = [] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] xgtrain = lgb.Dataset(train_x, label=train_y, feature_name=feats ) xgvalid = lgb.Dataset(valid_x, label=valid_y, feature_name=feats ) evals_results = {} clf = lgb.train(lgb_params, xgtrain, valid_sets=[xgvalid], valid_names=['valid'], evals_result=evals_results, num_boost_round=10000, early_stopping_rounds=200, verbose_eval=200) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) FOLDS.append(str(n_fold + 1)) SCORE.append(roc_auc_score(valid_y, oof_preds[valid_idx])) del clf, train_x, train_y, valid_x, valid_y gc.collect() FOLDS.append('Full AUC score') SCORE.append(roc_auc_score(train_df['TARGET'], oof_preds)) CV_score['folds'] = FOLDS CV_score['score'] = SCORE CV_score.to_csv('CV_SCORE.csv', index=False) print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds / num_folds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) pd.DataFrame(data=oof_preds, columns=['TARGET']).to_csv('lgb_baseline_val_oof.csv', index=False) display_importances(feature_importance_df) return feature_importance_df
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB #Load Breast Cancer Dataset from sklearn.metrics import roc_auc_score from sklearn.metrics import average_precision_score from sklearn.model_selection import RepeatedStratifiedKFold breast_cancer = load_breast_cancer() X = breast_cancer.data y = breast_cancer.target sum_l=0 n=5 kf = StratifiedKFold(n_splits=n, random_state=None) auc_score=0 auc_score_2=0 for train_index, test_index in kf.split(X,y): # print("Train:", train_index, "Validation:",test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] gnb = LogisticRegression() classifier = gnb.fit(X_train, y_train) y_pred = gnb.predict_proba( X_test)[:,1] y_class = gnb.predict( X_test) sum_l += metrics.accuracy_score( y_class, y_test) fpr, tpr, _ = metrics.roc_curve( y_test, y_pred) auc_score += metrics.auc(fpr, tpr)
y = y.astype(np.uint8) xtrain, xtest, ytrain, ytest = X[:60000], X[60000:], y[:60000], y[60000:] ytrain_5 = (ytrain == 5) ytest_5 = (ytest == 5) sgd = SGDClassifier(random_state=42) sgd.fit(xtrain, ytrain_5) ##Medir la presicion usando cross-validation from sklearn.base import clone from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_val_score skfold = StratifiedKFold(n_splits=3, random_state=42) for train_index, test_index in skfold.split(xtrain, ytrain_5): clone_sgd = clone(sgd) xtrain_folds = xtrain[train_index] ytrain_folds = ytrain_5[train_index] xtest_folds = xtrain[test_index] ytest_folds = ytrain_5[test_index] clone_sgd.fit(xtrain_folds, ytrain_folds) y_pred = clone_sgd.predict(xtest_folds) n_correct = sum(y_pred == ytest_folds) print(n_correct / len(y_pred)) print(cross_val_score(sgd, xtrain, ytrain_5, cv=3, scoring="accuracy"))
pipelines = {} pipelines['fgMDM-Coh'] = make_pipeline( FeatConn("Coh", s, "test"), FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs)) pipelines['fgMDM-PLV'] = make_pipeline( FeatConn("PLV", s, "test"), FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs)) pipelines['fgMDM-Cov'] = make_pipeline( FeatConn("Cov", s, "test"), FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs)) estimators = [('cov', pipelines['fgMDM-Cov']), ('coh', pipelines['fgMDM-Coh']), ('plv', pipelines['fgMDM-PLV']) ] final_estimator = RidgeClassifier(class_weight="balanced") cvkf = StratifiedKFold(n_splits=5, shuffle=True) scl = StackingClassifier(estimators=estimators, cv=cvkf, n_jobs=n_jobs, final_estimator=final_estimator, stack_method='predict_proba') pipelines['Ensemble'] = scl pipelines['Ensemble'].fit(X_train, y_train) y_pred = pipelines['Ensemble'].predict(X_test) y_pred = le.inverse_transform(y_pred) for i, yp in enumerate(y_pred): res = {"subject name": "P{:02d}".format(s+1), "trial index": i+1, "prediction": yp} all_pred.append(res) df_pred = pd.DataFrame(all_pred)
def train(infile): with gzip.open(infile, 'r') as file: data = np.genfromtxt(file, delimiter='\t', dtype=str) ## Split the data up into features and answers answers = [] features = [] for row in data[1:, ]: answers.append(row[1]) features.append(row[2:]) features = np.array(features) answers = np.array(answers) y_test_final = np.array([]) predictions_final = np.array([]) y_prob_final = np.ndarray(shape=(0, 2), dtype=int) ##first way to cross validate, not very easy to scale # scores = cross_val_score(mlp, features, answers, cv=10) # print(scores) ## Second way that is applicable, problem is that it just takes the first certain number. We don't know if there is a correlation # kf = KFold(n_splits=10) # for train, test in kf.split(features): ## Third way is the shuffle split. # ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) # for train, test in ss.split(features): ## Fouth way is the stradified fold. skf = StratifiedKFold(n_splits=10) for train, test in skf.split(features, answers): print("Training: %s \n Test: %s" % (train, test)) scaler = StandardScaler() X_train, X_test, y_train, y_test = features[train], features[ test], answers[train], answers[test] ## This sets the size of the scaler object scaler.fit(X_train) ## The MLP is super senesitive to feature scaling, so it is highly recommended to scale your data. X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) ####### This is where we want to implement the Ensemble method ###### predictions, y_prob = mlp(X_train, X_test, y_train) # predictions, y_prob = rf(X_train, X_test, y_train) # predictions, y_prob = naiveBayes(X_train, X_test, y_train) # predictions, y_prob = kNearestNeighbor(X_train, X_test, y_train) # predictions, y_prob = supportVM(X_train, X_test, y_train) ##Attention, this returns a y_prob of 0 because it doesn't work with the SVM # predictions, y_prob = logisticRegression(X_train, X_test, y_train) ## This will show the confusion in a matrix that will tell how often we were correct y_test_final = np.concatenate([y_test_final, y_test]) predictions_final = np.concatenate([predictions_final, predictions]) y_prob_final = np.concatenate([y_prob_final, y_prob]) print(confusion_matrix(y_test_final, predictions_final)) print(classification_report(y_test_final, predictions_final)) for i in range(len(y_prob_final)): print("Predicted value for item " + str(i + 1) + " : " + str(predictions_final[i]) + ", actual: " + str(y_test_final[i])) print("Probability : " + str(y_prob_final[i]))
def train_kfold(log_dir, hparams, model_name, k, state, X, Y, X_test=None, Y_test=None, X_train_add=None, Y_train_add=None, batch_size=20, epochs=200): ''' X: The training set X_test: The testing set X_train_add: The additional set for model training, used for the 2nd experiment in paper. ''' ''' Training ''' # Log timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') model_dir = os.path.join(log_dir, timestamp, 'models') os.makedirs(model_dir) hist_dir = os.path.join(log_dir, timestamp, 'history') os.makedirs(hist_dir) eval_dir = os.path.join(log_dir, timestamp, 'evaluate') os.makedirs(eval_dir) params_savename = os.path.join(log_dir, timestamp, 'params.json') summary_savename = os.path.join(log_dir, timestamp, 'summary.json') test_hist_savename = os.path.join(log_dir, timestamp, 'test.json') # Start training kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=state) fold = 1 accs = [] for train_index, val_index in kfold.split(X, Y): model, params = compile_model(model_name, hparams) if fold == 1: print(model.summary()) print(params) print('\n' + '=' * 60 + ' Fold: ' + str(fold) + ' ' + '=' * 60 + '\n') # Callback functions model_savename = os.path.join(model_dir, 'model{0}.h5'.format(str(fold))) hist_savename = os.path.join(hist_dir, 'history{0}.json'.format(str(fold))) val_savename = os.path.join(eval_dir, 'evaluate{0}.json'.format(str(fold))) cb_list = [ callbacks.ModelCheckpoint(filepath=model_savename, monitor='val_acc', save_best_only=True), callbacks.EarlyStopping( monitor='acc', patience=6, ) ] # Add new training sets try: if X_train_add.any() and Y_train_add.any(): x_train = np.concatenate([X[train_index], X_train_add], axis=0) y_train = np.concatenate([Y[train_index], Y_train_add], axis=0) index = list(range(len(y_train))) random.seed(state + 1) random.shuffle(index) x_train = x_train[index] y_train = y_train[index] except AttributeError: x_train = X[train_index] y_train = Y[train_index] history = model.fit(x_train, y_train, validation_data=(X[val_index], Y[val_index]), batch_size=batch_size, epochs=epochs, callbacks=cb_list, verbose=2) # Log hist_dict = history.history m = models.load_model(model_savename, custom_objects={'tf': tf}) val_dict = evaluate_model(m, X[val_index], Y[val_index]) accs.append(val_dict['accuracy']) log_to_json(hist_dict, hist_savename) log_to_json(val_dict, val_savename) fold += 1 K.clear_session() print('Session cleared.') # Summary try: if X_test.any() and Y_test.any(): model_path = os.path.join( model_dir, 'model{0}.h5'.format(accs.index(max(accs)) + 1)) m = models.load_model(model_path, custom_objects={'tf': tf}) test_dict = evaluate_model(m, X_test, Y_test) log_to_json(test_dict, test_hist_savename) except AttributeError: pass log_to_json(hparams, params_savename) summary = summary_kfold(eval_dir) print(summary) log_to_json(summary, summary_savename)
C_vals = [ 0.0001, 0.001, 0.01, 0.1, 0.13, 0.2, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 4.0, 4.5, 5.0, 5.1, 5.5, 6.0, 10.0, 100.0, 1000.0 ] penalties = ['l1', 'l2'] param = { 'penalty': penalties, 'C': C_vals, } grid = GridSearchCV(logreg, param, verbose=False, cv=StratifiedKFold(n_splits=5, random_state=10, shuffle=True), n_jobs=1, scoring='accuracy') # In[ ]: grid.fit(X_train, y_train) print(grid.best_params_) print(grid.best_score_) print(grid.best_estimator_) # In[ ]: #grid.best_estimator_.fit(X_train,y_train) #predict=grid.best_estimator_.predict(X_test)
imputer = ColumnTransformer([('imputer_media', imputer_media, num_cols), ('imputer_moda', imputer_moda, cat_cols)]) # Creamos un ColumnTransformer para el StandardScaler scaler = ColumnTransformer([('scaler_media', scaler_media, num_cols), ('scaler_moda', scaler_moda, cat_cols)]) # Creamos el Pipeline incorporando ColumnTransformer pipeline = Pipeline([('imputer', imputer), ('trans', trans), ('scaler', scaler), ('trans2', trans)]) # TRAMPA. Problemas con el pipeline. RFE y RFECV tienen un 'check_X_y()' antes de llamar al pipeline (que contiene el imputer) X = pipeline.fit_transform(X) # 5 folds estratificadas para el RFECV skf = StratifiedKFold(n_splits=5) # Diccionario que mapea la RFE Accuracy con un índice dict_1 = {} # Diccionario que mapea un índice con el objeto RFECV dict_2 = {} time_prebucle = time.time() # Itero sobre los posibles valores de C for i, c in enumerate(C): time_temp1 = time.time() clf_temp = SVC(C=c, kernel=kernel, class_weight=class_weight, random_state=random_state)
# def build_fn(): # model = MobileNet( # include_top=True, # input_shape=(64, 690, 1), # classes=2, # classifier_activation='softmax', # pooling=None, # weights=None, # ) # return model model = build_fn() model.summary() split = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 10) pred = [] pred_ = [] for train_idx, val_idx in split.split(train_x, train_y): x_train, y_train = train_x[train_idx], train_y[train_idx] x_val, y_val = train_x[val_idx], train_y[val_idx] model = build_fn() model.compile(optimizer = keras.optimizers.Adam(0.002), loss = keras.losses.SparseCategoricalCrossentropy(), metrics = ['acc']) history = model.fit(x = x_train, y = y_train, validation_data = (x_val, y_val), epochs = 8) print("*******************************************************************")
#print nX.shape #nX = SelectKBest(f_classif, k=10000).fit_transform(nX, ny) #print nX.shape #PCA #print nX.shape #nX = PCA(n_components=0.99, svd_solver="full").fit_transform(nX) #print nX.shape #Filter out NaN values indices = np.array([not np.any(np.isnan(vec)) for vec in nX]) nX = nX[indices] ny = ny[indices] #Begin k-fold cross validation kf = StratifiedKFold(n_splits=10) #Optional - parameter evaluation using grid search """ param_grid = [ {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'kernel': ['linear']}, {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}, {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['sigmoid']}, {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [2,3], 'kernel': ['poly']} ] grid = GridSearchCV(svm.SVC(class_weight="balanced"), param_grid=param_grid, cv=kf, scoring="f1_macro", verbose=10) grid.fit(nX,ny) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) """
numpy.random.seed(seed) tf.random.set_seed(3) df=pd.read_csv('./data/dataset/sonar.csv', header=None) dataset=df.values X=dataset[:,0:60].astype(float) Y_obj=dataset[:,60] e=LabelEncoder() e.fit(Y_obj) Y=e.transform(Y_obj) n_fold=10 skf=StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) accuracy=[] for train, test in skf.split(X, Y): model= Sequential() model.add(Dense(24, input_dim=60, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) model.fit(X[train], Y[train], epochs=100, batch_size=5) k_accuracy="%.4f" % (model.evaluate(X[test], Y[test])[1]) accuracy.append(k_accuracy) print("\n %.f fold_accuracy : " % n_fold, accuracy)
def smape_objective(preds, train_data): labels = train_data.get_label() grad = fgrad(preds, labels) hess = fhess(preds, labels) return grad, hess def smape_error(preds, train_data): labels = train_data.get_label() return 'error', 100 * np.mean( np.fabs(preds - labels) / (preds + labels) * 2), False from sklearn.model_selection import StratifiedKFold folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) y_pred_lgb = np.zeros(len(X_test)) params = { 'objective': 'regression', 'num_leaves': 5, 'learning_rate': 0.05, 'n_estimators': 720, 'max_bin': 55, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.2319, 'feature_fraction_seed': 9, 'bagging_seed': 9, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 11
from comet_ml import Experiment from pytorch_lightning.callbacks import ModelCheckpoint from src.transforms import ImageTransform from src.utils import summarize_submit import warnings warnings.filterwarnings('ignore') # Config ########################### # Input Data data_dir = './input' # TTA test_num = 20 # CV cv = StratifiedKFold(n_splits=4, shuffle=True) @hydra.main('config.yml') def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Random Seed seed_everything(cfg.train.seed) # Model #################################################################### net = ENet(model_name=cfg.train.model_name) transform = ImageTransform(img_size=cfg.data.img_size) # Comet.ml experiment = Experiment(api_key=cfg.comet_ml.api_key,
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = StratifiedKFold(2) score, scores, pvalue = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_group, _, pvalue_group = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", groups=np.ones( y.size), random_state=0) assert_true(score_group == score) assert_true(pvalue_group == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = StratifiedKFold(2) score_group, _, pvalue_group = permutation_test_score(svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", groups=np.ones( y.size), random_state=0) assert_true(score_group == score) assert_true(pvalue_group == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = permutation_test_score(svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def RandomizedSearchCV_load_or_make(model, data, labels, random_grid, cv="5", scoring="accuracy", n_iter=20, random_state=47): import xgboost as xgb from xgboost import XGBClassifier from sklearn.model_selection import RandomizedSearchCV import pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold # we use the RandomizedSearchCV to find the best parameters for our XGB model load_or_make = input("Load or make RandomizedSearchCV?") if load_or_make == "load": # pick RandomizedSearchCV to load print('Izaberi RandomizedSearchCV:') print('1. RandomizedSearchCV10_basic_all_features_neg_log_loss ') option = input() if option == '1': filename = '.../dataset/basic_modeli/RandomizedSearchCV10_basic_all_features_neg_log_loss.sav' else: print('Ne postoji zatražena opcija!') raise ValueError('Ne postoji zatražena opcija - pri učitavanju RandomizedSearchCV!') return rand_XGB = pickle.load(open(filename, 'rb')) rand_XGB.get_params() # show results rand_XGB_results_df = pd.DataFrame(rand_XGB.cv_results_)[['mean_test_score', 'std_test_score', 'params','rank_test_score']] rand_XGB_results_df # plot of randomized search results rand_XGB_mean_scores = rand_XGB.cv_results_['mean_test_score'] plt.plot(list(range(1, 21)), rand_XGB_mean_scores) plt.xlabel('k-ti Model Randomized Search CV treniranja (XGB)') plt.ylabel('Točnost unakrsne validacije') return [rand_XGB, rand_XGB_results_df] elif load_or_make == "make": # getting ready for saving later name = input('Unesi ime novo-pokrenutoga RandomizedSearchCV? ') filename = 'data/RandomizedSearchCV_' + name + '.sav' model_XGB = model # RandomizedSearchCV rand_XGB = RandomizedSearchCV(model_XGB, param_distributions = random_grid, cv=StratifiedKFold(n_splits=cv), scoring=scoring, n_iter=20, random_state=random_state, return_train_score=False, verbose=True, n_jobs=-1) # fit rand_XGB.fit(data, labels) # save pickle.dump(rand_XGB, open(filename, 'wb')) print('RandomizedSearchCV je spremljen u: ' + filename ) # show results rand_XGB_results_df = pd.DataFrame(rand_XGB.cv_results_)[['mean_test_score', 'std_test_score', 'params', 'rank_test_score']] rand_XGB_results_df # plot of randomized search results rand_XGB_mean_scores = rand_XGB.cv_results_['mean_test_score'] plt.plot(list(range(1, 21)), rand_XGB_mean_scores) plt.xlabel('k-ti Model Randomized Search CV treniranja (XGB)') plt.ylabel('Točnost unakrsne validacije') return [rand_XGB, rand_XGB_results_df] else: print("Krivi unos! Upiši 'load' ili 'make'!") raise ValueError('Nije upisano load ili make - pri učitavanju/izradi RandomizedSearchCV!') return
def fit_predict(X, y, X_pred): predictors = [i for i in X.columns] stacking_num = 5 bagging_num = 3 bagging_test_size = 0.33 num_boost_round = 500 early_stopping_rounds = 100 stacking_model = [] bagging_model = [] l2_error = [] X = X.values y = y.values layer_train = np.zeros((X.shape[0], 2)) SK = StratifiedKFold(n_splits=stacking_num, shuffle=True, random_state=1) for k, (train_index, test_index) in enumerate(SK.split(X, y)): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test) gbm = lgb.train(param, lgb_train, num_boost_round=num_boost_round, valid_sets=lgb_eval, early_stopping_rounds=early_stopping_rounds) stacking_model.append(gbm) X = np.hstack((X, layer_train[:, 1].reshape((-1, 1)))) predictors.append('lgb_result') for bn in range(bagging_num): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=bagging_test_size, random_state=bn) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test) gbm = lgb.train(param, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=200) bagging_model.append(gbm) l2_error.append( mean_squared_error( gbm.predict(X_test, num_iteration=gbm.best_iteration), y_test)) feat_imp = pd.Series(gbm.feature_importance(), predictors).sort_values(ascending=False) test_pred = np.zeros((X_pred.shape[0], stacking_num)) for sn, gbm in enumerate(stacking_model): pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration) test_pred[:, sn] = pred X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1, 1)))) for bn, gbm in enumerate(bagging_model): pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration) if bn == 0: pred_out = pred else: pred_out += pred return pred_out / bagging_num, feat_imp
verbose=False) score = model.best_score_["valid_0"]["multi_logloss"] return {'loss': score, 'status': STATUS_OK, 'model': model} trials = Trials() best = fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=10, verbose=1) hyperparams = space_eval(space, best) n_best = trials.best_trial['result']['model'].best_iteration_ params.update(hyperparams) print(params) # 하이퍼 파라미터 튜닝 끝 cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed) p_val = np.zeros((trn.shape[0], n_class)) p_tst = np.zeros((tst.shape[0], n_class)) for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1): print(f'training model for CV #{i}') clf = lgb.LGBMClassifier(**params) clf.fit(trn[i_trn], y[i_trn], eval_set=[(trn[i_val], y[i_val])], eval_metric='multiclass', verbose=3, early_stopping_rounds=20) p_val[i_val, :] = clf.predict_proba(trn[i_val]) p_tst += clf.predict_proba(tst) / n_fold
def discriminate(X, y, nDmax): CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique( y, return_counts=True ) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] classes, classesCount = np.unique(yGood, return_counts=True) nClasses = classes.size # Number of classes or groups cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print( 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS)) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X # Use a uniform prior myPrior = np.ones(nClasses) * (1.0 / nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. # nDmax = int(np.fix(np.sqrt(nX//5))) if nDmax < nD: print('Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.') nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print('Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_) * 100.0)) # Initialise Classifiers ldaMod = LDA(n_components=min(nDmax, nClasses - 1), priors=myPrior, shrinkage=None, solver='svd') qdaMod = QDA(priors=myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaYes = 0 qdaYes = 0 rfYes = 0 cvCount = 0 skf = StratifiedKFold(n_splits=cvFolds) skfList = skf.split(Xr, yGood) for train, test in skfList: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array( [b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses) * (1.0 / ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaModself = ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaYes += np.around( (ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) * goodInd.size) qdaYes += np.around( (qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) * goodInd.size) rfYes += np.around( (rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) * goodInd.size) cvCount += goodInd.size ldaYes = int(ldaYes) qdaYes = int(qdaYes) rfYes = int(rfYes) p = 1.0 / nClasses ldaP = 0 qdaP = 0 rfP = 0 for k in range(ldaYes, cvCount + 1): ldaP += binom.pmf(k, cvCount, p) for k in range(qdaYes, cvCount + 1): qdaP += binom.pmf(k, cvCount, p) for k in range(rfYes, cvCount + 1): rfP += binom.pmf(k, cvCount, p) print("Number of classes %d. Chance level %.2f %%" % (nClasses, 100.0 / nClasses)) print("LDA: %.2f %% (%d/%d p=%.4f)" % (100.0 * ldaYes / cvCount, ldaYes, cvCount, ldaP)) print("QDA: %.2f %% (%d/%d p=%.4f)" % (100.0 * qdaYes / cvCount, qdaYes, cvCount, qdaP)) print("RF: %.2f %% (%d/%d p=%.4f)" % (100.0 * rfYes / cvCount, rfYes, cvCount, rfP)) # return ldaYes, qdaYes, rfYes, cvCount, ldaP, qdaP, rfP, nClasses, weights return 100.0 * ldaYes / cvCount, 100.0 * qdaYes / cvCount, 100.0 * rfYes / cvCount
def kfold_lightgbm(df, num_folds, stratified=False): # Divide in training/validation and test data train_df = df[df['FLAG'] != -1] test_df = df[df['FLAG'] == -1] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['FLAG', 'USRID']] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['FLAG'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'FLAG'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'FLAG'].iloc[valid_idx] clf = lgb.LGBMClassifier(nthread=4, n_estimators=3000, learning_rate=0.02, num_leaves=31, colsample_bytree=0.997212866002197, bagging_fraction=0.7733927534732657, min_data_in_leaf=37, min_child_weight=13.05659547343758, min_split_gain=0.027258234021548238, reg_lambda=0.12367585365238067, verbose=0) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=150) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['FLAG'], oof_preds)) # Write submission file and plot feature importance train_df['RST'] = oof_preds test_df['RST'] = sub_preds test_df[['USRID', 'RST']].to_csv('submission6.csv', index=False, sep='\t') display_importances(feature_importance_df) return train_df[['USRID', 'RST']], test_df[['USRID', 'RST']], feature_importance_df
MAX_FEATURES= 195000 MAX_LEN = 150 MODEL_IDENTIFIER = "fastext_minimum_preproc_reg" train = pd.read_csv(TRAIN_DATA_FILE) test = pd.read_csv(TEST_DATA_FILE) print(train.shape, test.shape) list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] y = train[list_classes].values #Get validation folds train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes]) train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000') cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category'))) cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category'))) #NOrmalize text for df in train, test: df["comment_text"] = normalizeString(df["comment_text"]) #stemmer = PorterStemmer() #def custom_tokenize(text): # tokens = wordpunct_tokenize(text) # tokens = [stemmer.stem(token) for token in tokens] # return tokens #Tokenize comments S tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=wordpunct_tokenize) X = tok.fit_transform(pd.concat([train["comment_text"].astype(str).fillna("na"), test["comment_text"].astype(str).fillna("na")])) X_train = X[:len(train), :]
path, dataset), dtype=np.dtype(np.int)) nb_rows = len(clust) Data = np.zeros((nb_rows, nb_columns), dtype=np.float32) for i in range(nb_rows): row = prepare_activity_score_feature_vector(features, labels, clust[i], clusters) Data[i, :] = row X = np.transpose(Data) #Activity score features are sorted as label 0 then label 1, so we need to rearrange the labels (0s first then 1s) labels.sort() y = np.asarray(labels, dtype=np.int) # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(n_splits=5, shuffle=True) classifier = LogisticRegression(solver='lbfgs', max_iter=500) max_iter = 100 if (False): tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for train, test in cv.split(X, y): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0
ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) iris = datasets.load_iris() # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. skf = StratifiedKFold(n_splits=4) # Only take the first fold. train_index, test_index = next(iter(skf.split(iris.data, iris.target))) X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] print( X_train.shape ) print( X_test.shape ) #exit(1) n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances.
def fit(self, data): # Split training data for phase 1 and phase 2 if self.task_type in CLS_TASKS: kf = StratifiedKFold(n_splits=self.kfold) else: kf = KFold(n_splits=self.kfold) # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: model_to_eval = self.stats[algo_id]['model_to_eval'] for idx, (node, config) in enumerate(model_to_eval): X, y = node.data if self.base_model_mask[model_cnt] == 1: for j, (train, test) in enumerate(kf.split(X, y)): x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[ test] estimator = fetch_predict_estimator( self.task_type, config, x_p1, y_p1, weight_balance=data.enable_balance, data_balance=data.data_balance) with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, y) return self
def plot_learning_curve( model, X_train, y_train, X_test, y_test, cv, seed ): import warnings warnings.filterwarnings("ignore") # load libraries import numpy as np from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split as tts from sklearn.metrics import accuracy_score, make_scorer, log_loss import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve, StratifiedKFold #plt.style.use('ggplot') malware_dict = { 1 : 'Ramnit', 2 : 'Lollipop', 3 : 'Kelihos_ver3', 4 : 'Vundo', 5 : 'Simba', 6 : 'Tracur', 7 : 'Kelihos_ver1', 8 : 'Obfuscator.ACY', 9 : 'Gatak'} # Create CV training and test scores for various training set sizes train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=StratifiedKFold(n_splits=cv), scoring="accuracy", #scoring=make_scorer(log_loss, needs_proba=True, labels=list(malware_dict.keys())), n_jobs=-1, random_state=seed) # Create means and standard deviations of training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) # Create means and standard deviations of test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Draw lines plt.subplots(1, figsize=(12,12)) plt.plot(train_sizes, train_mean, '--', color="#111111", label="Uspješnost treniranja") plt.plot(train_sizes, test_mean, color="#111111", label="Uspješnost unakrsne validacije") # Draw bands plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD") plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD") # Create plot plt.title("Krivulja učenja") plt.xlabel("Veličina skupa za treniranje"), plt.ylabel("Točnost"), plt.legend(loc="best") plt.tight_layout(); plt.show() # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # retrieve performance metrics results = model.evals_result() epochs = len(results['validation_0']['merror']) x_axis = range(0, epochs) # plot log loss fig, ax = plt.subplots(figsize=(12,12)) ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train') ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test') ax.legend() plt.ylabel('Log Loss') plt.title('XGBoost Log Loss') plt.show() # plot classification error fig, ax = plt.subplots(figsize=(12,12)) ax.plot(x_axis, results['validation_0']['merror'], label='Train') ax.plot(x_axis, results['validation_1']['merror'], label='Test') ax.legend() plt.ylabel('Pogreška klasifikacije') plt.title('XGBoost pogreška klasifikacije') plt.show()
# learning and prediction ------------------------------------------------------------- print("BayesSearch") bayes_cv_tuner = BayesSearchCV(estimator=ExtraTreesClassifier( n_estimators=300, random_state=0, class_weight="balanced"), search_spaces={ 'criterion': ["gini", "entropy"], 'splitter': ["random", "best"], 'min_samples_split': (2, 100), 'min_samples_leaf': (1, 100), 'min_weight_fraction': (0, 1), 'max_depth': (1, 50), }, scoring="roc_auc", cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), n_jobs=-3, n_iter=10, verbose=0, refit=True, random_state=42) result = bayes_cv_tuner.fit(train_mod[selected_features].values, target_mod.values, callback=status_print) #Model #Best ROC-AUC: #Best params:
def mean_encode(train_data, test_data, columns, target_col, reg_method=None, alpha=0, add_random=False, rmean=0, rstd=0.1, folds=1): length_train = len(train_data) '''Returns a DataFrame with encoded columns''' encoded_cols = [] target_mean_global = train_data[target_col].mean() for col in columns: # Getting means for test data nrows_cat = train_data.groupby(col)[target_col].count() target_means_cats = train_data.groupby(col)[target_col].mean() target_means_cats_adj = (target_means_cats * nrows_cat + target_mean_global * alpha) / (nrows_cat + alpha) # Mapping means to test data encoded_col_test = test_data[col].map(target_means_cats_adj) # Getting a train encodings if reg_method == 'expanding_mean': train_data_shuffled = train_data.sample(frac=1, random_state=1) cumsum = train_data_shuffled.groupby( col)[target_col].cumsum() - train_data_shuffled[target_col] cumcnt = train_data_shuffled.groupby(col).cumcount() encoded_col_train = cumsum / (cumcnt) encoded_col_train.fillna(target_mean_global, inplace=True) if add_random: encoded_col_train = encoded_col_train + normal( loc=rmean, scale=rstd, size=(encoded_col_train.shape[0])) elif (reg_method == 'k_fold') and (folds > 1): kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1).split( train_data[target_col].values, train_data[target_col]) parts = [] for tr_in, val_ind in kfold: # divide data df_for_estimation, df_estimated = train_data.iloc[ tr_in], train_data.iloc[val_ind] # getting means on data for estimation (all folds except estimated) nrows_cat = df_for_estimation.groupby(col)[target_col].count() target_means_cats = df_for_estimation.groupby( col)[target_col].mean() target_means_cats_adj = (target_means_cats * nrows_cat + target_mean_global * alpha) / ( nrows_cat + alpha) # Mapping means to estimated fold encoded_col_train_part = df_estimated[col].map( target_means_cats_adj) if add_random: encoded_col_train_part = encoded_col_train_part + normal( loc=rmean, scale=rstd, size=(encoded_col_train_part.shape[0])) # Saving estimated encodings for a fold parts.append(encoded_col_train_part) encoded_col_train = pd.concat(parts, axis=0) encoded_col_train.fillna(target_mean_global, inplace=True) else: encoded_col_train = train_data[col].map(target_means_cats_adj) if add_random: encoded_col_train = encoded_col_train + normal( loc=rmean, scale=rstd, size=(encoded_col_train.shape[0])) # Saving the column with means encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0) encoded_col[encoded_col.isnull()] = target_mean_global encoded_cols.append( pd.DataFrame({'mean_' + target_col + '_' + col: encoded_col})) all_encoded = pd.concat(encoded_cols, axis=1) #Modified to reindex all_encoded = all_encoded.reset_index() return (all_encoded.iloc[:length_train].reset_index(drop=True), all_encoded.iloc[length_train:].reset_index(drop=True))
def kfold_xgb(df, num_folds, stratified=False): # Divide in training/validation and test data train_df = df[df['FLAG'] != -1] test_df = df[df['FLAG'] == -1] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feats = [f for f in train_df.columns if f not in ['FLAG', 'USRID']] feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['FLAG'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'FLAG'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'FLAG'].iloc[valid_idx] train_x = xgb.DMatrix(train_x, label=train_y) valid_x = xgb.DMatrix(valid_x, label=valid_y) params = { 'booster': 'gbtree', 'objective': 'rank:pairwise', 'eval_metric': 'auc', 'max_depth': 4, 'subsample': 0.85, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 'tree_method': 'exact', 'seed': 0, 'nthread': 4, 'gamma': 0.5, 'min_child_weight': 50, } watchlist = [(train_x, 'train'), (valid_x, 'val')] clf = xgb.train(params, train_x, num_boost_round=3000, evals=watchlist, early_stopping_rounds=90) test = xgb.DMatrix(test_df[feats]) oof_preds[valid_idx] = clf.predict(valid_x, ntree_limit=clf.best_ntree_limit) sub_preds += clf.predict( test, ntree_limit=clf.best_ntree_limit) / folds.n_splits print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) xgb.plot_importance(clf) fscore = clf.get_fscore() a = list(fscore.keys()) v = list(fscore.values()) fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = a fold_importance_df['importance'] = v fold_importance_df['fold'] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['FLAG'], oof_preds)) # Write submission file and plot feature importance train_df['RST'] = oof_preds test_df['RST'] = sub_preds # test_df[['USRID', 'RST']].to_csv('submission6.csv', index= False,sep='\t') return train_df[['USRID', 'RST']], test_df[['USRID', 'RST']], feature_importance_df
model = RandomForestClassifier(n_estimators=300, bootstrap=True, max_features='sqrt', n_jobs=2, random_state=1) elif args.m == 'dt': model = DecisionTreeClassifier(max_depth=10, random_state=1) elif args.m == 'svm': model = SVC(kernel='linear', C=1.0, random_state=1) elif args.m == 'nb': model = GaussianNB() elif args.m == 'knn': model = KNeighborsClassifier(n_neighbors=1) elif args.m == 'all': for mm in ['rf', 'dt', 'svm', 'nb', 'knn']: cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True) acc, recall, prec, f1, TN, TP, FP, FN = 0, 0, 0, 0, 0, 0, 0, 0 for (train, test), i in zip(cv.split(x, y), range(5)): if mm == 'rf': model = RandomForestClassifier(n_estimators=300, bootstrap=True, max_features='sqrt', n_jobs=2, random_state=1) elif mm == 'dt': model = DecisionTreeClassifier(max_depth=10, random_state=1) elif mm == 'svm': model = SVC(kernel='linear', C=1.0, random_state=1) elif mm == 'nb': model = GaussianNB() elif mm == 'knn':
"Ubicación"]] = cases_covid[[ "Ciudad.de.residencia", "Sexo", "Tipo.de.caso", "Ubicación" ]].astype(str) cases_covid.replace(dict, inplace=True) # print(cases_covid.columns) print(cases_covid.info()) # print(cases_covid) x = cases_covid.loc[:, cases_covid.columns != 'Estado'] y = cases_covid.loc[:, 'Estado'] n_samples, n_features = x.shape random_state = np.random.RandomState(0) x = np.c_[x, random_state.randn(n_samples, 200 * n_features)] cv = StratifiedKFold(n_splits=10) classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(x, y)): classifier.fit(x[train], y[train]) viz = plot_roc_curve(classifier, x[test], y[test], name='ROC fold {}'.format(i),