def _get_fold_generator(target_values): if params.stratified_cv: cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) cv.get_n_splits(target_values) fold_generator = cv.split(target_values, target_values) else: cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) fold_generator = cv.split(target_values) return fold_generator
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage X_40 = np.ones(40) y = [0] * 20 + [1] * 20 kf0 = StratifiedKFold(5, shuffle=True, random_state=0) kf1 = StratifiedKFold(5, shuffle=True, random_state=1) for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def Kfold(dataset, k, shuffle=False, stratify=False): """ Envelop function for folding operation """ # remove class labels data = dataset[0] if stratify: kf = StratifiedKFold(k, shuffle) return kf.split(dataset[0], dataset[1]) kf = KFold(k, shuffle) return kf.split(data)
def test_kfold_valueerrors(): X1 = np.array([[1, 2], [3, 4], [5, 6]]) X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, next, KFold(4).split(X1)) # Check that a warning is raised if the least populated class has too few # members. y = np.array([3, 3, -1, -1, 3]) skf_3 = StratifiedKFold(3) assert_warns_message(Warning, "The least populated class", next, skf_3.split(X2, y)) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split with warnings.catch_warnings(): warnings.simplefilter("ignore") check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3) # Check that errors are raised if all n_labels for individual # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) assert_raises(ValueError, next, skf_3.split(X2, y)) # Check that errors are raised if all n_labels for individual # classes are less than n_folds. y = np.array([3, 3, -1, -1, 2]) assert_raises(ValueError, next, skf_3.split(X2, y)) # Error when number of folds is <= 1 assert_raises(ValueError, KFold, 0) assert_raises(ValueError, KFold, 1) error_string = ("k-fold cross-validation requires at least one" " train/test split") assert_raise_message(ValueError, error_string, StratifiedKFold, 0) assert_raise_message(ValueError, error_string, StratifiedKFold, 1) # When n_splits is not integer: assert_raises(ValueError, KFold, 1.5) assert_raises(ValueError, KFold, 2.0) assert_raises(ValueError, StratifiedKFold, 1.5) assert_raises(ValueError, StratifiedKFold, 2.0) # When shuffle is not a bool: assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999): kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) scores = np.zeros((n_folds, n_epochs)) val_scores = np.zeros((n_folds, n_epochs)) best_epochs = np.zeros(n_folds) clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)] folds = kf.split(X, y_train) #iteratively train epochs kfsplit = [(itrain, itest) for itrain, itest in folds] for i in range(n_epochs): print('=============Epoch {}================'.format(i)) i_fold = 0 for itrain, itest in kfsplit: print('Fold ', i_fold) train = X[itrain,:] test = X[itest,:] ytrain, ytest = y[itrain], y[itest] clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1, validation_split=None, batch_size=64, patience=1) print('score: {}'.format(score)) scores[i_fold, i] = score best_epochs[i_fold] = num_epoch # predict on oof pred = clf.predict_proba(test) val_score = log_loss(ytest, pred) print('Validation score: ', val_score) val_scores[i_fold, i] = val_score i_fold += 1 return scores, val_scores, best_epochs
def test_kfold_valueerrors(): X1 = np.array([[1, 2], [3, 4], [5, 6]]) X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, next, KFold(4).split(X1)) # Check that a warning is raised if the least populated class has too few # members. y = np.array([3, 3, -1, -1, 2]) skf_3 = StratifiedKFold(3) assert_warns_message(Warning, "The least populated class", next, skf_3.split(X2, y)) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split with warnings.catch_warnings(): check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3) # Error when number of folds is <= 1 assert_raises(ValueError, KFold, 0) assert_raises(ValueError, KFold, 1) assert_raises(ValueError, StratifiedKFold, 0) assert_raises(ValueError, StratifiedKFold, 1) # When n_folds is not integer: assert_raises(ValueError, KFold, 1.5) assert_raises(ValueError, KFold, 2.0) assert_raises(ValueError, StratifiedKFold, 1.5) assert_raises(ValueError, StratifiedKFold, 2.0) # When shuffle is not a bool: assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
def stratified_cross_validate(self, k): attributes = np.append(self.training_attributes, self.testing_attributes, axis=0) labels = np.append(self.training_labels, self.testing_labels, axis=0) all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))]) #print("all data : %s" % all_data) #print("") np.random.shuffle(all_data) X = all_data[:, :-1] y = all_data[:, -1] print(X.shape, y.shape) skf = StratifiedKFold(n_splits=2) print(skf.get_n_splits(X, y)) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield (X_train, y_train, X_test, y_test) #print("shuffled data : %s" % all_data) #print("") for i in range(k): split = len(all_data) / k #print("split : %s" % split) test_data = all_data[i * split:(i + 1) * split, :] train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0) train_input, train_output = train_data[:, :-1], train_data[:, -1] test_input, test_output = test_data[:, :-1], test_data[:, -1] yield (train_input, train_output, test_input, test_output)
def test_datasets(dataset_names): from sklearn.svm import SVC data = Data(dataset_names=dataset_names) def separate_sets(x, y, test_fold_id, test_folds): x_test = x[test_folds == test_fold_id, :] y_test = y[test_folds == test_fold_id] x_train = x[test_folds != test_fold_id, :] y_train = y[test_folds != test_fold_id] return [x_train, y_train, x_test, y_test] n_folds = 2 accuracies = {} for name, dataset in data.datasets.items(): dataset.print_summary() skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True) test_folds = skf.test_folds accuracies[name] = np.zeros(n_folds) test_fold = 0 for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target): x_train, y_train = dataset.data[train_idx], dataset.target[train_idx] x_test, y_test = dataset.data[test_idx], dataset.target[test_idx] svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01) svc.fit(x_train, y_train) prediction = svc.predict(x_test) accuracies[name][test_fold] = 100*np.mean((prediction == y_test)) print("Acc = {0:.2f}%".format(accuracies[name][test_fold])) test_fold += 1 return accuracies
def cv(X_train, y_train): kfold = StratifiedKFold(n_splits=5, shuffle=True) scores_f = [] scores_p = [] scores_r = [] for train, test in kfold.split(X_train, y_train): model = TargetEnsembler(features) X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns) y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"]) X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns) y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"]) model.fit(X_train_cv, y_train_cv) y_pred = model.predict(X_test_cv) s_f = f1_score(y_test_cv, y_pred) s_p = precision_score(y_test_cv, y_pred) s_r = recall_score(y_test_cv, y_pred) print("\tscores f1", (s_f)) print("\tscores p", (s_p)) print("\tscores r", (s_r)) scores_f.append(s_f) scores_p.append(s_p) scores_r.append(s_r) print("mean scores f1", np.mean(scores_f)) print("mean scores p", np.mean(scores_p)) print("mean scores r", np.mean(scores_r))
def get_cv_results(design, data, cv_splits=10): test_df, unit_onehot, unit_x = data cv_results = [] for i in range(design.shape[0]): lambda_int, lambda_x = design[i, :] val_losses = [] for rep in range(3): # Almost like bootstrap. Reshuffling cv_val_losses = [] skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(unit_x, test_df['unit']): re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x, .01, .0001, .92) X_train = [test_df["x"][train_index], unit_onehot[train_index], unit_x[train_index]] X_test = [test_df["x"][test_index], unit_onehot[test_index], unit_x[test_index]] y_train, y_test = test_df["y"][train_index], test_df["y"][test_index] h = re_model.fit(X_train, y_train, epochs = 15000, batch_size = 450, validation_data = (X_test, y_test), callbacks = callbacks, verbose = 0) cv_val_losses.append(np.min(h.history['val_loss'])) val_losses.append(np.mean(cv_val_losses)) cv_results.append(np.mean(val_losses)) return cv_results
def classify(X,y, clf,**para): # y = profile["Loss"].as_matrix() # X = profile[features].as_matrix() kf = KFold(n_splits=10) skf = StratifiedKFold(n_splits=6) # print(**para) classifier = clf(**para) name = str(classifier).split("(")[0] # dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep) print("{0} has been established with {1}".format(name, para)) # lr = LogisticRegression(penalty='l1') for train_index, test_index in skf.split(X, y): # print("TRAIN:",train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) score = accuracy_score(y_test, y_pred) print("10-fold Score is: {0}".format(score)) return classifier,y_test, y_pred
def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
def split_data(self, X, y, stratified = True, bad_chess = False): if bad_chess: n_points = int(X.shape[0] / self.nodes) for node in range(self.nodes): start_slice = node * n_points final_slice = start_slice + n_points dx = X[start_slice:final_slice] dy = y[start_slice:final_slice] frame_dx = pd.DataFrame(dx) frame_dy = pd.DataFrame(dy) file_data = datas_path.joinpath('data_' + str(node) + '.csv') file_class = datas_path.joinpath('class_' + str(node) + '.csv') frame_dx.to_csv(file_data, index = False) frame_dy.to_csv(file_class, index = False) else: node = 0 if stratified: skf = StratifiedKFold(n_splits = self.nodes) else: skf = KFold(n_splits = self.nodes, shuffle = True, random_state = 17) for splited_index in skf.split(X, y): new_X = pd.DataFrame(X[splited_index[1]]) new_y = pd.DataFrame(y[splited_index[1]]) X_path = datas_path.joinpath("data_" + str(node) + ".csv") y_path = datas_path.joinpath("class_" + str(node) + ".csv") new_X.to_csv(X_path, index = False) new_y.to_csv(y_path, index = False) node += 1
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False, shuffle=True,metric='acc',clf_name='UnKnown'): folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed) folds.get_n_splits(X_train,y) #return stacking_proba for train set train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0])) score=0 for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)): # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds)) X_train_fold=X_train[train_index,:] y_train_fold=y[train_index] X_validate_fold=X_train[validate_index,:] y_validate_fold=y[validate_index] clf.fit(X_train_fold,y_train_fold) fold_preds=clf.predict_proba(X_validate_fold) train_stacking_proba[validate_index,:]=fold_preds #validation fold_preds_a = np.argmax(fold_preds, axis=1) fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold) # print('validate '+metric+":"+str(fold_score)) score+=fold_score score/=nfolds #return stacking_proba for test set clf.fit(X_train,y) test_stacking_proba=clf.predict_proba(X_test) if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba train_stacking_proba=train_stacking_proba[:,1] test_stacking_proba=test_stacking_proba[:,1] if return_score: return train_stacking_proba,test_stacking_proba,score else: return train_stacking_proba,test_stacking_proba
def split(dependent, independent, n_folds): skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE) for train_indices, test_indices in skf.split(dependent, independent): train_x = dependent[train_indices] train_y = independent[train_indices] test_x = dependent[test_indices] test_y = independent[test_indices] yield train_x, train_y, test_x, test_y
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # The cv indices from stratified kfold (where stratification is done based # on the fine-grained iris classes, i.e, before the classes 0 and 1 are # conflated) is used for both clf and clf1 n_cv = 2 cv = StratifiedKFold(n_cv) precomputed_folds = list(cv.split(train, target)) # Train clf on the original dataset where classes 0 and 1 are separated clf = LogisticRegressionCV(cv=precomputed_folds) clf.fit(train, target) # Conflate classes 0 and 1 and train clf1 on this modified dataset clf1 = LogisticRegressionCV(cv=precomputed_folds) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) # Ensure that what OvR learns for class2 is same regardless of whether # classes 0 and 1 are separated or not assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10,)) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: max_iter = 2000 if solver in ['sag', 'saga'] else 15 clf_multi = LogisticRegressionCV( solver=solver, multi_class='multinomial', max_iter=max_iter, random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2, cv=2) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10,)) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10))
def create_validation_split(self, n_folds=5, stratified=False): self.folds = n_folds if Path("cv_splits/train_cv_fold_0").is_file() is False: if stratified: skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True) idx = 0 for train_index, test_index in skf.split(self.df_train[[self.id_colname]], self.df_train[[self.target_colname]]): self.df_train[[self.id_colname]].loc[train_index, :].to_csv('cv_splits/train_cv_fold_{}'.format(idx), index=False) self.df_train[[self.id_colname]].loc[test_index, :].to_csv('cv_splits/test_cv_fold_{}'.format(idx), index=False) idx += 1 else: skf = KFold(n_splits=n_folds, random_state=42, shuffle=True) idx = 0 for train_index, test_index in skf.split(self.df_train[[self.id_colname]]): self.df_train[[self.id_colname]].loc[train_index, :].to_csv('cv_splits/train_cv_fold_{}'.format(idx), index=False) self.df_train[[self.id_colname]].loc[test_index, :].to_csv('cv_splits/test_cv_fold_{}'.format(idx), index=False) idx += 1 gc.collect()
def gen_folds(X, y, n_folds=5, random_state=0): from sklearn.model_selection import StratifiedKFold kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) folds = kf.split(X, y) # iteratively train epochs kfsplit = [(itrain, itest) for itrain, itest in folds] return kfsplit
def categorical_average(variable, y, pred_0, feature_name): def calculate_average(sub1, sub2): s = pd.DataFrame(data = { variable: sub1.groupby(variable, as_index = False).count()[variable], 'sumy': sub1.groupby(variable, as_index = False).sum()['y'], 'avgY': sub1.groupby(variable, as_index = False).mean()['y'], 'cnt': sub1.groupby(variable, as_index = False).count()['y'] }) tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) del tmp['index'] tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0 tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0 def compute_beta(row): cnt = row['cnt'] if row['cnt'] < 200 else float('inf') return 1.0 / (g + exp((cnt - k) / f)) if lambda_val is not None: tmp['beta'] = lambda_val else: tmp['beta'] = tmp.apply(compute_beta, axis = 1) tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'], axis = 1) tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0'] tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0'] tmp['random'] = np.random.uniform(size = len(tmp)) tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k), axis = 1) return tmp['adj_avg'].ravel() #cv for training set k_fold = StratifiedKFold(5) X_train[feature_name] = -999 for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)), X_train['interest_level'].ravel()): sub = pd.DataFrame(data = {variable: X_train[variable], 'y': X_train[y], 'pred_0': X_train[pred_0]}) sub1 = sub.iloc[train_index] sub2 = sub.iloc[cv_index] X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2) #for test set sub1 = pd.DataFrame(data = {variable: X_train[variable], 'y': X_train[y], 'pred_0': X_train[pred_0]}) sub2 = pd.DataFrame(data = {variable: X_test[variable], 'y': X_test[y], 'pred_0': X_test[pred_0]}) X_test.loc[:, feature_name] = calculate_average(sub1, sub2)
def stratifiedCV(X, y, n_splits = 6): skf = StratifiedKFold(n_splits=n_splits) for train_index, test_index in skf.split(X, y): # print("TRAIN:",train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield X_train, y_train, X_test, y_test
def cv_stats(self): """Perform cross-validation for model evaluation. Returns ------- (list[int], list[int], list[float]) Tuple containing three lists of the same size: true labels predicted labels prediction probabilities """ if 'y_true' in self._cache: return self._cache['y_true'], self._cache['y_pred'], self._cache['y_prob'], self._cache['sigfeatures'] X = self._fe.X y = self._fe.y kf = StratifiedKFold(n_splits=10, shuffle=True) y_true, y_pred, y_prob = [], [], [] sigfeatures = [] order_indices = [] for train_index, test_index in kf.split(X, y): order_indices.extend(test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = self.get_new_classifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) prob = clf.predict_proba(X_test) prob = np.choose(pred, prob.T) for predy in pred: sigfeatures.append(get_sig_features(predy, clf.coef_, 20)) y_true.extend(y_test) y_pred.extend(pred) y_prob.extend(prob) # reorder the results so they match the order of original data y_true = [v for i, v in sorted(zip(order_indices, y_true))] y_pred = [v for i, v in sorted(zip(order_indices, y_pred))] y_prob = [v for i, v in sorted(zip(order_indices, y_prob))] assert list(y_true) == list(y) # cache the results self._cache['y_true'] = y_true self._cache['y_pred'] = y_pred self._cache['y_prob'] = y_prob self._cache['sigfeatures'] = sigfeatures return (y_true, y_pred, y_prob, sigfeatures)
def get_cross_validated_confusion_matrix(data, label, estimator, index, nfolds=10): # nfolds = get_least_class(label) skf = StratifiedKFold(n_splits=nfolds) con_matrix = np.zeros((len(np.unique(label)), len(np.unique(label)))) for train_index, test_index in skf.split(data, label): train_data, test_data = data[train_index], data[test_index] train_label, test_label = np.array(label)[train_index], np.array(label)[test_index] estimator.train_matrix(train_data, train_label) pred_label = estimator.predict(test_data) con_matrix = con_matrix + confusion_matrix(test_label, pred_label, labels = index) return con_matrix
def runCrossValidation(train, RFfile): train_tracks = [] for feature in train: if feature[0] != 0.: train_tracks.append(feature) train_tracks = np.array(train_tracks) # Gets parameter values for training data trainArr = train_tracks[:,1:] # Gets class label of all training data trainRes = train_tracks[:,0] # Convert all NaNs to 0 for RF to work properly trainArr = np.nan_to_num(trainArr) trainRes = np.nan_to_num(trainRes) # Load the classifier rf = joblib.load(RFfile) # Stratified KFolds cross validation cv = StratifiedKFold(n_splits = 5) precision = [] accuracy = [] sensitivity = [] matthews = [] r2 = [] f1 = [] auroc = [] cm = [[0, 0], [0, 0]] for train_index, test_index in cv.split(trainArr, trainRes): probas_ = rf.fit(trainArr[train_index], trainRes[train_index]).predict_proba(trainArr[test_index]) classes = rf.fit(trainArr[train_index], trainRes[train_index]).predict(trainArr[test_index]) # r2 = np.append(r2, (r2_score(trainRes[test_index], probas_[:, 1]))) precision = np.append(precision, (precision_score(trainRes[test_index], classes))) # auroc = np.append(auroc, (roc_auc_score(trainRes[test_index], classes))) accuracy = np.append(accuracy, (accuracy_score(trainRes[test_index], classes))) sensitivity = np.append(sensitivity, (recall_score(trainRes[test_index], classes))) f1 = np.append(f1, (f1_score(trainRes[test_index], classes))) # matthews = np.append(matthews, (matthews_corrcoef(trainRes[test_index], classes))) #cma = np.add(cma, (confusion_matrix(trainRes[test_index], classes))) # cma = np.array(cma) # r2 = np.array(r2) precision = np.array(precision) accuracy = np.array(accuracy) sensitivity = np.array(sensitivity) f1 = np.array(f1) # auroc = np.array(auroc) # matthews = np.array(matthews) return accuracy, precision, sensitivity, f1
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None): """ Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>. :type dataset_path: str :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff) :type output_folder: str :param output_folder: Path to store both index file with folds and fold files. :type n_folds: int :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10. :type random_state: int :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed). """ import warnings warnings.filterwarnings('error') dataset_name = dataset_path.split('/')[-1].split('.')[0] af = load_arff(dataset_path) df = load_dataframe(af) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]]) fold_index = dict() jvm.start() csv_loader = Loader(classname="weka.core.converters.CSVLoader") arff_saver = Saver(classname='weka.core.converters.ArffSaver') for i, (arg_rest, arg_test) in enumerate(fold_iter): fold_index[i] = list(arg_test) _temp_path = 'temp_%s_%d.csv' % (dataset_name, i) fold_data = df.loc[arg_test] # type: pd.DataFrame fold_data.to_csv(_temp_path, sep=',', index=False) java_arff_dataset = csv_loader.load_file(_temp_path) java_arff_dataset.relationname = af['relation'] java_arff_dataset.class_is_last() arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i))) os.remove(_temp_path) json.dump( fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2 ) jvm.stop() warnings.filterwarnings('default')
def rmseCvMean(model, X, y, cv=5, random_state=41): from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=cv, random_state=random_state) scr = 0 for train_index, test_index in skf.split(X, y): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] model.fit(X_train, y_train) pred = model.predict(X_test) scr += rmse(y_test, pred) print('\t', rmse(y_test, pred)) return scr/cv
def run_cv_evaluation(data, n_folds, nlu_config): from sklearn import metrics from sklearn.model_selection import StratifiedKFold from collections import defaultdict # type: (List[rasa_nlu.training_data.Message], int, RasaNLUConfig) -> Dict[Text, List[float]] """Stratified cross validation on data :param data: list of rasa_nlu.training_data.Message objects :param n_folds: integer, number of cv folds :param nlu_config: nlu config file :return: dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ trainer = Trainer(nlu_config) results = defaultdict(list) y_true = [e.get("intent") for e in data] skf = StratifiedKFold(n_splits=n_folds, random_state=11, shuffle=True) counter = 1 logger.info("Evaluation started") for train_index, test_index in skf.split(data, y_true): train = [data[i] for i in train_index] test = [data[i] for i in test_index] logger.debug("Fold: {}".format(counter)) logger.debug("Training ...") trainer.train(TrainingData(training_examples=train)) model_directory = trainer.persist("projects/") # Returns the directory the model is stored in logger.debug("Evaluation ...") interpreter = Interpreter.load(model_directory, nlu_config) test_y = [e.get("intent") for e in test] preds = [] for e in test: res = interpreter.parse(e.text) if res.get('intent'): preds.append(res['intent'].get('name')) else: preds.append(None) # compute fold metrics results["Accuracy"].append(metrics.accuracy_score(test_y, preds)) results["F1-score"].append(metrics.f1_score(test_y, preds, average='weighted')) results["Precision"] = metrics.precision_score(test_y, preds, average='weighted') # increase fold counter counter += 1 return dict(results)
def cross_validation(sgd_clf, x_train, y_train): skfolds = StratifiedKFold(n_splits=5, random_state=42) for train_index, test_index in skfolds.split(x_train, y_train): #40000, 20000 clone_clf = clone(sgd_clf) x_train_folds = x_train[train_index] y_train_folds = y_train[train_index] x_test_fold = x_train[test_index] y_test_fold = y_train[test_index] clone_clf.fit(x_train_folds, y_train_folds) y_pred = clone_clf.predict(x_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred))
def run_cv_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True): # use k-fold cross validation # we need to standardize the data for the KNN learner pipe_clf = Pipeline([ ('scl', StandardScaler() ), ('clf', MLPClassifier(alpha=alpha, batch_size=batch_size, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, momentum=momentum, beta_1=beta_1, beta_2=beta_2, hidden_layer_sizes=hidden_layer_sizes))]) # resample the test data without replacement. This means that each data point is part of a test a # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are # evenly disributed such that each test and training set is an accurate representation of the whole # this is the 0.17 version #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0) # this is the 0.18dev version skf = StratifiedKFold(n_folds=self.cv, random_state=0) # do the cross validation train_scores = [] test_scores = [] #for k, (train, test) in enumerate(kfold): for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)): # run the learning algorithm pipe_clf.fit(self.x_train[train], self.y_train[train]) train_score = pipe_clf.score(self.x_train[test], self.y_train[test]) train_scores.append(train_score) test_score = pipe_clf.score(self.x_test, self.y_test) test_scores.append(test_score) print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score) train_score = np.mean(train_scores) print('Training score is', train_score) test_score = np.mean(test_scores) print('Test score is', test_score) if do_plot: self.__plot_learning_curve(pipe_clf) return train_score, test_score
def evaluate_classifier(clf, features, labels): """ Evaluates the classifier using StratifiedKFold cross validation. The precision and recall scores are used to evaluate the algorithm's performance. clf = classifier features = features list as returned by the targetFeatureSplit script labels = target list as returned by the targetFeatureSplit script """ from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.model_selection import StratifiedKFold ### Use StratifiedKFold cross validation with 10 folds skf = StratifiedKFold(n_splits = 10, random_state = 42) precision = [] recall = [] count = 0 ### Split the features and labels into training and testing sets. for train_index, test_index in skf.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for i in train_index: features_train.append(features[i]) labels_train.append(labels[i]) for j in test_index: features_test.append(features[j]) labels_test.append(labels[j]) clf.fit(features_train, labels_train) pred = clf.predict(features_test) precision.append(precision_score(labels_test, pred)) recall.append(recall_score(labels_test, pred)) count += 1 print clf print "Folds:", count print "Average Precision:", sum(precision) / count print "Average Recall:", sum(recall) / count print ""
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False): """ 交叉验证 estimator: 模型 X: 数据集X部分 y: 数据集的label procsessor: 预处理器,其实就是做特征选择 cv: 做cv折交叉验证 times: 重复times次交叉验证 random_state: 随机数种子 imb: 是否使用SMOTE使得正负样本数平衡 """ res=[] for t in range(times): skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t) indices=list(skf.split(X=X,y=y)) for k in indices: x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]] if(imb==True): n,p=__lableCount(y_train) rus=RandomUnderSampler(random_state=random_state+t) x_train,y_train=rus.fit_sample(x_train,y_train) if(procsessor is not None): procsessor.fit(x_train,y_train) x_train,y_train=procsessor.transform(x_train,y_train) x_test,y_test=procsessor.transform(x_test,y_test) estimator.fit(x_train,y_train) res.append(Metrics.Score(estimator,x_test,y_test)) res=np.array(res) return res
import csv f = open('data.csv', 'w') writer = csv.writer(f, lineterminator="\n") writer.writerow(data) f.close() f = open('label.csv', 'w') writer = csv.writer(f, lineterminator="\n") writer.writerow(data_label) f.close() """generate a SVM classifier""" classifier = svm.SVC() """train cross validation""" valid_score = [] kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=1) count = 0 for train_index, valid_index in kfold.split(np.array([0] * 3000), np.array([0] * 3000)): print('<<<<<COUNT>>>>> ' + str(count)) classifier.fit(train[train_index], train_label[train_index]) predicted = classifier.predict(train[valid_index]) confus = metrics.confusion_matrix(train_label[valid_index], predicted) acc = (confus[0][0] + confus[1][1]) / sum(sum(confus)) valid_score.extend([acc]) count = count + 1 print("valid: %.2f%% (+/- %.2f%%)" % (np.mean(valid_score), np.std(valid_score))) # train model, classifier.fit(資料:data numberxdata size, 分類目標:data numberxlabel size) """test model""" expected = test_label predicted = classifier.predict(test) confus = metrics.confusion_matrix(expected, predicted)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataset', type=str, help='Provide the dataset name') parser.add_argument('--crossvalidation', default=False, action='store_true', help='Enable a 10-fold crossvalidation') parser.add_argument('--gridsearch', default=False, action='store_true', help='Enable grid search') parser.add_argument('--sinkhorn', default=False, action='store_true', help='Use sinkhorn approximation') parser.add_argument('--h', type=int, required=False, default=2, help="(Max) number of WL iterations") parser.add_argument('--type', type=str, default='continuous') args = parser.parse_args() dataset = args.dataset h = args.h sinkhorn = args.sinkhorn typ = args.type if typ != 'discrete' and typ != 'continuous' and typ != 'both': print('Type error!') exit(-1) print(f'Generating results for {dataset}...') #--------------------------------- # Setup #--------------------------------- # Start by making directories for intermediate and final files data_path = 'data' output_path = os.path.join('output', dataset) results_path = os.path.join('results', dataset) for path in [output_path, results_path]: if not os.path.exists(path): os.makedirs(path) #--------------------------------- # Embeddings #--------------------------------- # Load the data and generate the embeddings # embedding_type = 'continuous' # if dataset == 'ENZYMES' else 'discrete' # print(f'Generating {embedding_type} embeddings for {dataset}.') node_labels, node_features, adj_mat, n_nodes, edge_features, y = load_continuous_graphs( dataset) if typ != 'discrete': label_sequences_continuous = compute_wl_embeddings_continuous( node_features, adj_mat, edge_features, n_nodes, h) if typ != 'continuous': label_sequences_discrete = compute_wl_embeddings_discrete( adj_mat, node_labels, h) # Save embeddings to output folder # out_name = f'{dataset}_wl_{embedding_type}_embeddings_h{h}.npy' # np.save(os.path.join(output_path, out_name), label_sequences) # print(f'Embeddings for {dataset} computed, saved to {os.path.join(output_path, out_name)}.') print() #--------------------------------- # Wasserstein & Kernel computations #--------------------------------- # Run Wasserstein distance computation print('Computing the Wasserstein distances...') if typ != 'discrete': wasserstein_distances_continuous = compute_wasserstein_distance( label_sequences_continuous, h, sinkhorn=sinkhorn, discrete=False) if typ != 'continuous': wasserstein_distances_discrete = compute_wasserstein_distance( label_sequences_discrete, h, sinkhorn=sinkhorn, discrete=True) if typ == 'discrete': wasserstein_distances = wasserstein_distances_discrete elif typ == 'continuous': wasserstein_distances = wasserstein_distances_continuous elif typ == 'both': wasserstein_distances = [] for h in range(len(wasserstein_distances_discrete)): M = wasserstein_distances_continuous[ h] * wasserstein_distances_discrete[h] wasserstein_distances.append(M) else: print('Type error!') exit(-1) print('Wasserstein distances computation done') print() # Transform to Kernel # Here the flags come into play if args.gridsearch: # Gammas in eps(-gamma*M): gammas = np.logspace(-4, 1, num=6) # iterate over the iterations too hs = range(h) param_grid = [{'C': np.logspace(-3, 3, num=7)}] else: gammas = [0.001] hs = [h] kernel_matrices = [] kernel_params = [] for i, current_h in enumerate(hs): # Generate the full list of kernel matrices from which to select M = wasserstein_distances[current_h] for g in gammas: K = np.exp(-g * M) kernel_matrices.append(K) kernel_params.append((current_h, g)) # Check for no hyperparam: if not args.gridsearch: assert len(kernel_matrices) == 1 print('Kernel matrices computed.') print() #--------------------------------- # Classification #--------------------------------- # Run hyperparameter search if needed print( f'Running SVMs, crossvalidation: {args.crossvalidation}, gridsearch: {args.gridsearch}.' ) cv_scores = [] for cv_time in range(10): # Contains accuracy scores for each cross validation step; the # means of this list will be used later on. accuracy_scores = [] # np.random.seed(42) cv = StratifiedKFold(n_splits=10, shuffle=True) # Hyperparam logging best_C = [] best_h = [] best_gamma = [] for train_index, test_index in cv.split(kernel_matrices[0], y): K_train = [K[train_index][:, train_index] for K in kernel_matrices] K_test = [K[test_index][:, train_index] for K in kernel_matrices] y_train, y_test = y[train_index], y[test_index] # Gridsearch if args.gridsearch: gs, best_params = custom_grid_search_cv( SVC(kernel='precomputed'), param_grid, K_train, y_train, cv=5) # Store best params C_ = best_params['params']['C'] h_, gamma_ = kernel_params[best_params['K_idx']] y_pred = gs.predict(K_test[best_params['K_idx']]) else: gs = SVC(C=100, kernel='precomputed').fit(K_train[0], y_train) y_pred = gs.predict(K_test[0]) h_, gamma_, C_ = h, gammas[0], 100 best_C.append(C_) best_h.append(h_) best_gamma.append(gamma_) accuracy_scores.append(accuracy_score(y_test, y_pred)) if not args.crossvalidation: break #--------------------------------- # Printing and logging #--------------------------------- if args.crossvalidation: print('Mean 10-fold accuracy {}: {:2.2f} +- {:2.2f} %'.format( cv_time, np.mean(accuracy_scores) * 100, np.std(accuracy_scores) * 100)) else: print('Final accuracy: {:2.3f} %'.format(np.mean(accuracy_scores))) cv_scores.append(np.mean(accuracy_scores)) # Save to file # if args.crossvalidation or args.gridsearch: # extension = '' # if args.crossvalidation: # extension += '_crossvalidation' # if args.gridsearch: # extension += '_gridsearch' # results_filename = os.path.join(results_path, f'results_{dataset}'+extension+'.csv') # n_splits = 10 if args.crossvalidation else 1 # pd.DataFrame(np.array([best_h, best_C, best_gamma, accuracy_scores]).T, # columns=[['h', 'C', 'gamma', 'accuracy']], # index=['fold_id{}'.format(i) for i in range(n_splits)]).to_csv(results_filename) # print(f'Results saved in {results_filename}.') # else: # print('No results saved to file as --crossvalidation or --gridsearch were not selected.') print('Mean 10-times 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format( np.mean(cv_scores) * 100, np.std(cv_scores) * 100))
def load_model_data(dataset_name, k_fold=5, dataset_autobalance=False, print_dataset_info=True): ''' :param dataset_name: name of the dataset to use :param k_fold: the number of folds to split the dataset :param test_number: if specified, use this to split dataset instead of k_fold :param dataset_autobalance: whether to balances dataset by class distribtuion if it is too skewed. :param print_dataset_info: whether to print information on the dataset :return: ''' print( 'load_data.py load_model_data(): Unserialising pickled dataset into Graph objects' ) # Perform unserialisation graph_list, graph_labels_mapping_dict, node_labels_mapping_dict, node_label_flag, node_feature_flag =\ unserialize_pickle(dataset_name) # Count the number of labels, and form a graph label list for kfold split later label_count_list = [0 for _ in range(len(graph_labels_mapping_dict))] graph_labels = [] for graph in graph_list: label_count_list[graph.label] += 1 graph_labels.append(graph.label) # If the dataset is too imbalanced, perform balancing operation using under-sampling if dataset_autobalance and len(label_count_list) == 2: balance_ratio = min(label_count_list[0] / label_count_list[1], label_count_list[1] / label_count_list[0]) ideal_balance_ratio = 0.5 if balance_ratio < ideal_balance_ratio: print( "load_data.py: Dataset is too imbalanced at %s, restoring to atleast %s now." % (str(round(balance_ratio, 3)), str(ideal_balance_ratio))) if label_count_list[0] > label_count_list[1]: endslice = round( len(graph_split[1]) / ideal_balance_ratio - len(graph_split[1])) graph_list = graph_split[0][:endslice] + graph_split[1] graph_labels = [1 for _ in range(endslice) ] + [0 for _ in range(len(graph_split[1]))] else: endslice = round( len(graph_split[0]) / ideal_balance_ratio - len(graph_split[0])) graph_list = graph_split[1][:endslice] + graph_split[0] graph_labels = [1 for _ in range(endslice) ] + [0 for _ in range(len(graph_split[0]))] # Recalculate label_count_list again: label_count_list = [0 for _ in len(label_count_list)] for label in graph_labels: label_count_list[label] += 1 # Set useful dataset features into a dictionary to be passed to main later dataset_features = {} dataset_features['name'] = dataset_name dataset_features['num_class'] = len(graph_labels_mapping_dict) dataset_features['label_dict'] = graph_labels_mapping_dict dataset_features['have_node_labels'] = node_label_flag dataset_features['have_node_attributions'] = node_feature_flag dataset_features['node_dict'] = node_labels_mapping_dict dataset_features['feat_dim'] = len(node_labels_mapping_dict) dataset_features['edge_feat_dim'] = 0 graph_sizes_list = [graph.number_of_nodes for graph in graph_list] dataset_features['max_num_nodes'] = max(graph_sizes_list) dataset_features['avg_num_nodes'] = round( sum(graph_sizes_list) / len(graph_sizes_list)) dataset_features['graph_sizes_list'] = graph_sizes_list if node_feature_flag == True: dataset_features['attr_dim'] = graph_list[0].node_features.shape[1] else: dataset_features['attr_dim'] = 0 # If verbose on dataset features if print_dataset_info: # Get class distribution of graphs class_distribution_dict = {} inverse_graph_label_dict = { v: k for k, v in graph_labels_mapping_dict.items() } inverse_node_label_dict = { v: k for k, v in node_labels_mapping_dict.items() } for i in range(len(label_count_list)): class_distribution_dict[ inverse_graph_label_dict[i]] = label_count_list[i] # Get node statistics unique_node_labels_count_list = [] unique_node_features_per_graph_count_list = [] unique_node_features_per_node_count_list = [] node_labels_count_dict = {} if graph.node_labels is not None: for graph in graph_list: unique_node_labels_count_list.append( len(graph.unique_node_labels)) for node_label in graph.node_labels: original_node_label = inverse_node_label_dict[node_label] if original_node_label not in node_labels_count_dict.keys( ): node_labels_count_dict[original_node_label] = 1 else: node_labels_count_dict[original_node_label] += 1 if graph.node_features is not None: for graph in graph_list: sum_node_features_in_graph = [ sum(x) for x in zip(*graph.node_features) ] unique_node_features_per_graph_count_list.append( sum([_ > 0 for _ in sum_node_features_in_graph])) for node_feature in graph.node_features: unique_node_features_per_node_count_list.append( sum([_ > 0 for _ in node_feature])) # Get Edge statistics edge_count_list = [] for graph in graph_list: edge_count_list.append(len(graph.edge_pairs) / 2) # Build verbose message dataset_features_string = "==== Dataset Information ====\n" dataset_features_string += "== General Information == \n" dataset_features_string += "Number of graphs: " + str( len(graph_list)) + "\n" dataset_features_string += "Number of classes: " + str( dataset_features['num_class']) + "\n" dataset_features_string += "Class distribution: \n" for key in sorted(class_distribution_dict.keys()): dataset_features_string += '{}:{} '.format( key, class_distribution_dict[key]) dataset_features_string += "\n\n" dataset_features_string += "== Node information== \n" dataset_features_string += "Average number of nodes: " + str( dataset_features['avg_num_nodes']) + "\n" dataset_features_string += "Average number of edges (undirected): " + \ str(round(sum(edge_count_list)/len(graph_list))) + "\n" dataset_features_string += "Max number of nodes: " + str( dataset_features['max_num_nodes']) + "\n" if graph.node_labels is not None: dataset_features_string += "Number of distinct node labels: " + str( len(node_labels_count_dict)) + "\n" dataset_features_string += "Average number of distinct node labels: " + \ str(round(sum(unique_node_labels_count_list)/len(graph_list))) + "\n" dataset_features_string += "Node labels distribution: " + "\n" for node_label in sorted(node_labels_count_dict.keys()): dataset_features_string += '{}:{} '.format( node_label, node_labels_count_dict[node_label]) if graph.node_features is not None: dataset_features_string += "Average number of distinct node features per graph: " + \ str(round(sum(unique_node_features_per_graph_count_list)/len(graph_list))) + "\n" dataset_features_string += "Average number of distinct node features per node: " + \ str(round(sum(unique_node_features_per_node_count_list)/ len(unique_node_features_per_node_count_list))) + "\n" dataset_features_string += "\n" dataset_features["dataset_info"] = dataset_features_string print(dataset_features_string) # If no test number is specified, use stratified KFold sampling for train test split stratified_KFold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=None) i = 0 train_graphs = [] test_graphs = [] for train_index, test_index in stratified_KFold.split( graph_list, graph_labels): train_graphs.append([graph_list[i] for i in train_index]) test_graphs.append([graph_list[i] for i in test_index]) return train_graphs, test_graphs, dataset_features
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): """ LightGBM with KFold or Stratified KFold Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code :param df: :param num_folds: :param stratified: :param debug: :return: """ # Divide in training / validation and testing data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create array and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, -1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, -1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = feats fold_importance_df['importance'] = clf.feature_importances_ fold_importance_df['fold'] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df]) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print("Full AUC score %.6f" % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(os.path.join('./submission/', submission_file_name), index=False) # Display feature importance display_importances(feature_importance_df) return feature_importance_df
def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TSVFILE = cfg.values.tsvfile # # early_stopping = EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001) # early_stopping_patience : 몇 번(epoch)을 참아줄 것인가? # early_stopping_threshold : metric이 어느 정도 개선 되어야 하는가? seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') # model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config = ElectraConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/" + TSVFILE) whole_label = whole_df['label'].values # tokenizer_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Tokenizer') # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate fp16=True, per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training dataloader_num_workers=4, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, greater_is_better=True, metric_for_best_model=cfg.values.train_args.metric_for_best_model, # lr_scheduler_type='get_cosine_with_hard_restarts_schedule_with_warmup' # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. load_best_model_at_end=cfg.values.train_args.load_best_model_at_end) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] val_df = whole_df.iloc[val_idx] tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) # model_module = getattr(import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') # model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config) model = ElectraForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation fkdataset compute_metrics=compute_metrics # define metrics function ) k += 1 # train model trainer.train() if cfg.values.val_args.fold_break: break else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) # model_module = getattr(import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) model.parameters model.to(device) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
def save_validationlist(root='.'): # list up filenames of valid data # totalfiles = glob.glob(os.path.join(root,"test_20??_withUPID","*.dcm")) # filenames = glob.glob(os.path.join(root,"test_20??_withUPID","*_[0-3]_[0-3].dcm")) data_dir = ["final_dcm", "final_crop"][0] logger.info('[' * 10 + ' ' * 20 + 'START ANALYSIS' + ' ' * 20 + ']' * 10) filenames = glob.glob( os.path.join(root, data_dir, "*" + (".dcm" if data_dir == 'final_dcm' else '.jpg'))) logger.info(f'No. of total datasets : {len(filenames)} patients') # 6516 rmfn = glob.glob( os.path.join(root, data_dir, "*_x_x" + (".dcm" if data_dir == 'final_dcm' else '.jpg'))) if len(rmfn) > 1: logger.info(' x_x.dcm :') logger.info(rmfn) filenames.remove(rmfn) logger.info( f'No. of valid datasets : {len(filenames)} patients (excluded x_x.dcm )' ) #2980 (20.10.7 ver) cvdf = prepare_metatable(filenames) n_folds = 10 plen = len(filenames) logger.info(f'----- Split patients for {n_folds} Cross-validation') skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True) for ii, (train_pindex, test_pindex) in enumerate( skf.split(range(plen), cvdf['left_label'])): # record fold index cvdf.at[test_pindex, 'FOLD'] = ii cvdf[f'FOLD{ii}_testset'] = 0 cvdf.at[test_pindex, f'FOLD{ii}_testset'] = 1 # save metadata filelist_dir = os.path.join(root, 'inputlist') os.makedirs(filelist_dir, exist_ok=True) cvdf.to_csv(os.path.join(filelist_dir, "input_metadata_table.csv"), index=False) cvdf[['index', 'filename']].to_csv(os.path.join(filelist_dir, "input_filenames_total.csv"), index=False) for i in range(n_folds): cvdf.loc[cvdf[f'FOLD{i}_testset'] == 1, 'filename'].to_csv(os.path.join( filelist_dir, f"input_filenames_fold{i}.csv"), index=False) # statistics logger.info(f'----- Data statistics by fold', cvdf['FOLD'].value_counts()) logger.info(cvdf['FOLD'].value_counts()) labelfreq_left = pd.crosstab(cvdf['FOLD'], cvdf['left_label'], margins=True) labelfreq_left_ratio = pd.crosstab(cvdf['FOLD'], cvdf['left_label'], margins=True, normalize='index') labelfreq_right = pd.crosstab(cvdf['FOLD'], cvdf['right_label'], margins=True) labelfreq_right_ratio = pd.crosstab(cvdf['FOLD'], cvdf['right_label'], margins=True, normalize='index') labelfreq = pd.concat([labelfreq_left, labelfreq_right], axis=1, keys=['left_sinus', 'right_sinus'], names=[' ', 'label']) labelfreq_ratio = pd.concat([labelfreq_left_ratio, labelfreq_right_ratio], axis=1, keys=['left_sinus', 'right_sinus'], names=[' ', 'label (ratio)']) labelfreq.to_csv(os.path.join(filelist_dir, f"label_freq_byfold.csv")) labelfreq_ratio.to_csv(os.path.join(filelist_dir, f"label_freq_ratio_byfold.csv"), float_format='%.2f') logger.info(f'----- Label frequency by fold') logger.info(labelfreq) logger.info(f'----- Label frequency (ratio) by fold') logger.info(labelfreq_ratio)
df = pd.DataFrame(data=X_reduced, columns=features_name) df["label"] = y g = sns.PairGrid(df, hue='label') g.map(sns.scatterplot) plt.show() x_data = df.iloc[:, 0:-1] y_data = df["label"] C = 1.0 #SVM regularization parameter kf = StratifiedKFold(n_splits=20, shuffle=True) clfs = [] scores = [] for i, (train_index, test_index) in enumerate(kf.split(x_data, y_data)): #row 1,4,7,8,10,11,15.... -> training #row 2,3,... ->testing X_train, X_test = x_data.iloc[train_index], x_data.iloc[test_index] Y_train, Y_test = y_data.iloc[train_index], y_data.iloc[test_index] clf = svm.SVC(kernel='linear', C=C, probability=True) clf.fit(X_train, Y_train) score = clf.score(X_test, Y_test) print(score) clfs.append(clf) scores.append(score) best_accuracy = np.argsort(scores)[::-1][0] clf = clfs[best_accuracy]
def train(train_path, tokenizer_path): print('import data...') maxlen = 1024 X, label, Y = text2sequence(train_path, tokenizer_path, maxlen) num_class = len(set(label)) print('data import finished!') tokenizer = pickle.load(open(tokenizer_path, 'rb')) num_words = len(tokenizer.word_index) + 1 print('prepare training data and validation data using k_fold') seed = 0 k = 10 k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed) #10折交叉验证数据集划分 cw_1 = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1} #不考虑数据不均衡 cw_2 = { 0: 0.348709, 1: 3.457910, 2: 1.451396, 3: 2.116922, 4: 17.358700, 5: 0.404727, 6: 3.370635, 7: 1.167362 } #每类权重为(1/8/该类出现频率) class_weight = [cw_1, cw_2] #使两种权重一样重要 #在100个文档的数据集上测试发现不使用class_weight的效果比使用class_weight的好 #使用class_weight的效果比只使用cw_2的效果好 print('create lstm model...') model = Sequential() model.add(Embedding(num_words, 128, input_length=maxlen)) model.add(Dropout(0.5)) model.add(LSTM(64, recurrent_dropout=0.5)) model.add(Dropout(0.5)) model.add(Dense(num_class, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) k_fold_cv_loss = [] k_fold_cv_acc = [] dt = datetime.now() d = dt.date() h = dt.time().hour m = dt.time().minute time_str = '{}_{}{}'.format(d, h, m) mckpt = ModelCheckpoint('model/best-lstm_weights_{}.h5'.format(time_str), monitor='val_loss', mode='auto', verbose=1, save_best_only=True, save_weights_only=True, period=1) rlstp = EarlyStopping(monitor='val_loss', patience=3) tb = TensorBoard(log_dir='./logs', embeddings_freq=1, write_images=1, histogram_freq=1, batch_size=32) turn = 1 for train, valid in k_fold.split(X, label): print('the {} turn training...'.format(turn)) turn += 1 model.fit(X[train], Y[train], validation_data=(X[valid], Y[valid]), class_weight=None, callbacks=[mckpt], verbose=2, epochs=11, batch_size=32) # Evaluate model loss, acc = model.evaluate(X[valid], Y[valid], verbose=0, batch_size=32) k_fold_cv_loss.append(loss) k_fold_cv_acc.append(acc) print("Model loss: {:0.6f}".format(np.mean(k_fold_cv_loss))) print("Model Accuracy: {:0.6f}%".format(np.mean(k_fold_cv_acc) * 100)) # Save model model.save_weights('model/lstm_weights_{}.h5'.format(time_str)) model.save('model/lstm_model_{}.h5'.format(time_str)) with open('model/lstm_model_{}.json'.format(time_str), 'w') as outfile: outfile.write(model.to_json())
def main(): parser = argparse.ArgumentParser() # IO-specific parser.add_argument( '-f', '--fcs', required=True, help='file specifying the FCS file names and corresponding labels') parser.add_argument( '-m', '--markers', required=True, help='file specifying the names of markers to be used for analysis') parser.add_argument('-i', '--indir', default='./', help='directory where input FCS files are located') parser.add_argument('-o', '--outdir', default='output', help='directory where output will be generated') parser.add_argument('-p', '--plot', action='store_true', default=True, help='whether to plot results ') parser.add_argument('--export_selected_cells', action='store_true', default=False, help='whether to export selected cell populations') parser.add_argument('--export_csv', action='store_true', default=False, help='whether to export network weights as csv files') parser.add_argument('-l', '--load_results', action='store_true', default=False, help='whether to load precomputed results') # data preprocessing parser.add_argument('--train_perc', type=float, default=0.75, help='percentage of samples to be used for training') parser.add_argument('--arcsinh', dest='arcsinh', action='store_true', help='preprocess the data with arcsinh') parser.add_argument('--no_arcsinh', dest='arcsinh', action='store_false', help='do not preprocess the data with arcsinh') parser.set_defaults(arcsinh=True) parser.add_argument('--cofactor', type=int, default=5, help='cofactor for the arcsinh transform') parser.add_argument( '--scale', dest='scale', action='store_true', help='z-transform features (mean=0, std=1) prior to training') parser.add_argument( '--no_scale', dest='scale', action='store_false', help='do not z-transform features (mean=0, std=1) prior to training') parser.set_defaults(scale=True) parser.add_argument( '--quant_normed', action='store_true', default=False, help= 'only use this option if the input data already lies in the [0, 1] interval, e.g. after quantile normalization' ) # multi-cell input specific parser.add_argument('--ncell', type=int, help='number of cells per multi-cell input', default=200) parser.add_argument('--nsubset', type=int, help='number of multi-cell inputs', default=1000) parser.add_argument( '--per_sample', action='store_true', default=False, help='whether nsubset refers to each class or each sample') parser.add_argument( '--subset_selection', choices=['random', 'outlier'], default='random', help='generate random or outlier-enriched multi-cell inputs') # neural network specific parser.add_argument( '--maxpool_percentages', nargs='+', type=float, help= 'list of choices (percentage of multi-cell input) for top-k max pooling', default=[0.01, 1, 5, 20, 100]) parser.add_argument('--nfilter_choice', nargs='+', type=int, help='list of choices for number of filters', default=range(3, 10)) parser.add_argument( '--learning_rate', type=float, default=0.005, help='learning rate for the Adam optimization algorithm') parser.add_argument('--coeff_l1', type=float, default=0, help='coefficient for L1 weight regularization') parser.add_argument('--coeff_l2', type=float, default=0.0001, help='coefficient for L2 weight regularization') parser.add_argument('--max_epochs', type=int, default=20, help='maximum number of iterations through the data') parser.add_argument('--patience', type=int, default=5, help='number of epochs before early stopping') # analysis specific parser.add_argument('--seed', type=int, default=1234, help='random seed') parser.add_argument( '--nrun', type=int, default=15, help='number of neural network configurations to try (should be >= 3)') parser.add_argument( '--regression', action='store_true', default=False, help='whether it is a regression problem (default is classification)') parser.add_argument( '--dendrogram_cutoff', type=float, default=.4, help='cutoff for hierarchical clustering of filter weights') parser.add_argument('--accur_thres', type=float, default=.9, help='keep filters from models achieving at least this accuracy ' \ ' (or at least from the best 3 models)') parser.add_argument('-v', '--verbose', type=int, choices=[0, 1], default=1, help='output verbosity') # plot specific parser.add_argument( '--filter_diff_thres', type=float, default=0.2, help='threshold that defines which filters are discriminative') parser.add_argument( '--filter_response_thres', type=float, default=0, help='threshold that defines the selected cell population per filter') parser.add_argument('--stat_test', choices=[None, 'ttest', 'mannwhitneyu'], help='statistical test for comparing cell population frequencies of two ' \ 'groups of samples') parser.add_argument('--group_a', default='group A', help='name of the first class') parser.add_argument('--group_b', default='group B', help='name of the second class') parser.add_argument('--group_names', nargs='+', default=None, help='list of class names') parser.add_argument('--tsne_ncell', type=int, help='number of cells to include in t-SNE maps', default=10000) args = parser.parse_args() # read in the data fcs_info = np.array(pd.read_csv(args.fcs, sep=',')) marker_names = list(pd.read_csv(args.markers, sep=',').columns) # if the samples have already been pre-processed via quantile normalization # we should not perform arcsinh transformation if args.quant_normed: args.arcsinh = False samples, phenotypes = get_data(args.indir, fcs_info, marker_names, args.arcsinh, args.cofactor) # generate training/validation sets np.random.seed(args.seed) val_perc = 1 - args.train_perc n_splits = int(1. / val_perc) # stratified CV for classification problems if not args.regression: skf = StratifiedKFold(n_splits=n_splits, shuffle=True) # simple CV for regression problems else: skf = KFold(n_splits=n_splits, shuffle=True) train, val = next(skf.split(np.zeros((len(phenotypes), 1)), phenotypes)) train_samples = [samples[i] for i in train] valid_samples = [samples[i] for i in val] train_phenotypes = [phenotypes[i] for i in train] valid_phenotypes = [phenotypes[i] for i in val] print '\nSamples used for model training:' for i in train: print fcs_info[i] print '\nSamples used for validation:' for i in val: print fcs_info[i] print # always generate multi-cell inputs on a per-sample basis for regression if args.regression: args.per_sample = True if not args.load_results: # run CellCnn model = CellCnn(ncell=args.ncell, nsubset=args.nsubset, per_sample=args.per_sample, subset_selection=args.subset_selection, scale=args.scale, quant_normed=args.quant_normed, maxpool_percentages=args.maxpool_percentages, nfilter_choice=args.nfilter_choice, nrun=args.nrun, regression=args.regression, learning_rate=args.learning_rate, coeff_l1=args.coeff_l1, coeff_l2=args.coeff_l2, max_epochs=args.max_epochs, patience=args.patience, dendrogram_cutoff=args.dendrogram_cutoff, accur_thres=args.accur_thres, verbose=args.verbose) model.fit(train_samples=train_samples, train_phenotypes=train_phenotypes, valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, outdir=args.outdir) # save results for subsequent analysis results = model.results pickle.dump(results, open(os.path.join(args.outdir, 'results.pkl'), 'w')) else: results = pickle.load( open(os.path.join(args.outdir, 'results.pkl'), 'r')) if args.export_csv: save_results(results, args.outdir, marker_names) # plot results if args.plot or args.export_selected_cells: plotdir = os.path.join(args.outdir, 'plots') plot_filters(results, marker_names, os.path.join(plotdir, 'filter_plots')) _v = discriminative_filters(results, os.path.join(plotdir, 'filter_plots'), filter_diff_thres=args.filter_diff_thres, show_filters=True) filter_info = plot_results( results, train_samples, train_phenotypes, marker_names, os.path.join(plotdir, 'training_plots'), filter_diff_thres=args.filter_diff_thres, filter_response_thres=args.filter_response_thres, stat_test=args.stat_test, group_a=args.group_a, group_b=args.group_b, group_names=args.group_names, tsne_ncell=args.tsne_ncell, regression=args.regression, show_filters=False) _v = plot_results(results, valid_samples, valid_phenotypes, marker_names, os.path.join(plotdir, 'validation_plots'), filter_diff_thres=args.filter_diff_thres, filter_response_thres=args.filter_response_thres, stat_test=args.stat_test, group_a=args.group_a, group_b=args.group_b, group_names=args.group_names, tsne_ncell=args.tsne_ncell, regression=args.regression, show_filters=False) if args.export_selected_cells: csv_dir = os.path.join(args.outdir, 'selected_cells') mkdir_p(csv_dir) nfilter = len(filter_info) sample_names = [ name.split('.fcs')[0] for name in list(fcs_info[:, 0]) ] # for each sample for x, x_name in zip(samples, sample_names): flags = np.zeros((x.shape[0], 2 * nfilter)) columns = [] # for each filter for i, (filter_idx, thres) in enumerate(filter_info): flags[:, 2 * i:2 * (i + 1)] = get_selected_cells( results['selected_filters'][filter_idx], x, results['scaler'], thres, True) columns += [ 'filter_%d_continuous' % filter_idx, 'filter_%d_binary' % filter_idx ] df = pd.DataFrame(flags, columns=columns) df.to_csv(os.path.join(csv_dir, x_name + '_selected_cells.csv'), index=False)
if (args.debug): print("len(X_train) : ", len(X_train)) print("len(y_train) : ", len(y_train)) print("len(y_pred_val) : ", len(y_pred_val)) #=========================================== # k-fold CV による処理 #=========================================== # k-hold cross validation で、学習用データセットを学習用と検証用に分割したもので評価 kf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=args.seed) y_preds = [] for fold_id, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): #-------------------- # データセットの分割 #-------------------- X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[ valid_index] y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[ valid_index] #-------------------- # モデル定義 #-------------------- model = KerasResNetClassifier(n_channles=len(X_train.columns)) #-------------------- # モデルの学習処理
X = breast_cancer.iloc[:, :9] Y = breast_cancer["Class"] labels = pd.unique(Y) Y = Y.replace("'recurrence-events'", 1) Y = Y.replace("'no-recurrence-events'", 0) X1 = preprocessData(X) clf = DecisionTreeClassifier(min_samples_leaf=15) rclf = RandomForestClassifier() skf = StratifiedKFold(n_splits=10) acc = [] for train_index, test_index in skf.split(X1, Y): #print("TRAIN:", train_index, "\nTEST:", test_index) X_train = X1.iloc[train_index, :] X_test = X1.iloc[test_index, :] y_train = Y.iloc[train_index] y_test = Y.iloc[test_index] clf = clf.fit(X_train, y_train) rclf = rclf.fit(X_train, y_train) acc1 = clf.score(X_test, y_test) * 100.0 acc.append(acc1) #acc2 = rclf.score(X_test,y_test) print('The accuracy of CART was: {}'.format(acc1)) #print('The accuracy of Random Forest was: {}'.format(acc2)) #print(clf.decision_path)
def extract_feature_siamese_lstm_manDist_char(): feature_name = 'dl_siamese_lstm_manDist_char' embedding_char_matrix_file_path = 'train_all_char_embedding_matrix.pickle' nb_filter = 300 filter_width = [4, 3] y_train_oofp = np.zeros((len(y_train), 1), dtype='float64') y_test_oofp = np.zeros((len(X_test_s1), 1), dtype='float64') kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=44) for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1, y_train)): # 选出需要添加的样本 train_true_mask = y_train[ix_train] == 1 X_train_true_s1 = X_train_s1[ix_train][train_true_mask] X_train_true_s2 = X_train_s2[ix_train][train_true_mask] y_train_true = y_train[ix_train][train_true_mask] # 进行添加 X_add_train_fold_s1 = np.vstack( [X_train_s1[ix_train], X_train_true_s2]) X_add_train_fold_s2 = np.vstack( [X_train_s2[ix_train], X_train_true_s1]) y_add_train_fold = np.concatenate([y_train[ix_train], y_train_true]) val_true_mask = y_train[ix_val] == 1 X_val_true_s1 = X_train_s1[ix_val][val_true_mask] X_val_true_s2 = X_train_s2[ix_val][val_true_mask] y_val_true = y_train[ix_val][val_true_mask] # 进行添加 X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2]) X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1]) y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true]) print('start train fold {} of {} ......'.format((fold_num + 1), 5)) # 创建模型 model = create_abcnn_model(embedding_matrix, nb_filter, filter_width) # 训练模型 model_checkpoint_path = project.trained_model_dir + 'dl_abcnn_model{}.h5'.format( fold_num) model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2], y=y_add_train_fold, validation_data=([X_add_val_fold_s1, X_add_val_fold_s2], y_add_val_fold), batch_size=512, epochs=30, verbose=1, class_weight={ 0: 1, 1: 2 }, callbacks=[ EarlyStopping(monitor='val_loss', min_delta=0.005, patience=5, verbose=1, mode='auto'), ModelCheckpoint(model_checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=False, verbose=1) ]) model.load_weights(model_checkpoint_path) y_train_oofp[ix_val] = predict(model, X_train_s1[ix_val], X_train_s2[ix_val]) K.clear_session() del X_add_train_fold_s1 del X_add_train_fold_s2 del X_add_val_fold_s1 del X_add_val_fold_s2 del y_add_train_fold del y_add_val_fold gc.collect() model_path = project.trained_model_dir + 'dl_abcnn_model0.h5' model0 = load_model(model_path, custom_objects={ 'fbeta_score': fbeta_score, 'precision': precision, 'recall': recall }) y_test_oofp = predict(model0, X_test_s1, X_test_s2) col_names = ['{}_{}'.format(feature_name, index) for index in range(1)] after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names, feature_name)
class KNN(object): def __init__(self, clf, train, test, lr_features, targets, cv, model_name): """ the construction method :param clf: classifier model_lr :param train: train data - dataframe :param test: test data - dataframe :param lr_features: features for LogitReg :param targets: y columns - list :param cv: number of cv - int :param model_name: name of the model - string """ self.clf = clf self.train = train self.test = test self.lr_features = lr_features self.targets = targets self.cv = StratifiedKFold(n_splits=cv) self.model_name = model_name self.train_X = train.loc[:, lr_features].values self.train_y = train.loc[:, targets].values # split feature and target def split_X_y(self, df, X_cols, y_cols): return df.loc[:, X_cols].values, df.loc[:, y_cols].values # function for sampling and spliting features and target def get_data(self): # undersampling according to length of ice data sample_size = len(self.train.loc[lambda df: df[self.targets] == 1, :]) # unbalanced sampling between ice data and normal data train_sample = self.train.loc[ lambda df: df[self.targets] == 1, :].append( self.train.loc[lambda df: df[self.targets] == 0, :].sample( sample_size - int(sample_size * 0.9))) # split features and target train_sample_X, train_sample_y = self.split_X_y( train_sample, self.lr_features, self.targets) test_X, test_y = self.split_X_y(self.test, self.lr_features, self.targets) return train_sample_X, train_sample_y, test_X, test_y def train_model(self): """ train and estimate model draw ROC curves of train and test :return: train_y, train_pred_y, test_y, test_pred_y """ train_sample_X, train_sample_y, test_X, test_y = self.get_data( ) # generate formed train and test data cv_data = self.cv.split( train_sample_X, train_sample_y) # split train data to train and validation data tprs = [] # list for saving TP rates in each cv aucs = [] # list for saving aucs in each cv mean_fpr = np.linspace(0, 1, 100) # mean FP rates fig, ax = plt.subplots() # initialize plt for i, (train, valid) in enumerate(cv_data): # 5 fold training of model_lr self.clf.fit(train_sample_X[train], train_sample_y[train]) # fit model using train data # plot ROC viz = metrics.plot_roc_curve(self.clf, train_sample_X[valid], train_sample_y[valid], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr) # get TP rates and do interp interp_tpr[0] = 0.0 tprs.append(interp_tpr) # add new interp_tpr to trprs list aucs.append(viz.roc_auc) # add viz.roc_auc to aucs list # plot ROC of test data metrics.plot_roc_curve(self.clf, test_X, test_y, name='ROC test', alpha=0.8, lw=1, color='green', ax=ax) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) # draw mean auc of 5 cv train mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = metrics.auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC of Train (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) # draw confident interval std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) # get upper bound tprs_lower = np.maximum(mean_tpr - std_tpr, 0) # get lower bound ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="ROC Curve") ax.legend(loc="lower right") plt.savefig('res_fig_knn/' + self.model_name + '.png') print( r'5 Cross Validation Mean AUC: %0.2f, Standard Deviation is %0.2f' % (mean_auc, std_auc)) # train with all train data and compute the train and test accuracy respectively start = datetime.datetime.now() # record start time self.clf.fit(train_sample_X, train_sample_y) # fit all train data test_pred_y = self.clf.predict(test_X) # predict test data end = datetime.datetime.now() # record end time print('Fit Time:') # calculate time cost print(end - start) dump(self.clf, 'model_knn/' + self.model_name + '.joblib') # save trained model train_acc = self.clf.score(self.train_X, self.train_y) # calculate train accuracy test_acc = self.clf.score(test_X, test_y) # calculate test accuracy print('Train Accuracy is %0.2f, Test Accuracy is %0.2f' % (train_acc, test_acc)) train_pred_y = self.clf.predict(self.train_X) # train data prediction return self.train_y, train_pred_y, test_y, test_pred_y
if (kfold==1): X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, train_size=0.7, random_state=42) # save dirname2 = '%s/result_%02d' % (dirname, 0) if not os.path.exists(dirname2): os.mkdir(dirname2) df1 = pd.concat([X_train, Y_train], axis = 1) df2 = pd.concat([X_test , Y_test ], axis = 1) trfile = '%s/%s_train_%02d.csv' % (dirname2, prefix, 0) tefile = '%s/%s_test_%02d.csv' % (dirname2, prefix, 0) df1.to_csv(trfile, header=None, index=False) df2.to_csv(tefile, header=None, index=False) else: k_fold = StratifiedKFold(n_splits=kfold, random_state=42,shuffle=True) cv = k_fold.split(X_data,Y_data) t = 0 for train_index, test_index in cv: print('KFold = ', t) X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index] Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index] # save dirname2 = '%s/result_%02d' % (dirname, t) if not os.path.exists(dirname2): os.mkdir(dirname2) df1 = pd.concat([X_train, Y_train], axis = 1) df2 = pd.concat([X_test , Y_test ], axis = 1) trfile = '%s/%s_train_%02d.csv' % (dirname2, prefix, t) tefile = '%s/%s_test_%02d.csv' % (dirname2, prefix, t)
def kfold_lightgbm(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=8, #is_unbalance=True, n_estimators=10000, learning_rate=0.02, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1, #scale_pos_weight=11 ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 200) oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: train_df['Prediction'] = oof_preds train_df.to_csv("kernel02_train.csv", index=False) test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) return feature_importance_df
y = [(category) for (rev, category) in documents] random.shuffle(featuresets) skf = StratifiedKFold(n_splits=10) # blank lists to store predicted values and actual values predicted_y = [] expected_y = [] # partition data training_set = [] testing_set = [] file = open("output_kfold.txt", "w") file.close for train_index, test_index in skf.split(x, y): file = open("output_kfold.txt", "a") file.write(str(train_index) + str(test_index)) # specific ".loc" syntax for working with dataframes # x_train, x_test = x[train_index], x[test_index] #y_train, y_test = y[train_index], y[test_index] for i in train_index: training_set.append(featuresets[i]) for i in test_index: testing_set.append(featuresets[i]) k_folds_f.KF(training_set, testing_set, file) #accuracy = metrics.accuracy_score(expected_y, predicted_y) #print("Accuracy: " + accuracy.__str__()) #print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) #classifier.show_most_informative_features(15)
#y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test).toarray()''' #CountVectorizer #Take the sum of all accuracies of the 10 folds sumsvc=0 sumdtc=0 sumrfc=0 sumlr=0 t=0 skf = StratifiedKFold(n_splits=10) skf.get_n_splits(x, y) StratifiedKFold(n_splits=10, random_state=None, shuffle=True) print("SKF on count vectorizer...") for train_index, test_index in skf.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] classifiers=[ (SVC(kernel = 'rbf', random_state = 0),"SVC"), (DecisionTreeClassifier(random_state = 0),"DTC"), (LogisticRegression(),"LR"), (RandomForestClassifier(n_estimators=80, max_depth=100,random_state=0),"RFC"), ] #Accuracy scores of different models score_ , names = [] , [] for model,name in classifiers: model.fit(x_train, y_train)
y_train = np.asarray(labels) logger.info('Number of Training Examples: {}'.format(X_train.shape)) logger.info('Number of Labels: {}'.format(y_train.shape)) # Train model logger.info('Training model...') clf = MultinomialNB() model = clf.fit(X_train, y_train) logger.info('Training Accuracy: {}'.format(model.score(X_train, y_train))) # K-fold Cross Validation (stratified) logger.info('Cross validating...') skf = StratifiedKFold(n_splits=5) test_prec_scores = [] test_rec_scores = [] for train_index, test_index in skf.split(X_train, y_train): # train X_train_val, X_test_val = X_train[train_index], X_train[test_index] y_train_val, y_test_val = y_train[train_index], y_train[test_index] clf = MultinomialNB() model = clf.fit(X_train_val, y_train_val) # test test_prec_score = precision_score(y_test_val, model.predict(X_test_val)) test_rec_score = recall_score(y_test_val, model.predict(X_test_val)) # update scores test_prec_scores.append(test_prec_score) test_rec_scores.append(test_rec_score) # Mean precision and recall logger.info('Average Test Precision: {}'.format(np.mean(test_prec_scores)))
def KNN(distances, labels, k=1, metrics=['Recall', 'Precision', 'F1_Score', 'AUC']): lb = LabelEncoder() labels = lb.fit_transform(labels) skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True) neigh = KNeighborsClassifier(n_neighbors=k, metric='precomputed', weights='distance') pred_list = [] pred_prob_list = [] labels_list = [] for train_idx, test_idx in skf.split(distances, labels): distances_train = distances[train_idx, :] distances_train = distances_train[:, train_idx] distances_test = distances[test_idx, :] distances_test = distances_test[:, train_idx] labels_train = labels[train_idx] labels_test = labels[test_idx] neigh.fit(distances_train, labels_train) pred = neigh.predict(distances_test) pred_prob = neigh.predict_proba(distances_test) labels_list.extend(labels_test) pred_list.extend(pred) pred_prob_list.extend(pred_prob) pred = np.asarray(pred_list) pred_prob = np.asarray(pred_prob_list) labels = np.asarray(labels_list) OH = OneHotEncoder(sparse=False) labels = OH.fit_transform(labels.reshape(-1, 1)) pred = OH.transform(pred.reshape(-1, 1)) metric = [] value = [] classes = [] k_list = [] for ii, c in enumerate(lb.classes_): if 'Recall' in metrics: value.append(recall_score(y_true=labels[:, ii], y_pred=pred[:, ii])) metric.append('Recall') classes.append(c) k_list.append(k) if 'Precision' in metrics: value.append( precision_score(y_true=labels[:, ii], y_pred=pred[:, ii])) metric.append('Precision') classes.append(c) k_list.append(k) if 'F1_Score' in metrics: value.append(f1_score(y_true=labels[:, ii], y_pred=pred[:, ii])) metric.append('F1_Score') classes.append(c) k_list.append(k) if 'AUC' in metrics: value.append(roc_auc_score(labels[:, ii], pred_prob[:, ii])) metric.append('AUC') classes.append(c) k_list.append(k) return classes, metric, value, k_list
elif first_scan >= threshold: if second_scan == 0: threshhold_data.append(i) starting_labels.append(malware) ending_labels.append(benign) elif second_scan >= threshold: threshhold_data.append(i) starting_labels.append(malware) ending_labels.append(malware) skf = StratifiedKFold(n_splits=splits) skf.get_n_splits(threshhold_data, ending_labels) print("stratified") for train_indexs, test_indexs in skf.split(threshhold_data, ending_labels): test_set = [] train_set = [] for i in train_indexs: train_set.append(threshhold_data[i]) for i in test_indexs: test_set.append(threshhold_data[i]) xTrain, yTrain, yExpect, day_one = build_matrix(train_set, test_set) print("built matrix") if method == 'var': for u_a in alpha_values: for l_a in alpha_values: #checking upload variant(xTrain, yTrain, l_a, l_a, u_a, u_a, kern, kNN, g=gamma)
# **Validation Strategy: Stratified KFold** # In[ ]: folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=59) # In[ ]: predicted = np.zeros((test.shape[0], 9)) measured = np.zeros((data.shape[0])) score = 0 # In[ ]: for times, (trn_idx, val_idx) in enumerate( folds.split(data.values, target['surface'].values)): model = RandomForestClassifier(n_estimators=500, n_jobs=-1) #model = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, n_jobs=-1) model.fit(data.iloc[trn_idx], target['surface'][trn_idx]) measured[val_idx] = model.predict(data.iloc[val_idx]) predicted += model.predict_proba(test) / folds.n_splits score += model.score(data.iloc[val_idx], target['surface'][val_idx]) print("Fold: {} score: {}".format( times, model.score(data.iloc[val_idx], target['surface'][val_idx]))) importances = model.feature_importances_ indices = np.argsort(importances) features = data.columns if model.score(data.iloc[val_idx], target['surface'][val_idx]) > 0.92000: hm = 30
Dense(16, activation='relu', kernel_initializer='he_uniform', use_bias=True)) model.add(Dense(1, input_shape=(16, ), activation='linear', use_bias=True)) print(model.summary()) if learn: model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae']) maes = [] for train, val in kfold.split(X_train, y_train): # Fit the model history = model.fit(X_train[train], y_train[train], epochs=150, batch_size=8, verbose=0) # evaluate the model scores = model.evaluate(X_train[val], y_train[val], verbose=1) print("%s: %.2f" % (model.metrics_names[1], scores[1])) maes.append(scores[1]) # ========== PART 3 ============ # # Code to be used only to print nice plots if plot:
lr_end = 1e-5 result = [] # 学习率变化 schedule = lambda epoch: LR_schedule(epoch, 5, lr_start=lr_start, lr_end=lr_end, c=c) lr_schedule_obj = LearningRateScheduler(schedule=schedule) # 随机权重对象 swa_obj = SWA(swa_start, update) # 记录对象 history = LossHistory() skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) i = 1 for train_index, test_index in skf.split(X_all, Y_all): # 随机数设置 print("开始第{0}轮验证".format(i)) model = LSTM_model() model.compile(optimizer=RMSprop(rho=0.9, epsilon=1e-06, clipnorm=0, clipvalue=1), loss=weight_categorical_crossentropy, metrics=['accuracy']) X_train = X_all[train_index] X_test = X_all[test_index] m = Y_all[train_index] Y_train = np_utils.to_categorical(Y_all[train_index], 4) Y_test = np_utils.to_categorical(Y_all[test_index], 4) print("校验类比比例") print("类别1: ", len(m[m == 0]) / len(m)) print("类别2: ", len(m[m == 1]) / len(m))
for train_index, test_index in kf.split(normalised_train_df): x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index] y_train, y_test = y_balanced[train_index], y_balanced[test_index] model = LogisticRegression().fit(x_train, y_train) #save result to list f1_scores.append(f1_score(y_true = y_test, y_pred = model.predict(x_test), pos_label = '2A')*100) f1_scores #StratifiedKFold from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) f1_scores = [] #run for every split for train_index, test_index in skf.split(normalised_train_df, y_balanced): x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index] y_train, y_test = y_balanced[train_index], y_balanced[test_index] model = LogisticRegression().fit(x_train, y_train) #save result to list f1_scores.append(f1_score(y_true = y_test, y_pred = model.predict(x_test), pos_label = '2A')*100) f1_scores #LeaveOneOut from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro') average_score = scores.mean() * 100 average_score
def train_cross_validation_model(model, X, y, output_folder, splits, resolution, batch_size=20, epochs=50): skf = StratifiedKFold(n_splits=splits, random_state=42, shuffle=True) params = { 'batch_size': batch_size, 'input_shape': (229, 229, 3), 'size': (229, 229), 'shuffle': True } split = 1 print(y) for train_idx, test_idx in skf.split(X, y): print('Starting Split : %02d' % split) X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = to_categorical(y[train_idx]), to_categorical( y[test_idx]) output_log = os.path.join(output_folder, 'split_%03d' % split) weights_best, logs_dir = logging_configuration(output_log) save_split(X_train, X_test, y_train, y_test, output_log) ds = Dataflow(X_train, y_train, size=(229, 229)) dsm = MultiProcessRunner(ds, num_prefetch=batch_size, num_proc=5) ds1 = BatchData(dsm, batch_size) train_gen = gen(ds1) callbacks_list = get_callbacks(weights_best, logs_dir) if True: History = model.fit_generator(train_gen, callbacks=callbacks_list, epochs=epochs, steps_per_epoch=len(y_train)) else: train_data = get_tf_dataset(filenames=X_train, labels=y_train) validation_data = get_tf_dataset(filenames=X_test, labels=y_test) History = model.fit(train_data, callbacks=callbacks_list, epochs=epochs, steps_per_epoch=len(y) / 100) X_test_img = np.array( [cv2.resize(cv2.imread(im), (224, 224)) for im in X_test], dtype=np.float16) y_pred = model.predict_classes(X_test_img) test = np.argmax(y_test, axis=1) report = classification_report(test, y_pred, output_dict=True) df = pd.DataFrame(report).transpose() print(report) Historydf = pd.DataFrame(History.history) history_file = os.path.join(output_log, 'history.csv') Historydf.to_csv(history_file) report_file = os.path.join(output_log, 'report.csv') df.to_csv(report_file) split += 1
def extract_feature_siamese_lstm_manDist(): # 前期参数设置 embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle' feature_name = 'dl_siamese_lstm_manDist' RANOD_SEED = 42 np.random.seed(RANOD_SEED) nepoch = 40 num_folds = 5 batch_size = 512 # 加载Embeding矩阵 embedding_matrix = project.load(project.aux_dir + embedding_matrix_file_path) #加载输入数据 X_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_ids_pad.pickle') X_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_ids_pad.pickle') X_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_ids_pad.pickle') X_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_ids_pad.pickle') #y_0.6_train.pickle 存储的为list y_train = np.array( project.load(project.features_dir + 'y_0.6_train.pickle')) y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle')) #定义model param model_param = { 'lstm_units': 50, 'lstm_dropout_rate': 0., 'lstm_re_dropout_rate': 0., 'desen_dropout_rate': 0.75, 'num_dense': 128 } # model_checkpoint_path = project.temp_dir + 'fold-checkpoint-'+feature_name + '.h5' kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=RANOD_SEED) # 存放最后预测结果 y_train_oofp = np.zeros((len(y_train), 2), dtype='float64') y_test_oofp = np.zeros((len(X_test_s1), 2), dtype='float64') train_y = to_categorical(y_train, 2) val_y = to_categorical(y_val, 2) for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1, y_train)): # 选出需要添加的样本 train_true_mask = y_train[ix_train] == 1 X_train_true_s1 = X_train_s1[ix_train][train_true_mask] X_train_true_s2 = X_train_s2[ix_train][train_true_mask] y_train_true = train_y[ix_train][train_true_mask] # 进行添加 X_add_train_fold_s1 = np.vstack( [X_train_s1[ix_train], X_train_true_s2]) X_add_train_fold_s2 = np.vstack( [X_train_s2[ix_train], X_train_true_s1]) y_add_train_fold = np.concatenate([train_y[ix_train], y_train_true]) val_true_mask = y_train[ix_val] == 1 X_val_true_s1 = X_train_s1[ix_val][val_true_mask] X_val_true_s2 = X_train_s2[ix_val][val_true_mask] y_val_true = train_y[ix_val][val_true_mask] # 进行添加 X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2]) X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1]) y_add_val_fold = np.concatenate([train_y[ix_val], y_val_true]) print('start train fold {} of {} ......'.format((fold_num + 1), 5)) # 创建模型 model = create_siamese_lstm_ManDistance_model(embedding_matrix, model_param) # 训练模型 model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model{}.h5'.format( fold_num) model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2], y=y_add_train_fold, validation_data=([X_add_val_fold_s1, X_add_val_fold_s2], y_add_val_fold), batch_size=batch_size, epochs=nepoch, verbose=1, class_weight={ 0: 1, 1: 2 }, callbacks=[ EarlyStopping(monitor='val_loss', min_delta=0.005, patience=5, verbose=1, mode='auto'), ModelCheckpoint(model_checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=False, verbose=1) ]) model.load_weights(model_checkpoint_path) y_train_oofp[ix_val] = predict(model, X_train_s1[ix_val], X_train_s2[ix_val]) K.clear_session() del X_add_train_fold_s1 del X_add_train_fold_s2 del X_add_val_fold_s1 del X_add_val_fold_s2 del y_add_train_fold del y_add_val_fold gc.collect() # save feature model_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model0.h5' model0 = load_model(model_path, custom_objects={ 'ManDist': ManDist, 'fbeta_score': fbeta_score, 'precision': precision, 'recall': recall }) y_test_oofp = predict(model0, X_test_s1, X_test_s2) col_names = ['{}_{}'.format(feature_name, index) for index in range(2)] after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names, feature_name)
n_estimators = [100] max_leaf_nodes = [2, 3, 4, 6, 8, 10] learning_rate = [0.1] min_samples_leaf = [1] subsample = [0.1, 0.2, 0.25, 0.5, 0.75, 1.0] gbtree = GradientBoostingRegressor() pipeline = Pipeline([('standardize', StandardScaler()), ('gbr', gbtree)]) param_grid = dict(gbr__n_estimators=n_estimators, gbr__max_leaf_nodes=max_leaf_nodes, gbr__learning_rate=learning_rate, gbr__subsample=subsample, gbr__min_samples_leaf=min_samples_leaf) metrics = ['neg_mean_squared_error'] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) grid_gbr = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=metrics, cv=kfold.split(X, df.iloc[:, 2].values), return_train_score=True, refit='neg_mean_squared_error') results_gbr = grid_gbr.fit(X, y) # Save model and results df_results = pd.DataFrame(results_gbr.cv_results_) df_results.to_csv(root_path / 'results' / 'results_gbr_metrics.csv', index=False) joblib.dump(grid_gbr.best_estimator_, root_path / 'models' / 'grid_gbr.pkl')
def main(): # fix seed for train reproduction seed_everything(args.SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("\n device", device) # TODO dataset loading train_df = pd.read_csv('/DATA/trainset-for_user.csv', header=None) train_df = train_df.dropna().reset_index(drop=True) test_df = pd.read_csv('/DATA/testset-for_user.csv', header=None) print('train_df shape : ', train_df.shape) train_df = create_str_feature(train_df) test_df = create_str_feature(test_df) train_df['patient_label'] = train_df['patient'] + '_' + train_df['label'] train_df['count'] = train_df['patient_label'].map(train_df['patient_label'].value_counts()) print(train_df.head()) print(train_df.isnull().sum()) from sklearn.model_selection import train_test_split train_df['image_path'] = [os.path.join('/DATA', train_df['patient'][i], train_df['image'][i]) for i in range(train_df.shape[0])] labels = train_df['label'].map({'Wake':0, 'N1':1, 'N2':2, 'N3':3, 'REM':4}).values str_train_df = train_df[['time', 'user_count', 'user_max', 'user_min']].values str_test_df = test_df[['time', 'user_count', 'user_max', 'user_min']].values print('meta max value: ', str_train_df.max(), str_test_df.max(), 'meta shape: ', str_train_df.shape, str_test_df.shape) skf_labels = train_df['patient'] + '_' + train_df['label'] unique_idx = train_df[train_df['count']==1].index non_unique_idx = train_df[train_df['count']>1].index trn_idx, val_idx, trn_labels, val_labels = train_test_split(non_unique_idx, labels[non_unique_idx], test_size=0.05, random_state=0, shuffle=True, stratify=skf_labels[non_unique_idx]) # valid set define trn_image_paths = train_df.loc[trn_idx, 'image_path'].values val_image_paths = train_df.loc[val_idx, 'image_path'].values # struture data define trn_str_data = str_train_df[trn_idx, :] val_str_data = str_train_df[val_idx, :] print('\n') print('8:2 train, valid split : ', len(trn_image_paths), len(trn_labels), len(val_image_paths), len(val_labels), trn_str_data.shape, val_str_data.shape) print('\n') print(trn_image_paths[:5], trn_labels[:5]) print(val_image_paths[:5], val_labels[:5]) valid_transforms = create_val_transforms(args, args.input_size) if args.DEBUG: valid_dataset = SleepDataset(args, val_image_paths[:100], val_str_data, val_labels[:100], valid_transforms, is_test=False) else: valid_dataset = SleepDataset(args, val_image_paths, val_str_data, val_labels, valid_transforms, is_test=False) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True) trn_skf_labels = (train_df.loc[trn_idx, 'patient'] + train_df.loc[trn_idx, 'label']).values print('skf labels head : ', trn_skf_labels[:5]) if args.DEBUG: print('\n#################################### DEBUG MODE') else: print('\n################################### MAIN MODE') print(trn_image_paths.shape, trn_labels.shape, trn_skf_labels.shape) # train set define train_dataset_dict = {} skf = StratifiedKFold(n_splits=args.n_folds, shuffle=True, random_state=args.SEED) nsplits = [val_idx for _, val_idx in skf.split(trn_image_paths, trn_skf_labels)] print(nsplits) #np.save('nsplits.npy', nsplits) #print('\nload nsplits') #nsplits = np.load('nsplits.npy', allow_pickle=True) #print(nsplits) for idx, val_idx in enumerate(nsplits):#trn_skf_labels sub_img_paths = np.array(trn_image_paths)[val_idx] sub_labels = np.array(trn_labels)[val_idx] sub_meta = np.array(trn_str_data)[val_idx] if args.DEBUG: sub_img_paths = sub_img_paths[:200] sub_labels = sub_labels[:200] sub_meta = sub_meta[:200] if idx==1 or idx==6: sub_img_paths = np.concatenate([sub_img_paths, train_df.loc[unique_idx, 'image_path'].values]) sub_labels = np.concatenate([sub_labels, labels[unique_idx]]) sub_meta = np.concatenate([sub_meta, str_train_df[unique_idx]]) train_transforms = create_train_transforms(args, args.input_size) #train_dataset = SleepDataset(args, sub_img_paths, sub_labels, train_transforms, use_masking=True, is_test=False) train_dataset_dict[idx] = [args, sub_img_paths, sub_meta, sub_labels, train_transforms] print(f'train dataset complete {idx}/{args.n_folds}, ') print("numberr of train datasets: ", len(train_dataset_dict)) # define model model = build_model(args, device) # optimizer definition optimizer = build_optimizer(args, model) #scheduler = build_scheduler(args, optimizer, len(train_loader)) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 9) scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=1, after_scheduler=scheduler_cosine) if args.label_smoothing: criterion = LabelSmoothingLoss(classes=args.num_classes, smoothing=args.label_smoothing_ratio) else: criterion = nn.CrossEntropyLoss() trn_cfg = {'train_datasets':train_dataset_dict, 'valid_loader':valid_loader, 'model':model, 'criterion':criterion, 'optimizer':optimizer, 'scheduler':scheduler, 'device':device, 'fold_num':0, } train(args, trn_cfg)
# In[180]: randomForestModel.fit(train_predictor,train_response) # In[102]: #K-Fold cross validation cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True) results = pd.DataFrame(columns=['training_score', 'test_score']) fprs, tprs, scores = [], [], [] for (train, test), i in zip(cv.split(train_dataset,train_response), range(10)): randomForestModel.fit(train_dataset.iloc[train], train_response.iloc[train]) _, _, auc_score_train = compute_roc_auc(train) fpr, tpr, auc_score = compute_roc_auc(test) scores.append((auc_score_train, auc_score)) fprs.append(fpr) tprs.append(tpr) plot_roc_curve(fprs, tprs); pd.DataFrame(scores, columns=['AUC Train', 'AUC Test']) # In[ ]: #fit chaid analysis
model = Model([input_layer, input2], output_coverage) return model preds = [] ids = [] preds_test = [] ids_test = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) model_count = 0 data = np.array(train_df.images.map(upsample).tolist()).reshape( -1, img_size_target, img_size_target, 1) labels = train_df.coverage is_train = True for train_idx, val_idx in skf.split(data, train_df.coverage_class): if is_train and model_count != int(sys.argv[1]): model_count += 1 continue model = build_model(10) model.compile(loss='mse', optimizer="adam", metrics=["accuracy", "mse"]) ids_train, ids_valid,x_train, x_valid, y_train, y_valid, depth_train, depth_test = \ train_df.index.values[train_idx],train_df.index.values[val_idx], \ data[train_idx], data[val_idx], \ labels[train_idx], labels[val_idx], \ train_df.z.values[train_idx],train_df.z.values[val_idx] depth_train = np.array(map(lambda x: math.log(x + 1, 10), depth_train)) depth_test = np.array(map(lambda x: math.log(x + 1, 10), depth_test)) sample_weight = [1.0] * len(x_train) x_train_org = x_train.copy() y_train_org = y_train.copy()