def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def _get_mse_profiling(self, x, y, alphas=None): """Calculate prediction RMSE. Use GroupKFold where a group is a combination of input size and number of workers. The prediction of a group is done when it is out of the training set. """ # Training set is 2/3 of the data groups = self._groups.loc[x.index] cv = GroupKFold(n_splits=3) preds = None for train_ix, test_ix in cv.split(x, groups=groups): # train_ix and test_ix starts from 0, so we use iloc x_train, y_train = x.iloc[train_ix], y.iloc[train_ix] x_test = x.iloc[test_ix] # Choose best alpha value for regularization based on training set lm = self._choose_alpha(x_train, y_train, alphas) lm.fit(x_train, y_train) pred = pd.DataFrame(lm.predict(x_test), index=test_ix) preds = pred if preds is None else preds.append( pred, verify_integrity=True) return self._calc_mse(y, preds.sort_index())
def fit_predict(self,x_train,y_train, x_predict): """Use local regression to predict values for unknown data. Arguments: x_train = The training data spectra. y_train = The values of the quantity being predicted for the training data x_predict = The unknown spectra for which y needs to be predicted. """ self.neighbors.fit(x_train) predictions = [] coeffs = [] intercepts = [] for i in range(x_predict.shape[0]): print('Predicting spectrum ' + str(i + 1)) x_temp = np.array(x_predict[i]) foo, ind = self.neighbors.kneighbors([x_temp]) x_train_local = np.squeeze(x_train[ind]) y_train_local = np.squeeze(y_train[ind]) cv = GroupKFold(n_splits=3) cv = cv.split(x_train_local, y_train_local, groups=y_train_local) self.model.fit(x_train_local, y_train_local) predictions.append(self.model.predict([x_temp])[0]) coeffs.append(self.model.coef_) intercepts.append(self.model.intercept_) return predictions, coeffs, intercepts
def _split(self, x, y): cv = GroupKFold(n_splits=3) groups = self._groups.loc[x.index] for train_ix, test_ix in cv.split(x, groups=groups): # train_ix and test_ix starts from 0, so we use iloc x_train = x.iloc[train_ix] y_train = y.iloc[train_ix] x_test = x.iloc[test_ix] yield x_train, y_train, x_test, test_ix
def plot_group_kfold(): from sklearn.model_selection import GroupKFold groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("GroupKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = GroupKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), groups=groups)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//", edgecolor="k", align='edge') for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w", edgecolor='k', align="edge") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % groups[i], horizontalalignment="center") axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels( ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]) plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)) plt.tight_layout()
def test_knn_rbf_groupkfold(): nan_roc_auc_scorer = make_scorer(nan_roc_auc_score) rng = np.random.RandomState(123) iris = load_iris() X = iris.data # knn = KNeighborsClassifier(n_neighbors=4) forest = RandomForestClassifier(n_estimators=100, random_state=123) bool_01 = [True if item == 0 else False for item in iris['target']] bool_02 = [True if (item == 1 or item == 2) else False for item in iris['target']] groups = [] y_new = [] for ind, _ in enumerate(bool_01): if bool_01[ind]: groups.append('attribute_A') y_new.append(0) if bool_02[ind]: throw = rng.rand() if throw < 0.5: groups.append('attribute_B') else: groups.append('attribute_C') throw2 = rng.rand() if throw2 < 0.5: y_new.append(0) else: y_new.append(1) y_new_bool = [True if item is 1 else False for item in y_new] cv_obj = GroupKFold(n_splits=3) cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups)) sfs1 = SFS(forest, k_features=3, forward=True, floating=False, cv=cv_obj_list, scoring=nan_roc_auc_scorer, verbose=0 ) sfs1 = sfs1.fit(X, y_new) expect = { 1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62, 'feature_idx': (1,)}, 2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53, 'feature_idx': (1, 2)}, 3: {'cv_scores': np.array([0.47, nan, 0.63]), 'avg_score': 0.55, 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
NUM_FOLDS = 4 EPOCHS = 30 BATCH_SIZE = 256 BAGS = 16 kf = GroupKFold(n_splits=NUM_FOLDS) shape = None for bag in range(BAGS): fold = 0 val_loss = np.ones((EPOCHS, NUM_FOLDS), np.float32) for train, val in kf.split(x_train, y_train, G): model.set_weights(weights) model.reset_states() tensorboard = TensorBoard( log_dir='./logs/gru_fold_{}_bag_{}'.format(fold, bag)) history = model.fit(x_train[train], y_train[train], batch_size=BATCH_SIZE, validation_data=(x_train[val], y_train[val]), epochs=EPOCHS, shuffle=True, verbose=1, callbacks=[tensorboard]) val_loss[:, fold] = history.history['val_loss'] fold += 1
# In[14]: ch_arr = [0, 1, 2] # In[15]: path = '6ts_4space_imgs_time_aug_resnet50_models' os.makedirs(path) # In[21]: models_w_arr = [] fold = 0 for train_index, val_index in group_kfold.split(train_df, train_df['wind_speed'], train_df['storm_id']): print(fold) fold += 1 os.makedirs(path + '/fold_%d' % (fold - 1)) image_datasets = { 'train': WindDataset(train_imgs, train_df['wind_speed'].values, train_df['storm_id'].values, 'train', train_index, data_transforms['train']), 'val': WindDataset(train_imgs, train_df['wind_speed'].values, train_df['storm_id'].values, 'val', val_index, data_transforms['val']) }
# In[11]: oof = np.zeros(df_train.shape[0]) sub = np.zeros(df_test.shape[0]) feat_imp_df = pd.DataFrame({'feat': feats, 'imp': 0.0}) gkf = GroupKFold(n_splits=5) # In[12]: print('train shape {} test shape {}'.format(df_train.shape, df_test.shape)) # In[13]: for i, (trn_idx, val_idx) in enumerate( gkf.split(df_train, groups=(df_train.date.map(str) + '_' + df_train.link_id.map(str)))): print( '------------------------------{} fold------------------------------'. format(i)) X_trn, Y_trn, W_trn = df_train.iloc[trn_idx][feats], df_train.iloc[ trn_idx].label, df_train.iloc[trn_idx].weight X_val, Y_val, W_val = df_train.iloc[val_idx][feats], df_train.iloc[ val_idx].label, df_train.iloc[val_idx].weight X_sub = df_test[feats] clf = LGBMRegressor( num_leaves=63, learning_rate=0.02, n_estimators=100000, subsample=0.6,
embedding_numeric = Dense(512, activation='relu')(input_numeric) inputs.append(input_numeric) embeddings.append(embedding_numeric) x = Concatenate()(embeddings) x = Dense(256, activation='relu')(x) x = Dense(128, activation='relu')(x) x = Dropout(0.5)(x) output = Dense(199, activation='softmax')(x) model = Model(inputs, output) return model n_splits = 5 kf = GroupKFold(n_splits=n_splits) score = [] for i_, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])): print(f'Fold : {i_+1}') X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx] X_train = [np.absolute(X_train[i]) for i in cat ] + [X_train[num]] # + [X_train[env1]] + [X_train[env2]] X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]] # + [X_val[env1]] + [X_val[env2]] model = model_NN() model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[]) es = EarlyStopping(monitor='val_CRPS', mode='min', restore_best_weights=True,
feature_importance = {} num_dic = {} trans_x = np.transpose(X) max_val = 0 for i in range(num_vars): feature_importance[names[i]] = ms_array[i] num_dic[i] = ms_array[i] if max_val < num_dic[i]: max_val = num_dic[i] best_feature = i for a in range(10): x1 = X[:, best_feature].reshape(-1, 1) group_kfold = GroupKFold(n_splits=10) group_kfold.get_n_splits(x1, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] for id in train_index: X_train.append(x1[id]) for id in test_index: X_test.append(x1[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) clf = RandomForestClassifier().fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score)
def cv(): data_x, data_y, body_ids = build_data() holdout_ids = set([int(x.rstrip()) for x in file('hold_out_ids.txt')]) print 'len(holdout_ids): ', len(holdout_ids) holdout_idx = [t for (t, x) in enumerate(body_ids) if x in holdout_ids] test_x = data_x[holdout_idx] # features of test set print 'holdout_x.shape: ' print test_x.shape test_y = data_y[holdout_idx] print Counter(test_y) #return 1 # to obtain test dataframe for model averaging body = pd.read_csv("train_bodies.csv") stances = pd.read_csv("train_stances.csv") data = pd.merge(stances, body, how='left', on='Body ID') targets = ['agree', 'disagree', 'discuss', 'unrelated'] targets_dict = dict(zip(targets, range(len(targets)))) data['target'] = map(lambda x: targets_dict[x], data['Stance']) test_df = data.ix[holdout_idx] cv_ids = set([int(x.rstrip()) for x in file('training_ids.txt')]) print 'len(cv_ids): ', len(cv_ids) cv_idx = [t for (t, x) in enumerate(body_ids) if x in cv_ids] cv_x = data_x[cv_idx] print 'cv_x.shape: ' print cv_x.shape cv_y = data_y[cv_idx] groups = body_ids[cv_idx] # GroupKFold will make sure all samples # having the same "Body ID" will appear in the same fold w = np.array([1 if y == 3 else 4 for y in cv_y]) print 'w:' print w print np.mean(w) scores = [] wscores = [] pscores = [] n_folds = 10 best_iters = [0] * n_folds kf = GroupKFold(n_splits=n_folds) # need to create disjoint sets for training and validation for fold, (trainInd, validInd) in enumerate(kf.split(cv_x, cv_y, groups)): continue print 'fold %s' % fold x_train = cv_x[trainInd] y_train = cv_y[trainInd] x_valid = cv_x[validInd] y_valid = cv_y[validInd] idx_valid = np.array(cv_idx)[validInd] print 'perfect_score: ', perfect_score(y_valid) print Counter(y_valid) #break dtrain = xgb.DMatrix(x_train, label=y_train, weight=w[trainInd]) dvalid = xgb.DMatrix(x_valid, label=y_valid, weight=w[validInd]) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] bst = xgb.train( params_xgb, dtrain, num_round, watchlist, verbose_eval=10, #feval = eval_metric, #maximize = True, early_stopping_rounds=80) #pred_y = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit) pred_y = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit).reshape( y_valid.shape[0], 4) print 'predicted probabilities: ' print pred_y pred_y = np.argmax(pred_y, axis=1) print 'predicted label indices: ' print pred_y print 'best iterations: ', bst.best_ntree_limit best_iters[fold] = bst.best_ntree_limit #pred_y = bst.predict(dvalid) print pred_y #print Counter(pred_y) #pred_y = np.argmax(bst.predict(dvalid, ntree_limit=bst.best_ntree_limit), axis=1) print 'pred_y.shape' print pred_y.shape print 'y_valid.shape' print y_valid.shape #s = fscore(pred_y, y_valid) #s_perf = perfect_score(y_valid) predicted = [LABELS[int(a)] for a in pred_y] actual = [LABELS[int(a)] for a in y_valid] # print out the headline & body text for incorrect predictions #show_incorrect_pred(actual, predicted, idx_valid) s, _ = score_submission(actual, predicted) s_perf, _ = score_submission(actual, actual) wscore = float(s) / s_perf print 'fold %s, score = %f, perfect_score %f, weighted percentage %f' % ( fold, s, s_perf, wscore) scores.append(s) pscores.append(s_perf) wscores.append(wscore) #break print 'scores:' print scores print 'mean score:' print np.mean(scores) print 'perfect scores:' print pscores print 'mean perfect score:' print np.mean(pscores) print 'w scores:' print wscores print 'mean w score:' print np.mean(wscores) print 'best iters:' print best_iters print 'mean best_iter:' m_best = np.mean(best_iters) print m_best #m_best = best_iters[0] m_best = 500 #m_best = 500 #m_best = 600 #return 1 # use the same parameters to train with full cv data, test on hold-out data print 'test on holdout set' dtrain = xgb.DMatrix(cv_x, label=cv_y, weight=w) dtest = xgb.DMatrix(test_x, label=test_y) watchlist = [(dtrain, 'train')] clf = xgb.train( params_xgb, dtrain, #num_round, int(m_best), watchlist, feval=eval_metric, verbose_eval=10) pred_prob_holdout_y = clf.predict(dtest).reshape(test_y.shape[0], 4) # probabilities pred_holdout_y = np.argmax(pred_prob_holdout_y, axis=1) print 'pred_holdout_y.shape:' print pred_holdout_y.shape print 'test_y.shape:' print test_y.shape #s_test = fscore(pred_holdout_y, test_y) #s_test_perf = perfect_score(test_y) predicted = [LABELS[int(a)] for a in pred_holdout_y] actual = [LABELS[int(a)] for a in test_y] report_score(actual, predicted) print Counter(predicted) test_df['actual'] = actual test_df['predicted'] = predicted test_df['prob_0'] = pred_prob_holdout_y[:, 0] test_df['prob_1'] = pred_prob_holdout_y[:, 1] test_df['prob_2'] = pred_prob_holdout_y[:, 2] test_df['prob_3'] = pred_prob_holdout_y[:, 3] #test_df[['Headline','Body ID', 'Stance', 'actual', 'predicted']].to_csv('predtest.csv', index=False) test_df[[ 'Headline', 'Body ID', 'Stance', 'actual', 'predicted', 'prob_0', 'prob_1', 'prob_2', 'prob_3' ]].to_csv('predtest_cor2.csv', index=False)
def train_classifier_GS(self, x,y, groups): if self.classifier == "SVM": clf = svm.SVC(kernel='rbf', probability=True) if self.OneVsRest: param_grid = { 'estimator__C': [1, 10, 100, 1000], 'estimator__gamma': [0.001, 0.0001] } else: param_grid = { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001] } elif self.classifier == "RF": clf = RandomForestClassifier() if self.OneVsRest: param_grid = { 'estimator__n_estimators': [100, 250, 500, 750, 1000], 'estimator__max_features': ['auto', 'log2'], 'estimator__max_depth': [4, 6, 8], 'estimator__criterion': ['gini', 'entropy'] } else: param_grid = { 'n_estimators': [100, 250, 500, 750, 1000], 'max_features': ['auto', 'log2'], 'max_depth': [4, 6, 8], 'criterion': ['gini', 'entropy'] } elif self.classifier == "XGB": clf = xgb.XGBClassifier() if self.OneVsRest: param_grid = { 'estimator__max_depth': [4, 6, 8], 'estimator__learning_rate': [0.01, 0.1, 0.3] } else: param_grid = { 'max_depth': [4, 6, 8], 'learning_rate': [0.01, 0.1, 0.3] } #Apply PCA transformation to data if self.n > 0: self.pca.fit_transform(x) x_transformed =self.pca.transform(x) else: x_transformed = x # OneVsRest Classifier if self.OneVsRest: clf = OneVsRestClassifier(clf) # Define cross validation method #kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=50) group_kfold = GroupKFold(n_splits=5) kfolds = group_kfold.split(x_transformed, y, groups) #Get scoring metrics scoring = self.get_scoring() #Perform Grid Search grid = GridSearchCV(clf, param_grid=param_grid, cv=kfolds, n_jobs=1, scoring=scoring, refit='f1_weighted', verbose=1, return_train_score=True) grid.fit(x_transformed, y) #Print and log results self.save_results(grid) self.save_best_result(grid) #Save best estimator self.clf = grid.best_estimator_ #disp = plot_precision_recall_curve(best_clf, x_transformed, y) return
def main(args): # Load the parameters from json file params_dir = args.params_dir json_path = os.path.join(params_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) # use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments torch.manual_seed(params.seed) if params.cuda: torch.cuda.manual_seed(params.seed) # Set the logger model_dir = args.output_dir if not os.path.exists(model_dir): os.makedirs(model_dir) utils.set_logger(os.path.join(model_dir, 'train.log')) logging.info("************ Validation fold: {} ************".format( args.fold)) # Create the input data pipeline logging.info("Loading the datasets...") config_dict = { 'image_dir': os.path.join(args.input_dir, 'train'), 'csv_path': os.path.join(args.input_dir, 'train.csv') } train_data = DataPreprocess(config_dict) df, target_cols, num_targets = train_data.df, train_data.target_cols, train_data.num_targets # check for debug mode if args.debug: params.num_epochs = 1 df = df.sample(n=100, random_state=params.seed).reset_index(drop=True) # update params params.mode = args.mode params.num_targets = num_targets params.target_cols = target_cols # split data into folds and pass to the model Fold = GroupKFold(n_splits=params.num_folds) groups = df['PatientID'].values for n, (train_index, valid_index) in enumerate( Fold.split(df, df[params.target_cols], groups)): df.loc[valid_index, 'fold'] = int(n) df['fold'] = df['fold'].astype(int) # get training and validation data using folds train_df = df[df.fold != args.fold].reset_index(drop=True) valid_df = df[df.fold == args.fold].reset_index(drop=True) # get dataloaders train_dataloader = dataloader.fetch_dataloader(train_df, params, data='train') valid_dataloader = dataloader.fetch_dataloader(valid_df, params, data='valid') logging.info("- done.") # Define the model and optimizer model = RANZCRModel(params, pretrained=True).model if params.cuda: model = model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate, amsgrad=False) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True) # fetch loss function and metrics loss_fn = nn.BCEWithLogitsLoss() metrics = models.metrics # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) train_and_evaluate(model, train_dataloader, valid_dataloader, valid_df[params.target_cols].values, optimizer, scheduler, loss_fn, metrics, params, model_dir)
logger.debug(f"Number of rows in train: {n_train}") logger.debug(f"Number of rows in test: {n_test}") logger.debug(f"Using features:{train_use.columns.values}") categorical_cols = ["district", "layout", "direction", "structure"] #################### ## Train model #################### folds = GroupKFold(n_splits=5) oof = np.zeros(len(train_use)) predictions = np.zeros(len(test_use)) feature_importance_df = pd.DataFrame() for fold, (train_idx, val_idx) in enumerate(folds.split(train_use, groups=train_group)): print(f"Fold {fold+1}") train_data = lgb.Dataset(train_use.iloc[train_idx], label=target_log[train_idx], categorical_feature=categorical_cols) val_data = lgb.Dataset(train_use.iloc[val_idx], label=target_log[val_idx], categorical_feature=categorical_cols) num_round = N_ROUNDS callbacks = [log_evaluation(logger, period=100)] clf = lgb.train(params, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100,
print(f'Processing fold {fold}') model = get_model() model.to(DEVICE) train_idx, valid_idx = fold_info[fold] f'Proportions valid / train: {len(valid_idx) / len(train_idx)}' train_dl, valid_dl = generate_train_valid_dls(ds, train_idx, valid_idx) optimizer, scheduler = create_optimizer_scheduler(model, train_dl, EPOCHS) train_losses, valid_losses, accumulated_lrs, accumulated_dice_metrics = train(fold, EPOCHS, train_dl, valid_dl, optimizer, scheduler, patience = PATIENCE) return train_losses, valid_losses, accumulated_lrs, accumulated_dice_metrics # In[50]: fold_info = [(train_idx, valid_idx) for fold, (train_idx, valid_idx) in tqdm(enumerate(group_kfold.split(ds.slices, groups = groups)), total=FOLDS)] # In[51]: # from fastai.data.core import DataLoaders # train_idx, valid_idx = fold_info[0] # train_ds, valid_ds = create_subset(ds, train_idx, valid_idx) # dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=BATCH_SIZE, num_workers=2) # assert(dls.bs == BATCH_SIZE) # In[52]:
# Weight train samples by the inverse of how frequently the installation_id appears train_features["sample_weight"] = 1 / train_features.groupby( "installation_id")["accuracy_group"].transform("count") # Estimate accuracy_group proportions in the test set by using sample weights in train instead of sampling accuracy_groups = train_features.groupby( "accuracy_group")["sample_weight"].agg("sum") accuracy_group_proportions = list(accuracy_groups / accuracy_groups.sum()) assessment_encoder = LabelEncoder() train_features["assessment"] = assessment_encoder.fit_transform( train_features["assessment"]) group_kfold = GroupKFold(n_splits=5) models = [] qwk_scores = [] for train_index, val_index in group_kfold.split( train_features, groups=train_features["installation_id"]): model, qwk_score = train_and_evaluate( train_features, list(train_index), list(val_index), accuracy_group_proportions, ) models.append(model) qwk_scores.append(qwk_score) print(f"QWK score: {np.mean(qwk_scores)}") # Predict on test set X_test, installation_ids = ( test_features.drop(["installation_id"], axis=1), test_features["installation_id"],
param_dist = { 'n_estimators': stats.randint(100, 1000), 'learning_rate': stats.uniform(0.01, 0.1), 'subsample': stats.uniform(0.3, 0.7), 'max_depth': [3, 4, 5, 6, 7, 8, 9], 'colsample_bytree': stats.uniform(0.5, 0.45), 'min_child_weight': [1, 2, 3] } n_splits = 50 groups = data_train['Scenario'] group_kfold = GroupKFold(n_splits=n_splits) rmse = list() for number, (train_idx, test_idx) in enumerate( group_kfold.split(data_train, groups=groups)): print(f'Fold {number + 1} of {n_splits}') X_train, Y_train = x_.iloc[train_idx, :], y_.iloc[train_idx] X_test, Y_test = x_.iloc[test_idx, :], y_.iloc[test_idx] groups_train = groups[train_idx] group_kfold_inner = GroupKFold(n_splits=3) xgb_regressor_model = xgb.XGBRegressor(objective='reg:squarederror') xgb_regressor_search = RandomizedSearchCV( xgb_regressor_model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=group_kfold_inner.split(X_train, groups=groups_train), refit=1,
data = data.merge(user_ais_data, on=['user_id', 'aisle_id'], how='left') del user_ais_data train_set = data[data['eval_set'] == 'train'] del data """ 任务1: 用一个单决策树来进行拟合,对参数进行调整。 """ from sklearn.model_selection import GroupKFold kf = GroupKFold(n_splits=5) train_indexes = [] test_indexes = [] for i, (train_index, test_index) in enumerate( kf.split(train_set, groups=train_set['user_id'].values)): train_indexes.append(train_index) test_indexes.append(test_index) train_index = train_indexes[0] test_index = test_indexes[0] training = train_set.iloc[train_index, :] testing = train_set.iloc[test_index, :] del train_set """ Decision Tree """ col = list(training.columns) col.remove('reordered')
group_kfold = GroupKFold(n_splits=NFOLD) sub_train['g'] = sub_train.index % NFOLD CAT = list(set(X.columns) & set(utils_cat.ALL)) # ============================================================================= # cv # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) gc.collect() ret = lgb.cv(param, dtrain, 9999, folds=group_kfold.split(X, sub_train['y'], sub_train['g']), early_stopping_rounds=100, verbose_eval=50, seed=SEED) result = f"CV auc-mean: {ret['auc-mean'][-1]}" print(result) utils.send_line(result) # ============================================================================= # imp # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) model = lgb.train(param, dtrain, len(ret['auc-mean'])) imp = ex.getImp(model).sort_values(['gain', 'feature'],
def multiclass_one_vs_rest(x, y, model_type='svm', plot=False, verbose=False, run_cv=False): # First split the data into training and test sets x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=RANDOM_SEED) # pick the base classifier based on the model_type paramemter if model_type is 'logistic': base_model = LogisticRegression(random_state=RANDOM_SEED, class_weight='balanced') elif model_type is 'tree': base_model = DecisionTreeClassifier(random_state=RANDOM_SEED, class_weight='balanced') elif model_type is 'adaboost': base_model = AdaBoostClassifier(random_state=RANDOM_SEED) elif model_type is 'forest': # base case base_model = RandomForestClassifier(random_state=RANDOM_SEED, class_weight='balanced') # this gives no improvement over base case #base_model = RandomForestClassifier(random_state=RANDOM_SEED, class_weight='balanced', n_estimators = 50, max_depth=20) # no improvement here either #base_model = RandomForestClassifier(random_state=RANDOM_SEED, class_weight='balanced', n_estimators = 100, max_depth=40) elif model_type is 'nnet': base_model = MLPClassifier(random_state=RANDOM_SEED) elif model_type is 'extra': base_model = ExtraTreesClassifier(random_state=RANDOM_SEED) else: #base case base_model = SVC(kernel='linear', random_state=RANDOM_SEED, class_weight='balanced', probability=True) # no improvement here #base_model = SVC(kernel='linear', random_state=RANDOM_SEED, class_weight='balanced', activation="logistic", max_iter=500) # create the OvR model using the base classifier # model = OneVsRestClassifier(base_model, n_jobs=10) # create OvR model with base classifier and feature selection model = CustomBRClassifier(base_model) # train the model using the training data fit_start = time.time() model.fit(x_train, y_train) fit_end = time.time() if verbose: print('------ model info ----------') print('one vs all ' + model_type + ' is a multi-label classifier: ' + str(model.multilabel_)) print('one vs all ' + model_type + ' number of classes: ' + str(model.classes_)) print('one vs all ' + model_type + ' elapsed training time: ' + str(fit_end - fit_start)) # check the accuracy on the training data if verbose: print('------ training data ----------') fpr_train, tpr_train, auc_train = check_predictions( model, (model_type + " - train"), x_train, y_train, plot, verbose) # check the accuracy on the test data if verbose: print('------ test data ----------') fpr_test, tpr_test, auc_test = check_predictions(model, (model_type + " - test"), x_test, y_test, plot, verbose) # generate_roc_hist(y_test, model.predict_proba(x_test)) # get the cross-validation score if run_cv: accuracy_scorer = make_scorer(calculate_overall_accuracy) kf = GroupKFold(5) cv_accuracy_scores = [] cv_auc_scores = [] for train_index, test_index in kf.split(x, y=y, groups=np.arange(x.shape[0])): X_train, X_test = x.iloc[train_index, np.arange(0, x.shape[1])], x.iloc[ test_index, np.arange(0, x.shape[1])] Y_train, Y_test = y[train_index], y[test_index] model.fit(X_train, Y_train) cv_auc_scores.append( roc_auc_score(Y_test, model.predict_proba(X_test), average='micro')) cv_accuracy_scores.append( calculate_overall_accuracy(Y_test, model.predict_proba(X_test))) cv_accuracy_scores = np.mean(np.array(cv_accuracy_scores)) cv_auc_scores = np.mean(np.array(cv_auc_scores)) if verbose: print('------ CV scores ----------') print('one vs all ' + model_type + ' CV accuracy scores ' + str(cv_accuracy_scores)) print('one vs all ' + model_type + ' CV AUC scores ' + str(cv_auc_scores)) return fpr_train, tpr_train, auc_train, fpr_test, tpr_test, auc_test
logger.debug(f"Number of rows in train: {n_train}") logger.debug(f"Number of rows in test: {n_test}") logger.debug(f"Using features:{train_use.columns.values}") categorical_cols = ["district", "layout", "direction", "structure"] #################### ## Train model #################### folds = GroupKFold(n_splits=5) oof = np.zeros(len(train_use)) predictions = np.zeros(len(test_use)) feature_importance_df = pd.DataFrame() for fold, (train_idx, val_idx) in enumerate(folds.split(train_use, groups=group_for_kfold)): print(f"Fold {fold+1}") train_data = lgb.Dataset(train_use.iloc[train_idx], label=target_log[train_idx], categorical_feature=categorical_cols) val_data = lgb.Dataset(train_use.iloc[val_idx], label=target_log[val_idx], categorical_feature=categorical_cols) num_round = N_ROUNDS callbacks = [log_evaluation(logger, period=100)] clf = lgb.train(params, train_data, num_round, valid_sets=[train_data, val_data], verbose_eval=False, early_stopping_rounds=100,
def RandomForest(X, Y, groups, n_trees): lpgo = GroupKFold(n_splits=14) MAE = [] ECM = [] MAPE = [] R2_SCORE = [] relevant_features = [] N = np.size(Y[0]) for train_index, test_index in lpgo.split(X, Y, groups): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] #Normalización de los datos sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # Ajustar el modelo a la regresión simple regressor = RandomForestRegressor(n_estimators=n_trees, random_state=0) regressor.fit(X_train, y_train) relevant_features.append(regressor.feature_importances_) # Predecir los resultados de prueba y_pred = regressor.predict(X_test) ECM.append(mean_squared_error(y_test, y_pred, multioutput='raw_values')) MAE.append( mean_absolute_error(y_test, y_pred, multioutput='raw_values')) R2_SCORE.append(r2_score(y_test, y_pred, multioutput='raw_values')) m = [] m.append( np.mean(np.abs( (y_test[:, 0] - y_pred[:, 0]) / y_test[:, 0])) * 100) m.append( np.mean(np.abs( (y_test[:, 1] - y_pred[:, 1]) / y_test[:, 1])) * 100) MAPE.append(m) ECM_matrix = np.asmatrix(ECM) MAE_matrix = np.asmatrix(MAE) MAPE_matrix = np.asmatrix(MAPE) R2_matrix = np.asmatrix(R2_SCORE) for i in range(0, N): print("El error cuadratrico medio de validación para la salida", i, "es (ECM):", np.around(np.mean(ECM_matrix[:, i]), decimals=3), "+-", np.around(np.std(ECM_matrix[:, i]), decimals=3)) print("El error medio absoluto de validación para la salida", i, "es (MAE):", np.around(np.mean(MAE_matrix[:, i]), decimals=3), "+-", np.around(np.std(MAE_matrix[:, i]), decimals=3)) print( "El porcentaje de error medio absoluto de validación para la salida", (i + 1), "es (MAPE):", np.around(np.mean(MAPE_matrix[:, i]), decimals=3), "%", "+-", np.around(np.std(MAPE_matrix[:, i]), decimals=3), "%") print("Coeficiente de determinación para la salida", (i + 1), "es (R2):", np.around(np.mean(R2_matrix[:, i])), "%", "+-", np.around(np.std(R2_matrix[:, i]), decimals=3)) relevant_features = np.asmatrix(relevant_features) print(np.mean(relevant_features, axis=0))
def __init__(self, seed, val_split=0.2, shuffle=True, cell_features=['expression'], drug_features=['descriptors'], use_landmark_genes=False, use_combo_score=False, feature_subsample=None, scaling='std', scramble=False, cv_partition='overlapping', cv=0): """Initialize data merging drug response, drug descriptors and cell line essay. Shuffle and split training and validation set Parameters ---------- seed: integer seed for random generation val_split : float, optional (default 0.2) fraction of data to use in validation cell_features: list of strings from 'expression', 'expression_5platform', 'mirna', 'proteome', 'all', 'categorical' (default ['expression']) use one or more cell line feature sets: gene expression, microRNA, proteome use 'all' for ['expression', 'mirna', 'proteome'] use 'categorical' for one-hot encoded cell lines drug_features: list of strings from 'descriptors', 'latent', 'all', 'categorical', 'noise' (default ['descriptors']) use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder trained on NSC drugs, or both; use random features if set to noise use 'categorical' for one-hot encoded drugs shuffle : True or False, optional (default True) if True shuffles the merged data before splitting training and validation sets scramble: True or False, optional (default False) if True randomly shuffle dose response data as a control feature_subsample: None or integer (default None) number of feature columns to use from cellline expressions and drug descriptors use_landmark_genes: True or False only use LINCS1000 landmark genes use_combo_score: bool (default False) use combination score in place of percent growth (stored in 'GROWTH' column) scaling: None, 'std', 'minmax' or 'maxabs' (default 'std') type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization """ self.cv_partition = cv_partition np.random.seed(seed) df = NCI60.load_combo_response(use_combo_score=use_combo_score, fraction=True) logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format( df.shape[0])) if 'all' in cell_features: self.cell_features = ['expression', 'mirna', 'proteome'] else: self.cell_features = cell_features if 'all' in drug_features: self.drug_features = ['descriptors', 'latent'] else: self.drug_features = drug_features for fea in self.cell_features: if fea == 'expression' or fea == 'rnaseq': self.df_cell_expr = NCI60.load_cell_expression_rnaseq( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'expression_u133p2': self.df_cell_expr = NCI60.load_cell_expression_u133p2( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'expression_5platform': self.df_cell_expr = NCI60.load_cell_expression_5platform( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'mirna': self.df_cell_mirna = NCI60.load_cell_mirna( ncols=feature_subsample, scaling=scaling) df = df.merge(self.df_cell_mirna[['CELLNAME']], on='CELLNAME') elif fea == 'proteome': self.df_cell_prot = NCI60.load_cell_proteome( ncols=feature_subsample, scaling=scaling) df = df.merge(self.df_cell_prot[['CELLNAME']], on='CELLNAME') elif fea == 'categorical': df_cell_ids = df[['CELLNAME']].drop_duplicates() cell_ids = df_cell_ids['CELLNAME'].map( lambda x: x.replace(':', '.')) df_cell_cat = pd.get_dummies(cell_ids) df_cell_cat.index = df_cell_ids['CELLNAME'] self.df_cell_cat = df_cell_cat.reset_index() for fea in self.drug_features: if fea == 'descriptors': self.df_drug_desc = NCI60.load_drug_descriptors( ncols=feature_subsample, scaling=scaling) df = df[df['NSC1'].isin(self.df_drug_desc['NSC']) & df['NSC2'].isin(self.df_drug_desc['NSC'])] elif fea == 'latent': self.df_drug_auen = NCI60.load_drug_autoencoded_AG( ncols=feature_subsample, scaling=scaling) df = df[df['NSC1'].isin(self.df_drug_auen['NSC']) & df['NSC2'].isin(self.df_drug_auen['NSC'])] elif fea == 'categorical': df_drug_ids = df[['NSC1']].drop_duplicates() df_drug_ids.columns = ['NSC'] drug_ids = df_drug_ids['NSC'] df_drug_cat = pd.get_dummies(drug_ids) df_drug_cat.index = df_drug_ids['NSC'] self.df_drug_cat = df_drug_cat.reset_index() elif fea == 'noise': ids1 = df[['NSC1' ]].drop_duplicates().rename(columns={'NSC1': 'NSC'}) ids2 = df[['NSC2' ]].drop_duplicates().rename(columns={'NSC2': 'NSC'}) df_drug_ids = pd.concat([ids1, ids2]).drop_duplicates() noise = np.random.normal(size=(df_drug_ids.shape[0], 500)) df_rand = pd.DataFrame( noise, index=df_drug_ids['NSC'], columns=['RAND-{:03d}'.format(x) for x in range(500)]) self.df_drug_rand = df_rand.reset_index() logger.info( 'Filtered down to {} rows with matching information.'.format( df.shape[0])) ids1 = df[['NSC1']].drop_duplicates().rename(columns={'NSC1': 'NSC'}) ids2 = df[['NSC2']].drop_duplicates().rename(columns={'NSC2': 'NSC'}) df_drug_ids = pd.concat([ids1, ids2 ]).drop_duplicates().reset_index(drop=True) n_drugs = df_drug_ids.shape[0] n_val_drugs = int(n_drugs * val_split) n_train_drugs = n_drugs - n_val_drugs logger.info('Unique cell lines: {}'.format(df['CELLNAME'].nunique())) logger.info('Unique drugs: {}'.format(n_drugs)) # df.to_csv('filtered.growth.min.tsv', sep='\t', index=False, float_format='%.4g') # df.to_csv('filtered.score.max.tsv', sep='\t', index=False, float_format='%.4g') if shuffle: df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True) df_drug_ids = df_drug_ids.sample( frac=1.0, random_state=seed).reset_index(drop=True) self.df_response = df self.df_drug_ids = df_drug_ids self.train_drug_ids = df_drug_ids['NSC'][:n_train_drugs] self.val_drug_ids = df_drug_ids['NSC'][-n_val_drugs:] if scramble: growth = df[['GROWTH']] random_growth = growth.iloc[np.random.permutation( np.arange(growth.shape[0]))].reset_index() self.df_response[['GROWTH']] = random_growth['GROWTH'] logger.warn('Randomly shuffled dose response growth values.') logger.info('Distribution of dose response:') logger.info(self.df_response[['GROWTH']].describe()) self.total = df.shape[0] self.n_val = int(self.total * val_split) self.n_train = self.total - self.n_val logger.info('Rows in train: {}, val: {}'.format( self.n_train, self.n_val)) self.cell_df_dict = { 'expression': 'df_cell_expr', 'expression_5platform': 'df_cell_expr', 'expression_u133p2': 'df_cell_expr', 'rnaseq': 'df_cell_expr', 'mirna': 'df_cell_mirna', 'proteome': 'df_cell_prot', 'categorical': 'df_cell_cat' } self.drug_df_dict = { 'descriptors': 'df_drug_desc', 'latent': 'df_drug_auen', 'categorical': 'df_drug_cat', 'noise': 'df_drug_rand' } self.input_features = collections.OrderedDict() self.feature_shapes = {} for fea in self.cell_features: feature_type = 'cell.' + fea feature_name = 'cell.' + fea df_cell = getattr(self, self.cell_df_dict[fea]) self.input_features[feature_name] = feature_type self.feature_shapes[feature_type] = (df_cell.shape[1] - 1, ) for drug in ['drug1', 'drug2']: for fea in self.drug_features: feature_type = 'drug.' + fea feature_name = drug + '.' + fea df_drug = getattr(self, self.drug_df_dict[fea]) self.input_features[feature_name] = feature_type self.feature_shapes[feature_type] = (df_drug.shape[1] - 1, ) logger.info('Input features shapes:') for k, v in self.input_features.items(): logger.info(' {}: {}'.format(k, self.feature_shapes[v])) self.input_dim = sum([ np.prod(self.feature_shapes[x]) for x in self.input_features.values() ]) logger.info('Total input dimensions: {}'.format(self.input_dim)) if cv > 1: if cv_partition == 'disjoint': pass elif cv_partition == 'disjoint_cells': y = self.df_response['GROWTH'].values groups = self.df_response['CELLNAME'].values gkf = GroupKFold(n_splits=cv) splits = gkf.split(y, groups=groups) self.cv_train_indexes = [] self.cv_val_indexes = [] for index, (train_index, val_index) in enumerate(splits): print(index, train_index) self.cv_train_indexes.append(train_index) self.cv_val_indexes.append(val_index) else: y = self.df_response['GROWTH'].values # kf = KFold(n_splits=cv) # splits = kf.split(y) skf = StratifiedKFold(n_splits=cv, random_state=seed) splits = skf.split(y, discretize(y, bins=cv)) self.cv_train_indexes = [] self.cv_val_indexes = [] for index, (train_index, val_index) in enumerate(splits): print(index, train_index) self.cv_train_indexes.append(train_index) self.cv_val_indexes.append(val_index)
def split_data(df, ycol='0', classify=False, cv=5, bins=0, cutoffs=None, groupcols=None, ignore_categoricals=False, verbose=True): if groupcols is not None: groups = make_group_from_columns(df, groupcols) cat_cols = df.select_dtypes(['object']).columns if ignore_categoricals: df[cat_cols] = 0 else: df[cat_cols] = df[cat_cols].apply( lambda x: x.astype('category').cat.codes) if ycol.isdigit(): ycol = df.columns[int(ycol)] y = df.loc[:, ycol].as_matrix() x = df.drop(ycol, axis=1).as_matrix() features = df.drop(ycol, axis=1).columns.tolist() if verbose: print('Target column: {}'.format(ycol)) print(' count = {}, uniq = {}, mean = {:.3g}, std = {:.3g}'.format( len(y), len(np.unique(y)), np.mean(y), np.std(y))) print( ' min = {:.3g}, q1 = {:.3g}, median = {:.3g}, q3 = {:.3g}, max = {:.3g}' .format(np.min(y), np.percentile(y, 25), np.median(y), np.percentile(y, 75), np.max(y))) if not classify: y_even = discretize(y, bins=5, verbose=False) elif bins >= 2: y = discretize(y, bins=bins, min_count=cv, verbose=verbose) elif cutoffs: y = discretize(y, cutoffs=cutoffs, min_count=cv, verbose=verbose) elif df[ycol].dtype in [np.dtype('float64'), np.dtype('float32')]: warnings.warn( 'Warning: classification target is float; consider using --bins or --cutoffs' ) y = y.astype(int) if classify: mask = np.ones(len(y), dtype=bool) unique, counts = np.unique(y, return_counts=True) for v, c in zip(unique, counts): if c < cv: mask[y == v] = False x = x[mask] y = y[mask] removed = len(mask) - np.sum(mask) if removed and verbose: print('Removed {} rows in small classes: count < {}'.format( removed, cv)) if groupcols is None: if classify: y_even = y skf = StratifiedKFold(n_splits=cv, shuffle=True) splits = skf.split(x, y_even) else: if classify: groups = groups[mask] gkf = GroupKFold(n_splits=cv) splits = gkf.split(x, y, groups) if verbose: print() return x, y, list(splits), features
col = list(train.columns) col.remove('reordered') col.remove('eval_set') col.remove('user_id') col.remove('product_id') col.remove('order_id') col.remove('department_id') col.remove('aisle_id') from sklearn.model_selection import GroupKFold kf = GroupKFold(n_splits=5) train_indexes = [] test_indexes = [] for i, (train_index, test_index) in enumerate( kf.split(train, groups=train['user_id'].values)): train_indexes.append(train_index) test_indexes.append(test_index) train_index = train_indexes[0] test_index = test_indexes[0] training = train.iloc[train_index, :] testing = train.iloc[test_index, :] del train from sklearn.metrics import roc_auc_score from sklearn.metrics import log_loss del train_index, train_indexes, test_index, test_indexes, i del test import lightgbm as lgb
# Plot and save bar chart plt.rcParams['xtick.labelsize'] = 8 ax = scaled_var_imp_df_sorted.plot.bar(y='scaled_importance', x='variable', rot=90, figsize=(16, 12)) plt.tight_layout() plt.savefig('FS_result_h2o.pdf', format='pdf', dpi=1200) # Perform k-fold cv # split on train - test dataset by group - according to no_folds gkf = GroupKFold(n_splits=no_folds) cv_fold = 0 for train_index, test_index in gkf.split(X, y, groups=groups): cv_fold += 1 print("CV fold: ", cv_fold) # print("Train Index: ", train_index) # print("Test Index: ", test_index, "\n") #print('Groups: ', groups,'\n') trainX_data = X.loc[train_index] trainy_data = y.loc[train_index] testX_data = X.loc[test_index] testy_data = y.loc[test_index] # Save original 10cv folds with all features train_set = pd.concat([trainX_data, trainy_data, groups], axis=1,
def group_test_2(pre_x, kmeans_labels, names, num_dic, groups, num_vars, meta_i): # print('meta-i='+str(meta_i)) chosen_vars = np.zeros(meta_i) chosen_values = np.zeros(meta_i) # print('===') for i in range(num_vars): #clean this routine up? check for errors? old_val = 0 new_val = num_dic[i] clust = kmeans_labels[i] old_val = chosen_values[clust] if old_val < new_val: # print(names[i],old_val,new_val) chosen_vars[clust] = int(i) chosen_values[clust] = new_val # print(chosen_vars) # print(type(chosen_vars)) chosen_works = [] chosen_names = [] for qq in list(chosen_vars): chosen_names.append(names[int(qq)]) chosen_works.append(int(qq)) X = pre_x[:, chosen_works] # print(chosen_names) # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3) group_kfold = GroupKFold(n_splits=3) group_kfold.get_n_splits(X, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] # print(train_index) # print(test_index) for id in train_index: X_train.append(X[id]) for id in test_index: X_test.append(X[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) # if model=='svm': # clf=SVC().fit(X_train,y_train) # elif model=='rf_extra': # clf=ExtraTreesClassifier().fit(X_train,y_train) # elif model=='rf': # clf=RandomForestClassifier().fit(X_train,y_train) # elif model=='nb': # clf=GaussianNB().fit(X_train,y_train) # elif model=='lr': # clf=LogisticRegression().fit(X_train,y_train) clf = RandomForestClassifier().fit(X_train, y_train) # score=np.mean(cross_val_score(svm,X,y,cv=10)) # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train)) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) # print('accuracy='+','+str(qwer)+'\n') return np.mean(acc_arr), chosen_names
if cross_test: this_scores = cross_val_score(pipe, Sample, target, cv=10) print('10次交叉验证:\n') print('10次交叉验证的精确度', this_scores.view()) print('10次交叉验证的精确度平均值', this_scores.mean()) print('10次交叉验证的精确度方差', this_scores.std()) print('-----------------------------------------------') group = sio.loadmat( 'D:\\1-embed\\4-Serial_GUI\\分类模型训练\\tmp\\group.mat') group = group['group_individual'] group = group.reshape(-1) print(group) # gkf.split(X, y, groups=group) gkf = GroupKFold(n_splits=11) for train, test in gkf.split(Sample, target, groups=group): print("train-%s test-%s" % (group[train], group[test])) this_scores = cross_val_score(pipe, Sample, target, groups=group, cv=gkf) print('5随机划分训练集:\n') print('随机交叉验证的精确度', this_scores.view()) print('随机交叉验证的精确度平均值', this_scores.mean()) print('随机交叉验证的精确度方差', this_scores.std()) if save: #使用dump()将数据序列化到文件中 fw = open('D:\\1-embed\\4-Serial_GUI\\分类模型训练\\tmp\\ModelFile.txt', 'wb')
def validate_train(model, X, y, groups, oversample=True, n_splits=5, dump=DUMP_DEFAULT, model_folder=MODEL_FOLDER_DEFAULT, metric=f1_score, verbose=False, num_importance=20): kf = GroupKFold(n_splits=n_splits) all_y = [] all_predicted = [] for train_index, test_index in kf.split(X, y, groups): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] if oversample: X_tmp, y_tmp = oversample_data(X_train, y_train, verbose=verbose) model.train_model(X_tmp, y_tmp) else: model.train_model(X_train, y_train) pred = model.predict_batch(X_test) all_predicted.extend(pred) all_y.extend(y_test) print(">>> MODEL: ", model.model_name) print("Params:", model.get_description()) all_y = model.encode2idx(all_y) if metric is f1_score: result = metric(all_y, all_predicted, average=None) else: result = metric(all_y, all_predicted) if dump: if oversample: X_tmp, y_tmp = oversample_data(X, y, verbose=verbose) model.train_model(X_tmp, y_tmp) else: model.train_model(X, y) print("FEATURE_IMPORTANCE") importances = model.get_feature_importance() labels = model.get_labels() print("=== labels {} ===".format(labels)) if importances is not None: for imp_line, label in zip(importances, labels): print("\nLABEL: ", label) print("*" * 20) print("\n --- TOP {} most important --- \n".format( num_importance)) for n, val in imp_line[:num_importance]: print("{}\t{}".format(n, np.round(val, 3))) print("\n --- TOP {} anti features --- \n".format( num_importance)) for n, val in imp_line[::-1][:num_importance]: print("{}\t{}".format(n, np.round(val, 3))) model.dump_model(os.path.join(model_folder, model.model_name)) print("== MODEL DUMPED ==") print("classif_report:\n", classification_report(all_y, all_predicted)) log_results(all_y, all_predicted, model.model_name or model.model_name, model) return result
def run_cv_model_by_batch(train, test, splits, batch_col, feats, sample_submission, nn_epochs, nn_batch_size): seed_everything(SEED) K.clear_session() config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1) sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=config) tf.compat.v1.keras.backend.set_session(sess) oof_ = np.zeros((len(train), 11)) # build out of folds matrix with 11 columns, they represent our target variables classes (from 0 to 10) preds_ = np.zeros((len(test), 11)) target = ['open_channels'] group = train['group'] kf = GroupKFold(n_splits=5) splits = [x for x in kf.split(train, train[target], group)] oof_pd = pd.DataFrame() oof_pd['open_channels'] = train['open_channels'] new_splits = [] for sp in splits: new_split = [] new_split.append(np.unique(group[sp[0]])) new_split.append(np.unique(group[sp[1]])) new_split.append(sp[1]) new_splits.append(new_split) # pivot target columns to transform the net to a multiclass classification estructure (you can also leave it in 1 vector with sparsecategoricalcrossentropy loss function) tr = pd.concat([pd.get_dummies(train.open_channels), train[['group']]], axis=1) tr.columns = ['target_'+str(i) for i in range(11)] + ['group'] target_cols = ['target_'+str(i) for i in range(11)] train_tr = np.array(list(tr.groupby('group').apply(lambda x: x[target_cols].values))).astype(np.float32) train = np.array(list(train.groupby('group').apply(lambda x: x[feats].values))) test = np.array(list(test.groupby('group').apply(lambda x: x[feats].values))) for n_fold, (tr_idx, val_idx, val_orig_idx) in enumerate(new_splits[0:], start=0): train_x, train_y = train[tr_idx], train_tr[tr_idx] valid_x, valid_y = train[val_idx], train_tr[val_idx] print(f'Our training dataset shape is {train_x.shape}') print(f'Our validation dataset shape is {valid_x.shape}') train_x_new = train gc.collect() shape_ = (None, train_x.shape[2]) # input is going to be the number of feature we are using (dimension 2 of 0, 1, 2) model = Classifier(shape_) # using our lr_schedule function cb_lr_schedule = LearningRateScheduler(lr_schedule) # Use Early-Stopping callback_early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='auto') model.fit(train_x,train_y, epochs = nn_epochs, callbacks = [callback_early_stopping,cb_lr_schedule, MacroF1(model, valid_x, valid_y)], # adding custom evaluation metric for each epoch batch_size = nn_batch_size,verbose = 2, validation_data = (valid_x,valid_y)) preds_f = model.predict(valid_x) f1_score_ = f1_score(np.argmax(valid_y, axis=2).reshape(-1), np.argmax(preds_f, axis=2).reshape(-1), average = 'macro') # need to get the class with the biggest probability print(f'Training fold {n_fold + 1} completed. macro f1 score : {f1_score_ :1.5f}') preds_f = preds_f.reshape(-1, preds_f.shape[-1]) oof_[val_orig_idx,:] += preds_f te_preds = model.predict(test) te_preds = te_preds.reshape(-1, te_preds.shape[-1]) preds_ += te_preds / SPLITS del train_x, train_y, valid_x, valid_y # calculate the oof macro f1_score f1_score_ = f1_score(np.argmax(train_tr, axis = 2).reshape(-1), np.argmax(oof_, axis = 1), average = 'macro') # axis 2 for the 3 Dimension array and axis 1 for the 2 Domension Array (extracting the best class) print(f'Training completed. oof macro f1 score : {f1_score_:1.5f}') sample_submission['open_channels'] = np.argmax(preds_, axis = 1).astype(int) oof_pd['open_channels_pred'] = np.argmax(oof_, axis = 1).astype(int) sample_submission.to_csv('submission_wavenet.csv', index=False, float_format='%.4f')
# ## Make Folds # In[13]: df_train = pd.read_csv(f'{datadir}train.csv') # 载入train.csv为dataframe df_train['file_path'] = df_train.image.apply( lambda x: os.path.join(f"{datadir}train_images", x)) # 加入图片的文件路径 # In[14]: # 做5fold切分数据,并把每行归属的fold加入到df_train中 gkf = GroupKFold(n_splits=5) df_train['fold'] = -1 for fold, (train_idx, valid_idx) in enumerate( gkf.split(df_train, None, df_train.label_group)): df_train.loc[valid_idx, 'fold'] = fold # In[15]: df_train.head() # In[16]: # 对label_group做一次标签编码 # 相关阅读 https://blog.csdn.net/weixin_43172660/article/details/84886470 le = LabelEncoder() df_train.label_group = le.fit_transform(df_train.label_group) # ## Transforms
ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold') ax.set_xlabel(name_param_1, fontsize=16) ax.set_ylabel('CV Average Score', fontsize=16) ax.legend(loc="best", fontsize=15) ax.grid(True) # load the data data = sgl.load_watch() X = data['X'] y = data['y'] g = data['subject'] # use subject id to group folds splitter = GroupKFold(n_splits=3) cv = splitter.split(X, y, groups=g) # create a feature representation pipeline pipe = sgl.Pype([('seg', sgl.SegmentX()), ('features', sgl.FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier())]) # create a parameter dictionary using the sklearn API # note that if you want to set a parameter to a single value, it will still need to be as a list par_grid = { 'seg__width': [50, 100, 200], 'seg__overlap': [0., 0.5], 'rf__n_estimators': [20] }
logger.info('{} Features after dropping null columns'.format( len(X_type.columns))) # Start training for type bond_start = timer() fold_count = 1 # Train the model # X_type = X.loc[X['type'] == bond_type] # y_type = y.iloc[X_type.index] # X_test_type = X_test.loc[X_test['type'] == bond_type] mol_group_type = mol_group.loc[mol_group['type'] == bond_type]['molecule_name'] oof = np.zeros(len(X_type)) prediction_type = np.zeros(len(X_test_type)) bond_scores = [] for fold_n, (train_idx, valid_idx) in enumerate( folds.split(X_type, groups=mol_group_type)): if MODEL_TYPE == 'lgbm': fold_start = timer() logger.info('Running Type {} - Fold {} of {}'.format( bond_type, fold_count, folds.n_splits)) X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx] y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx] model = lgb.LGBMRegressor(**lgb_params, n_estimators=N_ESTIMATORS, n_jobs=N_THREADS) model.fit( X_train.drop('type', axis=1), y_train, eval_set=[ #(X_train.drop('type', axis=1), y_train), (X_valid.drop('type', axis=1), y_valid) ],
# Step 2: Collect data for running CRF classifier true_iob_dir = join(LOCAL_DIR, 'train', 'iob') data = collect_crf_data(true_iob_dir, base_feats_dir, word_feats_dir) # Step 3: Create folds # create folds from complete texts only (i.e. instances of the same text # are never in different folds) # TODO How to set seed for random generator? group_k_fold = GroupKFold(n_splits=5) # use same split for all three entities splits = list( group_k_fold.split(data['feats'], data['Material'], data['filenames'])) # Step 4: Run CRF classifier crf = PruneCRF(c1=0.1, c2=0.1, all_possible_transitions=True) pred = {} for ent in ENTITIES: pred[ent] = cross_val_predict(crf, data['feats'], data[ent], cv=splits) # Report scores directly on I and B tags, # disregard 'O' because it is by far the most frequent class print('\n' + ent + ':\n') print( flat_classification_report(data[ent], pred[ent], digits=3, labels=('B', 'I')))
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array([ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ]) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))