def _trn_val_split(self, split_type, split_num, cell_type): trn_df = pd.read_csv('./mnt/inputs/origin/train.csv.zip') if split_type == 'gkf': fold = gkf(split_num).split(trn_df['id_code'], trn_df['sirna'], trn_df['well']) elif split_type == 'skf': fold = skf(split_num, shuffle=True, random_state=71)\ .split(trn_df['id_code'], trn_df['sirna']) elif split_type == 'cskf': fold = cskf(trn_df, trn_df['sirna'], split_num, shuffle=True, random_state=71) else: raise Exception(f'invalid split type: {split_type}') if cell_type not in ['ALL', 'HEPG2', 'U2OS', 'HUVEC', 'RPE']: raise Exception(f'invalid cell type {cell_type}') for trn_idx, val_idx in fold: if cell_type != 'ALL': _trn_df = trn_df.iloc[trn_idx] trn_ids = _trn_df[_trn_df.experiment.str.contains( cell_type)].id_code _val_df = trn_df.iloc[val_idx] val_ids = _val_df[_val_df.experiment.str.contains( cell_type)].id_code else: trn_ids = trn_df.iloc[trn_idx].id_code val_ids = trn_df.iloc[val_idx].id_code break return trn_ids, val_ids
def perform_grid_search(train_csv_path, headers, num_heroes): df = pd.read_csv(train_csv_path, names=headers, nrows=10000) print('Number of observations in the training data:', len(df)) enhanced_features = enhance_features(headers, df, None, num_heroes) combined_features = enhanced_features + headers[1:4] tuned_parameters = { 'n_estimators': [50, 100], 'max_depth': [6, 8], 'subsample': [0.5], 'learning_rate': [0.01, 0.05] } splitter = skf(5, shuffle=True, random_state=0) clf = gscv(gdc(), tuned_parameters, cv=splitter, n_jobs=-1) clf.fit(df[combined_features], df['score']) print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in sorted(zip(means, stds, clf.cv_results_['params'])): print("%0.4f (+/-%0.04f) for %r" % (mean, std * 2, params)) print() print('Best score: ' + '\x1b[1;33;40m', clf.best_score_, '\x1b[0m') print('Best parameters set found on development set:') print() print(clf.best_params_)
def split(self, x, y, group=None): if self.split_type == 'skf': if group is not None: self.logger.warn('the group is set for skf, ' 'which is not used.') fold = skf(self.split_num, shuffle=self.shuffle, random_state=self.random_state).split(x, y) elif self.split_type == 'gkf': fold = gkf(self.split_num).split(x, y, group) elif self.split_type == 'abhishek5': fold = [] fold_df = pd.read_csv(self.abhishek5) for i in range(5): fold.append(( fold_df.query(f'kfold != {i}').index.tolist(), fold_df.query(f'kfold == {i}').index.tolist(), )) elif self.split_type == 'abhishek8': fold = [] fold_df = pd.read_csv(self.abhishek8) for i in range(8): fold.append(( fold_df.query(f'kfold != {i}').index.tolist(), fold_df.query(f'kfold == {i}').index.tolist(), )) else: raise NotImplementedError(f'split_type: {self.split_type}') return fold
def CellwiseStratifiedKFold(X_df, y, n_splits=5, shuffle=False, random_state=71): cells = X_df.experiment.apply(lambda x: x.split('-')[0]) cell_folds = [] whole_index = np.array([i for i in range(len(X_df))]) cell_whole_indexes = [] for cell in np.unique(cells): cell_df = X_df[cells == cell] cell_whole_index = whole_index[cells == cell] cell_whole_indexes.append(cell_whole_index) cell_y = y[cells == cell] if cell_y.value_counts().min() < n_splits: cell_fold = skf(n_splits=int(cell_y.value_counts().min()), shuffle=shuffle, random_state=random_state).split(cell_df, cell_y) else: cell_fold = skf(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(cell_df, cell_y) cell_folds.append(cell_fold) fold = [[[], []] for i in range(n_splits)] for cell_whole_index, cell_fold in zip(cell_whole_indexes, cell_folds): for i, (trn_idx, val_idx) in enumerate(cell_fold): if i > 2: break fold[i][0].append(cell_whole_index[trn_idx]) fold[i][1].append(cell_whole_index[val_idx]) for i, _ in enumerate(fold): if len(fold[i][0]) > 0: fold[i][0] = np.concatenate(fold[i][0]) fold[i][1] = np.concatenate(fold[i][1]) return fold
def test(emb, label_mat, emb_IDmap, label_IDmap, n_splits, random_state, shuffle): """Test embedding performance Perform node classification using L2 regularized Logistic Regression with 5-Fold Cross Validation """ n_classes = label_mat.shape[1] label_IDs = list(label_IDmap) emb_idx = [emb_IDmap[ID] for ID in label_IDs] x = emb[emb_idx] splitter = skf(n_splits=n_splits, random_state=random_state, shuffle=shuffle) mdl = LogReg(penalty='l2', solver='lbfgs', warm_start=False, max_iter=1000) y_true_all = [] y_pred_all = [] for i in range(n_classes): y = label_mat[:, i] label = i + 1 y_true = np.array([], dtype=bool) y_pred = np.array([]) for j, (train, test) in enumerate(splitter.split(y, y)): print("Testing class #{:>4d},\tfold {:>2d} / {:<2d}".format( label, j + 1, n_splits), flush=True, end='\r') mdl.fit(x[train], y[train]) y_true = np.append(y_true, y[test]) y_pred = np.append(y_pred, mdl.decision_function(x[test])) y_true_all.append(y_true) y_pred_all.append(y_pred) print('') return y_true_all, y_pred_all
def _trn_val_split(self, split_type, split_num): trn_df = pd.read_csv('./mnt/inputs/origin/train.csv.zip') if split_type == 'gkf': fold = gkf(split_num).split(trn_df['id_code'], trn_df['sirna'], trn_df['well']) elif split_type == 'skf': fold = skf(split_num, shuffle=True, random_state=71)\ .split(trn_df['id_code'], trn_df['sirna']) elif split_type == 'cskf': fold = cskf(trn_df, trn_df['sirna'], split_num, shuffle=True, random_state=71) else: raise Exception(f'invalid split type: {split_type}') for trn_idx, val_idx in fold: trn_ids = trn_df.iloc[trn_idx].id_code val_ids = trn_df.iloc[val_idx].id_code break return trn_ids, val_ids
def __init__(self, Cs=500, cv=10, sampler='skf', solver='liblinear', **kwargs): super(self.__class__, self).__init__() self.penalty = 'l1' self.solver = solver self.Cs = Cs self.sampler = sampler self.cv_folds = cv if self.sampler == 'skf': self.cv = skf(n_splits=self.cv_folds) elif self.sampler == 'sss': self.cv = sss(n_splits=self.cv_folds) elif self.sampler == 'kf': self.cv = kf(n_splits=self.cv_folds) elif self.sampler == 'ss': self.cv = ss(n_splits=self.cv_folds) else: raise (Exception( 'Selected sampler is not a valid. Please choose ' '"skf" for stratified K-fold or "sss" for ' 'stratified shuffle split. Also "sk" and "ss" for ' 'the respective non-stratified methods.')) for k, v in kwargs.items(): setattr(self, k, v) self.x = None self.y = None
criterion='entropy', max_depth=20, max_leaf_nodes=50, n_jobs=-1, random_state=1) final_model.fit(x_train, y_train) y_pred = final_model.predict(x_test) # k-fold cross validation from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold as skf score = cross_val_score(estimator=final_model, scoring='f1', X=x_train, y=y_train, cv=skf(n_splits=10)) f1 = score.mean() ############################################################################# # grid search from sklearn.model_selection import GridSearchCV parameters = [{ 'n_estimators': [50, 100, 150, 250], 'max_depth': [10, 20, 40], 'max_features': (5, 10, 20, 30), 'max_leaf_nodes': [20, 40, 60] }] grid_search = GridSearchCV(iid=False, estimator=final_model, param_grid=parameters,
def use_splitted_data_set(train, combined_features): splitter = skf(5, shuffle=True, random_state=0) sp_iter = splitter.split(train[combined_features], train['score']) chosen_index, _ = next(sp_iter) return train.iloc[chosen_index]
max_leaf_nodes=50, n_jobs=-1, random_state=1) model.fit(X1, y1) y1_pred = model.predict(X1_test) from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score cm = confusion_matrix(y1, model.predict(X1)) accuracy_0 = accuracy_score(y1_test, y1_pred) f1_0 = f1_score(y1_test, y1_pred) from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold as skf score_base = cross_val_score(estimator=model, X=X1, y=y1, cv=skf(n_splits=20)) accuracy_base = score_base.mean() pred_0 = model.predict(x_pred) output_0 = pd.DataFrame({ 'PassengerId': da_test.PassengerId, 'Survived': pred_0 }) output_0.to_csv('my_submission_base.csv', index=False) ##### (SVC) ##### from sklearn.svm import SVC classifier_svc = SVC(kernel='rbf', gamma='auto') classifier_svc.fit(x_train, y_train) y_pred_svc = classifier_svc.predict(x_test)