def fit_predict(self,x_train,y_train, x_predict): """Use local regression to predict values for unknown data. Arguments: x_train = The training data spectra. y_train = The values of the quantity being predicted for the training data x_predict = The unknown spectra for which y needs to be predicted. """ self.neighbors.fit(x_train) predictions = [] coeffs = [] intercepts = [] for i in range(x_predict.shape[0]): print('Predicting spectrum ' + str(i + 1)) x_temp = np.array(x_predict[i]) foo, ind = self.neighbors.kneighbors([x_temp]) x_train_local = np.squeeze(x_train[ind]) y_train_local = np.squeeze(y_train[ind]) cv = GroupKFold(n_splits=3) cv = cv.split(x_train_local, y_train_local, groups=y_train_local) self.model.fit(x_train_local, y_train_local) predictions.append(self.model.predict([x_temp])[0]) coeffs.append(self.model.coef_) intercepts.append(self.model.intercept_) return predictions, coeffs, intercepts
def _get_mse_profiling(self, x, y, alphas=None): """Calculate prediction RMSE. Use GroupKFold where a group is a combination of input size and number of workers. The prediction of a group is done when it is out of the training set. """ # Training set is 2/3 of the data groups = self._groups.loc[x.index] cv = GroupKFold(n_splits=3) preds = None for train_ix, test_ix in cv.split(x, groups=groups): # train_ix and test_ix starts from 0, so we use iloc x_train, y_train = x.iloc[train_ix], y.iloc[train_ix] x_test = x.iloc[test_ix] # Choose best alpha value for regularization based on training set lm = self._choose_alpha(x_train, y_train, alphas) lm.fit(x_train, y_train) pred = pd.DataFrame(lm.predict(x_test), index=test_ix) preds = pred if preds is None else preds.append( pred, verify_integrity=True) return self._calc_mse(y, preds.sort_index())
def _split(self, x, y): cv = GroupKFold(n_splits=3) groups = self._groups.loc[x.index] for train_ix, test_ix in cv.split(x, groups=groups): # train_ix and test_ix starts from 0, so we use iloc x_train = x.iloc[train_ix] y_train = y.iloc[train_ix] x_test = x.iloc[test_ix] yield x_train, y_train, x_test, test_ix
def plot_group_kfold(): from sklearn.model_selection import GroupKFold groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] plt.figure(figsize=(10, 2)) plt.title("GroupKFold") axes = plt.gca() axes.set_frame_on(False) n_folds = 12 n_samples = 12 n_iter = 3 n_samples_per_fold = 1 cv = GroupKFold(n_splits=3) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(cv.split(range(12), groups=groups)): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//", edgecolor="k", align='edge') for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w", edgecolor='k', align="edge") for i in range(12): axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % groups[i], horizontalalignment="center") axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter + 1) + .3) axes.set_yticklabels( ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]) plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)) plt.tight_layout()
def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def test_knn_rbf_groupkfold(): nan_roc_auc_scorer = make_scorer(nan_roc_auc_score) rng = np.random.RandomState(123) iris = load_iris() X = iris.data # knn = KNeighborsClassifier(n_neighbors=4) forest = RandomForestClassifier(n_estimators=100, random_state=123) bool_01 = [True if item == 0 else False for item in iris['target']] bool_02 = [True if (item == 1 or item == 2) else False for item in iris['target']] groups = [] y_new = [] for ind, _ in enumerate(bool_01): if bool_01[ind]: groups.append('attribute_A') y_new.append(0) if bool_02[ind]: throw = rng.rand() if throw < 0.5: groups.append('attribute_B') else: groups.append('attribute_C') throw2 = rng.rand() if throw2 < 0.5: y_new.append(0) else: y_new.append(1) y_new_bool = [True if item is 1 else False for item in y_new] cv_obj = GroupKFold(n_splits=3) cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups)) sfs1 = SFS(forest, k_features=3, forward=True, floating=False, cv=cv_obj_list, scoring=nan_roc_auc_scorer, verbose=0 ) sfs1 = sfs1.fit(X, y_new) expect = { 1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62, 'feature_idx': (1,)}, 2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53, 'feature_idx': (1, 2)}, 3: {'cv_scores': np.array([0.47, nan, 0.63]), 'avg_score': 0.55, 'feature_idx': (1, 2, 3)}} dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
def fit_and_predict(self, X_train, X_test, y_train, groups): if self.cv == "mcs": folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100) elif self.cv == "group": folds = GroupKFold(n_splits=10) elif self.cv == "stratified": folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) y_to_stratify = pd.cut(y_train["Global_Sales_log1p"], bins=7, labels=False) oof = np.zeros(len(X_train)) predictions = np.zeros(len(X_test)) feature_importance_df = pd.DataFrame() fold_scores = [] for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)): # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)): print("-" * 100) print(f"Fold {fold+1}") train_data = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx]) val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx]) # callbacks = [log_evaluation(self.logger, period=100)] clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=100, early_stopping_rounds=100) #, feval=eval_func) oof_pred = clf.predict(X_train.iloc[val_idx].values, num_iteration=clf.best_iteration) oof_pred[oof_pred < 0] = 0 oof[val_idx] = oof_pred fold_score = mean_squared_log_error( np.expm1(y_train.iloc[val_idx].values), np.expm1(oof[val_idx]))**.5 fold_scores.append(fold_score) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X_train.columns.values fold_importance_df["importance"] = clf.feature_importance( importance_type="gain") fold_importance_df["fold"] = fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += np.expm1( clf.predict(X_test, num_iteration=clf.best_iteration)) / folds.n_splits feature_importance_df = feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50) print("##### feature importance #####") print(feature_importance_df) cv_score_fold_mean = sum(fold_scores) / len(fold_scores) print(f"cv_score_fold_mean: {cv_score_fold_mean}") return oof, predictions, cv_score_fold_mean
if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() sub_train = utils.read_pickles( '../data/prev_train', ['SK_ID_CURR', 'SK_ID_PREV']).set_index('SK_ID_CURR').iloc[tr_ind] sub_train['y'] = y_train.values sub_train['cnt'] = sub_train.index.value_counts() sub_train['w'] = 1 / sub_train.cnt.values group_kfold = GroupKFold(n_splits=NFOLD) sub_train['g'] = sub_train.index % NFOLD CAT = list(set(X_train.columns) & set(utils_cat.ALL)) # ============================================================================= # load test # ============================================================================= files = ('../feature_prev/test_' + imp.head(HEAD).feature + '.f').tolist() X_test = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1).iloc[te_ind] sub_test = utils.read_pickles( '../data/prev_test', ['SK_ID_CURR', 'SK_ID_PREV']).set_index('SK_ID_CURR').iloc[te_ind]
ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold') ax.set_xlabel(name_param_1, fontsize=16) ax.set_ylabel('CV Average Score', fontsize=16) ax.legend(loc="best", fontsize=15) ax.grid('on') # load the data data = sgl.load_watch() X = data['X'] y = data['y'] g = data['subject'] # use subject id to group folds splitter = GroupKFold(n_splits=3) cv = splitter.split(X, y, groups=g) # create a feature representation pipeline est = Pipeline([('features', sgl.FeatureRep()), ('scaler', StandardScaler()), ('rf', RandomForestClassifier())]) pipe = sgl.SegPipe(est) # create a parameter dictionary using the SegPipe API - which is similar to the sklearn API # # parameters passed to an estimator in the ``feed`` pipeline are keyed ``f$estimator__parameter`` # parameters passed to an estimator in the ``est`` pipeline are keyed ``e$estimator__parameter`` # # when the ``feed`` or ``est`` pipeline is not a pipeline, but just a single estimator # the parameter would be keyed f$parameter or e$parameter respectively
embeddings.append(embedding) input_numeric = Input(shape=(len(num), )) embedding_numeric = Dense(512, activation='relu')(input_numeric) inputs.append(input_numeric) embeddings.append(embedding_numeric) x = Concatenate()(embeddings) x = Dense(256, activation='relu')(x) x = Dense(128, activation='relu')(x) x = Dropout(0.5)(x) output = Dense(199, activation='softmax')(x) model = Model(inputs, output) return model n_splits = 5 kf = GroupKFold(n_splits=n_splits) score = [] for i_369, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])): print(f'Fold : {i_369+1}') X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx] X_train = [np.absolute(X_train[i]) for i in cat] + [X_train[num]] X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]] model = model_396_1() model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[]) es = EarlyStopping(monitor='val_CRPS', mode='min', restore_best_weights=True, verbose=2, patience=5)
return score # affix = 'loocv' # nsplits = n_subjects # affix = '2out' # nsplits = n_subjects // 2 affix = '3out' nsplits = n_subjects // 3 # affix = 'split-half' # nsplits = n_subjects // 6 gkf = GroupKFold(n_splits=nsplits) clf = RidgeCV() # clf = DummyClassifier(strategy="most_frequent") # clf = GradientBoostingRegressor() # clf = RandomForestRegressor() scores = Parallel(n_jobs=2)(delayed(get_cv_score)(i, ward, paths, clf, gkf) for i in range(1, 1 + n_parcels)) score_imgs = [] for task in task_list: score_map = math_img('0. * img ', img=ward.labels_img_).get_data() for i in range(1, 1 + n_parcels): score_map[ward.labels_img_.get_data() == i] = scores[i - 1][task] score_img = nib.Nifti1Image(score_map, ward.labels_img_.affine) filename = os.path.join(write_dir, 'score_' + affix + '_%s.nii.gz' % task)
if restore_best_state: print( "Restoring model state from the end of the best epoch") model.load_state_dict(best_model_state) optimizer.load_state_dict(best_optimizer_state) break else: best_rho = valid_rho wait = 0 if restore_best_state: best_model_state = model.state_dict() best_optimizer_state = optimizer.state_dict() return model, best_rho kf_split = GroupKFold(n_splits=NUM_FOLDS).split(X=train, groups=train.question_body) kfold_rhos = list() all_models = list() for fold, (train_idx, valid_idx) in enumerate(kf_split): print(f" fold: {fold} ".center(100, "#")) train_inputs = train_tqa_bert_encoded.loc[train_idx, bert_columns].values _train_targets = train_targets.loc[train_idx, :].values valid_inputs = train_tqa_bert_encoded.loc[valid_idx, bert_columns].values _valid_targets = train_targets.loc[valid_idx, :].values model, best_rho = train_mlp(OutputMLP, train_inputs, _train_targets, valid_inputs, _valid_targets,
from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC models = [LogisticRegression(solver='liblinear', max_iter=300), SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), MultinomialNB(), KNeighborsClassifier(), RidgeClassifier(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3)] accuracy_mean, accuracy_std, precision_mean, precision_std = [], [], [], [] for model in models: pipe = Pipeline([('cleaner', clean_transformer()), ('vectorizer', bow_vector), ('classifier', model)]) accuracy = cross_val_score(estimator=pipe, X=X, y=y, groups=titles, cv=GroupKFold(), scoring='accuracy') precision = cross_val_score(estimator=pipe, X=X, y=y, groups=titles, cv=GroupKFold(), scoring='precision') accuracy_mean.append(np.mean(accuracy)) accuracy_std.append(np.std(accuracy)) precision_mean.append(np.mean(precision)) precision_std.append(np.std(precision)) # hyperparameter tuning classifier = KNeighborsClassifier() pipe = Pipeline([('cleaner', clean_transformer()), ('vectorizer', tfidf_vector), ('classifier', classifier)])
def main(args, logger): trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') gkf = GroupKFold(n_splits=5).split(X=trn_df.question_body, groups=trn_df.question_body) histories = { 'trn_loss': [], 'val_loss': [], 'val_metric': [], } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: continue fold_trn_df = trn_df.iloc[trn_idx] fold_val_df = trn_df.iloc[val_idx] if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() tokens = [] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = soft_binary_cross_entropy model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN).to(DEVICE) model.resize_token_embeddings(len(trn_dataset.tokenizer)) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): model = model.to(DEVICE) if fold <= loaded_fold and epoch <= loaded_epoch: continue trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test( model, val_loader) scheduler.step() histories['trn_loss'].append(trn_loss) histories['val_loss'].append(val_loss) histories['val_metric'].append(val_metric) sel_log( f'epoch : {epoch} -- fold : {fold} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f}', logger) model = model.to('cpu') save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model sel_log('now saving best checkpoints...', logger)
from sklearn.model_selection import GroupKFold, GroupShuffleSplit from photonai.base import Hyperpipe, PipelineElement, OutputSettings from photonai.optimization import FloatRange, Categorical # WE USE THE BREAST CANCER SET FROM SKLEARN X, y = load_breast_cancer(return_X_y=True) groups = np.random.random_integers(0, 3, (len(y), )) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe('group_split_pipe', optimizer='grid_search', metrics=['accuracy', 'precision', 'recall'], best_config_metric='accuracy', outer_cv=GroupKFold(n_splits=4), inner_cv=GroupShuffleSplit(n_splits=10), verbosity=1, output_settings=OutputSettings(project_folder='./tmp/')) # ADD ELEMENTS TO YOUR PIPELINE # first normalize all features my_pipe += PipelineElement('StandardScaler') # then do feature selection using a PCA, specify which values to try in the hyperparameter search my_pipe += PipelineElement('PCA', hyperparameters={'n_components': [5, 10, None]}, test_disabled=True) # engage and optimize the good old SVM for Classification my_pipe += PipelineElement('SVC', hyperparameters={ 'kernel': Categorical(['rbf', 'linear']),
def main(): parser = get_arg_parser() args = parser.parse_args() do_eval = len(args.dev_sets) > 0 and not args.do_cross_validation do_train = len(args.train_sets) > 0 and not args.do_cross_validation device, n_gpu = get_device(args.local_rank, args.no_cuda) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not do_train and not do_eval and not args.do_cross_validation: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() processors = {"ner": NerProcessor, "wikipedia-ner": WikipediaNerProcessor} if task_name not in processors: raise ValueError("Task not found: %s" % task_name) if args.do_cross_validation: cross_val_result_file = "cross_validation_results.pkl" cross_val_result_file = os.path.join(args.output_dir, cross_val_result_file) sets = set(args.train_sets.split( '|')) if args.train_sets is not None else set() gt = pd.read_pickle(args.gt_file) gt = gt.loc[gt.dataset.isin(sets)] k_fold = GroupKFold(n_splits=args.n_splits) eval_results = list() tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) for ep in range(1, int(args.num_train_epochs) + 1): for sp, (train, test) in enumerate(k_fold.split(X=gt, groups=gt.nsentence)): tr = gt.iloc[train].copy() te = gt.iloc[test].copy() tr['dataset'] = 'TRAIN' te['dataset'] = 'TEST' gt_tmp = pd.concat([tr, te]) processor = \ processors[task_name](train_sets='TRAIN', dev_sets='TEST', test_sets='TEST', gt=gt_tmp, max_seq_length=args.max_seq_length, tokenizer=tokenizer, data_epochs=args.num_data_epochs, epoch_size=args.epoch_size) model, model_config = \ model_train(bert_model=args.bert_model, max_seq_length=args.max_seq_length, do_lower_case=args.do_lower_case, num_train_epochs=ep, train_batch_size=args.train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion, processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16, cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run, no_cuda=args.no_cuda) label_map = { v: k for k, v in model_config['label_map'].items() } eval_result =\ model_eval(model=model, label_map=label_map, processor=processor, device=device, batch_size=args.eval_batch_size, local_rank=args.local_rank, no_cuda=args.no_cuda, dry_run=args.dry_run).reset_index() eval_result['split'] = sp eval_result['epoch'] = ep eval_results.append(eval_result) del model # release CUDA memory pd.concat(eval_results).to_pickle(cross_val_result_file) if do_train: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) processor = \ processors[task_name](train_sets=args.train_sets, dev_sets=args.dev_sets, test_sets=args.test_sets, gt_file=args.gt_file, max_seq_length=args.max_seq_length, tokenizer=tokenizer, data_epochs=args.num_data_epochs, epoch_size=args.epoch_size) model_train( bert_model=args.bert_model, output_dir=args.output_dir, max_seq_length=args.max_seq_length, do_lower_case=args.do_lower_case, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion, processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16, cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run, no_cuda=args.no_cuda) if do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model_config = json.load( open(os.path.join(args.output_dir, "model_config.json"), "r")) label_to_id = model_config['label_map'] label_map = {v: k for k, v in model_config['label_map'].items()} tokenizer = BertTokenizer.from_pretrained( model_config['bert_model'], do_lower_case=model_config['do_lower']) processor = \ processors[task_name](train_sets=None, dev_sets=args.dev_sets, test_sets=args.test_sets, gt_file=args.gt_file, max_seq_length=model_config['max_seq_length'], tokenizer=tokenizer, data_epochs=args.num_data_epochs, epoch_size=args.epoch_size, label_map=label_to_id) model_eval(label_map=label_map, processor=processor, device=device, num_train_epochs=args.num_train_epochs, output_dir=args.output_dir, batch_size=args.eval_batch_size, local_rank=args.local_rank, no_cuda=args.no_cuda, dry_run=args.dry_run)
def Cross_validation(is_group=False): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" training_data = pd.read_csv( '/home1/pansj/Small_protein_prediction/training_upstream_cluster.csv', index_col=0).values print(training_data.shape) data = training_data[:, 0] label = training_data[:, 1] cluster = training_data[:, 3] print('cluster:', len(set(cluster))) label = label.reshape(len(label), -1) print(np.sum(label > 0), len(label)) label[label == -1] = 0 label = np.array([y[0] for y in label]).reshape(len(label), 1) data_one_hot = [] print('start') for seq in data: seq_one_hot = sequence_to_onehot(seq) data_one_hot.append(seq_one_hot) # print(np.array(data_one_hot).shape) train_data = np.array(data_one_hot) accuracy = [] recall = [] precision = [] f1 = [] if is_group == False: print('CV') skf = StratifiedKFold(n_splits=5, shuffle=True) count = 0 for train_index, test_index in skf.split(train_data, label): count += 1 X_train, Y_train = train_data[train_index], label[train_index] X_test, Y_test = train_data[test_index], label[test_index] enc = OneHotEncoder() Y_train = enc.fit_transform(Y_train).toarray() Y_test_ = enc.fit_transform(Y_test).toarray() tbCallBack = TensorBoard( log_dir='/home1/pansj/Small_protein_prediction/logs_{}'.format( count), # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 # batch_size=32, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=True, # 是否可视化梯度直方图 write_images=True, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) cnn_model = CNN_lstm_model([3, 4, 5], 32) history = cnn_model.fit(X_train, Y_train, batch_size=128, epochs=10, workers=8, validation_data=(X_test, Y_test_), callbacks=[tbCallBack]) print(history.history) pre_y = [] pre = cnn_model.predict(X_test) for i in range(len(pre)): if pre[i][0] > pre[i][1]: pre_y.append(0) else: pre_y.append(1) pre_y = np.array(pre_y).reshape(len(pre_y), 1) accuracy.append(accuracy_score(Y_test, pre_y)) recall.append(recall_score(Y_test, pre_y)) precision.append(precision_score(Y_test, pre_y)) f1.append(f1_score(Y_test, pre_y)) else: print('group_cv') skf = GroupKFold(n_splits=5) count = 0 for train_index, test_index in skf.split(train_data, label, cluster): count += 1 print(len(train_index), len(test_index)) X_train, Y_train = train_data[train_index], label[train_index] X_test, Y_test = train_data[test_index], label[test_index] enc = OneHotEncoder() Y_train = enc.fit_transform(Y_train).toarray() Y_test_ = enc.fit_transform(Y_test).toarray() cnn_model = CNN_model_multi_conv([3, 4, 5], 32) tbCallBack = TensorBoard( log_dir= '/home1/pansj/Small_protein_prediction/CNN_group/cnn_results_upstream_logs_{}' .format(count), # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 # batch_size=32, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=True, # 是否可视化梯度直方图 write_images=True, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) history = cnn_model.fit(X_train, Y_train, batch_size=128, epochs=20, workers=8, validation_data=(X_test, Y_test_), callbacks=[tbCallBack]) print(history.history) pre_y = [] pre = cnn_model.predict(X_test) for i in range(len(pre)): if pre[i][0] > pre[i][1]: pre_y.append(0) else: pre_y.append(1) pre_y = np.array(pre_y).reshape(len(pre_y), 1) accuracy.append(accuracy_score(Y_test, pre_y)) recall.append(recall_score(Y_test, pre_y)) precision.append(precision_score(Y_test, pre_y)) f1.append(f1_score(Y_test, pre_y)) print( 'CNN CV_group_new: kernel size:[3,4,5] ; num_kernel = 32 epoches = 20;' ) print(accuracy) print('accuracy: ', np.average(accuracy)) print(recall) print('recall:', np.average(recall)) print(precision) print('precision:', np.average(precision)) print(f1) print('f1:', np.average(f1))
# In[ ]: #train_imgs = np.array(train_imgs).astype(np.uint8) # In[ ]: #train_imgs.shape # In[ ]: #train_imgs.mean(), train_imgs.std() # In[14]: #group_kfold = GroupShuffleSplit(n_splits=5, random_state = 4321) group_kfold = GroupKFold(n_splits=5) # In[12]: data_transforms = { 'train': transforms.Compose([ transforms.Resize(224), #transforms.Grayscale(3), transforms.RandomAffine(degrees=45, scale=(0.9, 1.1)), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomVerticalFlip(p=0.5), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val':
del data print('Data Done') import lightgbm as lgb col = list(train_set.columns) col.remove('reordered') col.remove('eval_set') col.remove('user_id') col.remove('product_id') col.remove('order_id') col.remove('department_id') col.remove('aisle_id') from sklearn.model_selection import GroupKFold kf = GroupKFold(n_splits=5) train_indexes = [] test_indexes = [] for i, (train_index, test_index) in enumerate( kf.split(train_set, groups=train_set['user_id'].values)): train_indexes.append(train_index) test_indexes.append(test_index) index_ = train_indexes[0] train_set = train_set.iloc[index_, :] train_set = train_set.frac(0.9, seed=42) dtrain = lgb.Dataset(train_set[col], label=train_set['reordered']) #del train_set lgb_params = { 'task': 'train',
def fit_meta_feature( X_train, X_valid, X_test, Meta_train, train_idx, bond_type, base_fold, feature="fc", N_META_FOLDS=N_META_FOLDS, N_META_ESTIMATORS=N_META_ESTIMATORS, model_type="catboost", ): """ Adds meta features to train, test and val """ logger.info(f"{bond_type}: Creating meta feature {feature}") logger.info("{}: X_train, X_valid and X_test are shapes {} {} {}".format( bond_type, X_train.shape, X_valid.shape, X_test.shape)) folds = GroupKFold(n_splits=N_META_FOLDS) fold_count = 1 # Init predictions X_valid["meta_" + feature] = 0 X_test["meta_" + feature] = 0 X_train["meta_" + feature] = 0 X_train_oof = X_train[["meta_" + feature]].copy() X_train = X_train.drop("meta_" + feature, axis=1) feature_importance = pd.DataFrame() for fold_n, (train_idx2, valid_idx2) in enumerate( folds.split(X_train, groups=mol_group_type.iloc[train_idx].values)): logger.info("{}: Running Meta Feature Type {} - Fold {} of {}".format( bond_type, feature, fold_count, folds.n_splits)) update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature), N_META_ESTIMATORS) update_tracking(run_id, "{}_meta_{}_metafolds".format(bond_type, feature), N_META_FOLDS) # Load fold IDs from files for consistancy X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)] X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)] X_train2 = X_train2.copy() X_valid2 = X_valid2.copy() y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin( train_idx2)][feature] y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin( valid_idx2)][feature] fold_count += 1 if model_type == "catboost": train_dataset = Pool(data=X_train2, label=y_train2) metavalid_dataset = Pool(data=X_valid2, label=y_valid2) valid_dataset = Pool(data=X_valid) test_dataset = Pool(data=X_test) model = CatBoostRegressor( iterations=N_META_ESTIMATORS, learning_rate=LEARNING_RATE, depth=META_DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU", ) # Train on GPU model.fit( train_dataset, eval_set=metavalid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(metavalid_dataset) y_pred_valid = model.predict(valid_dataset) y_pred = model.predict(test_dataset) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) update_tracking(run_id, '{}_f{}-{}_meta{}_best_iter'.format( bond_type, base_fold, fold_count, feature), model.best_iteration_, integer=True) elif model_type == "xgboost": model = xgboost.XGBRegressor(**xgb_params) model.fit( X_train2, y_train2, eval_metric=EVAL_METRIC, eval_set=[(X_valid2, y_valid2)], verbose=VERBOSE, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(X_valid2) y_pred_valid = model.predict( X_valid.drop("meta_" + feature, axis=1)) y_pred = model.predict( X_test.drop(["meta_" + feature, 'id'], axis=1)) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) update_tracking(run_id, '{}_f{}-{}_meta{}_best_iter'.format( bond_type, base_fold, fold_count, feature), model.get_booster().best_iteration, integer=True) oof_score = mean_absolute_error(Meta_train[feature], X_train_oof["meta_" + feature]) log_oof_score = np.log(oof_score) logger.info( f"{bond_type} Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}" ) update_tracking( run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold), oof_score) update_tracking( run_id, "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold), log_oof_score, ) X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS X_train["meta_" + feature] = X_train_oof["meta_" + feature] feature_importance.to_parquet( "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train_oof.to_parquet( "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train.to_parquet( "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_valid.to_parquet( "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_test.to_parquet( "type_results/{}/meta/{}_{}_{}_X_test_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) logger.info(f"{bond_type} Done creating meta features") logger.info("{} X_train, X_valid and X_test are shapes {} {} {}".format( bond_type, X_train.shape, X_valid.shape, X_test.shape)) return X_train, X_valid, X_test
def Split_group_kfolds(self): Train_X = self.Train_df.drop(['fraud_ind'], axis=1) Train_Y = self.Train_df['fraud_ind'] Folds = GroupKFold(n_splits=self.N_folds) Splited_data = Folds.split(Train_X, Train_Y, groups=Train_X['Month']) return Splited_data
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ]# + additional_tokens trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier(num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len( trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, cat_last_layer_num=1, do_ratio=0.5, ) optimizer = optim.Adam(model.parameters(), lr=3e-5, weight_decay=0.001) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
from sklearn.model_selection import StratifiedKFold X = np.ones(10) y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1] skf = StratifiedKFold(n_splits=3) for train, test in skf.split(X, y): print("%s %s" % (train, test)) from sklearn.model_selection import GroupKFold X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] gkf = GroupKFold(n_splits=3) for train, test in gkf.split(X, y, groups=groups): print("%s %s" % (train, test)) from sklearn.model_selection import LeaveOneGroupOut X = [1, 5, 10, 50, 60, 70, 80] y = [0, 1, 1, 2, 2, 2, 2] groups = [1, 1, 2, 2, 3, 3, 3] logo = LeaveOneGroupOut() for train, test in logo.split(X, y, groups=groups): print("%s %s" % (train, test)) from sklearn.model_selection import LeavePGroupsOut X = np.arange(6)
'num_leaves': 63, 'max_bin': 255, 'min_child_weight': 10, 'min_data_in_leaf': 150, 'reg_lambda': 0.5, # L2 regularization term on weights. 'reg_alpha': 0.5, # L1 regularization term on weights. 'colsample_bytree': 0.9, 'subsample': 0.9, 'nthread': 32, # 'nthread': cpu_count(), 'bagging_freq': 1, 'verbose': -1, 'seed': SEED } group_kfold = GroupKFold(n_splits=NFOLD) np.random.seed(SEED) os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= train = pd.read_csv('../input/application_train.csv.zip') test = pd.read_csv('../input/application_test.csv.zip') prev = pd.read_csv('../input/previous_application.csv.zip') def mk_feature(df): df['AMT_CREDIT-d-AMT_ANNUITY'] = df['AMT_CREDIT'] / df[ 'AMT_ANNUITY'] # how long should user pay?(month)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 # roberta! fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader( trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), # num_workers=0, worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # val_sampler = SequentialSampler(data_source=val_dataset) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() # fobj = MSELoss() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) # model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') # model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def split( self, X, y=None, group=None, **kwargs ): ## the group here will be passed on from the class where this is being called if self.validation_scheme is None or isinstance( self.validation_scheme, KFold ) or self.validation_scheme == FoldScheme.KFold.name or self.validation_scheme == FoldScheme.KFold: folds = KFold(n_splits=self.num_folds, random_state=self.random_state, shuffle=self.shuffle) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X)] elif isinstance( self.validation_scheme, StratifiedKFold ) or self.validation_scheme == FoldScheme.StratifiedKFold.name or self.validation_scheme == FoldScheme.StratifiedKFold: if y is None or X.shape[0] != y.shape[0]: raise ValueError( "Y should be passed and X and Y should be of same length for StratifiedKFold" ) folds = StratifiedKFold(n_splits=self.num_folds, random_state=self.random_state, shuffle=self.shuffle) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X, y)] elif isinstance( self.validation_scheme, GroupKFold ) or self.validation_scheme == FoldScheme.GroupKFold.name or self.validation_scheme == FoldScheme.GroupKFold: folds = GroupKFold(n_splits=self.num_folds) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X, y, groups=group) ] elif isinstance( self.validation_scheme, TimeSeriesSplit ) or self.validation_scheme == FoldScheme.TimeSeriesSplit.name or self.validation_scheme == FoldScheme.TimeSeriesSplit: folds = TimeSeriesSplit(n_splits=self.num_folds) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X)] elif self.validation_scheme == FoldScheme.train_test_split.name or self.validation_scheme == FoldScheme.train_test_split: # validation_scheme is a simple train test split. testsize is used to determine the size of test samples self.indices = [ train_test_split(list(range(X.shape[0])), test_size=self.test_size, shuffle=self.shuffle) ] elif callable(self.validation_scheme): # validation_scheme is a callable funtion which will take X and y as params. self.indices = self.validation_scheme(X, y, **kwargs) else: if not isinstance(self.validation_scheme, list): raise ValueError( "Validation Schema should be a list of (train_indexes, test_indexes)" ) self.indices = self.validation_scheme return self.indices
result = {} if args.cross_validation: # cross valiation modified_data_set = [] groups = [] for category in data_set.keys(): for example in data_set[category]: modified_data_set.append([category] + example) groups.append(category) modified_data_set = np.array(modified_data_set) gkf = GroupKFold(n_splits=args.folds) all_folds = [] for train_indices, test_indices in gkf.split(modified_data_set, groups=groups): train = modified_data_set[train_indices] test = modified_data_set[test_indices] print(len(train_indices), len(test_indices), len(test_indices) / len(modified_data_set)) print(train[0], test[0], '\n') all_folds.append(test) result['n_folds'] = args.folds result['folds'] = all_folds
'objective': 'regression', 'max_depth': 6, 'learning_rate': LEARNING_RATE, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "bagging_seed": 11, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'colsample_bytree': 1.0, 'random_state': RANDOM_STATE } folds = GroupKFold(n_splits=N_FOLDS) # Setup arrays for storing results train_df = pd.read_parquet( 'data/FE008_train.parquet') # only loading for skeleton not features oof_df = train_df[['id', 'type', 'scalar_coupling_constant']].copy() mol_group = train_df[['molecule_name', 'type']].copy() del train_df gc.collect() oof_df['oof_preds'] = 0 test_df = pd.read_parquet( 'data/FE008_test.parquet') # only loading for skeleton not features prediction = np.zeros(len(test_df)) feature_importance = pd.DataFrame() test_pred_df = test_df[['id', 'type', 'molecule_name']].copy()
from sklearn.metrics import auc import matplotlib.pyplot as plt import pars import roc_curve_target as rct xml_path = 'path.xml' # settings parsing global_params, external_params, param_list = pars.parsconf(xml_path) train, test = pars.parsdata(global_params) # find best model by GridSearchCV nfolds = external_params['kfold'] group_kfold = GroupKFold(n_splits=nfolds) ind = list( group_kfold.split(train['features'], train['labels'], train['target_id'])) ######################################################################################################################## import check_kfolds as ck status = ck.test_kfold(ind, train['target_id']) ######################################################################################################################## dtrain = xgb.DMatrix(train['features'], label=train['labels']) dtest = xgb.DMatrix(test['features'], label=test['labels']) results = pd.DataFrame( columns=['n', 'metric_train', 'metric_val', 'num_trees']) for n, param in enumerate(param_list):
return df if __name__ == "__main__": train = pd.read_csv(config.CLEAN_TRAIN_DATA) test = pd.read_csv(config.CLEAN_TEST_DATA) # Get numerical target train['target'] = train.Tag.map(config.TAG_DICT) y = train["target"].values # Replicate train/test split strategy for cross validation train["target_str"] = train["Domain"].astype(str) + train["Tag"].astype( str) train["target_str"] = train["target_str"].astype("category") cvlist = list(GroupKFold(5).split(train, groups=train["target_str"])) # Word and character TFIDF on URLs vec1 = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), min_df=500, sublinear_tf=True) vec2 = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=400, sublinear_tf=True) vec = FeatureUnion([("char", vec1), ("word", vec2)]) train = tokenize_url(train) test = tokenize_url(test) all_url = pd.concat([train["Url"], test["Url"]])
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))
fulldatasetpath = '../downsampled/' metadata = pd.read_csv('../UrbanSound8K.csv') le = LabelEncoder() le.fit(metadata['class']) class_mapping = dict(zip(le.classes_, le.transform(le.classes_))) parameters = { 'num_cep_coef': [25,30,35,40,45,50], 'num_states':[2,3,4,5,6] } gKFold = GroupKFold(n_splits = 10) urban_hmm = UrbanHMMClassifier(class_map = class_mapping) grid_search = GridSearchCV(urban_hmm, parameters, cv = gKFold, n_jobs = -1, verbose = 1) grid_search.fit(X = list(fulldatasetpath + metadata['slice_file_name'].astype(str)), y = le.transform(metadata['class']), groups = metadata['fold']) best_filename = "./models/hmm_cvbest_f1_{}.pkl".format(str(grid_search.best_score_)[2:10]) pickle.dump(grid_search.best_estimator_, open(best_filename, "wb")) cv_filename = "./models/hmm_cv_f1_{}.pkl".format(str(grid_search.best_score_)[2:10]) pickle.dump(grid_search, open(cv_filename, "wb")) print("\n-----------GRID SEARCH RANKING----------\n")
X_all = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y = utils.read_pickles('../data/prev_label').TARGET val = utils.read_pickles('../data/prev_train', ['DAYS_DECISION']) ind = val[val['DAYS_DECISION'].between(day_start, day_end)].index y = y[ind] sub_train = utils.read_pickles( '../data/prev_train', ['SK_ID_CURR']).set_index('SK_ID_CURR').iloc[ind] sub_train['y'] = y.values sub_train['cnt'] = sub_train.index.value_counts() sub_train['w'] = 1 / sub_train.cnt.values group_kfold = GroupKFold(n_splits=NFOLD) sub_train['g'] = sub_train.index % NFOLD for HEAD in HEADS: X = X_all.iloc[ind, :HEAD] if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') gc.collect() CAT = list(set(X.columns) & set(utils_cat.ALL))
X_test: pd.DataFrame = test[use_cols_revised].copy() print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}") # X.to_csv("../info/X_sampled.csv") # export colnames pd.DataFrame({"columns": X.columns.tolist()}).to_csv(log_path / f"use_cols.csv") #################################################################################################### # Model Fitting print("start fitting") n_fold = 5 if GROUP_K_FOLD: folds = GroupKFold(n_splits=n_fold) else: folds = KFold(n_splits=n_fold, shuffle=True, random_state=11) ######################################################################################################### # 1st layer model #seed_base = [0, 2019, 71, 1228, 1988, 1879, 92, 3018, 1234, 185289] seed_base = [2019] seed_list = np.array(seed_base) + 50 #seed_list = np.array(seed_base) + 51 #seed_list = np.array(seed_base) + 52 #seed_list = np.array(seed_base) + 53 #seed_list = np.array(seed_base) + 54 current_seed = -1
if i == 0: tsdict[dict_dataset[dataset][:-1]]['test'].append(test_idx) tsdict[dict_dataset[dataset][:-1]]['train'].append(train_idx) if i == 0 and j == 0: cvdict[dict_dataset[dataset][:-1]]['test'] = [] cvdict[dict_dataset[dataset][:-1]]['train'] = [] #replace all X_in[train_idx] with scaled X_train and X_in[test_idx] with X_test scaler.fit(X_in[train_idx]) X_train = scaler.transform(X_in[train_idx]) X_test = scaler.transform(X_in[test_idx]) #set random state for gkf np.random.set_state(state) gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx], cluster_in[train_idx]) for tr, te in gkf: cvdict[dict_dataset[dataset][:-1]]['train'].append(tr) cvdict[dict_dataset[dataset][:-1]]['test'].append(te) #set random state for gkf np.random.set_state(state) gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx], cluster_in[train_idx]) best_params[dict_dataset[dataset] + 'RF.' + str(i + 1) + '.' + str(j + 1)] = rf_param_selection( X_train, Y[train_idx], gkf) np.random.set_state(state) gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx],
train, test, 'category') # Set training parameters device = 'cuda' num_workers = 10 n_folds = 10 lr = 0.001 n_epochs = 10 bs = 2 grad_accum = 4 weight_decay = 0.01 loss_fn = nn.BCEWithLogitsLoss() # Start training init_seed() folds = GroupKFold(n_splits=n_folds).split(X=train['question_body'], groups=train['question_body']) oofs = np.zeros((len(train), N_TARGETS)) main_logger.info(f'Start training model {model_name}...') for fold_id, (train_index, valid_index) in enumerate(folds): main_logger.info(f'Fold {fold_id + 1} started at {time.ctime()}') fold_logger = init_logger(log_dir, f'train_fold_{fold_id+1}_{model_name}.log') train_loader = DataLoader( TextDataset(cat_features_train, ids_train['question'], ids_train['answer'], seg_ids_train['question'], seg_ids_train['answer'], train_index, y),