## INITIALISE LOGGING -------------------------------------------------------------------------------------- # use current time as a unique identifier now = datetime.now() job_id = now.strftime("%d%m%y_%H%M%S") # create log file logging.basicConfig(filename='logs/logreg_'+problem+'_'+job_id+'.txt',\ level=logging.DEBUG) ## LOAD DATA -------------------------------------------------------------------------------------------- X = pd.read_hdf('data/FINAL/X_' + problem + '.h5') y = pd.read_hdf('data/FINAL/y_' + problem + '.h5') subject = pd.read_hdf('data/FINAL/subject_' + problem + '.h5') # cross-validation iterator gkf = GroupKFold(n_splits=len(subject.unique())) gkf = list(gkf.split(X, y, subject)) # scoring scoring = {'Accuracy': 'accuracy', 'F1-score': 'f1_weighted'} # define the pipeline # DO FEATURE SELECTION ON ALL TRAINING DATA FOR NOW fs = FeatureSelector(n_jobs=20) fs.fit(X, y) X_filtered = fs.transform(X) dump(fs, 'models/FINAL/feature_selector_' + problem + '.joblib') qt = QuantileTransformer()
test_feats = np.array(test_feats) test_storm_ids = np.array(test_storm_ids) test_org_pred = np.array(test_org_pred) # In[20]: #train_img_features = img_features[:train_img_features.shape[0]] #test_img_features = img_features[train_img_features.shape[0]:] # In[21]: test_final_pred = np.zeros_like(test_org_pred) # In[22]: group_kfold = GroupKFold(n_splits=5) # In[23]: models_arr = [] val_pred = np.zeros_like(train_targets) sc_arr = [] fold = 0 for train_index, val_index in group_kfold.split(train_feats, train_targets, train_storm_ids): print(fold) fold += 1 image_datasets = { 'train': WindDataset(train_feats[train_index], train_img_features[train_index], train_targets[train_index], 'train'),
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset2( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, Q_LABEL_COL=Q_LABEL_COL, A_LABEL_COL=A_LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, rm_zero=RM_ZERO, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset2( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, Q_LABEL_COL=Q_LABEL_COL, A_LABEL_COL=A_LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, rm_zero=RM_ZERO, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier2(num_labels=len(Q_LABEL_COL) + len(A_LABEL_COL), config_path=MODEL_CONFIG_PATH, q_state_dict=state_dict, a_state_dict=state_dict, token_size=len( trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch2( model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test2( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def evaluate_han(batchsize): dbank = pd.read_json('dbank.json') X = dbank['tokens'].sample(frac=1, random_state=20) y = dbank['labels'].sample(frac=1, random_state=20) ids = dbank['ids'].sample(frac=1, random_state=20) group_kfold = GroupKFold(n_splits=10).split(X, y, groups=ids) data = [] for train_index, test_index in group_kfold: fold = {} fold["X_train"] = X.values[train_index] fold["y_train"] = y.values[train_index] fold["X_test"] = X.values[test_index] fold["y_test"] = y.values[test_index] fold["train_ids"] = np.array(ids)[train_index] data.append(fold) learning_rate = 1e-1 momentum = 0.9 criterion = nn.NLLLoss() idx = 0 accuracy = 0 f_measure = 0 while idx < 10: fold = data[idx] X_train, y_train = fold["X_train"], fold["y_train"].ravel() # Ravel flattens a (n,1) array into (n, ) X_test, y_test = fold["X_test"], fold["y_test"].ravel() split = len(X_train)//10 X_validate = X_train[:split] y_validate = y_train[:split] X_train = X_train[split:] y_train = y_train[split:] word_attn = AttentionWordRNN(batch_size=batchsize, num_tokens=1829, embed_size=300, word_gru_hidden=100, bidirectional= True).cuda() sent_attn = AttentionSentRNN(batch_size=batchsize, sent_gru_hidden=100, word_gru_hidden=100, n_classes=2, bidirectional= True).cuda() word_optmizer = torch.optim.SGD(word_attn.parameters(), lr=learning_rate, momentum= momentum) sent_optimizer = torch.optim.SGD(sent_attn.parameters(), lr=learning_rate, momentum= momentum) print("---------------- fold {} ----------------".format(idx)) sys.stdout.flush() best_model = train_early_stopping(idx, batchsize, X_train, y_train, X_validate, y_validate, word_attn, sent_attn, word_optmizer, sent_optimizer, criterion, 160, 5) if not best_model: continue trained_word_attn = AttentionWordRNN(batch_size=batchsize, num_tokens=1829, embed_size=300, word_gru_hidden=100, bidirectional= True).cuda() trained_sent_attn = AttentionSentRNN(batch_size=batchsize, sent_gru_hidden=100, word_gru_hidden=100, n_classes=2, bidirectional= True).cuda() trained_word_attn.load_state_dict(torch.load('saved_models/noage/fold{}_word_attn.pth'.format(idx))) trained_sent_attn.load_state_dict(torch.load('saved_models/noage/fold{}_sent_attn.pth'.format(idx))) trained_word_attn.eval() trained_sent_attn.eval() acc, f1 = test_accuracy_full_batch(X_test, y_test, batchsize, trained_word_attn, trained_sent_attn) print("Best model is {}".format(best_model)) print("---------------- accuracy, f-measure of fold {} is {}, {} ----------------".format(idx, acc, f1)) accuracy += acc f_measure += f1 sys.stdout.flush() idx += 1 print("average acc, f score = {}, {}".format(accuracy/10, f_measure/10))
def _build_cv_generator(self, y=None): # Use information about X to build a cross-validation split generator. # http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation # There are a few general strategies for cross-validation: # k-fold split: Split training set into k partitions, train using k-1, # and validate using the kth. Loop through k validation partitions. # Basis for all other cross-validation strategies. # Assumes each sample is independent and identically distributed. # shuffle split: Instead of mixing k partitions into training and # validation sets, just generate a random train/validation split # and do that k times. Typically efficient, but also can fail # to use each sample equally, e.g. if a sample is only ever # in the validation set or the training set. # leave one out (LOO): k-fold CV where k = n, so each partition only # has one sample in the test data. Test error across partitions # typically has high variance between error is either 0 or 1. # Empirically, LOO is typically worse than 5- or 10-fold CV. # Can tweak to leave P out (LPO). Because (n choose p) is much # greater than (n choose 1), this process tends to be # computationally intensive, and perhaps not worth the effort. # repetition: k-fold CV run multiple times, with different partitions # for each run. Good way to get more learning from limited data. # stratification: for unbalanced classification problems, enforce # a constraint on either k-fold or shuffle splits to require # an equal split of each class between the training and # validation sets. # grouping: for some classification problems, the samples are not iid. # In particular, for medical applications, if there are multiple # samples taken from a single patient, those samples are by # definition not independent. Moreover, we typically care about # whether we can accurately classify new unseen patients based # on patients we saw in the past. Therefore, grouping guarantees # that any particular group (e.g. a single patient_id) is only # in the training set or in the validation set. # time series split: data representing a time series also break the # i.i.d. assumption, because samples that are near each other # in time are by definition correlated (autocorrelation). To get # around this, split the data into k partitions, and in each # of k loops, use partions [0, i] to train and partition [i+1] # for validation. Ensure that you are always training on the past # and validating on the "future." # # TODO(sbala): Ideally, we'd find a way to do stratified group k-fold. # sklearn only provides StratifiedKFold and GroupKFold out of the box. # Given how unbalanced many of our classification problems are, use # StratifiedKFold for now, but we need something better. # Use information about y to determine n_splits. # In certain pathological cases (esp. with bifurcated classifiers) # there might be fewer than n examples of a given class in y. # If that's the case, n_splits can't be greater than n_samples. if y is not None: log.debug('y.value_counts(): %s' % Series(y).value_counts()) max_possible_splits = np.min(Series(y).value_counts()) log.debug('max_possible_splits: %s' % max_possible_splits) n_splits = np.min([10, max_possible_splits]) else: n_splits = 10 log.debug('n_splits: %s' % n_splits) if self.CV_STRATEGY == 'StratifiedKFold': return StratifiedKFold(n_splits=n_splits, shuffle=False, \ random_state=self._hyperparams['random_state']) elif self.CV_STRATEGY == 'GroupKFold': ''' GroupKFold is not randomized at all. Hence the random_state=None ''' return GroupKFold(n_splits=n_splits)
score = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split) print('Cross-validation score:\n{}'.format(score)) # Cross-validation score: # [0.98666667 0.94666667 0.97333333 0.96 0.97333333 0.94666667 # 0.97333333 0.96 0.93333333 0.94666667] # 分组交叉验证 # 适用于数据集中的分组高度相关时 # 例如人脸情绪识别中需要避免在数据集和测试集中出现同一个人的不同情绪,这会令预测结果偏好 # 我们想把每个人的不同情绪分为一组,而不会分散在测试集和数据集中 # 我们可以使用GroupKFold实现这点 # 下面的例子中包含12个数据点,共分为4个组 X, y = make_blobs(n_samples=12, random_state=0) # 假设前3个样本属于同一组,接下来4个属于同一组,以此类推 groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3)) print('Cross-validation scores:\n{}'.format(scores)) # Cross-validation scores: # [0.75 0.6 0.66666667] # 可视化分组 mglearn.plots.plot_group_kfold() plt.show() # sklearn中还有很多交叉验证的划分策略,详细查看用户指南 # 网格搜索 # 自己实现简单网格搜索 X_train, X_test, y_train, y_test = train = train_test_split( iris.data, iris.target, random_state=0 )
# # 2. Training # Using commentBody as X and editiorsSelection as Y. Use the article ID to track data and split: # In[67]: commentBody = new.commentBody nytpicks = new.editorsSelection articleID = new.articleID # In[69]: for train_index, test_index in GroupKFold(n_splits=5).split(commentBody, nytpicks, groups=articleID): train_text, test_text = commentBody[train_index], commentBody[test_index] train_target, test_target = nytpicks[train_index], nytpicks[test_index] train_groups, test_groups = articleID[train_index], articleID[test_index] train_text.shape[0], test_text.shape[0] # Using TFIDF for words and character n-grams and combine them using FeatureUnion # In[70]: vectorizer = FeatureUnion([ ('word_tfidf', TfidfVectorizer( analyzer='word',
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 for HOST in HOSTs: trn_df.loc[trn_df.host.str.contains(HOST).values, 'host'] = f'HOST_{HOST}'.casefold() # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), 'host_stackexchange', 'host_askubuntu', 'host_mathoverflow', 'host_serverfault', 'host_stackoverflow', 'host_superuser', ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() # fobj = MSELoss() model = BertModelForBinaryMultiLabelClassifier( num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
"eval_df", ]) experiment_df = experiment_df.append( { "run_name": "holdout", "run_dirp": str(holdout_dirp), "train_devset_idc": "full_devset", "eval_devset_idc": "full_holdout", "train_df": dev_df, "eval_df": holdout_df, }, ignore_index=True, ) # Make KFolds and collect fold splits group_kfold = GroupKFold(n_splits=settings.N_FOLDS) groups = dev_df["document_id"].to_numpy() X = dev_df["text"].to_numpy() y = dev_df["labels"].to_numpy() for i, (train_idc, eval_idc) in enumerate(group_kfold.split(X, y, groups)): print( f"Fold {i}: {train_idc.shape[0]} train inst. and {eval_idc.shape[0]} eval inst." ) train_df = dev_df.iloc[train_idc] eval_df = dev_df.iloc[eval_idc] fold_dirp = experiment_dirp / f"fold_{i}" # collect run metadata experiment_df = experiment_df.append(
def decode_window(X, y, clf=None, cv=None, sample_weight='auto', n_jobs='auto', random_state=None, labels=None): """Decode entire window Parameters ---------- X : np.ndarray of float, shape(n_samples, n_sensors, n_times) The data. y : np.ndarray of int, shape(n_samples,) The response vector. clf : instance of BaseEstimator | None The classifier. If None, defaults to a Pipeline. cv : cross validation object | None The cross validation. If None, defaults to stratified K-folds with 10 folds. sample_weight : np.ndarray of float, shape(n_samples,) The sample weights to deal with class imbalance. if 'auto' computes sample weights to balance Returns ------- probas : np.ndarray of float, shape(n_samples,) The predicted probabilities for each sample. predictions : np.ndarray of int, shape(n_samples,) The class preditions. scores : np.ndarray of float, shape(n_resamples,) The score at each resampling iteration. """ if n_jobs == 'auto': try: import multiprocessing as mp n_jobs = mp.cpu_count() logger.info( 'Autodetected number of jobs {}'.format(n_jobs)) except: logger.info('Cannot autodetect number of jobs') n_jobs = 1 if clf is None: scaler = StandardScaler() transform = SelectPercentile(f_classif, 10) svc = SVC(C=1, kernel='linear', probability=True) clf = Pipeline([('scaler', scaler), ('anova', transform), ('svc', svc)]) if cv is None or isinstance(cv, int): if isinstance(cv, int): n_splits = cv else: n_splits = 10 if labels is None: cv = StratifiedKFold(n_splits=int(min(n_splits, len(y) / 2)), shuffle=True, random_state=random_state) else: cv = GroupKFold(n_splits=n_splits) if isinstance(sample_weight, str) and sample_weight == 'auto': sample_weight = np.zeros(len(y), dtype=float) for this_y in np.unique(y): this_mask = (y == this_y) sample_weight[this_mask] = 1.0 / np.sum(this_mask) y = LabelEncoder().fit_transform(y) X = X.reshape(len(X), np.prod(X.shape[1:])) probas = np.zeros(y.shape, dtype=float) predictions = np.zeros(y.shape, dtype=int) scores = list() parallel, pfunc, _ = parallel_func(_decode_window_one_fold, n_jobs) out = parallel(pfunc(clone(clf), X, y, train, test, sample_weight) for train, test in cv.split(X, y, labels)) for (_, test), (probas_, predicts_, score_) in zip( cv.split(X, y, labels), out): probas[test] = probas_[:, 1] # second column predictions[test] = predicts_ scores.append(score_) return probas, predictions, np.array(scores)
def main(): args = parse_arguments() # create save path DATA_DIR = args.data_path num_folds = args.fold # log directory time = datetime.now().strftime("%Y%m%d%H%M%S") out_dir_path = path.normpath(path.join(getcwd(), 'logs/{}'.format(time))) makedirs(out_dir_path, exist_ok=True) # copy this file to log dir shutil.copy(path.abspath(sys.argv[0]), out_dir_path) # setup data with open(DATA_DIR + '/features.txt') as f: features_txt = f.readlines() features_name = [x.strip() for x in features_txt] features_name = [ "".join(c if c.isalnum() else "_" for c in str(x)) for x in features_name ] X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name) X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name) y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label']) subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv', names=['subject_id']) # 0始まりにする y_train['activity_label'] = y_train['activity_label'] - 1 # CV valid_preds = np.zeros((len(X_train), 6)) test_preds = np.zeros((num_folds, len(X_test), 6)) kf = GroupKFold(n_splits=num_folds) score_df = pd.DataFrame() all_score = [] for fold, (train_index, valid_index) in enumerate( kf.split(X=subject_train, groups=subject_train)): str_fold = 'fold_{}'.format(fold + 1) print(str_fold) # set data x_trn, x_val = X_train.iloc[train_index], X_train.iloc[valid_index] y_trn, y_val = y_train.iloc[train_index], y_train.iloc[valid_index] # lgb_params = { # 'learning_rate': 0.1, # 'objective': 'multiclass', # 'num_class': 6, # 'n_jobs': -1, # 'seed': 1, # } # classifier = LGBClassifier(lgb_params) # classifier = KNNClassifier() classifier = SVCClassifier() # classifier = LRClassifier() # train and predict classifier.train(x_trn, y_trn, x_val, y_val) classifier.predict(x_val) valid_preds[valid_index] = classifier.predict(x_val) test_preds[fold] = classifier.predict(X_test) # scoring score = accuracy_score(y_val, np.argmax(valid_preds[valid_index], axis=1)) print('Fold {} Score : {}'.format(fold + 1, score)) score_df[str_fold] = [score] all_score.append(score) # final score print('CV (mean) : {}'.format(np.mean(all_score))) score_df['mean'] = [np.mean(all_score)] # make submission score_df.to_csv(out_dir_path + '/score.csv') submit = np.argmax(np.mean(test_preds, axis=0), axis=1) + 1 np.savetxt(out_dir_path + '/baseline.txt', submit)
def __init__(self, seed, val_split=0.2, shuffle=True, cell_features=['expression'], drug_features=['descriptors'], response_url=None, use_landmark_genes=False, use_combo_score=False, preprocess_rnaseq=None, exclude_cells=[], exclude_drugs=[], feature_subsample=None, scaling='std', scramble=False, cv_partition='overlapping', cv=0): """Initialize data merging drug response, drug descriptors and cell line essay. Shuffle and split training and validation set Parameters ---------- seed: integer seed for random generation val_split : float, optional (default 0.2) fraction of data to use in validation cell_features: list of strings from 'expression', 'expression_5platform', 'mirna', 'proteome', 'all', 'categorical' (default ['expression']) use one or more cell line feature sets: gene expression, microRNA, proteome use 'all' for ['expression', 'mirna', 'proteome'] use 'categorical' for one-hot encoded cell lines drug_features: list of strings from 'descriptors', 'latent', 'all', 'categorical', 'noise' (default ['descriptors']) use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder trained on NSC drugs, or both; use random features if set to noise use 'categorical' for one-hot encoded drugs shuffle : True or False, optional (default True) if True shuffles the merged data before splitting training and validation sets scramble: True or False, optional (default False) if True randomly shuffle dose response data as a control feature_subsample: None or integer (default None) number of feature columns to use from cellline expressions and drug descriptors use_landmark_genes: True or False only use LINCS1000 landmark genes use_combo_score: bool (default False) use combination score in place of percent growth (stored in 'GROWTH' column) scaling: None, 'std', 'minmax' or 'maxabs' (default 'std') type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization """ self.cv_partition = cv_partition np.random.seed(seed) df = NCI60.load_combo_dose_response(response_url=response_url, use_combo_score=use_combo_score, fraction=True, exclude_cells=exclude_cells, exclude_drugs=exclude_drugs) logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format( df.shape[0])) if 'all' in cell_features: self.cell_features = ['expression', 'mirna', 'proteome'] else: self.cell_features = cell_features if 'all' in drug_features: self.drug_features = ['descriptors', 'latent'] else: self.drug_features = drug_features for fea in self.cell_features: if fea == 'expression' or fea == 'rnaseq': self.df_cell_expr = NCI60.load_cell_expression_rnaseq( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes, preprocess_rnaseq=preprocess_rnaseq) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'expression_u133p2': self.df_cell_expr = NCI60.load_cell_expression_u133p2( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'expression_5platform': self.df_cell_expr = NCI60.load_cell_expression_5platform( ncols=feature_subsample, scaling=scaling, use_landmark_genes=use_landmark_genes) df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME') elif fea == 'mirna': self.df_cell_mirna = NCI60.load_cell_mirna( ncols=feature_subsample, scaling=scaling) df = df.merge(self.df_cell_mirna[['CELLNAME']], on='CELLNAME') elif fea == 'proteome': self.df_cell_prot = NCI60.load_cell_proteome( ncols=feature_subsample, scaling=scaling) df = df.merge(self.df_cell_prot[['CELLNAME']], on='CELLNAME') elif fea == 'categorical': df_cell_ids = df[['CELLNAME']].drop_duplicates() cell_ids = df_cell_ids['CELLNAME'].map( lambda x: x.replace(':', '.')) df_cell_cat = pd.get_dummies(cell_ids) df_cell_cat.index = df_cell_ids['CELLNAME'] self.df_cell_cat = df_cell_cat.reset_index() for fea in self.drug_features: if fea == 'descriptors': self.df_drug_desc = NCI60.load_drug_descriptors( ncols=feature_subsample, scaling=scaling) df = df[df['NSC1'].isin(self.df_drug_desc['NSC']) & df['NSC2'].isin(self.df_drug_desc['NSC'])] elif fea == 'latent': self.df_drug_auen = NCI60.load_drug_autoencoded_AG( ncols=feature_subsample, scaling=scaling) df = df[df['NSC1'].isin(self.df_drug_auen['NSC']) & df['NSC2'].isin(self.df_drug_auen['NSC'])] elif fea == 'categorical': df_drug_ids = df[['NSC1']].drop_duplicates() df_drug_ids.columns = ['NSC'] drug_ids = df_drug_ids['NSC'] df_drug_cat = pd.get_dummies(drug_ids) df_drug_cat.index = df_drug_ids['NSC'] self.df_drug_cat = df_drug_cat.reset_index() elif fea == 'noise': ids1 = df[['NSC1' ]].drop_duplicates().rename(columns={'NSC1': 'NSC'}) ids2 = df[['NSC2' ]].drop_duplicates().rename(columns={'NSC2': 'NSC'}) df_drug_ids = pd.concat([ids1, ids2]).drop_duplicates() noise = np.random.normal(size=(df_drug_ids.shape[0], 500)) df_rand = pd.DataFrame( noise, index=df_drug_ids['NSC'], columns=['RAND-{:03d}'.format(x) for x in range(500)]) self.df_drug_rand = df_rand.reset_index() logger.info( 'Filtered down to {} rows with matching information.'.format( df.shape[0])) ids1 = df[['NSC1']].drop_duplicates().rename(columns={'NSC1': 'NSC'}) ids2 = df[['NSC2']].drop_duplicates().rename(columns={'NSC2': 'NSC'}) df_drug_ids = pd.concat([ids1, ids2 ]).drop_duplicates().reset_index(drop=True) n_drugs = df_drug_ids.shape[0] n_val_drugs = int(n_drugs * val_split) n_train_drugs = n_drugs - n_val_drugs logger.info('Unique cell lines: {}'.format(df['CELLNAME'].nunique())) logger.info('Unique drugs: {}'.format(n_drugs)) # df.to_csv('filtered.growth.min.tsv', sep='\t', index=False, float_format='%.4g') # df.to_csv('filtered.score.max.tsv', sep='\t', index=False, float_format='%.4g') if shuffle: df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True) df_drug_ids = df_drug_ids.sample( frac=1.0, random_state=seed).reset_index(drop=True) self.df_response = df self.df_drug_ids = df_drug_ids self.train_drug_ids = df_drug_ids['NSC'][:n_train_drugs] self.val_drug_ids = df_drug_ids['NSC'][-n_val_drugs:] if scramble: growth = df[['GROWTH']] random_growth = growth.iloc[np.random.permutation( np.arange(growth.shape[0]))].reset_index() self.df_response[['GROWTH']] = random_growth['GROWTH'] logger.warn('Randomly shuffled dose response growth values.') logger.info('Distribution of dose response:') logger.info(self.df_response[['GROWTH']].describe()) self.total = df.shape[0] self.n_val = int(self.total * val_split) self.n_train = self.total - self.n_val logger.info('Rows in train: {}, val: {}'.format( self.n_train, self.n_val)) self.cell_df_dict = { 'expression': 'df_cell_expr', 'expression_5platform': 'df_cell_expr', 'expression_u133p2': 'df_cell_expr', 'rnaseq': 'df_cell_expr', 'mirna': 'df_cell_mirna', 'proteome': 'df_cell_prot', 'categorical': 'df_cell_cat' } self.drug_df_dict = { 'descriptors': 'df_drug_desc', 'latent': 'df_drug_auen', 'categorical': 'df_drug_cat', 'noise': 'df_drug_rand' } self.input_features = collections.OrderedDict() self.feature_shapes = {} for fea in self.cell_features: feature_type = 'cell.' + fea feature_name = 'cell.' + fea df_cell = getattr(self, self.cell_df_dict[fea]) self.input_features[feature_name] = feature_type self.feature_shapes[feature_type] = (df_cell.shape[1] - 1, ) for drug in ['drug1', 'drug2']: for fea in self.drug_features: feature_type = 'drug.' + fea feature_name = drug + '.' + fea df_drug = getattr(self, self.drug_df_dict[fea]) self.input_features[feature_name] = feature_type self.feature_shapes[feature_type] = (df_drug.shape[1] - 1, ) self.feature_shapes['dose'] = (1, ) for dose in ['dose1', 'dose2']: self.input_features[dose] = 'dose' logger.info('Input features shapes:') for k, v in self.input_features.items(): logger.info(' {}: {}'.format(k, self.feature_shapes[v])) self.input_dim = sum([ np.prod(self.feature_shapes[x]) for x in self.input_features.values() ]) logger.info('Total input dimensions: {}'.format(self.input_dim)) if cv > 1: if cv_partition == 'disjoint': pass elif cv_partition == 'disjoint_cells': y = self.df_response['GROWTH'].values groups = self.df_response['CELLNAME'].values gkf = GroupKFold(n_splits=cv) splits = gkf.split(y, groups=groups) self.cv_train_indexes = [] self.cv_val_indexes = [] for index, (train_index, val_index) in enumerate(splits): print(index, train_index) self.cv_train_indexes.append(train_index) self.cv_val_indexes.append(val_index) else: y = self.df_response['GROWTH'].values # kf = KFold(n_splits=cv) # splits = kf.split(y) skf = StratifiedKFold(n_splits=cv, random_state=seed) splits = skf.split(y, discretize(y, bins=cv)) self.cv_train_indexes = [] self.cv_val_indexes = [] for index, (train_index, val_index) in enumerate(splits): print(index, train_index) self.cv_train_indexes.append(train_index) self.cv_val_indexes.append(val_index)
# %% plt.plot(groups) plt.yticks(np.unique(groups)) plt.xticks(writer_boundaries, rotation=90) plt.xlabel("Target index") plt.ylabel("Writer index") _ = plt.title("Underlying writer groups existing in the target") # %% [markdown] # Once we group the digits by writer, we can use cross-validation to take this # information into account: the class containing `Group` should be used. # %% from sklearn.model_selection import GroupKFold cv = GroupKFold() test_score = cross_val_score(model, data, target, groups=groups, cv=cv, n_jobs=-1) print(f"The average accuracy is " f"{test_score.mean():.3f} +/- " f"{test_score.std():.3f}") # %% [markdown] # We see that this strategy is less optimistic regarding the model statistical # performance. However, this is the most reliable if our goal is to make # handwritten digits recognition writers independent. Besides, we can as well # see that the standard deviation was reduced.
def fit_meta_feature( X_train, X_valid, X_test, Meta_train, train_idx, bond_type, base_fold, feature="fc", N_META_FOLDS=N_META_FOLDS, N_META_ESTIMATORS=N_META_ESTIMATORS, model_type="catboost", ): """ Adds meta features to train, test and val """ logger.info(f"{bond_type}: Creating meta feature {feature}") logger.info("{}: X_train, X_valid and X_test are shapes {} {} {}".format( bond_type, X_train.shape, X_valid.shape, X_test.shape)) folds = GroupKFold(n_splits=N_META_FOLDS) fold_count = 1 # Init predictions X_valid["meta_" + feature] = 0 X_test["meta_" + feature] = 0 X_train["meta_" + feature] = 0 X_train_oof = X_train[["meta_" + feature]].copy() X_train = X_train.drop("meta_" + feature, axis=1) feature_importance = pd.DataFrame() for fold_n, (train_idx2, valid_idx2) in enumerate( folds.split(X_train, groups=mol_group_type.iloc[train_idx].values)): logger.info("{}: Running Meta Feature Type {} - Fold {} of {}".format( bond_type, feature, fold_count, folds.n_splits)) update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature), N_META_ESTIMATORS) update_tracking(run_id, "{}_meta_{}_metafolds".format(bond_type, feature), N_META_FOLDS) # Load fold IDs from files for consistancy X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)] X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)] X_train2 = X_train2.copy() X_valid2 = X_valid2.copy() y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin( train_idx2)][feature] y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin( valid_idx2)][feature] fold_count += 1 if model_type == "catboost": train_dataset = Pool(data=X_train2, label=y_train2) metavalid_dataset = Pool(data=X_valid2, label=y_valid2) valid_dataset = Pool(data=X_valid) test_dataset = Pool(data=X_test) model = CatBoostRegressor( iterations=N_META_ESTIMATORS, learning_rate=LEARNING_RATE, depth=META_DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU", ) # Train on GPU model.fit( train_dataset, eval_set=metavalid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(metavalid_dataset) y_pred_valid = model.predict(valid_dataset) y_pred = model.predict(test_dataset) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) elif model_type == "xgboost": model = xgboost.XGBRegressor(**xgb_params) model.fit( X_train2, y_train2, eval_metric=EVAL_METRIC, eval_set=[(X_valid2, y_valid2)], verbose=VERBOSE, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(X_valid2) y_pred_valid = model.predict( X_valid.drop("meta_" + feature, axis=1)) y_pred = model.predict(X_test.drop("meta_" + feature, axis=1)) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) update_tracking(run_id, '{}_f{}-{}_meta{}_best_iter'.format( bond_type, base_fold, fold_count, feature), model.best_iteration_, integer=True) oof_score = mean_absolute_error(Meta_train[feature], X_train_oof["meta_" + feature]) log_oof_score = np.log(oof_score) logger.info( f"{bond_type} Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}" ) update_tracking( run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold), oof_score) update_tracking( run_id, "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold), log_oof_score, ) X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS X_train["meta_" + feature] = X_train_oof["meta_" + feature] feature_importance.to_parquet( "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train_oof.to_parquet( "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train.to_parquet( "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_valid.to_parquet( "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_test.to_parquet( "type_results/{}/meta/{}_{}_{}_X_test_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) logger.info(f"{bond_type} Done creating meta features") logger.info("{} X_train, X_valid and X_test are shapes {} {} {}".format( bond_type, X_train.shape, X_valid.shape, X_test.shape)) return X_train, X_valid, X_test
X_test: pd.DataFrame = test[use_cols_revised].copy() print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}") # X.to_csv("../info/X_sampled.csv") # export colnames pd.DataFrame({ "columns": X.columns.tolist() }).to_csv(log_path / f"use_cols.csv") #################################################################################################### # Model Fitting print("start fitting") n_fold = 5 # folds = KFold(n_splits=n_fold, shuffle=True, random_state=11) folds = GroupKFold(n_splits=n_fold) ######################################################################################################### # 1st layer model seed_base = [0, 2019, 71, 1228, 1988, 1879, 92, 3018, 1234, 185289] #seed_list = np.array(seed_base) + 40 #seed_list = np.array(seed_base) + 41 #seed_list = np.array(seed_base) + 42 #seed_list = np.array(seed_base) + 43 #seed_list = np.array(seed_base) + 44 current_seed = -1 num_leaves_dict = { 0: 8, 1: 8,
def regression_stage1(dmi, divs, dtreatments, gm_wbc, dinfections, pidmeta, dw, wbc_type='neutrophils'): """Stage 1 Elastic Net regression dmi: microbiota composition for intervals divs: microbiota diversity for intervals, contains sample additional sample info (pid) dtreatments: immunomodulatory medications and treatments administered during interval gm_wbc: geometric mean of absolute white blood cell counts during interval dinfections: positive blood cultures detected during interval pidmeta: patient and HCT meta data dw: daily change in WBC count, "y" wbc_type: WBC type considered """ # identify patients with microbiota data (stage 2) to exclude in stage 1 # by inner-joining with microbiota diversity table Xmi = pd.merge(divs, dw, on=["pid", "anon-date"], how="inner") mi_pids = Xmi.reset_index().pid.unique() # stage 1 feature selection regression (and stage 2 as regularized ML version instead of full Bayesian for internal checks) for MIINDICATOR in ["stage1", "stage2"]: X = pd.merge( dw, gm_wbc, on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVEgmwbc"]) if MIINDICATOR == "stage2": # ML-version of the Bayesian model for stage 2, include microbiome X = pd.merge( X, dmi, on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVE_dmi"]) X = pd.merge( X, divs.reset_index()[["pid", "anon-date", "inverseSimpson"]], on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVE_ivs"]) X = pd.merge( X, dtreatments, on=["pid", "anon-date"], how="left", suffixes=["", "_REMOVEdreatments"]) X = pd.merge( X, dinfections, on=["pid", "anon-date"], how="left", suffixes=["", "_REMOVEdinfections"]) X = pd.merge( X, pidmeta, on=["pid", "n_bmt"], how="left", suffixes=["", "_REMOVEpidmeta"]) X = X.loc[~X["log_%s" % WBCTYPE].isna()] X = X[[x for x in X.columns if "REMOVE" not in x]] X = X.drop(columns=["n_bmt"]) X_columns = X.columns if MIINDICATOR == "stage1": # mi_pids: patients with microbiota data X = X.loc[X.pid.apply(lambda v: v not in mi_pids)] elif MIINDICATOR == "stage2": # ML-version of the Bayesian model for stage 2 X = X.loc[X.pid.apply(lambda v: v in mi_pids)] # intercepts per transplant type X = X.join(pd.get_dummies(X["hct_source"])) # drop original HCT type column X = X.drop(columns=['hct_source', "PBSC"]) #PBSC as reference # intercepts per intensity X = X.join(pd.get_dummies(X["Intensity"].fillna("unknown"))) X = X.drop(columns=["ABLATIVE", "unknown", "Intensity"]) #ABLATIVE as reference # intercepts female X = X.join(pd.get_dummies(X.sex)['F']) X = X.drop(columns='sex') # only after engraftment and before day 100 X = X.query('day>6 ').copy() #smallest observed engraftment da # only analyze daily changes X = X.query('dt == 1').copy() # # from join, fill gaps X.loc[:, dinfections.columns] = X[dinfections.columns].fillna(0) X.loc[:, dtreatments.columns] = X[dtreatments.columns].fillna(0) # missing patient ages, fill with mean for ML feature selection X["age"] = X["age"].fillna(X["age"].mean()) ### drop columns # delta time columns dtcols = [x for x in X.columns if 'dt' in x] X = X.drop(columns=dtcols) # time point columns anoncols = [x for x in X.columns if 'anon' in x] X = X.drop(columns=anoncols) # patient id columns pidcols = [x for x in X.columns if ('pid' in x) and (x != 'pid')] remaining_pids = X.reset_index().pid.unique() print("pid count in regression", len(remaining_pids)) X = X.drop(columns=pidcols) print("shape before dropna", X.shape) X = X.dropna() print("shape after dropna (should not change)", X.shape) # drop all zero columns drop_zero_columns = (X.sum() == 0) & (X.max() == 0) drop_zero_columns = drop_zero_columns.loc[ drop_zero_columns.values].index X = X.drop(columns=drop_zero_columns) # drop HCT day daycolumns = [ x for x in X.columns if ("day" in x) and (x not in ["day", "eday"]) ] X.drop(columns=daycolumns, inplace=True) #### data transformations and standardizations from sklearn.preprocessing import StandardScaler X.loc[:, [ 'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils', 'platelets' ]] = np.log10(X[[ 'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils', 'platelets' ]] + 0.1 / 2) from sklearn.linear_model import ElasticNetCV # Fit and summarize OLS model # CV on patient sub samples, X contains 'pid' column Xpid = X.copy() X = X.drop(columns=['pid']) _drug_cols = [x for x in X.columns if x in dtreatments.columns] _other_cols = [x for x in X.columns if x not in dtreatments.columns] groups = Xpid.pid from sklearn.model_selection import GroupKFold group_kfold = GroupKFold(n_splits=10) cv = list( group_kfold.split( X.drop(columns='log_%s' % wbc_type), X['log_%s' % wbc_type], groups)) mod = ElasticNetCV( cv=cv, positive=False, normalize=False, fit_intercept=True) res = mod.fit( MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)), X['log_%s' % wbc_type], ) r2 = mod.score( MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)), X['log_%s' % wbc_type], ) print('chosen alpha: %f' % mod.alpha_) print("R2 = %f" % r2) coefs = pd.Series( res.coef_, index=X.drop(columns='log_%s' % wbc_type).columns).replace( 0, np.nan).dropna().sort_values() coefs["gr"] = mod.intercept_ coefs["N"] = len(np.unique(Xpid.pid)) coefs["n"] = X.shape[0] coefs["r2"] = r2 coefs["alpha"] = mod.alpha_ return (coefs)
def get_data_loader(config, group_id=None): # read data and batching train, test, sub = get_data(config) features = ['signal'] train = batching(train, batch_size=config.GROUP_BATCH_SIZE) test = batching(test, batch_size=config.GROUP_BATCH_SIZE) # data feature engineering if config.data_fe is not None and 'shifted' in config.data_fe: train, test = normalize(train, test) train = run_feat_engineering(train) test = run_feat_engineering(test) train, test, features = feature_selection(train, test) # cross valid target = ['open_channels'] group = train['group'] kf = GroupKFold(n_splits=config.SPLITS) splits = [x for x in kf.split(train, train[target], group)] new_splits = [] for sp in splits: new_split = [] new_split.append(np.unique(group[sp[0]])) new_split.append(np.unique(group[sp[1]])) new_split.append(sp[1]) new_splits.append(new_split) target_cols = ['open_channels'] train_tr = np.array( list(train.groupby('group').apply( lambda x: x[target_cols].values))).astype(np.float32) train = np.array( list(train.groupby('group').apply(lambda x: x[features].values))) test = np.array( list(test.groupby('group').apply(lambda x: x[features].values))) train_dataloaders = [] valid_dataloaders = [] test_dataloaders = [] for index, (train_index, val_index, _) in enumerate(new_splits[0:], start=0): # build dataloader test_y = np.zeros([ int(2000000 / config.GROUP_BATCH_SIZE), config.GROUP_BATCH_SIZE, 1 ]) test_dataset = IronDataset(test, test_y, config, training=False) if group_id is not None: train_group_indexs, test_group_indexs = get_group_index( group_id, len(train), len(test)) train_index = np.intersect1d(train_index, train_group_indexs) val_index = np.intersect1d(val_index, train_group_indexs) test_dataset = IronDataset(test[test_group_indexs], test_y[test_group_indexs], config, training=False) test_dataloader = DataLoader(test_dataset, config.NNBATCHSIZE, shuffle=False) train_dataset = IronDataset(train[train_index], train_tr[train_index], config, training=True) train_dataloader = DataLoader(train_dataset, config.NNBATCHSIZE, shuffle=True, num_workers=16) valid_dataset = IronDataset(train[val_index], train_tr[val_index], config, training=False) valid_dataloader = DataLoader(valid_dataset, config.NNBATCHSIZE, shuffle=False) train_dataloaders.append(train_dataloader) valid_dataloaders.append(valid_dataloader) test_dataloaders.append(test_dataloader) return train_dataloaders, valid_dataloaders, test_dataloaders
test['filter'] = 2 ts1 = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True) ts1['time2'] = pd.cut(ts1['time'], bins=np.linspace(0.0000, 700., num=14 + 1), labels=list(range(14)), include_lowest=True).astype(int) ts1['time2'] = ts1.groupby('time2')['time'].rank( )/500000. np.random.seed(321) ts1['group'] = pd.cut(ts1['time'], bins=np.linspace(0.0000, 700., num=14*125 + 1), labels=list(range(14*125)), include_lowest=True).astype(int) np.random.seed(321) y = ts1.loc[ts1['filter']==0, 'open_channels'] group = ts1.loc[ts1['filter']==0, 'group'] X = ts1.loc[ts1['filter']==0, 'signal'] np.random.seed(321) skf = GroupKFold(n_splits=5) splits = [x for x in skf.split(X, y, group)] use_cols = [col for col in ts1.columns if col not in ['index','filter','group', 'open_channels', 'time', 'time2']] # Create numpy array of inputs for col in use_cols: col_mean = ts1[col].mean() ts1[col] = ts1[col].fillna(col_mean) val_preds_all = np.zeros((ts1[ts1['filter']==0].shape[0], 11)) test_preds_all = np.zeros((ts1[ts1['filter']==2].shape[0], 11)) groups = ts1.loc[ts1['filter']==0, 'group'] times = ts1.loc[ts1['filter']==0, 'time']
import numpy as np import pandas as pd from sklearn.model_selection import GroupKFold from sklearn import svm import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import shuffle from features import getFeatures from sklearn.metrics import f1_score, accuracy_score # File containing values of Username, Comment, 11 Features, Label of user new = pd.read_csv('Joined_v2.csv') new = new.drop(new.columns[new.columns.str.contains('unnamed', case=False)], axis=1) kfold = GroupKFold(n_splits=3) # Calculating the PCA to reduce number of features def getPca(dat_array): pca = PCA(n_components=0.999) DATA_PCA = pca.fit_transform(dat_array) return DATA_PCA # Getting label value of exact number of users as exact number of comments def getLabel(sortedData): usercnt = sortedData['Label'].value_counts().to_dict() numOfComm = list(usercnt.values())[0]
random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True seed_torch(seed=CFG.seed) # ## CV splits # In[8]: folds = train.copy() Fold = GroupKFold(n_splits=CFG.n_fold) groups = folds['PatientID'].values for n, (train_index, val_index) in enumerate( Fold.split(folds, folds[CFG.target_cols], groups)): folds.loc[val_index, 'fold'] = int(n) folds['fold'] = folds['fold'].astype(int) print(folds.groupby('fold').size()) # ## Dataset # In[9]: # ==================================================== # Dataset # ==================================================== COLOR_MAP = {
embeddings.append(embedding) input_numeric = Input(shape=(len(num), )) embedding_numeric = Dense(512, activation='relu')(input_numeric) inputs.append(input_numeric) embeddings.append(embedding_numeric) x = Concatenate()(embeddings) x = Dense(256, activation='relu')(x) x = Dense(128, activation='relu')(x) x = Dropout(0.5)(x) output = Dense(199, activation='softmax')(x) model = Model(inputs, output) return model n_splits = 5 kf = GroupKFold(n_splits=n_splits) score = [] for i_, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])): print(f'Fold : {i_+1}') X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx] X_train = [np.absolute(X_train[i]) for i in cat ] + [X_train[num]] # + [X_train[env1]] + [X_train[env2]] X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]] # + [X_val[env1]] + [X_val[env2]] model = model_NN() model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=[]) es = EarlyStopping(monitor='val_CRPS',
def main(): # Lazy import libraries from rlearnlib.utils import ( predefined_estimators, load_training_data, save_training_data, option_to_list, scoring_metrics, check_class_weights, ) from rlearnlib.raster import RasterStack try: import sklearn if sklearn.__version__ < "0.20": gs.fatal( "Package python3-scikit-learn 0.20 or newer is not installed") except ImportError: gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed") try: import pandas as pd except ImportError: gs.fatal("Package python3-pandas 0.25 or newer is not installed") # parser options ---------------------------------------------------------- group = options["group"] training_map = options["training_map"] training_points = options["training_points"] field = options["field"] model_save = options["save_model"] model_name = options["model_name"] hyperparams = { "penalty": options["penalty"], "alpha": options["alpha"], "l1_ratio": options["l1_ratio"], "C": options["c"], "epsilon": options["epsilon"], "min_samples_leaf": options["min_samples_leaf"], "n_estimators": options["n_estimators"], "learning_rate": options["learning_rate"], "subsample": options["subsample"], "max_depth": options["max_depth"], "max_features": options["max_features"], "n_neighbors": options["n_neighbors"], "weights": options["weights"], "hidden_layer_sizes": options["hidden_units"], } cv = int(options["cv"]) group_raster = options["group_raster"] importances = flags["f"] preds_file = options["preds_file"] classif_file = options["classif_file"] fimp_file = options["fimp_file"] param_file = options["param_file"] norm_data = flags["s"] random_state = int(options["random_state"]) load_training = options["load_training"] save_training = options["save_training"] n_jobs = int(options["n_jobs"]) balance = flags["b"] category_maps = option_to_list(options["category_maps"]) # define estimator -------------------------------------------------------- hyperparams, param_grid = process_param_grid(hyperparams) estimator, mode = predefined_estimators(model_name, random_state, n_jobs, hyperparams) # remove dict keys that are incompatible for the selected estimator estimator_params = estimator.get_params() param_grid = { key: value for key, value in param_grid.items() if key in estimator_params } scoring, search_scorer = scoring_metrics(mode) # checks of input options ------------------------------------------------- if (mode == "classification" and balance is True and model_name not in check_class_weights()): gs.warning(model_name + " does not support class weights") balance = False if mode == "regression" and balance is True: gs.warning( "Balancing of class weights is only possible for classification") balance = False if classif_file: if cv <= 1: gs.fatal("Output of cross-validation global accuracy requires " "cross-validation cv > 1") if not os.path.exists(os.path.dirname(classif_file)): gs.fatal("Directory for output file {} does not exist".format( classif_file)) # feature importance file selected but no cross-validation scheme used if importances: if sklearn.__version__ < "0.22": gs.fatal("Feature importances calculation requires scikit-learn " "version >= 0.22") if fimp_file: if importances is False: gs.fatal( 'Output of feature importance requires the "f" flag to be set') if not os.path.exists(os.path.dirname(fimp_file)): gs.fatal("Directory for output file {} does not exist".format( fimp_file)) # predictions file selected but no cross-validation scheme used if preds_file: if cv <= 1: gs.fatal("Output of cross-validation predictions requires " "cross-validation cv > 1") if not os.path.exists(os.path.dirname(preds_file)): gs.fatal("Directory for output file {} does not exist".format( preds_file)) # define RasterStack ------------------------------------------------------ stack = RasterStack(group=group) if category_maps is not None: stack.categorical = category_maps # extract training data --------------------------------------------------- if load_training != "": X, y, cat, class_labels, group_id = load_training_data(load_training) if class_labels is not None: a = pd.DataFrame({"response": y, "labels": class_labels}) a = a.drop_duplicates().values class_labels = {k: v for (k, v) in a} else: gs.message("Extracting training data") if group_raster != "": stack.append(group_raster) if training_map != "": X, y, cat = stack.extract_pixels(training_map) y = y.flatten() with RasterRow(training_map) as src: if mode == "classification": src_cats = {v: k for (k, v, m) in src.cats} class_labels = {k: k for k in np.unique(y)} class_labels.update(src_cats) else: class_labels = None elif training_points != "": X, y, cat = stack.extract_points(training_points, field) y = y.flatten() if y.dtype in (np.object_, np.object): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) class_labels = {k: v for (k, v) in enumerate(le.classes_)} else: class_labels = None # take group id from last column and remove from predictors if group_raster != "": group_id = X[:, -1] X = np.delete(X, -1, axis=1) stack.drop(group_raster) else: group_id = None # check for labelled pixels and training data if y.shape[0] == 0 or X.shape[0] == 0: gs.fatal("No training pixels or pixels in imagery group ...check " "computational region") from sklearn.utils import shuffle if group_id is None: X, y, cat = shuffle(X, y, cat, random_state=random_state) else: X, y, cat, group_id = shuffle(X, y, cat, group_id, random_state=random_state) if save_training != "": save_training_data(save_training, X, y, cat, class_labels, group_id, stack.names) # cross validation settings ----------------------------------------------- # inner resampling method (cv=2) from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold if any(param_grid) is True: if group_id is None and mode == "classification": inner = StratifiedKFold(n_splits=3) elif group_id is None and mode == "regression": inner = KFold(n_splits=3) else: inner = GroupKFold(n_splits=3) else: inner = None # outer resampling method (cv=cv) if cv > 1: if group_id is None and mode == "classification": outer = StratifiedKFold(n_splits=cv) elif group_id is None and mode == "regression": outer = KFold(n_splits=cv) else: outer = GroupKFold(n_splits=cv) # modify estimators that take sample_weights ------------------------------ if balance is True: from sklearn.utils import compute_class_weight class_weights = compute_class_weight(class_weight="balanced", classes=(y), y=y) fit_params = {"sample_weight": class_weights} else: class_weights = None fit_params = {} # preprocessing ----------------------------------------------------------- from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder # standardization if norm_data is True and category_maps is None: scaler = StandardScaler() trans = ColumnTransformer( remainder="passthrough", transformers=[("scaling", scaler, np.arange(0, stack.count))], ) # one-hot encoding elif norm_data is False and category_maps is not None: enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer(remainder="passthrough", transformers=[("onehot", enc, stack.categorical)]) # standardization and one-hot encoding elif norm_data is True and category_maps is not None: scaler = StandardScaler() enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer( remainder="passthrough", transformers=[ ("onehot", enc, stack.categorical), ( "scaling", scaler, np.setxor1d(range(stack.count), stack.categorical).astype("int"), ), ], ) # combine transformers if norm_data is True or category_maps is not None: estimator = Pipeline([("preprocessing", trans), ("estimator", estimator)]) param_grid = wrap_named_step(param_grid) fit_params = wrap_named_step(fit_params) if any(param_grid) is True: estimator = GridSearchCV( estimator=estimator, param_grid=param_grid, scoring=search_scorer, n_jobs=n_jobs, cv=inner, ) # estimator training ------------------------------------------------------ gs.message(os.linesep) gs.message(("Fitting model using " + model_name)) if balance is True and group_id is not None: estimator.fit(X, y, groups=group_id, **fit_params) elif balance is True and group_id is None: estimator.fit(X, y, **fit_params) else: estimator.fit(X, y) # message best hyperparameter setup and optionally save using pandas if any(param_grid) is True: gs.message(os.linesep) gs.message("Best parameters:") optimal_pars = [ (k.replace("estimator__", "").replace("selection__", "") + " = " + str(v)) for (k, v) in estimator.best_params_.items() ] for i in optimal_pars: gs.message(i) if param_file != "": param_df = pd.DataFrame(estimator.cv_results_) param_df.to_csv(param_file) # cross-validation -------------------------------------------------------- if cv > 1: from sklearn.metrics import classification_report from sklearn import metrics if (mode == "classification" and cv > np.histogram(y, bins=np.unique(y))[0].min()): gs.message(os.linesep) gs.fatal("Number of cv folds is greater than number of samples in " "some classes ") gs.message(os.linesep) gs.message("Cross validation global performance measures......:") if (mode == "classification" and len(np.unique(y)) == 2 and all([0, 1] == np.unique(y))): scoring["roc_auc"] = metrics.roc_auc_score from sklearn.model_selection import cross_val_predict preds = cross_val_predict(estimator, X, y, group_id, cv=outer, n_jobs=n_jobs, fit_params=fit_params) test_idx = [test for train, test in outer.split(X, y)] n_fold = np.zeros((0, )) for fold in range(outer.get_n_splits()): n_fold = np.hstack((n_fold, np.repeat(fold, test_idx[fold].shape[0]))) preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold} preds = pd.DataFrame(data=preds, columns=["y_pred", "y_true", "cat", "fold"]) gs.message(os.linesep) gs.message("Global cross validation scores...") gs.message(os.linesep) gs.message("Metric \t Mean \t Error") for name, func in scoring.items(): score_mean = (preds.groupby("fold").apply( lambda x: func(x["y_true"], x["y_pred"])).mean()) score_std = (preds.groupby("fold").apply( lambda x: func(x["y_true"], x["y_pred"])).std()) gs.message(name + "\t" + str(score_mean.round(3)) + "\t" + str(score_std.round(3))) if mode == "classification": gs.message(os.linesep) gs.message("Cross validation class performance measures......:") report_str = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=False, ) report = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=True, ) report = pd.DataFrame(report) gs.message(report_str) if classif_file != "": report.to_csv(classif_file, mode="w", index=True) # write cross-validation predictions to csv file if preds_file != "": preds.to_csv(preds_file, mode="w", index=False) text_file = open(preds_file + "t", "w") text_file.write('"Real", "Real", "integer", "integer"') text_file.close() # feature importances ----------------------------------------------------- if importances is True: from sklearn.inspection import permutation_importance fimp = permutation_importance( estimator, X, y, scoring=search_scorer, n_repeats=5, n_jobs=n_jobs, random_state=random_state, ) feature_names = deepcopy(stack.names) feature_names = [i.split("@")[0] for i in feature_names] fimp = pd.DataFrame({ "feature": feature_names, "importance": fimp["importances_mean"], "std": fimp["importances_std"], }) gs.message(os.linesep) gs.message("Feature importances") gs.message("Feature" + "\t" + "Score") for index, row in fimp.iterrows(): gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" + str(row["std"])) if fimp_file != "": fimp.to_csv(fimp_file, index=False) # save the fitted model import joblib joblib.dump((estimator, y, class_labels), model_save)
X_exgal_d0 = X_exgal[~is_ddf] X_exgal_d1 = X_exgal[is_ddf] li = [X_exgal_d0.copy() for i in range(37)] X_exgal = pd.concat([X_exgal_d1] + li, ignore_index=True) group_exgal = X_exgal.g y_exgal_d0 = y_exgal[~is_ddf] y_exgal_d1 = y_exgal[is_ddf] li = [y_exgal_d0.copy() for i in range(37)] y_exgal = pd.concat([y_exgal_d1] + li, ignore_index=True) del li, X_exgal_d0, X_exgal_d1, X_exgal['g'], y_exgal_d0, y_exgal_d1 group_kfold = GroupKFold(n_splits=NFOLD) print(f'X_gal.shape: {X_gal.shape}') print(f'X_exgal.shape: {X_exgal.shape}') gc.collect() # ============================================================================= # cv(gal) # ============================================================================= print('==== GAL ====') param['num_class'] = 5 dtrain = lgb.Dataset( X_gal, y_gal.values, #categorical_feature=CAT,
def split( self, X, y=None, group=None, **kwargs ): ## the group here will be passed on from the class where this is being called if self.validation_scheme is None or isinstance( self.validation_scheme, KFold ) or self.validation_scheme == FoldScheme.KFold.name or self.validation_scheme == FoldScheme.KFold: folds = KFold(n_splits=self.num_folds, random_state=self.random_state, shuffle=self.shuffle) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X)] elif isinstance( self.validation_scheme, StratifiedKFold ) or self.validation_scheme == FoldScheme.StratifiedKFold.name or self.validation_scheme == FoldScheme.StratifiedKFold: if y is None or X.shape[0] != y.shape[0]: raise ValueError( "Y should be passed and X and Y should be of same length for StratifiedKFold" ) folds = StratifiedKFold(n_splits=self.num_folds, random_state=self.random_state, shuffle=self.shuffle) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X, y)] elif isinstance( self.validation_scheme, GroupKFold ) or self.validation_scheme == FoldScheme.GroupKFold.name or self.validation_scheme == FoldScheme.GroupKFold: folds = GroupKFold(n_splits=self.num_folds) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X, y, groups=group) ] elif isinstance( self.validation_scheme, TimeSeriesSplit ) or self.validation_scheme == FoldScheme.TimeSeriesSplit.name or self.validation_scheme == FoldScheme.TimeSeriesSplit: folds = TimeSeriesSplit(n_splits=self.num_folds) self.indices = [(train_index, test_index) for (train_index, test_index) in folds.split(X)] elif self.validation_scheme == FoldScheme.train_test_split.name or self.validation_scheme == FoldScheme.train_test_split: # validation_scheme is a simple train test split. testsize is used to determine the size of test samples self.indices = [ train_test_split(range(X.shape[0]), test_size=self.test_size, shuffle=self.shuffle) ] elif callable(self.validation_scheme): # validation_scheme is a callable funtion which will take X and y as params. self.indices = self.validation_scheme(X, y, **kwargs) else: if not isinstance(self.validation_scheme, list): raise ValueError( "Validation Schema should be a list of (train_indexes, test_indexes)" ) self.indices = self.validation_scheme return self.indices
sep_token="[unused0]") tokenizer = BertTokenizer(BERT_PATH + 'vocab.txt', True) train_inputs2 = compute_input_arrays_2s(train, "question_body", "answer", tokenizer, MAX_SEQUENCE_LENGTH, s1_max_length=254, s2_max_length=255, sep_token="[SEP]") tokenizer = BertTokenizer(BERT_PATH + 'vocab.txt', True) train_inputs3 = compute_input_arrays(train, "answer", tokenizer, MAX_SEQUENCE_LENGTH) train_inputs = train_inputs1 + train_inputs2 + train_inputs3 kf_split = GroupKFold(n_splits=NUM_FOLDS).split(X=train, groups=train.question_body) kfold_rho = list() kfold_rhos = list() for fold, (train_idx, valid_idx) in enumerate(kf_split): print(f" fold: {fold} ".center(100, "#")) _train_inputs = [train_inputs[i][train_idx] for i in range(9)] _train_targets = train_targets.loc[train_idx, :].values _valid_inputs = [train_inputs[i][valid_idx] for i in range(9)] _valid_targets = train_targets.loc[valid_idx, :].values model = BERTRegressor(bert_path=BERT_PATH, dropout=DROPOUT, hidden_size=768, output_size1=21, output_size2=9)
data_concat = pd.concat(data) y = data_concat['original']['power'] X = data_concat.drop('power', axis=1, level=1) X.fillna(X.mean(), inplace=True) groups = [] for group_idx, activity in enumerate(data): groups += [group_idx] * activity.shape[0] groups = np.array(groups) scores = cross_validate(GradientBoostingRegressor(random_state=42, n_jobs=-1), X, y, groups=groups, scoring=['r2', 'neg_median_absolute_error'], cv=GroupKFold(n_splits=3), n_jobs=1, return_train_score=True, verbose=0) print('The obtained scores on training and testing in terms of ' 'R2 and MAE are: \n') print(scores) # Store the prediction for visualization y_pred = cross_val_predict(GradientBoostingRegressor(random_state=42, n_jobs=-1), X, y, groups=groups, cv=GroupKFold(n_splits=3),
print('output to file.') sX = pickle.dumps(X) fx.write(sX) sy = pickle.dumps(y) fy.write(sy) if MODE == 2: crf = CRF() with codecs.open('model/contract_train_crffeatures.pkl', 'rb') as fx: with codecs.open('model/contract_train_crfstates.pkl', 'rb') as fy: with codecs.open('model/contract_train_crfmodel.pkl', 'wb') as fm: with codecs.open('plain/contract_train_group.utf8', 'r') as fg: with codecs.open('plain/contract_train_group_log.utf8', 'w') as fl: groups = fg.readlines() groupKfold = GroupKFold(n_splits=10) bx = fx.read() by = fy.read() X = pickle.loads(bx) y = pickle.loads(by) for i in range(len(X)): assert len(X[i]) == len(y[i]) index = 0 for train, test in groupKfold.split(X, y, groups=groups): print(index) index += 1 gX = [X[i] for i in train] gy = [y[i] for i in train] tX = [X[i] for i in test]
'objective': 'regression', 'max_depth': 6, 'learning_rate': LEARNING_RATE, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "bagging_seed": 11, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1, 'reg_lambda': 0.4, 'colsample_bytree': 1.0, 'random_state': RANDOM_STATE } folds = GroupKFold(n_splits=N_FOLDS) # Setup arrays for storing results train_df = pd.read_parquet('data/FE008_train.parquet') # only loading for skeleton not features oof_df = train_df[['id', 'type','scalar_coupling_constant']].copy() mol_group = train_df[['molecule_name','type']].copy() del train_df gc.collect() oof_df['oof_preds'] = 0 test_df = pd.read_parquet('data/FE008_test.parquet') # only loading for skeleton not features prediction = np.zeros(len(test_df)) feature_importance = pd.DataFrame() test_pred_df = test_df[['id','type','molecule_name']].copy() del test_df gc.collect()
bag_times = config['bagging_times'] logging.debug(bag_times) logging.debug('\n\n=== random_seed_average times =========') random_seed_average_times = config['random_seed_average_times'] logging.debug(random_seed_average_times) logging.debug('\n\n=== N Folds =========') n_fold = config['n_fold'] logging.debug(n_fold) logging.debug('\n\n=== Folds Type =========') folds_type = { 'time_series': TimeSeriesSplit(n_fold), 'k_fold': KFold(n_fold), 'group_k_fold': GroupKFold(n_fold), 'train_test_split_time_series': 'train_test_split_time_series' } folds = folds_type[config['folds_type']] logging.debug(config['folds_type']) if config['folds_type'] == 'group_k_fold': split_groups = train['DT_M'] else: split_groups = None logging.debug('\n\n=== train shape =========') logging.debug(train.shape) print('train shape', train.shape) if model_type == 'cat': for col in train:
iris = load_iris() X = iris.data y = iris.target plt.figure() plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8) from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit, StratifiedKFold, GroupKFold, GroupShuffleSplit #cv = KFold(5, random_state = 0) #cv = LeaveOneOut() # cv = ShuffleSplit(4, test_size = 0.2) #cv = StratifiedKFold(4) cv = GroupKFold(5).get_n_splits(X, y, groups = X[:, 0]) print(cross_val_score(KNeighborsClassifier(), X, y, cv = cv)) import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import * y = np.array([1, 2, 2, 3, 5, 2]) y_pred = np.array([5, 2, 2, 5, 7, 1000]) print('MAE :', mean_absolute_error(y, y_pred)) print('RMSE :', np.sqrt(mean_squared_error(y, y_pred))) print('median absolute error :', median_absolute_error(y, y_pred)) from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression