Exemple #1
0
## INITIALISE LOGGING --------------------------------------------------------------------------------------
# use current time as a unique identifier
now = datetime.now()
job_id = now.strftime("%d%m%y_%H%M%S")

# create log file
logging.basicConfig(filename='logs/logreg_'+problem+'_'+job_id+'.txt',\
     level=logging.DEBUG)

## LOAD DATA --------------------------------------------------------------------------------------------
X = pd.read_hdf('data/FINAL/X_' + problem + '.h5')
y = pd.read_hdf('data/FINAL/y_' + problem + '.h5')
subject = pd.read_hdf('data/FINAL/subject_' + problem + '.h5')

# cross-validation iterator
gkf = GroupKFold(n_splits=len(subject.unique()))
gkf = list(gkf.split(X, y, subject))

# scoring
scoring = {'Accuracy': 'accuracy', 'F1-score': 'f1_weighted'}

# define the pipeline

# DO FEATURE SELECTION ON ALL TRAINING DATA FOR NOW
fs = FeatureSelector(n_jobs=20)
fs.fit(X, y)
X_filtered = fs.transform(X)

dump(fs, 'models/FINAL/feature_selector_' + problem + '.joblib')

qt = QuantileTransformer()
Exemple #2
0
test_feats = np.array(test_feats)
test_storm_ids = np.array(test_storm_ids)
test_org_pred = np.array(test_org_pred)

# In[20]:

#train_img_features = img_features[:train_img_features.shape[0]]
#test_img_features = img_features[train_img_features.shape[0]:]

# In[21]:

test_final_pred = np.zeros_like(test_org_pred)

# In[22]:

group_kfold = GroupKFold(n_splits=5)

# In[23]:

models_arr = []
val_pred = np.zeros_like(train_targets)
sc_arr = []
fold = 0
for train_index, val_index in group_kfold.split(train_feats, train_targets,
                                                train_storm_ids):
    print(fold)
    fold += 1
    image_datasets = {
        'train':
        WindDataset(train_feats[train_index], train_img_features[train_index],
                    train_targets[train_index], 'train'),
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset2(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            Q_LABEL_COL=Q_LABEL_COL,
            A_LABEL_COL=A_LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            rm_zero=RM_ZERO,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset2(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            Q_LABEL_COL=Q_LABEL_COL,
            A_LABEL_COL=A_LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            rm_zero=RM_ZERO,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier2(num_labels=len(Q_LABEL_COL) + len(A_LABEL_COL),
                                                        config_path=MODEL_CONFIG_PATH,
                                                        q_state_dict=state_dict,
                                                        a_state_dict=state_dict,
                                                        token_size=len(
            trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch2(
                model, fobj, optimizer, trn_loader, DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test2(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [val_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [val_metric, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [val_metric_raws, ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}',
                logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
            )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer,
            clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Exemple #4
0
def evaluate_han(batchsize):
    dbank = pd.read_json('dbank.json')
    X = dbank['tokens'].sample(frac=1, random_state=20)
    y = dbank['labels'].sample(frac=1, random_state=20)
    ids = dbank['ids'].sample(frac=1, random_state=20)

    group_kfold = GroupKFold(n_splits=10).split(X, y, groups=ids)
    data = []

    for train_index, test_index in group_kfold:
        fold = {}
        fold["X_train"] = X.values[train_index]
        fold["y_train"] = y.values[train_index]
        fold["X_test"]  = X.values[test_index]
        fold["y_test"]  = y.values[test_index]
        fold["train_ids"]  = np.array(ids)[train_index]

        data.append(fold)

    learning_rate = 1e-1
    momentum = 0.9
    criterion = nn.NLLLoss()
    idx = 0
    accuracy = 0
    f_measure = 0

    while idx < 10:
        fold = data[idx]
        X_train, y_train = fold["X_train"], fold["y_train"].ravel()  # Ravel flattens a (n,1) array into (n, )
        X_test, y_test   = fold["X_test"], fold["y_test"].ravel()
        split = len(X_train)//10
        X_validate = X_train[:split]
        y_validate = y_train[:split]
        X_train = X_train[split:]
        y_train = y_train[split:]

        word_attn = AttentionWordRNN(batch_size=batchsize, num_tokens=1829, embed_size=300, 
                             word_gru_hidden=100, bidirectional= True).cuda()
        sent_attn = AttentionSentRNN(batch_size=batchsize, sent_gru_hidden=100, word_gru_hidden=100, 
                             n_classes=2, bidirectional= True).cuda()
        word_optmizer = torch.optim.SGD(word_attn.parameters(), lr=learning_rate, momentum= momentum)
        sent_optimizer = torch.optim.SGD(sent_attn.parameters(), lr=learning_rate, momentum= momentum)
        print("---------------- fold {} ----------------".format(idx))
        sys.stdout.flush()
        best_model = train_early_stopping(idx, batchsize, X_train, y_train, X_validate, y_validate, word_attn, sent_attn, word_optmizer, sent_optimizer, criterion, 160, 5)
        if not best_model:
            continue
        trained_word_attn = AttentionWordRNN(batch_size=batchsize, num_tokens=1829, embed_size=300,  word_gru_hidden=100, bidirectional= True).cuda()
        trained_sent_attn = AttentionSentRNN(batch_size=batchsize, sent_gru_hidden=100, word_gru_hidden=100, n_classes=2, bidirectional= True).cuda()

        trained_word_attn.load_state_dict(torch.load('saved_models/noage/fold{}_word_attn.pth'.format(idx)))
        trained_sent_attn.load_state_dict(torch.load('saved_models/noage/fold{}_sent_attn.pth'.format(idx)))
        trained_word_attn.eval()
        trained_sent_attn.eval()
        
        acc, f1 = test_accuracy_full_batch(X_test, y_test, batchsize, trained_word_attn, trained_sent_attn)
        print("Best model is {}".format(best_model))
        print("---------------- accuracy, f-measure of fold {} is {}, {} ----------------".format(idx, acc, f1))
        accuracy += acc
        f_measure += f1
        sys.stdout.flush()
        idx += 1
    print("average acc, f score = {}, {}".format(accuracy/10, f_measure/10))
Exemple #5
0
    def _build_cv_generator(self, y=None):
        # Use information about X to build a cross-validation split generator.
        # http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
        # There are a few general strategies for cross-validation:
        #   k-fold split: Split training set into k partitions, train using k-1,
        #       and validate using the kth. Loop through k validation partitions.
        #       Basis for all other cross-validation strategies.
        #       Assumes each sample is independent and identically distributed.
        #   shuffle split: Instead of mixing k partitions into training and
        #       validation sets, just generate a random train/validation split
        #       and do that k times. Typically efficient, but also can fail
        #       to use each sample equally, e.g. if a sample is only ever
        #       in the validation set or the training set.
        #   leave one out (LOO): k-fold CV where k = n, so each partition only
        #       has one sample in the test data. Test error across partitions
        #       typically has high variance between error is either 0 or 1.
        #       Empirically, LOO is typically worse than 5- or 10-fold CV.
        #       Can tweak to leave P out (LPO). Because (n choose p) is much
        #       greater than (n choose 1), this process tends to be
        #       computationally intensive, and perhaps not worth the effort.
        #   repetition: k-fold CV run multiple times, with different partitions
        #       for each run. Good way to get more learning from limited data.
        #   stratification: for unbalanced classification problems, enforce
        #       a constraint on either k-fold or shuffle splits to require
        #       an equal split of each class between the training and
        #       validation sets.
        #   grouping: for some classification problems, the samples are not iid.
        #       In particular, for medical applications, if there are multiple
        #       samples taken from a single patient, those samples are by
        #       definition not independent. Moreover, we typically care about
        #       whether we can accurately classify new unseen patients based
        #       on patients we saw in the past. Therefore, grouping guarantees
        #       that any particular group (e.g. a single patient_id) is only
        #       in the training set or in the validation set.
        #   time series split: data representing a time series also break the
        #       i.i.d. assumption, because samples that are near each other
        #       in time are by definition correlated (autocorrelation). To get
        #       around this, split the data into k partitions, and in each
        #       of k loops, use partions [0, i] to train and partition [i+1]
        #       for validation. Ensure that you are always training on the past
        #       and validating on the "future."
        #
        # TODO(sbala): Ideally, we'd find a way to do stratified group k-fold.
        # sklearn only provides StratifiedKFold and GroupKFold out of the box.
        # Given how unbalanced many of our classification problems are, use
        # StratifiedKFold for now, but we need something better.

        # Use information about y to determine n_splits.
        # In certain pathological cases (esp. with bifurcated classifiers)
        # there might be fewer than n examples of a given class in y.
        # If that's the case, n_splits can't be greater than n_samples.
        if y is not None:
            log.debug('y.value_counts(): %s' % Series(y).value_counts())
            max_possible_splits = np.min(Series(y).value_counts())
            log.debug('max_possible_splits: %s' % max_possible_splits)
            n_splits = np.min([10, max_possible_splits])
        else:
            n_splits = 10
        log.debug('n_splits: %s' % n_splits)

        if self.CV_STRATEGY == 'StratifiedKFold':
            return StratifiedKFold(n_splits=n_splits, shuffle=False, \
                                random_state=self._hyperparams['random_state'])
        elif self.CV_STRATEGY == 'GroupKFold':
            '''
            GroupKFold is not randomized at all. Hence the random_state=None
            '''
            return GroupKFold(n_splits=n_splits)
Exemple #6
0
score = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print('Cross-validation score:\n{}'.format(score))
# Cross-validation score:
# [0.98666667 0.94666667 0.97333333 0.96       0.97333333 0.94666667
#  0.97333333 0.96       0.93333333 0.94666667]

# 分组交叉验证
# 适用于数据集中的分组高度相关时
# 例如人脸情绪识别中需要避免在数据集和测试集中出现同一个人的不同情绪,这会令预测结果偏好
# 我们想把每个人的不同情绪分为一组,而不会分散在测试集和数据集中
# 我们可以使用GroupKFold实现这点
# 下面的例子中包含12个数据点,共分为4个组
X, y = make_blobs(n_samples=12, random_state=0)
# 假设前3个样本属于同一组,接下来4个属于同一组,以此类推
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print('Cross-validation scores:\n{}'.format(scores))
# Cross-validation scores:
# [0.75       0.6        0.66666667]

# 可视化分组
mglearn.plots.plot_group_kfold()
plt.show()

# sklearn中还有很多交叉验证的划分策略,详细查看用户指南

# 网格搜索
# 自己实现简单网格搜索
X_train, X_test, y_train, y_test = train = train_test_split(
    iris.data, iris.target, random_state=0
)
Exemple #7
0
# # 2. Training

# Using commentBody as X and editiorsSelection as Y. Use the article ID to track data and split: 

# In[67]:


commentBody = new.commentBody
nytpicks = new.editorsSelection
articleID = new.articleID


# In[69]:


for train_index, test_index in GroupKFold(n_splits=5).split(commentBody, nytpicks, groups=articleID):
    train_text, test_text = commentBody[train_index], commentBody[test_index] 
    train_target, test_target = nytpicks[train_index], nytpicks[test_index]
    train_groups, test_groups = articleID[train_index], articleID[test_index]
    
train_text.shape[0], test_text.shape[0]


# Using TFIDF for words and character n-grams and combine them using FeatureUnion

# In[70]:


vectorizer = FeatureUnion([
    ('word_tfidf', TfidfVectorizer(
    analyzer='word',
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    for HOST in HOSTs:
        trn_df.loc[trn_df.host.str.contains(HOST).values,
                   'host'] = f'HOST_{HOST}'.casefold()
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    # calc max_seq_len using quest dataset
    # max_seq_len = QUESTDataset(
    #     df=trn_df,
    #     mode='train',
    #     tokens=[],
    #     augment=[],
    #     pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
    # ).MAX_SEQUENCE_LENGTH
    # max_seq_len = 9458
    # max_seq_len = 1504
    max_seq_len = 512

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
            'host_stackexchange',
            'host_askubuntu',
            'host_mathoverflow',
            'host_serverfault',
            'host_stackoverflow',
            'host_superuser',
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        # fobj = MSELoss()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=30,
            pretrained_model_name_or_path=MODEL_PRETRAIN,
            # cat_num=5,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
    "eval_df",
])
experiment_df = experiment_df.append(
    {
        "run_name": "holdout",
        "run_dirp": str(holdout_dirp),
        "train_devset_idc": "full_devset",
        "eval_devset_idc": "full_holdout",
        "train_df": dev_df,
        "eval_df": holdout_df,
    },
    ignore_index=True,
)

# Make KFolds and collect fold splits
group_kfold = GroupKFold(n_splits=settings.N_FOLDS)
groups = dev_df["document_id"].to_numpy()
X = dev_df["text"].to_numpy()
y = dev_df["labels"].to_numpy()

for i, (train_idc, eval_idc) in enumerate(group_kfold.split(X, y, groups)):
    print(
        f"Fold {i}: {train_idc.shape[0]} train inst. and {eval_idc.shape[0]} eval inst."
    )
    train_df = dev_df.iloc[train_idc]
    eval_df = dev_df.iloc[eval_idc]

    fold_dirp = experiment_dirp / f"fold_{i}"

    # collect run metadata
    experiment_df = experiment_df.append(
Exemple #10
0
def decode_window(X, y, clf=None, cv=None, sample_weight='auto', n_jobs='auto',
                  random_state=None, labels=None):
    """Decode entire window

    Parameters
    ----------
    X : np.ndarray of float, shape(n_samples, n_sensors, n_times)
        The data.
    y : np.ndarray of int, shape(n_samples,)
        The response vector.
    clf : instance of BaseEstimator | None
        The classifier. If None, defaults to a Pipeline.
    cv : cross validation object | None
        The cross validation. If None, defaults to stratified K-folds
        with 10 folds.
    sample_weight : np.ndarray of float, shape(n_samples,)
        The sample weights to deal with class imbalance.
        if 'auto' computes sample weights to balance

    Returns
    -------
    probas : np.ndarray of float, shape(n_samples,)
        The predicted probabilities for each sample.
    predictions : np.ndarray of int, shape(n_samples,)
        The class preditions.
    scores : np.ndarray of float, shape(n_resamples,)


        The score at each resampling iteration.
    """
    if n_jobs == 'auto':
        try:
            import multiprocessing as mp
            n_jobs = mp.cpu_count()
            logger.info(
                'Autodetected number of jobs {}'.format(n_jobs))
        except:
            logger.info('Cannot autodetect number of jobs')
            n_jobs = 1
    if clf is None:
        scaler = StandardScaler()
        transform = SelectPercentile(f_classif, 10)
        svc = SVC(C=1, kernel='linear', probability=True)
        clf = Pipeline([('scaler', scaler),
                        ('anova', transform),
                        ('svc', svc)])

    if cv is None or isinstance(cv, int):
        if isinstance(cv, int):
            n_splits = cv
        else:
            n_splits = 10

        if labels is None:
            cv = StratifiedKFold(n_splits=int(min(n_splits, len(y) / 2)),
                                 shuffle=True, random_state=random_state)
        else:
            cv = GroupKFold(n_splits=n_splits)

    if isinstance(sample_weight, str) and sample_weight == 'auto':
        sample_weight = np.zeros(len(y), dtype=float)
        for this_y in np.unique(y):
            this_mask = (y == this_y)
            sample_weight[this_mask] = 1.0 / np.sum(this_mask)

    y = LabelEncoder().fit_transform(y)
    X = X.reshape(len(X), np.prod(X.shape[1:]))
    probas = np.zeros(y.shape, dtype=float)
    predictions = np.zeros(y.shape, dtype=int)
    scores = list()
    parallel, pfunc, _ = parallel_func(_decode_window_one_fold, n_jobs)

    out = parallel(pfunc(clone(clf), X, y, train, test, sample_weight)
                   for train, test in cv.split(X, y, labels))

    for (_, test), (probas_, predicts_, score_) in zip(
            cv.split(X, y, labels), out):
        probas[test] = probas_[:, 1]  # second column
        predictions[test] = predicts_
        scores.append(score_)

    return probas, predictions, np.array(scores)
def main():
    args = parse_arguments()
    # create save path
    DATA_DIR = args.data_path
    num_folds = args.fold

    # log directory
    time = datetime.now().strftime("%Y%m%d%H%M%S")
    out_dir_path = path.normpath(path.join(getcwd(), 'logs/{}'.format(time)))
    makedirs(out_dir_path, exist_ok=True)
    # copy this file to log dir
    shutil.copy(path.abspath(sys.argv[0]), out_dir_path)

    # setup data
    with open(DATA_DIR + '/features.txt') as f:
        features_txt = f.readlines()
    features_name = [x.strip() for x in features_txt]
    features_name = [
        "".join(c if c.isalnum() else "_" for c in str(x))
        for x in features_name
    ]
    X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name)
    X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name)
    y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label'])
    subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv',
                                names=['subject_id'])

    # 0始まりにする
    y_train['activity_label'] = y_train['activity_label'] - 1

    # CV
    valid_preds = np.zeros((len(X_train), 6))
    test_preds = np.zeros((num_folds, len(X_test), 6))
    kf = GroupKFold(n_splits=num_folds)
    score_df = pd.DataFrame()
    all_score = []
    for fold, (train_index, valid_index) in enumerate(
            kf.split(X=subject_train, groups=subject_train)):
        str_fold = 'fold_{}'.format(fold + 1)
        print(str_fold)

        # set data
        x_trn, x_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_trn, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

        # lgb_params = {
        #     'learning_rate': 0.1,
        #     'objective': 'multiclass',
        #     'num_class': 6,
        #     'n_jobs': -1,
        #     'seed': 1,
        # }

        # classifier = LGBClassifier(lgb_params)
        # classifier = KNNClassifier()
        classifier = SVCClassifier()
        # classifier = LRClassifier()

        # train and predict
        classifier.train(x_trn, y_trn, x_val, y_val)
        classifier.predict(x_val)
        valid_preds[valid_index] = classifier.predict(x_val)
        test_preds[fold] = classifier.predict(X_test)

        # scoring
        score = accuracy_score(y_val,
                               np.argmax(valid_preds[valid_index], axis=1))
        print('Fold {} Score : {}'.format(fold + 1, score))
        score_df[str_fold] = [score]
        all_score.append(score)

    # final score
    print('CV (mean) : {}'.format(np.mean(all_score)))
    score_df['mean'] = [np.mean(all_score)]

    # make submission
    score_df.to_csv(out_dir_path + '/score.csv')
    submit = np.argmax(np.mean(test_preds, axis=0), axis=1) + 1
    np.savetxt(out_dir_path + '/baseline.txt', submit)
Exemple #12
0
    def __init__(self,
                 seed,
                 val_split=0.2,
                 shuffle=True,
                 cell_features=['expression'],
                 drug_features=['descriptors'],
                 response_url=None,
                 use_landmark_genes=False,
                 use_combo_score=False,
                 preprocess_rnaseq=None,
                 exclude_cells=[],
                 exclude_drugs=[],
                 feature_subsample=None,
                 scaling='std',
                 scramble=False,
                 cv_partition='overlapping',
                 cv=0):
        """Initialize data merging drug response, drug descriptors and cell line essay.
           Shuffle and split training and validation set

        Parameters
        ----------
        seed: integer
            seed for random generation
        val_split : float, optional (default 0.2)
            fraction of data to use in validation
        cell_features: list of strings from 'expression', 'expression_5platform', 'mirna', 'proteome', 'all', 'categorical' (default ['expression'])
            use one or more cell line feature sets: gene expression, microRNA, proteome
            use 'all' for ['expression', 'mirna', 'proteome']
            use 'categorical' for one-hot encoded cell lines
        drug_features: list of strings from 'descriptors', 'latent', 'all', 'categorical', 'noise' (default ['descriptors'])
            use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder
            trained on NSC drugs, or both; use random features if set to noise
            use 'categorical' for one-hot encoded drugs
        shuffle : True or False, optional (default True)
            if True shuffles the merged data before splitting training and validation sets
        scramble: True or False, optional (default False)
            if True randomly shuffle dose response data as a control
        feature_subsample: None or integer (default None)
            number of feature columns to use from cellline expressions and drug descriptors
        use_landmark_genes: True or False
            only use LINCS1000 landmark genes
        use_combo_score: bool (default False)
            use combination score in place of percent growth (stored in 'GROWTH' column)
        scaling: None, 'std', 'minmax' or 'maxabs' (default 'std')
            type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization
        """

        self.cv_partition = cv_partition

        np.random.seed(seed)

        df = NCI60.load_combo_dose_response(response_url=response_url,
                                            use_combo_score=use_combo_score,
                                            fraction=True,
                                            exclude_cells=exclude_cells,
                                            exclude_drugs=exclude_drugs)
        logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format(
            df.shape[0]))

        if 'all' in cell_features:
            self.cell_features = ['expression', 'mirna', 'proteome']
        else:
            self.cell_features = cell_features

        if 'all' in drug_features:
            self.drug_features = ['descriptors', 'latent']
        else:
            self.drug_features = drug_features

        for fea in self.cell_features:
            if fea == 'expression' or fea == 'rnaseq':
                self.df_cell_expr = NCI60.load_cell_expression_rnaseq(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes,
                    preprocess_rnaseq=preprocess_rnaseq)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'expression_u133p2':
                self.df_cell_expr = NCI60.load_cell_expression_u133p2(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'expression_5platform':
                self.df_cell_expr = NCI60.load_cell_expression_5platform(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'mirna':
                self.df_cell_mirna = NCI60.load_cell_mirna(
                    ncols=feature_subsample, scaling=scaling)
                df = df.merge(self.df_cell_mirna[['CELLNAME']], on='CELLNAME')
            elif fea == 'proteome':
                self.df_cell_prot = NCI60.load_cell_proteome(
                    ncols=feature_subsample, scaling=scaling)
                df = df.merge(self.df_cell_prot[['CELLNAME']], on='CELLNAME')
            elif fea == 'categorical':
                df_cell_ids = df[['CELLNAME']].drop_duplicates()
                cell_ids = df_cell_ids['CELLNAME'].map(
                    lambda x: x.replace(':', '.'))
                df_cell_cat = pd.get_dummies(cell_ids)
                df_cell_cat.index = df_cell_ids['CELLNAME']
                self.df_cell_cat = df_cell_cat.reset_index()

        for fea in self.drug_features:
            if fea == 'descriptors':
                self.df_drug_desc = NCI60.load_drug_descriptors(
                    ncols=feature_subsample, scaling=scaling)
                df = df[df['NSC1'].isin(self.df_drug_desc['NSC'])
                        & df['NSC2'].isin(self.df_drug_desc['NSC'])]
            elif fea == 'latent':
                self.df_drug_auen = NCI60.load_drug_autoencoded_AG(
                    ncols=feature_subsample, scaling=scaling)
                df = df[df['NSC1'].isin(self.df_drug_auen['NSC'])
                        & df['NSC2'].isin(self.df_drug_auen['NSC'])]
            elif fea == 'categorical':
                df_drug_ids = df[['NSC1']].drop_duplicates()
                df_drug_ids.columns = ['NSC']
                drug_ids = df_drug_ids['NSC']
                df_drug_cat = pd.get_dummies(drug_ids)
                df_drug_cat.index = df_drug_ids['NSC']
                self.df_drug_cat = df_drug_cat.reset_index()
            elif fea == 'noise':
                ids1 = df[['NSC1'
                           ]].drop_duplicates().rename(columns={'NSC1': 'NSC'})
                ids2 = df[['NSC2'
                           ]].drop_duplicates().rename(columns={'NSC2': 'NSC'})
                df_drug_ids = pd.concat([ids1, ids2]).drop_duplicates()
                noise = np.random.normal(size=(df_drug_ids.shape[0], 500))
                df_rand = pd.DataFrame(
                    noise,
                    index=df_drug_ids['NSC'],
                    columns=['RAND-{:03d}'.format(x) for x in range(500)])
                self.df_drug_rand = df_rand.reset_index()

        logger.info(
            'Filtered down to {} rows with matching information.'.format(
                df.shape[0]))

        ids1 = df[['NSC1']].drop_duplicates().rename(columns={'NSC1': 'NSC'})
        ids2 = df[['NSC2']].drop_duplicates().rename(columns={'NSC2': 'NSC'})
        df_drug_ids = pd.concat([ids1, ids2
                                 ]).drop_duplicates().reset_index(drop=True)

        n_drugs = df_drug_ids.shape[0]
        n_val_drugs = int(n_drugs * val_split)
        n_train_drugs = n_drugs - n_val_drugs

        logger.info('Unique cell lines: {}'.format(df['CELLNAME'].nunique()))
        logger.info('Unique drugs: {}'.format(n_drugs))
        # df.to_csv('filtered.growth.min.tsv', sep='\t', index=False, float_format='%.4g')
        # df.to_csv('filtered.score.max.tsv', sep='\t', index=False, float_format='%.4g')

        if shuffle:
            df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
            df_drug_ids = df_drug_ids.sample(
                frac=1.0, random_state=seed).reset_index(drop=True)

        self.df_response = df
        self.df_drug_ids = df_drug_ids

        self.train_drug_ids = df_drug_ids['NSC'][:n_train_drugs]
        self.val_drug_ids = df_drug_ids['NSC'][-n_val_drugs:]

        if scramble:
            growth = df[['GROWTH']]
            random_growth = growth.iloc[np.random.permutation(
                np.arange(growth.shape[0]))].reset_index()
            self.df_response[['GROWTH']] = random_growth['GROWTH']
            logger.warn('Randomly shuffled dose response growth values.')

        logger.info('Distribution of dose response:')
        logger.info(self.df_response[['GROWTH']].describe())

        self.total = df.shape[0]
        self.n_val = int(self.total * val_split)
        self.n_train = self.total - self.n_val
        logger.info('Rows in train: {}, val: {}'.format(
            self.n_train, self.n_val))

        self.cell_df_dict = {
            'expression': 'df_cell_expr',
            'expression_5platform': 'df_cell_expr',
            'expression_u133p2': 'df_cell_expr',
            'rnaseq': 'df_cell_expr',
            'mirna': 'df_cell_mirna',
            'proteome': 'df_cell_prot',
            'categorical': 'df_cell_cat'
        }

        self.drug_df_dict = {
            'descriptors': 'df_drug_desc',
            'latent': 'df_drug_auen',
            'categorical': 'df_drug_cat',
            'noise': 'df_drug_rand'
        }

        self.input_features = collections.OrderedDict()
        self.feature_shapes = {}
        for fea in self.cell_features:
            feature_type = 'cell.' + fea
            feature_name = 'cell.' + fea
            df_cell = getattr(self, self.cell_df_dict[fea])
            self.input_features[feature_name] = feature_type
            self.feature_shapes[feature_type] = (df_cell.shape[1] - 1, )

        for drug in ['drug1', 'drug2']:
            for fea in self.drug_features:
                feature_type = 'drug.' + fea
                feature_name = drug + '.' + fea
                df_drug = getattr(self, self.drug_df_dict[fea])
                self.input_features[feature_name] = feature_type
                self.feature_shapes[feature_type] = (df_drug.shape[1] - 1, )

        self.feature_shapes['dose'] = (1, )
        for dose in ['dose1', 'dose2']:
            self.input_features[dose] = 'dose'

        logger.info('Input features shapes:')
        for k, v in self.input_features.items():
            logger.info('  {}: {}'.format(k, self.feature_shapes[v]))

        self.input_dim = sum([
            np.prod(self.feature_shapes[x])
            for x in self.input_features.values()
        ])
        logger.info('Total input dimensions: {}'.format(self.input_dim))

        if cv > 1:
            if cv_partition == 'disjoint':
                pass
            elif cv_partition == 'disjoint_cells':
                y = self.df_response['GROWTH'].values
                groups = self.df_response['CELLNAME'].values
                gkf = GroupKFold(n_splits=cv)
                splits = gkf.split(y, groups=groups)
                self.cv_train_indexes = []
                self.cv_val_indexes = []
                for index, (train_index, val_index) in enumerate(splits):
                    print(index, train_index)
                    self.cv_train_indexes.append(train_index)
                    self.cv_val_indexes.append(val_index)
            else:
                y = self.df_response['GROWTH'].values
                # kf = KFold(n_splits=cv)
                # splits = kf.split(y)
                skf = StratifiedKFold(n_splits=cv, random_state=seed)
                splits = skf.split(y, discretize(y, bins=cv))
                self.cv_train_indexes = []
                self.cv_val_indexes = []
                for index, (train_index, val_index) in enumerate(splits):
                    print(index, train_index)
                    self.cv_train_indexes.append(train_index)
                    self.cv_val_indexes.append(val_index)
Exemple #13
0
# %%
plt.plot(groups)
plt.yticks(np.unique(groups))
plt.xticks(writer_boundaries, rotation=90)
plt.xlabel("Target index")
plt.ylabel("Writer index")
_ = plt.title("Underlying writer groups existing in the target")

# %% [markdown]
# Once we group the digits by writer, we can use cross-validation to take this
# information into account: the class containing `Group` should be used.

# %%
from sklearn.model_selection import GroupKFold

cv = GroupKFold()
test_score = cross_val_score(model,
                             data,
                             target,
                             groups=groups,
                             cv=cv,
                             n_jobs=-1)
print(f"The average accuracy is "
      f"{test_score.mean():.3f} +/- "
      f"{test_score.std():.3f}")

# %% [markdown]
# We see that this strategy is less optimistic regarding the model statistical
# performance. However, this is the most reliable if our goal is to make
# handwritten digits recognition writers independent. Besides, we can as well
# see that the standard deviation was reduced.
Exemple #14
0
def fit_meta_feature(
    X_train,
    X_valid,
    X_test,
    Meta_train,
    train_idx,
    bond_type,
    base_fold,
    feature="fc",
    N_META_FOLDS=N_META_FOLDS,
    N_META_ESTIMATORS=N_META_ESTIMATORS,
    model_type="catboost",
):
    """
    Adds meta features to train, test and val
    """
    logger.info(f"{bond_type}: Creating meta feature {feature}")
    logger.info("{}: X_train, X_valid and X_test are shapes {} {} {}".format(
        bond_type, X_train.shape, X_valid.shape, X_test.shape))
    folds = GroupKFold(n_splits=N_META_FOLDS)
    fold_count = 1

    # Init predictions
    X_valid["meta_" + feature] = 0
    X_test["meta_" + feature] = 0
    X_train["meta_" + feature] = 0
    X_train_oof = X_train[["meta_" + feature]].copy()
    X_train = X_train.drop("meta_" + feature, axis=1)
    feature_importance = pd.DataFrame()
    for fold_n, (train_idx2, valid_idx2) in enumerate(
            folds.split(X_train,
                        groups=mol_group_type.iloc[train_idx].values)):
        logger.info("{}: Running Meta Feature Type {} - Fold {} of {}".format(
            bond_type, feature, fold_count, folds.n_splits))
        update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature),
                        N_META_ESTIMATORS)
        update_tracking(run_id,
                        "{}_meta_{}_metafolds".format(bond_type,
                                                      feature), N_META_FOLDS)
        # Load fold IDs from files for consistancy
        X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)]
        X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)]
        X_train2 = X_train2.copy()
        X_valid2 = X_valid2.copy()
        y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            train_idx2)][feature]
        y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            valid_idx2)][feature]
        fold_count += 1

        if model_type == "catboost":
            train_dataset = Pool(data=X_train2, label=y_train2)
            metavalid_dataset = Pool(data=X_valid2, label=y_valid2)
            valid_dataset = Pool(data=X_valid)
            test_dataset = Pool(data=X_test)
            model = CatBoostRegressor(
                iterations=N_META_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=META_DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                task_type="GPU",
            )  # Train on GPU

            model.fit(
                train_dataset,
                eval_set=metavalid_dataset,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )
            y_pred_meta_valid = model.predict(metavalid_dataset)
            y_pred_valid = model.predict(valid_dataset)
            y_pred = model.predict(test_dataset)

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
        elif model_type == "xgboost":
            model = xgboost.XGBRegressor(**xgb_params)
            model.fit(
                X_train2,
                y_train2,
                eval_metric=EVAL_METRIC,
                eval_set=[(X_valid2, y_valid2)],
                verbose=VERBOSE,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )

            y_pred_meta_valid = model.predict(X_valid2)
            y_pred_valid = model.predict(
                X_valid.drop("meta_" + feature, axis=1))
            y_pred = model.predict(X_test.drop("meta_" + feature, axis=1))

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
            update_tracking(run_id,
                            '{}_f{}-{}_meta{}_best_iter'.format(
                                bond_type, base_fold, fold_count, feature),
                            model.best_iteration_,
                            integer=True)

    oof_score = mean_absolute_error(Meta_train[feature],
                                    X_train_oof["meta_" + feature])
    log_oof_score = np.log(oof_score)
    logger.info(
        f"{bond_type} Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}"
    )
    update_tracking(
        run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold),
        oof_score)
    update_tracking(
        run_id,
        "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold),
        log_oof_score,
    )
    X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS
    X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS
    X_train["meta_" + feature] = X_train_oof["meta_" + feature]
    feature_importance.to_parquet(
        "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train_oof.to_parquet(
        "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_valid.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_test.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_test_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))
    logger.info(f"{bond_type} Done creating meta features")
    logger.info("{} X_train, X_valid and X_test are shapes {} {} {}".format(
        bond_type, X_train.shape, X_valid.shape, X_test.shape))
    return X_train, X_valid, X_test
X_test: pd.DataFrame = test[use_cols_revised].copy()
print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}")

# X.to_csv("../info/X_sampled.csv")

# export colnames
pd.DataFrame({
    "columns": X.columns.tolist()
}).to_csv(log_path / f"use_cols.csv")

####################################################################################################
# Model Fitting
print("start fitting")
n_fold = 5
# folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
folds = GroupKFold(n_splits=n_fold)

#########################################################################################################
# 1st layer model
seed_base = [0, 2019, 71, 1228, 1988, 1879, 92, 3018, 1234, 185289]
#seed_list = np.array(seed_base) + 40
#seed_list = np.array(seed_base) + 41
#seed_list = np.array(seed_base) + 42
#seed_list = np.array(seed_base) + 43
#seed_list = np.array(seed_base) + 44

current_seed = -1

num_leaves_dict = {
    0: 8,
    1: 8,
Exemple #16
0
def regression_stage1(dmi,
                      divs,
                      dtreatments,
                      gm_wbc,
                      dinfections,
                      pidmeta,
                      dw,
                      wbc_type='neutrophils'):
    """Stage 1 Elastic Net regression
    dmi: microbiota composition for intervals
    divs: microbiota diversity for intervals, contains sample additional sample info (pid)
    dtreatments: immunomodulatory medications and treatments administered during interval
    gm_wbc: geometric mean of absolute white blood cell counts during interval
    dinfections: positive blood cultures detected during interval
    pidmeta: patient and HCT meta data
    dw: daily change in WBC count, "y"
    wbc_type: WBC type considered
    """
    # identify patients with microbiota data (stage 2) to exclude in stage 1
    # by inner-joining with microbiota diversity table
    Xmi = pd.merge(divs, dw, on=["pid", "anon-date"], how="inner")
    mi_pids = Xmi.reset_index().pid.unique()
    # stage 1 feature selection regression (and stage 2 as regularized ML version instead of full Bayesian for internal checks)
    for MIINDICATOR in ["stage1", "stage2"]:
        X = pd.merge(
            dw,
            gm_wbc,
            on=["pid", "anon-date"],
            how="inner",
            suffixes=["", "_REMOVEgmwbc"])
        if MIINDICATOR == "stage2":
            # ML-version of the Bayesian model for stage 2, include microbiome
            X = pd.merge(
                X,
                dmi,
                on=["pid", "anon-date"],
                how="inner",
                suffixes=["", "_REMOVE_dmi"])
            X = pd.merge(
                X,
                divs.reset_index()[["pid", "anon-date", "inverseSimpson"]],
                on=["pid", "anon-date"],
                how="inner",
                suffixes=["", "_REMOVE_ivs"])
        X = pd.merge(
            X,
            dtreatments,
            on=["pid", "anon-date"],
            how="left",
            suffixes=["", "_REMOVEdreatments"])
        X = pd.merge(
            X,
            dinfections,
            on=["pid", "anon-date"],
            how="left",
            suffixes=["", "_REMOVEdinfections"])
        X = pd.merge(
            X,
            pidmeta,
            on=["pid", "n_bmt"],
            how="left",
            suffixes=["", "_REMOVEpidmeta"])
        X = X.loc[~X["log_%s" % WBCTYPE].isna()]
        X = X[[x for x in X.columns if "REMOVE" not in x]]
        X = X.drop(columns=["n_bmt"])
        X_columns = X.columns

        if MIINDICATOR == "stage1":
            # mi_pids: patients with microbiota data
            X = X.loc[X.pid.apply(lambda v: v not in mi_pids)]
        elif MIINDICATOR == "stage2":
            # ML-version of the Bayesian model for stage 2
            X = X.loc[X.pid.apply(lambda v: v in mi_pids)]

        # intercepts per transplant type
        X = X.join(pd.get_dummies(X["hct_source"]))
        # drop original HCT type column
        X = X.drop(columns=['hct_source', "PBSC"])  #PBSC as reference
        # intercepts per intensity
        X = X.join(pd.get_dummies(X["Intensity"].fillna("unknown")))
        X = X.drop(columns=["ABLATIVE", "unknown",
                            "Intensity"])  #ABLATIVE as reference
        # intercepts female
        X = X.join(pd.get_dummies(X.sex)['F'])
        X = X.drop(columns='sex')

        # only after engraftment and before day 100
        X = X.query('day>6 ').copy()  #smallest observed engraftment da
        # only analyze daily changes
        X = X.query('dt == 1').copy()
        #
        # from join, fill gaps
        X.loc[:, dinfections.columns] = X[dinfections.columns].fillna(0)
        X.loc[:, dtreatments.columns] = X[dtreatments.columns].fillna(0)
        # missing patient ages, fill with mean for ML feature selection
        X["age"] = X["age"].fillna(X["age"].mean())

        ### drop columns
        # delta time columns
        dtcols = [x for x in X.columns if 'dt' in x]
        X = X.drop(columns=dtcols)
        # time point columns
        anoncols = [x for x in X.columns if 'anon' in x]
        X = X.drop(columns=anoncols)
        # patient id columns
        pidcols = [x for x in X.columns if ('pid' in x) and (x != 'pid')]
        remaining_pids = X.reset_index().pid.unique()
        print("pid count in regression", len(remaining_pids))
        X = X.drop(columns=pidcols)
        print("shape before dropna", X.shape)
        X = X.dropna()
        print("shape after dropna (should not change)", X.shape)

        # drop all zero columns
        drop_zero_columns = (X.sum() == 0) & (X.max() == 0)
        drop_zero_columns = drop_zero_columns.loc[
            drop_zero_columns.values].index
        X = X.drop(columns=drop_zero_columns)
        # drop HCT day
        daycolumns = [
            x for x in X.columns if ("day" in x) and (x not in ["day", "eday"])
        ]
        X.drop(columns=daycolumns, inplace=True)

        #### data transformations and standardizations
        from sklearn.preprocessing import StandardScaler
        X.loc[:, [
            'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils',
            'platelets'
        ]] = np.log10(X[[
            'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils',
            'platelets'
        ]] + 0.1 / 2)
        from sklearn.linear_model import ElasticNetCV
        # Fit and summarize OLS model
        # CV on patient sub samples, X contains 'pid' column
        Xpid = X.copy()
        X = X.drop(columns=['pid'])
        _drug_cols = [x for x in X.columns if x in dtreatments.columns]
        _other_cols = [x for x in X.columns if x not in dtreatments.columns]
        groups = Xpid.pid
        from sklearn.model_selection import GroupKFold
        group_kfold = GroupKFold(n_splits=10)
        cv = list(
            group_kfold.split(
                X.drop(columns='log_%s' % wbc_type),
                X['log_%s' % wbc_type],
                groups))
        mod = ElasticNetCV(
            cv=cv, positive=False, normalize=False, fit_intercept=True)
        res = mod.fit(
            MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)),
            X['log_%s' % wbc_type],
        )
        r2 = mod.score(
            MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)),
            X['log_%s' % wbc_type],
        )
        print('chosen alpha: %f' % mod.alpha_)
        print("R2 = %f" % r2)
        coefs = pd.Series(
            res.coef_,
            index=X.drop(columns='log_%s' % wbc_type).columns).replace(
                0, np.nan).dropna().sort_values()
        coefs["gr"] = mod.intercept_
        coefs["N"] = len(np.unique(Xpid.pid))
        coefs["n"] = X.shape[0]
        coefs["r2"] = r2
        coefs["alpha"] = mod.alpha_
    return (coefs)
Exemple #17
0
def get_data_loader(config, group_id=None):
    # read data and batching
    train, test, sub = get_data(config)
    features = ['signal']
    train = batching(train, batch_size=config.GROUP_BATCH_SIZE)
    test = batching(test, batch_size=config.GROUP_BATCH_SIZE)

    # data feature engineering
    if config.data_fe is not None and 'shifted' in config.data_fe:
        train, test = normalize(train, test)
        train = run_feat_engineering(train)
        test = run_feat_engineering(test)
        train, test, features = feature_selection(train, test)

    # cross valid
    target = ['open_channels']
    group = train['group']
    kf = GroupKFold(n_splits=config.SPLITS)
    splits = [x for x in kf.split(train, train[target], group)]
    new_splits = []
    for sp in splits:
        new_split = []
        new_split.append(np.unique(group[sp[0]]))
        new_split.append(np.unique(group[sp[1]]))
        new_split.append(sp[1])
        new_splits.append(new_split)

    target_cols = ['open_channels']
    train_tr = np.array(
        list(train.groupby('group').apply(
            lambda x: x[target_cols].values))).astype(np.float32)
    train = np.array(
        list(train.groupby('group').apply(lambda x: x[features].values)))
    test = np.array(
        list(test.groupby('group').apply(lambda x: x[features].values)))
    train_dataloaders = []
    valid_dataloaders = []
    test_dataloaders = []
    for index, (train_index, val_index, _) in enumerate(new_splits[0:],
                                                        start=0):
        # build dataloader
        test_y = np.zeros([
            int(2000000 / config.GROUP_BATCH_SIZE), config.GROUP_BATCH_SIZE, 1
        ])
        test_dataset = IronDataset(test, test_y, config, training=False)
        if group_id is not None:
            train_group_indexs, test_group_indexs = get_group_index(
                group_id, len(train), len(test))
            train_index = np.intersect1d(train_index, train_group_indexs)
            val_index = np.intersect1d(val_index, train_group_indexs)
            test_dataset = IronDataset(test[test_group_indexs],
                                       test_y[test_group_indexs],
                                       config,
                                       training=False)

        test_dataloader = DataLoader(test_dataset,
                                     config.NNBATCHSIZE,
                                     shuffle=False)
        train_dataset = IronDataset(train[train_index],
                                    train_tr[train_index],
                                    config,
                                    training=True)
        train_dataloader = DataLoader(train_dataset,
                                      config.NNBATCHSIZE,
                                      shuffle=True,
                                      num_workers=16)

        valid_dataset = IronDataset(train[val_index],
                                    train_tr[val_index],
                                    config,
                                    training=False)
        valid_dataloader = DataLoader(valid_dataset,
                                      config.NNBATCHSIZE,
                                      shuffle=False)

        train_dataloaders.append(train_dataloader)
        valid_dataloaders.append(valid_dataloader)
        test_dataloaders.append(test_dataloader)
    return train_dataloaders, valid_dataloaders, test_dataloaders
Exemple #18
0
test['filter'] = 2
ts1 = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)

ts1['time2'] = pd.cut(ts1['time'], bins=np.linspace(0.0000, 700., num=14 + 1), labels=list(range(14)), include_lowest=True).astype(int)
ts1['time2'] = ts1.groupby('time2')['time'].rank( )/500000.

np.random.seed(321)
ts1['group'] = pd.cut(ts1['time'], bins=np.linspace(0.0000, 700., num=14*125 + 1), labels=list(range(14*125)), include_lowest=True).astype(int)
np.random.seed(321)

y = ts1.loc[ts1['filter']==0, 'open_channels']
group = ts1.loc[ts1['filter']==0, 'group']
X = ts1.loc[ts1['filter']==0, 'signal']

np.random.seed(321)
skf = GroupKFold(n_splits=5)
splits = [x for x in skf.split(X, y, group)]

use_cols = [col for col in ts1.columns if col not in ['index','filter','group', 'open_channels', 'time', 'time2']]  

# Create numpy array of inputs
for col in use_cols:
    col_mean = ts1[col].mean()
    ts1[col] = ts1[col].fillna(col_mean)
 
val_preds_all = np.zeros((ts1[ts1['filter']==0].shape[0], 11))
test_preds_all = np.zeros((ts1[ts1['filter']==2].shape[0], 11))

groups = ts1.loc[ts1['filter']==0, 'group']
times = ts1.loc[ts1['filter']==0, 'time']
Exemple #19
0
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from features import getFeatures
from sklearn.metrics import f1_score, accuracy_score

# File containing values of Username, Comment, 11 Features, Label of user
new = pd.read_csv('Joined_v2.csv')
new = new.drop(new.columns[new.columns.str.contains('unnamed', case=False)],
               axis=1)

kfold = GroupKFold(n_splits=3)


# Calculating the PCA to reduce number of features
def getPca(dat_array):
    pca = PCA(n_components=0.999)
    DATA_PCA = pca.fit_transform(dat_array)

    return DATA_PCA


# Getting label value of exact number of users as exact number of comments
def getLabel(sortedData):
    usercnt = sortedData['Label'].value_counts().to_dict()
    numOfComm = list(usercnt.values())[0]
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=CFG.seed)

# ## CV splits

# In[8]:

folds = train.copy()
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = folds['PatientID'].values
for n, (train_index, val_index) in enumerate(
        Fold.split(folds, folds[CFG.target_cols], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby('fold').size())

# ## Dataset

# In[9]:

# ====================================================
# Dataset
# ====================================================
COLOR_MAP = {
Exemple #21
0
        embeddings.append(embedding)
    input_numeric = Input(shape=(len(num), ))
    embedding_numeric = Dense(512, activation='relu')(input_numeric)
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model


n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_+1}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]

    X_train = [np.absolute(X_train[i]) for i in cat
               ] + [X_train[num]]  # + [X_train[env1]] + [X_train[env2]]
    X_val = [np.absolute(X_val[i])
             for i in cat] + [X_val[num]]  # + [X_val[env1]] + [X_val[env2]]

    model = model_NN()
    model.compile(optimizer='Adam',
                  loss='categorical_crossentropy',
                  metrics=[])
    es = EarlyStopping(monitor='val_CRPS',
Exemple #22
0
def main():

    # Lazy import libraries
    from rlearnlib.utils import (
        predefined_estimators,
        load_training_data,
        save_training_data,
        option_to_list,
        scoring_metrics,
        check_class_weights,
    )
    from rlearnlib.raster import RasterStack

    try:
        import sklearn

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    try:
        import pandas as pd

    except ImportError:
        gs.fatal("Package python3-pandas 0.25 or newer is not installed")

    # parser options ----------------------------------------------------------
    group = options["group"]
    training_map = options["training_map"]
    training_points = options["training_points"]
    field = options["field"]
    model_save = options["save_model"]
    model_name = options["model_name"]
    hyperparams = {
        "penalty": options["penalty"],
        "alpha": options["alpha"],
        "l1_ratio": options["l1_ratio"],
        "C": options["c"],
        "epsilon": options["epsilon"],
        "min_samples_leaf": options["min_samples_leaf"],
        "n_estimators": options["n_estimators"],
        "learning_rate": options["learning_rate"],
        "subsample": options["subsample"],
        "max_depth": options["max_depth"],
        "max_features": options["max_features"],
        "n_neighbors": options["n_neighbors"],
        "weights": options["weights"],
        "hidden_layer_sizes": options["hidden_units"],
    }
    cv = int(options["cv"])
    group_raster = options["group_raster"]
    importances = flags["f"]
    preds_file = options["preds_file"]
    classif_file = options["classif_file"]
    fimp_file = options["fimp_file"]
    param_file = options["param_file"]
    norm_data = flags["s"]
    random_state = int(options["random_state"])
    load_training = options["load_training"]
    save_training = options["save_training"]
    n_jobs = int(options["n_jobs"])
    balance = flags["b"]
    category_maps = option_to_list(options["category_maps"])

    # define estimator --------------------------------------------------------
    hyperparams, param_grid = process_param_grid(hyperparams)
    estimator, mode = predefined_estimators(model_name, random_state, n_jobs,
                                            hyperparams)

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value
        for key, value in param_grid.items() if key in estimator_params
    }
    scoring, search_scorer = scoring_metrics(mode)

    # checks of input options -------------------------------------------------
    if (mode == "classification" and balance is True
            and model_name not in check_class_weights()):
        gs.warning(model_name + " does not support class weights")
        balance = False

    if mode == "regression" and balance is True:
        gs.warning(
            "Balancing of class weights is only possible for classification")
        balance = False

    if classif_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation global accuracy requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(classif_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                classif_file))

    # feature importance file selected but no cross-validation scheme used
    if importances:
        if sklearn.__version__ < "0.22":
            gs.fatal("Feature importances calculation requires scikit-learn "
                     "version >= 0.22")

    if fimp_file:
        if importances is False:
            gs.fatal(
                'Output of feature importance requires the "f" flag to be set')

        if not os.path.exists(os.path.dirname(fimp_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                fimp_file))

    # predictions file selected but no cross-validation scheme used
    if preds_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation predictions requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(preds_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                preds_file))

    # define RasterStack ------------------------------------------------------
    stack = RasterStack(group=group)

    if category_maps is not None:
        stack.categorical = category_maps

    # extract training data ---------------------------------------------------
    if load_training != "":
        X, y, cat, class_labels, group_id = load_training_data(load_training)

        if class_labels is not None:
            a = pd.DataFrame({"response": y, "labels": class_labels})
            a = a.drop_duplicates().values
            class_labels = {k: v for (k, v) in a}

    else:
        gs.message("Extracting training data")

        if group_raster != "":
            stack.append(group_raster)

        if training_map != "":
            X, y, cat = stack.extract_pixels(training_map)
            y = y.flatten()

            with RasterRow(training_map) as src:

                if mode == "classification":
                    src_cats = {v: k for (k, v, m) in src.cats}
                    class_labels = {k: k for k in np.unique(y)}
                    class_labels.update(src_cats)
                else:
                    class_labels = None

        elif training_points != "":
            X, y, cat = stack.extract_points(training_points, field)
            y = y.flatten()

            if y.dtype in (np.object_, np.object):
                from sklearn.preprocessing import LabelEncoder

                le = LabelEncoder()
                y = le.fit_transform(y)
                class_labels = {k: v for (k, v) in enumerate(le.classes_)}
            else:
                class_labels = None

        # take group id from last column and remove from predictors
        if group_raster != "":
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal("No training pixels or pixels in imagery group ...check "
                     "computational region")

        from sklearn.utils import shuffle

        if group_id is None:
            X, y, cat = shuffle(X, y, cat, random_state=random_state)
        else:
            X, y, cat, group_id = shuffle(X,
                                          y,
                                          cat,
                                          group_id,
                                          random_state=random_state)

        if save_training != "":
            save_training_data(save_training, X, y, cat, class_labels,
                               group_id, stack.names)

    # cross validation settings -----------------------------------------------
    # inner resampling method (cv=2)
    from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold

    if any(param_grid) is True:
        if group_id is None and mode == "classification":
            inner = StratifiedKFold(n_splits=3)
        elif group_id is None and mode == "regression":
            inner = KFold(n_splits=3)
        else:
            inner = GroupKFold(n_splits=3)
    else:
        inner = None

    # outer resampling method (cv=cv)
    if cv > 1:
        if group_id is None and mode == "classification":
            outer = StratifiedKFold(n_splits=cv)
        elif group_id is None and mode == "regression":
            outer = KFold(n_splits=cv)
        else:
            outer = GroupKFold(n_splits=cv)

    # modify estimators that take sample_weights ------------------------------
    if balance is True:
        from sklearn.utils import compute_class_weight

        class_weights = compute_class_weight(class_weight="balanced",
                                             classes=(y),
                                             y=y)
        fit_params = {"sample_weight": class_weights}

    else:
        class_weights = None
        fit_params = {}

    # preprocessing -----------------------------------------------------------
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    # standardization
    if norm_data is True and category_maps is None:
        scaler = StandardScaler()
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[("scaling", scaler, np.arange(0, stack.count))],
        )

    # one-hot encoding
    elif norm_data is False and category_maps is not None:
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(remainder="passthrough",
                                  transformers=[("onehot", enc,
                                                 stack.categorical)])

    # standardization and one-hot encoding
    elif norm_data is True and category_maps is not None:
        scaler = StandardScaler()
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[
                ("onehot", enc, stack.categorical),
                (
                    "scaling",
                    scaler,
                    np.setxor1d(range(stack.count),
                                stack.categorical).astype("int"),
                ),
            ],
        )

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([("preprocessing", trans),
                              ("estimator", estimator)])
        param_grid = wrap_named_step(param_grid)
        fit_params = wrap_named_step(fit_params)

    if any(param_grid) is True:
        estimator = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=search_scorer,
            n_jobs=n_jobs,
            cv=inner,
        )

    # estimator training ------------------------------------------------------
    gs.message(os.linesep)
    gs.message(("Fitting model using " + model_name))
    if balance is True and group_id is not None:
        estimator.fit(X, y, groups=group_id, **fit_params)
    elif balance is True and group_id is None:
        estimator.fit(X, y, **fit_params)
    else:
        estimator.fit(X, y)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message("Best parameters:")

        optimal_pars = [
            (k.replace("estimator__", "").replace("selection__", "") + " = " +
             str(v)) for (k, v) in estimator.best_params_.items()
        ]

        for i in optimal_pars:
            gs.message(i)

        if param_file != "":
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # cross-validation --------------------------------------------------------
    if cv > 1:
        from sklearn.metrics import classification_report
        from sklearn import metrics

        if (mode == "classification"
                and cv > np.histogram(y, bins=np.unique(y))[0].min()):
            gs.message(os.linesep)
            gs.fatal("Number of cv folds is greater than number of samples in "
                     "some classes ")

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        if (mode == "classification" and len(np.unique(y)) == 2
                and all([0, 1] == np.unique(y))):
            scoring["roc_auc"] = metrics.roc_auc_score

        from sklearn.model_selection import cross_val_predict

        preds = cross_val_predict(estimator,
                                  X,
                                  y,
                                  group_id,
                                  cv=outer,
                                  n_jobs=n_jobs,
                                  fit_params=fit_params)

        test_idx = [test for train, test in outer.split(X, y)]
        n_fold = np.zeros((0, ))

        for fold in range(outer.get_n_splits()):
            n_fold = np.hstack((n_fold, np.repeat(fold,
                                                  test_idx[fold].shape[0])))

        preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold}

        preds = pd.DataFrame(data=preds,
                             columns=["y_pred", "y_true", "cat", "fold"])
        gs.message(os.linesep)
        gs.message("Global cross validation scores...")
        gs.message(os.linesep)
        gs.message("Metric \t Mean \t Error")

        for name, func in scoring.items():
            score_mean = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).mean())

            score_std = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).std())

            gs.message(name + "\t" + str(score_mean.round(3)) + "\t" +
                       str(score_std.round(3)))

        if mode == "classification":
            gs.message(os.linesep)
            gs.message("Cross validation class performance measures......:")

            report_str = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=False,
            )

            report = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=True,
            )
            report = pd.DataFrame(report)

            gs.message(report_str)

            if classif_file != "":
                report.to_csv(classif_file, mode="w", index=True)

        # write cross-validation predictions to csv file
        if preds_file != "":
            preds.to_csv(preds_file, mode="w", index=False)
            text_file = open(preds_file + "t", "w")
            text_file.write('"Real", "Real", "integer", "integer"')
            text_file.close()

    # feature importances -----------------------------------------------------
    if importances is True:
        from sklearn.inspection import permutation_importance

        fimp = permutation_importance(
            estimator,
            X,
            y,
            scoring=search_scorer,
            n_repeats=5,
            n_jobs=n_jobs,
            random_state=random_state,
        )

        feature_names = deepcopy(stack.names)
        feature_names = [i.split("@")[0] for i in feature_names]

        fimp = pd.DataFrame({
            "feature": feature_names,
            "importance": fimp["importances_mean"],
            "std": fimp["importances_std"],
        })

        gs.message(os.linesep)
        gs.message("Feature importances")
        gs.message("Feature" + "\t" + "Score")

        for index, row in fimp.iterrows():
            gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" +
                       str(row["std"]))

        if fimp_file != "":
            fimp.to_csv(fimp_file, index=False)

    # save the fitted model
    import joblib

    joblib.dump((estimator, y, class_labels), model_save)
Exemple #23
0
X_exgal_d0 = X_exgal[~is_ddf]
X_exgal_d1 = X_exgal[is_ddf]
li = [X_exgal_d0.copy() for i in range(37)]

X_exgal = pd.concat([X_exgal_d1] + li, ignore_index=True)

group_exgal = X_exgal.g

y_exgal_d0 = y_exgal[~is_ddf]
y_exgal_d1 = y_exgal[is_ddf]
li = [y_exgal_d0.copy() for i in range(37)]
y_exgal = pd.concat([y_exgal_d1] + li, ignore_index=True)

del li, X_exgal_d0, X_exgal_d1, X_exgal['g'], y_exgal_d0, y_exgal_d1

group_kfold = GroupKFold(n_splits=NFOLD)

print(f'X_gal.shape: {X_gal.shape}')
print(f'X_exgal.shape: {X_exgal.shape}')

gc.collect()

# =============================================================================
# cv(gal)
# =============================================================================
print('==== GAL ====')
param['num_class'] = 5

dtrain = lgb.Dataset(
    X_gal,
    y_gal.values,  #categorical_feature=CAT, 
    def split(
        self,
        X,
        y=None,
        group=None,
        **kwargs
    ):  ## the group here will be passed on from the class where this is being called

        if self.validation_scheme is None or isinstance(
                self.validation_scheme, KFold
        ) or self.validation_scheme == FoldScheme.KFold.name or self.validation_scheme == FoldScheme.KFold:
            folds = KFold(n_splits=self.num_folds,
                          random_state=self.random_state,
                          shuffle=self.shuffle)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X)]

        elif isinstance(
                self.validation_scheme, StratifiedKFold
        ) or self.validation_scheme == FoldScheme.StratifiedKFold.name or self.validation_scheme == FoldScheme.StratifiedKFold:
            if y is None or X.shape[0] != y.shape[0]:
                raise ValueError(
                    "Y should be passed and X and Y should be of same length for StratifiedKFold"
                )
            folds = StratifiedKFold(n_splits=self.num_folds,
                                    random_state=self.random_state,
                                    shuffle=self.shuffle)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X, y)]

        elif isinstance(
                self.validation_scheme, GroupKFold
        ) or self.validation_scheme == FoldScheme.GroupKFold.name or self.validation_scheme == FoldScheme.GroupKFold:
            folds = GroupKFold(n_splits=self.num_folds)
            self.indices = [(train_index, test_index)
                            for (train_index,
                                 test_index) in folds.split(X, y, groups=group)
                            ]

        elif isinstance(
                self.validation_scheme, TimeSeriesSplit
        ) or self.validation_scheme == FoldScheme.TimeSeriesSplit.name or self.validation_scheme == FoldScheme.TimeSeriesSplit:
            folds = TimeSeriesSplit(n_splits=self.num_folds)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X)]

        elif self.validation_scheme == FoldScheme.train_test_split.name or self.validation_scheme == FoldScheme.train_test_split:
            # validation_scheme is a simple train test split. testsize is used to determine the size of test samples
            self.indices = [
                train_test_split(range(X.shape[0]),
                                 test_size=self.test_size,
                                 shuffle=self.shuffle)
            ]

        elif callable(self.validation_scheme):
            # validation_scheme is a callable funtion which will take X and y as params.
            self.indices = self.validation_scheme(X, y, **kwargs)

        else:
            if not isinstance(self.validation_scheme, list):
                raise ValueError(
                    "Validation Schema should be a list of (train_indexes, test_indexes)"
                )
            self.indices = self.validation_scheme
        return self.indices
                                        sep_token="[unused0]")
tokenizer = BertTokenizer(BERT_PATH + 'vocab.txt', True)
train_inputs2 = compute_input_arrays_2s(train,
                                        "question_body",
                                        "answer",
                                        tokenizer,
                                        MAX_SEQUENCE_LENGTH,
                                        s1_max_length=254,
                                        s2_max_length=255,
                                        sep_token="[SEP]")
tokenizer = BertTokenizer(BERT_PATH + 'vocab.txt', True)
train_inputs3 = compute_input_arrays(train, "answer", tokenizer,
                                     MAX_SEQUENCE_LENGTH)
train_inputs = train_inputs1 + train_inputs2 + train_inputs3

kf_split = GroupKFold(n_splits=NUM_FOLDS).split(X=train,
                                                groups=train.question_body)
kfold_rho = list()
kfold_rhos = list()
for fold, (train_idx, valid_idx) in enumerate(kf_split):
    print(f" fold: {fold} ".center(100, "#"))
    _train_inputs = [train_inputs[i][train_idx] for i in range(9)]
    _train_targets = train_targets.loc[train_idx, :].values

    _valid_inputs = [train_inputs[i][valid_idx] for i in range(9)]
    _valid_targets = train_targets.loc[valid_idx, :].values

    model = BERTRegressor(bert_path=BERT_PATH,
                          dropout=DROPOUT,
                          hidden_size=768,
                          output_size1=21,
                          output_size2=9)
Exemple #26
0
data_concat = pd.concat(data)
y = data_concat['original']['power']
X = data_concat.drop('power', axis=1, level=1)
X.fillna(X.mean(), inplace=True)
groups = []
for group_idx, activity in enumerate(data):
    groups += [group_idx] * activity.shape[0]
groups = np.array(groups)

scores = cross_validate(GradientBoostingRegressor(random_state=42, n_jobs=-1),
                        X,
                        y,
                        groups=groups,
                        scoring=['r2', 'neg_median_absolute_error'],
                        cv=GroupKFold(n_splits=3),
                        n_jobs=1,
                        return_train_score=True,
                        verbose=0)

print('The obtained scores on training and testing in terms of '
      'R2 and MAE are: \n')
print(scores)

# Store the prediction for visualization
y_pred = cross_val_predict(GradientBoostingRegressor(random_state=42,
                                                     n_jobs=-1),
                           X,
                           y,
                           groups=groups,
                           cv=GroupKFold(n_splits=3),
Exemple #27
0
                    print('output to file.')
                    sX = pickle.dumps(X)
                    fx.write(sX)
                    sy = pickle.dumps(y)
                    fy.write(sy)

if MODE == 2:
    crf = CRF()
    with codecs.open('model/contract_train_crffeatures.pkl', 'rb') as fx:
        with codecs.open('model/contract_train_crfstates.pkl', 'rb') as fy:
            with codecs.open('model/contract_train_crfmodel.pkl', 'wb') as fm:
                with codecs.open('plain/contract_train_group.utf8', 'r') as fg:
                    with codecs.open('plain/contract_train_group_log.utf8',
                                     'w') as fl:
                        groups = fg.readlines()
                        groupKfold = GroupKFold(n_splits=10)
                        bx = fx.read()
                        by = fy.read()
                        X = pickle.loads(bx)
                        y = pickle.loads(by)
                        for i in range(len(X)):
                            assert len(X[i]) == len(y[i])
                        index = 0
                        for train, test in groupKfold.split(X,
                                                            y,
                                                            groups=groups):
                            print(index)
                            index += 1
                            gX = [X[i] for i in train]
                            gy = [y[i] for i in train]
                            tX = [X[i] for i in test]
Exemple #28
0
              'objective': 'regression',
              'max_depth': 6,
              'learning_rate': LEARNING_RATE,
              "boosting_type": "gbdt",
              "subsample_freq": 1,
              "subsample": 0.9,
              "bagging_seed": 11,
              "metric": 'mae',
              "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.4,
              'colsample_bytree': 1.0,
              'random_state': RANDOM_STATE
              }

folds = GroupKFold(n_splits=N_FOLDS)

# Setup arrays for storing results
train_df = pd.read_parquet('data/FE008_train.parquet') # only loading for skeleton not features
oof_df = train_df[['id', 'type','scalar_coupling_constant']].copy()
mol_group = train_df[['molecule_name','type']].copy()
del train_df
gc.collect()

oof_df['oof_preds'] = 0
test_df = pd.read_parquet('data/FE008_test.parquet') # only loading for skeleton not features
prediction = np.zeros(len(test_df))
feature_importance = pd.DataFrame()
test_pred_df = test_df[['id','type','molecule_name']].copy()
del test_df
gc.collect()
Exemple #29
0
bag_times = config['bagging_times']
logging.debug(bag_times)

logging.debug('\n\n=== random_seed_average times =========')
random_seed_average_times = config['random_seed_average_times']
logging.debug(random_seed_average_times)

logging.debug('\n\n=== N Folds =========')
n_fold = config['n_fold']
logging.debug(n_fold)

logging.debug('\n\n=== Folds Type =========')
folds_type = {
    'time_series': TimeSeriesSplit(n_fold),
    'k_fold': KFold(n_fold),
    'group_k_fold': GroupKFold(n_fold),
    'train_test_split_time_series': 'train_test_split_time_series'
}
folds = folds_type[config['folds_type']]
logging.debug(config['folds_type'])
if config['folds_type'] == 'group_k_fold':
    split_groups = train['DT_M']
else:
    split_groups = None

logging.debug('\n\n=== train shape =========')
logging.debug(train.shape)
print('train shape', train.shape)

if model_type == 'cat':
    for col in train:
Exemple #30
0
iris = load_iris()
X = iris.data
y = iris.target

plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit, StratifiedKFold, GroupKFold, GroupShuffleSplit

#cv = KFold(5, random_state = 0)
#cv = LeaveOneOut()
# cv = ShuffleSplit(4, test_size = 0.2)
#cv = StratifiedKFold(4)
cv = GroupKFold(5).get_n_splits(X, y, groups = X[:, 0])
print(cross_val_score(KNeighborsClassifier(), X, y, cv = cv))

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import *

y = np.array([1, 2, 2, 3, 5, 2])
y_pred = np.array([5, 2, 2, 5, 7, 1000])

print('MAE :', mean_absolute_error(y, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y, y_pred)))
print('median absolute error :', median_absolute_error(y, y_pred))

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression