def make_folds(targets, scored, seed, K): # LOCATE DRUGS vc = scored["drug_id"].value_counts() vc1 = vc.loc[vc <= 18].index.sort_values() vc2 = vc.loc[vc > 18].index.sort_values() # STRATIFY DRUGS 18X OR LESS dct1 = {} dct2 = {} skf = MultilabelStratifiedKFold(n_splits=K, shuffle=True, random_state=seed) tmp = scored.groupby('drug_id')[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} # drug id がどのフォールドに属すか格納 dct1.update(dd) # STRATIFY DRUGS MORE THAN 18X skf = MultilabelStratifiedKFold(n_splits=K, shuffle=True, random_state=seed) tmp = scored.loc[scored["drug_id"].isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp["sig_id"][idxV].values} dct2.update(dd) # ASSIGN K scored['fold'] = scored.drug_id.map(dct1) scored.loc[scored["fold"].isna(), 'fold'] = scored.loc[scored["fold"].isna(), 'sig_id'].map(dct2) scored["fold"] = scored["fold"].astype('int8') return scored
def create_random_split(train_meta, external_meta=None, n_splits=4, alias='random'): split_dir = opj(DATA_DIR, 'split', '%s_folds%d' % (alias, n_splits)) os.makedirs(split_dir, exist_ok=True) kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100) train_indices_list, valid_indices_list = [], [] for train_indices, valid_indices in kf.split( train_meta, train_meta[LABEL_NAME_LIST].values): train_indices_list.append(train_indices) valid_indices_list.append(valid_indices) ext_train_indices_list, ext_valid_indices_list = [], [] if external_meta is not None: ext_kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=100) for ext_train_indices, ext_valid_indices in ext_kf.split( external_meta, external_meta[LABEL_NAME_LIST].values): ext_train_indices_list.append(ext_train_indices) ext_valid_indices_list.append(ext_valid_indices) for idx in range(n_splits): train_split_df = train_meta.loc[train_indices_list[idx]] valid_split_df = train_meta.loc[valid_indices_list[idx]] if external_meta is not None: train_split_df = pd.concat( (train_split_df, external_meta.loc[ext_train_indices_list[idx]]), ignore_index=True) valid_split_df = pd.concat( (valid_split_df, external_meta.loc[ext_valid_indices_list[idx]]), ignore_index=True) train_split_df = train_split_df[ [ID, TARGET, EXTERNAL, ANTIBODY, ANTIBODY_CODE] + LABEL_NAME_LIST] valid_split_df = valid_split_df[ [ID, TARGET, EXTERNAL, ANTIBODY, ANTIBODY_CODE] + LABEL_NAME_LIST] if idx == 0: for name in LABEL_NAMES.values(): print(name, (train_split_df[name] == 1).sum(), (valid_split_df[name] == 1).sum()) fname = opj(split_dir, 'random_train_cv%d.csv' % (idx)) print("create split file: %s, shape: %s" % (fname, str(train_split_df.shape))) train_split_df.to_csv(fname, index=False) fname = opj(split_dir, 'random_valid_cv%d.csv' % (idx)) print("create split file: %s, shape: %s" % (fname, str(valid_split_df.shape))) valid_split_df.to_csv(fname, index=False)
def get_best_epoch(cfg): print(cfg) dataset = pd.read_csv(data_dir + 'train.csv') dataset['Id'] = data_dir + 'img_data/' + dataset['Id'] dataset['suffix'] = '.png' ex_data = pd.read_csv(data_dir + 'HPAv18RBGY_wodpl.csv') if cfg['gray']: ex_data['Id'] = data_dir + 'HPAv18_gray/' + ex_data['Id'] else: ex_data['Id'] = data_dir + 'HPAv18/' + ex_data['Id'] ex_data['suffix'] = '.jpg' target = get_label(dataset) extarget = get_label(ex_data) folds = MultilabelStratifiedKFold(2, shuffle=True, random_state=66666) exfolds = MultilabelStratifiedKFold(3, shuffle=True, random_state=66666) for fold_i, (tr_idx, val_idx) in enumerate(folds.split(dataset['Id'], target)): for fold_j, (ex_tr_idx, _) in enumerate(exfolds.split(ex_data['Id'], extarget)): if fold_i * 3 + fold_j not in cfg['fold']: continue print(fold_i, fold_j, '-----------------') tr_data = ex_data.iloc[ex_tr_idx].append( dataset.iloc[tr_idx]).reset_index(drop=True) val_data = dataset.iloc[val_idx] train(fold_i * 3 + fold_j, tr_data, val_data, cfg)
def add_fold(train, targets): # https://www.kaggle.com/c/lish-moa/discussion/195195 # LOAD LIBRARIESr from iterstrat.ml_stratifiers import MultilabelStratifiedKFold # LOAD FILES drug = pd.read_csv(os.path.join(DATA_DIR, 'train_drug.csv')) target_cols = targets.columns[1:] targets = targets.merge(drug, on='sig_id', how='left') # LOCATE DRUGS. 18 or 1000 vc = targets.drug_id.value_counts() vc1 = vc.loc[vc <= 18].index vc2 = vc.loc[vc > 18].index # STRATIFY DRUGS 18X OR LESS dct1 = {} dct2 = {} cv = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True) tmp = targets.groupby('drug_id')[target_cols].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(cv.split(tmp, tmp[target_cols])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18X cv = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True) tmp = targets.loc[targets.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(cv.split(tmp, tmp[target_cols])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS train['fold'] = targets.drug_id.map(dct1) train.loc[train.fold.isna(), 'fold'] = train.loc[train.fold.isna(), 'sig_id'].map(dct2) train.fold = train.fold.astype('int8') targets.drop(['drug_id'], axis=1, inplace=True) return train, targets
def process_score(scored, targets, seed=42, folds=7): # LOCATE DRUGS vc = scored.drug_id.value_counts() vc1 = vc.loc[vc <= 18].index.sort_values() vc2 = vc.loc[vc > 18].index.sort_values() # STRATIFY DRUGS 18X OR LESS dct1 = {} dct2 = {} skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) tmp = scored.groupby('drug_id')[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18X skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS scored['kfold'] = scored.drug_id.map(dct1) scored.loc[scored.kfold.isna(),'kfold'] =\ scored.loc[scored.kfold.isna(),'sig_id'].map(dct2) scored.kfold = scored.kfold.astype('int8') return scored
def make_folds(train, num_starts, num_splits): train_ = train.copy() folds = [] # LOAD FILES train_feats = pd.read_csv('../input/lish-moa/train_features.csv') scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv') drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv') scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :] drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :] targets = scored.columns[1:] scored = scored.merge(drug, on='sig_id', how='left') # LOCATE DRUGS vc = scored.drug_id.value_counts() vc1 = vc.loc[(vc <= 6) | (vc == 12) | (vc == 18)].index.sort_values() vc2 = vc.loc[(vc > 6) & (vc != 12) & (vc != 18)].index.sort_values() for seed in range(num_starts): # STRATIFY DRUGS 18X OR LESS dct1 = {} dct2 = {} skf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed) tmp = scored.groupby('drug_id')[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18X skf = MultilabelStratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed) tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS scored['fold'] = scored.drug_id.map(dct1) scored.loc[scored.fold.isna(), 'fold'] = scored.loc[scored.fold.isna(), 'sig_id'].map(dct2) scored.fold = scored.fold.astype('int8') folds.append(scored.fold.values) del scored['fold'] for i in range(len(folds)): train_[f'seed{i}'] = folds[i] return train_
def create_cv(X, y, drugs, sig_ids, threshold=1000, folds=5, seed=42): seed_everything(seed) y_cols = y.columns.tolist() X = X.copy() y = y.copy() X = pd.concat([X, y], axis=1) X['drug_id'] = drugs X['sig_id'] = sig_ids #Locate drugs drugs_count = X['drug_id'].value_counts() drugs_below_thresh = drugs_count.loc[ drugs_count <= threshold].index.sort_values() drugs_above_thresh = drugs_count.loc[ drugs_count > threshold].index.sort_values() dct_below_thresh = {} dct_above_thresh = {} #Stratify below threshold skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) tmp = X.groupby('drug_id')[y_cols].mean().loc[drugs_below_thresh] for f, (idxT, idxV) in enumerate(skf.split(tmp, tmp[y_cols])): dd = {k: f for k in tmp.index[idxV].values} dct_below_thresh.update(dd) #stratify above threshold skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) tmp = X.loc[X['drug_id'].isin(drugs_above_thresh)].reset_index(drop=True) for f, (idxT, idxV) in enumerate(skf.split(tmp, tmp[y_cols])): dd = {k: f for k in tmp.sig_id[idxV].values} dct_above_thresh.update(dd) # ASSIGN FOLDS X['fold'] = X['drug_id'].map(dct_below_thresh) X.loc[X['fold'].isna(), 'fold'] = X.loc[X['fold'].isna(), 'sig_id'].map(dct_above_thresh) X['fold'] = X['fold'].astype('int8') oof_assignment = X['fold'].values oof_idx = [] for x in np.arange(folds): train = np.where(oof_assignment != x)[0] val = np.where(oof_assignment == x)[0] oof_idx.append((train, val)) return oof_idx
def make_folds(drug, scored, folds, random_state, stratify=True, drug_thresh=18): targets = scored.columns[1:] scored = scored.merge(drug, on='sig_id', how='left') # LOCATE DRUGS vc = scored.drug_id.value_counts() vc1 = vc.loc[vc <= drug_thresh].index.sort_values() vc2 = vc.loc[vc > drug_thresh].index.sort_values() # STRATIFY DRUGS 18 OR LESS dct1 = {} dct2 = {} if stratify: skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state) else: skf = KFold(n_splits=folds, shuffle=True, random_state=random_state) tmp = scored.groupby('drug_id')[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18 if stratify: skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state) else: skf = KFold(n_splits=folds, shuffle=True, random_state=random_state) tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS scored['fold'] = np.nan scored['fold'] = scored.drug_id.map(dct1) scored.loc[scored.fold.isna(), 'fold'] = scored.loc[scored.fold.isna(), 'sig_id'].map(dct2) scored.fold = scored.fold.astype('int8') return scored[['sig_id', 'fold']].copy()
def split_data(x, y, n_splits): kfold = MultilabelStratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True) for train_index, valid_index in kfold.split(x, y): yield (x.iloc[train_index], y.iloc[train_index]), (x.iloc[valid_index], y.iloc[valid_index])
def split_data(config, df): """split data into training and validation data Args: config: CFG df: dataframe object Returns: df: dataframe object with fold """ df['fold'] = -1 # MultilabelStratifiedKFold mskf = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=config.seed, shuffle=True) # patient level - target, sex, size patient = df.groupby("patient_id")['target'].apply(lambda v: (v == 1).any()) patient = pd.concat( [patient, df.groupby('patient_id')['sex'].apply(lambda v: v.iloc[0])], axis=1) patient = pd.concat([patient, df.groupby("patient_id").size()], axis=1).rename({0: "size"}, axis=1) patient['size'] = pd.qcut(patient['size'], 10, labels=range(10)) for fold, (tr_idx, vl_idx) in enumerate(mskf.split(X=patient, y=patient.values)): vl_idx = df[df['patient_id'].isin(patient.iloc[vl_idx].index)].index df.loc[vl_idx, "fold"] = fold return df
def main(cfg): data_dir = './input/' dataset = pd.read_csv(data_dir+'train.csv') dataset['Id'] = data_dir + 'train/' + dataset['Id'] dataset['suffix'] = '.png' if cfg['use_external_data']: ex_data = pd.read_csv(data_dir+'HPAv18RBGY_wodpl.csv') ex_data['Id'] = data_dir + 'HPAv18_512/' + ex_data['Id'] ex_data['suffix'] = '.jpg' dataset = dataset.append(ex_data).reset_index(drop=True) print(cfg) target = np.zeros((len(dataset), 28)) for i, labels in enumerate(dataset['Target']): labels = [int(t) for t in labels.split() if t != ''] for l in labels: target[i, l] = 1 # folds = KFold(len(dataset),cfg['nfold'],shuffle=True,random_state=66666) folds = MultilabelStratifiedKFold(cfg['nfold'],shuffle=True,random_state=66666) for n_fold, (tr_idx, val_idx) in enumerate(folds.split(dataset['Id'],target)): print(val_idx) if n_fold not in cfg['fold']: continue print(n_fold,'-----------------') tr_data = dataset.iloc[tr_idx] val_data = dataset.iloc[val_idx] train(n_fold,tr_data,val_data,cfg)
def train(model, device, X, Y, n_splits=10, batch_size=4096, epochs=50): kfold = MultilabelStratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters()) train_losses = np.array([]) val_losses = np.array([]) for n, (tr, te) in enumerate(kfold.split(X, Y)): X_train, X_val = X[tr], X[te] y_train, y_val = Y[tr], Y[te] train_dataset = MoaDataset(X_train, y_train) val_dataset = MoaDataset(X_val, y_val) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) split_train_losses, split_val_losses = batch_gd( model, device, criterion, optimizer, train_loader, val_loader, epochs) print( f"Fold {n+1}, final train loss: {split_train_losses[epochs-1]:5.5f}, final train loss: {split_val_losses[epochs-1]:5.5f}" ) train_losses = np.concatenate((train_losses, split_train_losses)) val_losses = np.concatenate((val_losses, split_val_losses)) model.save("latest_model") return train_losses, val_losses
def main(cfg): dataset = pd.read_csv(data_dir + 'train.csv') dataset['Id'] = data_dir + 'img_data/' + dataset['Id'] dataset['suffix'] = '.png' if cfg['use_external_data']: ex_data = pd.read_csv(data_dir + 'HPAv18RBGY_wodpl.csv') if cfg['gray']: ex_data['Id'] = data_dir + 'HPAv18_gray/' + ex_data['Id'] else: ex_data['Id'] = data_dir + 'HPAv18/' + ex_data['Id'] ex_data['suffix'] = '.jpg' dataset = dataset.append(ex_data).reset_index(drop=True) print(cfg) target = get_label(dataset) # folds = KFold(len(dataset),cfg['nfold'],shuffle=True,random_state=66666) folds = MultilabelStratifiedKFold(cfg['nfold'], shuffle=True, random_state=66666) for n_fold, (tr_idx, val_idx) in enumerate(folds.split(dataset['Id'], target)): print(val_idx) if n_fold not in cfg['fold']: continue print(n_fold, '-----------------') tr_data = dataset.iloc[tr_idx] val_data = dataset.iloc[val_idx] train(n_fold, tr_data, val_data, cfg)
def run_train(features, targets, seed): data = pd.read_pickle('./cached/preprocessing_train.pkl') oof = np.zeros(data[targets].shape) mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=seed) for n_fold, (train_idx, valid_idx) in enumerate( mskf.split(data[features], data[targets])): logger.info('>' * 5 + f" Fold {n_fold+1}") logger.info( f"train_size: {len(train_idx)} valid_size: {len(valid_idx)}") # Split train and valid. X_train, y_train = data.iloc[train_idx][features], data.iloc[ train_idx][targets] X_valid, y_valid = data.iloc[valid_idx][features], data.iloc[ valid_idx][targets] # Fit model and cv pred. with timer(f'fold {n_fold+1} train time'): estimater = MoaModel(len(features), len(targets)).to(device) model = MoaModelTrainer(estimater) oof[valid_idx] = model.fit(X_train, y_train, X_valid, y_valid, n_fold, seed) # Evaluation. oof_metric = moa_loss(oof, data[targets].to_numpy()) logger.info(f'\nMetric of oof: {oof_metric}\n') # Export oof dataframe. oof_df = data[['sig_id']].reset_index(drop=True) oof_df = oof_df.join( pd.DataFrame(oof, columns=targets).add_prefix('pred_')) oof_df.to_csv(f'{VERSION}_oof_seed{seed}.csv', index=False)
def make_folds(train_features, train_targets_scored, n_folds, seed): train = train_features.merge(train_targets_scored, on='sig_id') target_cols = [ c for c in train_targets_scored.columns if c not in ['sig_id'] ] cols = target_cols + ['cp_type'] train_cp = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) mskf = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed) train_cp.loc[:, 'fold'] = 0 for n, (train_index, val_index) in enumerate(mskf.split(train_cp, train_cp[target_cols])): train_cp.loc[val_index, 'fold'] = int(n) train_cp['fold'] = train_cp['fold'].astype(int) train_ctl = train[train['cp_type'] == 'ctl_vehicle'].reset_index(drop=True) train_ctl.loc[:, 'fold'] = 100 train = pd.concat([train_cp, train_ctl]) train_features_with_fold = train_features.merge(train[['sig_id', 'fold']], on='sig_id') return train_features_with_fold
def main(): args = make_parse() df = pd.read_csv(args.train_df) # Add columns df['Label'] = df.Image_Label.map(lambda x: x.split('_')[1]) df['ImageId'] = df.Image_Label.map(lambda x: x.split('_')[0]) # Extract unique image ids labels = ['Fish', 'Flower', 'Gravel', 'Sugar'] df_new = pd.DataFrame({'ImageId': df.ImageId.unique()}) df_new['kfold'] = -1 for l in labels: df_tmp = (df[df.Label == l].drop( columns=['Image_Label', 'Label']).rename(columns={ 'EncodedPixels': l }).reset_index(drop=True)) df_new = df_new.merge(df_tmp, on='ImageId') # Make ont-hot vector df_new2 = df_new.copy() for l in labels: df_new2[l] = (~pd.isna(df_new2[l])).astype('int') y = df_new2.iloc[:, 2:].values # Make kfolds indxs = list(range(len(df_new2))) mskf = MultilabelStratifiedKFold(n_splits=args.kfold, random_state=42) for i, (train_index, test_index) in enumerate(mskf.split(indxs, y)): df_new.loc[test_index, 'kfold'] = i + 1 new_path = Path(args.train_df).parent / f'train_{args.kfold}kfold.csv' df_new.to_csv(new_path, index=False)
def _k_fold(df: pd.DataFrame, n_splits: int, random_state: Any = 42): X = np.array(df.Id) y = np.array( [HumanProteinDataset.parse_target(target) for target in df.Target]) mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=random_state) return mskf.split(X, y)
def get_pids(self): # FIXME: This is based on label distribution # MultilabelStratified KFold xs = np.array(list(self.meta_dict.keys())) ys = [] for v in self.meta_dict.values(): temp_lbl = np.array(v["bbox"])[:, 2] temp = np.zeros((15)) for i in temp_lbl: temp[int(i)] = 1 ys.append(temp) ys = np.array(ys) mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=30) kfold_generator = mskf.split(xs, ys) for _ in range(self.cfgs["fold"] + 1): train_index, val_index = next(kfold_generator) if self.mode == "train": pids = xs[train_index] elif self.mode == "val": pids = xs[val_index] return pids
def split_stratified(all_examples_dict): examples = [] y_list = [] for key, labels in all_examples_dict.items(): labels = list(labels) np_labels = np.zeros((28,), dtype=int) np_labels[np.array(labels)] = 1 # print('np_labels ', np_labels) # print('key ', key) examples.append((key, labels)) y_list.append(np_labels) X = np.arange(len(y_list)) y = np.array(y_list) # test_val mskf = MultilabelStratifiedKFold(n_splits=11, random_state=1234) folds = [] # print('whole len ', len(X)) for train_index, test_index in mskf.split(X, y): folds.append(test_index) # print('train_index ', train_index) # print('train_len ', len(train_index)) # print('test_index ', test_index) # print('test_index ', len(test_index)) for a, b in combinations(folds, 2): assert len(set(a) & set(b)) == 0 # print('folds ', folds) return examples, folds
def split_df(self, df): kf = MultilabelStratifiedKFold(n_splits=self.config.fold_num, shuffle=True, random_state=self.config.seed) for fold, (train_idx, val_idx) in enumerate(kf.split(df, df.iloc[:, 1:train_df['class_id'].nunique() + 1])): df[f'fold_{fold}'] = 0 df.loc[val_idx, f'fold_{fold}'] = 1 return df
def ensemble_training(k_folds=5): # 准备工作 log = Logger() log.open(os.path.join(config.logs_dir, "%s_log_train.txt" % config.model_name), mode="a") # load dataset all_files = pd.read_csv(config.train_csv) image_names = all_files['Id'] labels_strs = all_files['Target'] image_labels = [] for cur_label_str in labels_strs: cur_label = np.eye(config.num_classes, dtype=np.float)[np.array(list(map(int, cur_label_str.split(' '))))].sum( axis=0) image_labels.append(cur_label) image_labels = np.stack(image_labels, axis=0) msss = MultilabelStratifiedKFold(n_splits=k_folds) i = 0 for train_index, val_index in msss.split(image_names, image_labels): model = get_net() model.cuda() train_image_names = image_names[train_index] train_image_labels = image_labels[train_index] val_image_names = image_names[val_index] val_image_labels = image_labels[val_index] training(model, i, log, train_image_names, train_image_labels, val_image_names, val_image_labels) i += 1
def main(): args = parse_args() os.makedirs(args.output_root, exist_ok=True) tile_annotations = pd.read_csv(args.annotation_path) mosaic_annotations = pd.read_csv(args.mosaic_path, converters={"bbox": ast.literal_eval}) mosaic_annotations["num_of_bboxes"] = mosaic_annotations["image_id"].map( mosaic_annotations["image_id"].value_counts() ) mosaic_annotations["median_area"] = mosaic_annotations["bbox"].apply(lambda x: np.sqrt(x[-1] * x[-2])) mosaic_annotations["source_index"] = mosaic_annotations["source"].apply(lambda x: SOURCES.index(x)) images = ( mosaic_annotations[["image_id", "source_index", "median_area", "num_of_bboxes", "source"]] .copy() .drop_duplicates("image_id") ) images = images[~images["source"].isin(VAL_SOURCES)] splitter = MultilabelStratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=3) for i, (train_index, test_index) in enumerate( splitter.split(images, images[["source_index", "median_area", "num_of_bboxes"]]) ): mosaic_val_ids = images.iloc[test_index, images.columns.get_loc("image_id")] tile_val_ids = sum([x.split("_") for x in mosaic_val_ids], []) fold_root = osp.join(args.output_root, str(i)) save_split(mosaic_annotations, mosaic_val_ids, fold_root, prefix="mosaic") save_split(tile_annotations, tile_val_ids, fold_root, prefix="tile")
def make_folds(folds=5, random_state=0, stratify=True, scored=None): drug = pd.read_csv("../input/lish-moa/train_drug.csv") if scored is None: scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv") targets = scored.columns[1:] scored = scored.merge(drug, on="sig_id", how="left") # LOCATE DRUGS vc = scored.drug_id.value_counts() vc1 = vc.loc[vc <= 18].index.sort_values() vc2 = vc.loc[vc > 18].index.sort_values() # STRATIFY DRUGS 18 OR LESS dct1 = {} dct2 = {} if stratify: skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state) else: skf = KFold(n_splits=folds, shuffle=True, random_state=random_state) tmp = scored.groupby("drug_id")[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18 if stratify: skf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state) else: skf = KFold(n_splits=folds, shuffle=True, random_state=random_state) tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS scored["fold"] = np.nan scored["fold"] = scored.drug_id.map(dct1) scored.loc[scored.fold.isna(), "fold"] = scored.loc[scored.fold.isna(), "sig_id"].map(dct2) scored.fold = scored.fold.astype("int8") return scored[["sig_id", "fold"]].copy()
def split_data(): SEED, FOLDS = 42, 5 print("Split data") path_fold = "input/folds/train_folds.csv" if not exists(path_fold): scored = pd.read_csv("input/lish-moa/train_targets_scored.csv") drug = pd.read_csv("input/lish-moa/train_drug.csv") targets = scored.columns[1:] scored = scored.merge(drug, on="sig_id", how="left") # LOCATE DRUGS vc = scored.drug_id.value_counts() vc1 = vc.loc[vc <= 18].index.sort_values() vc2 = vc.loc[vc > 18].index.sort_values() # STRATIFY DRUGS 18X OR LESS dct1 = {} dct2 = {} skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED) tmp = scored.groupby("drug_id")[targets].mean().loc[vc1] for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.index[idxV].values} dct1.update(dd) # STRATIFY DRUGS MORE THAN 18X skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED) tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True) for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])): dd = {k: fold for k in tmp.sig_id[idxV].values} dct2.update(dd) # ASSIGN FOLDS scored["kfold"] = scored.drug_id.map(dct1) scored.loc[scored.kfold.isna(), "kfold"] = scored.loc[scored.kfold.isna(), "sig_id"].map( dct2 ) scored.kfold = scored.kfold.astype("int8") # SAVE FOLDS scored.drop("drug_id", axis=1, inplace=True) scored.to_csv(path_fold, index=False) print(f"Created: {path_fold}") else: print("Skipped: already exists")
def generate_k_folds(k_folds, labels): placeholder_x = np.zeros(labels.shape[0]) msss = MultilabelStratifiedKFold(n_splits=k_folds) i = 0 for train_index, val_index in msss.split(placeholder_x, labels): with open('%d.pkl' % i, 'wb') as f: pickle.dump((train_index, val_index), f) i += 1
def multilabel_stratified_K_fold(dataset, n_folds=5, shuffle=False, random_state=42): y = target_to_numpy(dataset['Target']) mskf = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state) return list(mskf.split(X=dataset, y=y))
def _mls_enhanced_full_kfold_dfs(): df = get_enhanced_full_train_df() label_mat = multilabel_binary_representation(df, sparse=False) kf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0) for train_index, val_index in kf.split(df.index.values, label_mat): fold_train_df = df.iloc[train_index] fold_val_df = df.iloc[val_index] yield fold_train_df, fold_val_df
def train_test_split(self, train_portion, batch_size=32): mskf = MultilabelStratifiedKFold(n_splits=int(1 / (1 - train_portion))) train_set, val_set = mskf.split(X=self.labels, y=self.labels).__next__() train_generator = TrainGenerator(self, train_set, batch_size) val_generator = TrainGenerator(self, train_set, batch_size) return train_generator, val_generator
def kfold( n_splits: int, annotations: Annotations, ) -> t.List[t.Tuple[Annotations, Annotations]]: multi_hot = to_multi_hot(annotations, size=3474) indecies = range(len(multi_hot)) mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=0) return [([annotations[i] for i in train], [annotations[i] for i in test]) for train, test in mskf.split(indecies, multi_hot)]
def run_train(seed): ''' Train model and Dump out of fold. ''' # X is already cut `ctl_vehicle` data. train = pd.read_pickle(f'{CACHE_DIR}/train_features.pkl') targets = pd.read_pickle(f'{CACHE_DIR}/train_targets.pkl') features = train.columns.difference(drop_cols).tolist() len_g_features = train.columns.str.contains('^g-').sum() y_train_size = targets.shape[0] if USE_PSEUDO: X_test = pd.read_pickle(f'{EXTEND_DIR}/v04002_test_features.pkl') y_test = pd.read_csv(f'{EXTEND_DIR}/v04002_test_targets.csv') y_test.iloc[:, 1:] = (y_test.iloc[:, 1:] > 0.5).astype(int) is_ctl_vehicle = (X_test['cp_type'] == 'ctl_vehicle') X_test = X_test[~is_ctl_vehicle] y_test = y_test[~is_ctl_vehicle] train = pd.concat([train, X_test], axis=0).reset_index(drop=True) targets = pd.concat([targets, y_test], axis=0).reset_index(drop=True) X, y = train[features], targets.iloc[:, 1:] oof = np.zeros(y.shape) from iterstrat.ml_stratifiers import MultilabelStratifiedKFold mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=seed) for n_fold, (train_idx, valid_idx) in enumerate(mskf.split(X, y)): logger.info('>' * 5 + f" Fold {n_fold+1} / {NUM_FOLD}") logger.info( f"train_size: {len(train_idx)} valid_size: {len(valid_idx)}") # Split train and valid. X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] # Fit model and cv pred. with timer(f'fold {n_fold+1} train time'): num_epoch = 50 if os.path.exists('/kaggle/input') else 5 modelpath = f"{CACHE_DIR}/{VERSION}_fold{n_fold}_seed{seed}.pth" model = MoaModel( CustomNet1D(X.shape[1], len_g_features, y.shape[1]).to(device)) oof[valid_idx] = model.fit(X_train, y_train, X_valid, y_valid, modelpath, num_epoch=num_epoch) oof = post_processing(train, oof) oof = oof[:y_train_size] y_np = y.iloc[:y_train_size].to_numpy() # Evaluation only original train targets, I mean exclude pesudo label. logger.info(f'\nMetric of oof: { moa_loss(oof, y_np) }\n') # Export oof dataframe. np.save(f'{CACHE_DIR}/oof_seed{seed}.npy', oof)