def iterative_train_test_split(X: pd.Series, y: np.ndarray, train_size: float = 0.7) -> Tuple: """Custom iterative train test split which 'maintains balanced representation with respect to order-th label combinations.' Args: X (pd.Series): Input features as a pandas Series object. y (np.ndarray): One-hot encoded labels. train_size (float, optional): Proportion of data for first split. Defaults to 0.7. Returns: Two stratified splits based on specified proportions. """ stratifier = IterativeStratification( n_splits=2, order=1, sample_distribution_per_fold=[ 1.0 - train_size, train_size, ], ) train_indices, test_indices = next(stratifier.split(X, y)) X_train, y_train = X[train_indices], y[train_indices] X_test, y_test = X[test_indices], y[test_indices] return X_train, X_test, y_train, y_test
def portion_split(data, portion, seed=1337, labels=None, label_info=None): """Perform a k% split to train-validation instances""" msg = f"Portion-splitting with input data: {len(data)} samples on a {portion} validation portion" if labels is None: info(msg) return list( ShuffleSplit(n_splits=1, test_size=portion, random_state=seed).split(data)) else: multilabel = label_info.multilabel num_labels = len(label_info.label_names) if multilabel: stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[portion, 1.0 - portion]) labels = one_hot(labels, num_labels, True) info(msg + " using iterative stratification.") train_indexes, test_indexes = next( stratifier.split(np.zeros(len(data)), labels)) return [(train_indexes, test_indexes)] else: try: info(msg + " using stratification.") return list( StratifiedShuffleSplit(n_splits=1, test_size=portion, random_state=seed).split( data, labels)) except ValueError as ve: error(f"Unable to complete a stratified split: {ve}")
def multilabel_pipeline_cross_val(pipeline, X, y, labels=None, n_splits=3, verbose=0): """Multi-label pipeline cross-validation Parameters ---------- pipeline : `sklearn.pipeline.Pipeline` or custom pipeline Must have .fit and .predict methods X : array-like y : array-like (n_samples x n_labels) labels : array-like Label names (numerical if Default = None) n_splits : int Number of cross-validation splits (Default = 3) Returns ------- mlc : `multilabel.MultiLabelClassification` Multi-label classification results folds : list (train_idx, valid_idx) pair for each CV fold """ kfold = IterativeStratification(n_splits=n_splits, order=1, random_state=None) pred = np.zeros_like(y, dtype=float) thresh_folds = np.zeros((y.shape[1], n_splits)) for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y)): if verbose > 0: print(f"\n--------\nFold {i+1}/{kfold.n_splits}") X_train, y_train = X[train_idx], y[train_idx] X_valid, y_valid = X[valid_idx], y[valid_idx] pipeline.fit(X_train, y_train, labels=labels, verbose=verbose) valid_pred = pipeline.predict(X_valid) pred[valid_idx] = valid_pred mlc_valid = MultiLabelClassification(y_valid, valid_pred, labels=labels) thresh_folds[:, i] = mlc_valid.best_thresholds('gmean') if verbose > 0: mlc_valid.print_report(full=(verbose > 1)) threshold = thresh_folds.mean(axis=1) mlc = MultiLabelClassification(y, pred=pred, labels=labels, threshold=threshold) if verbose > 0: print("\n------------------------\nCross-validation results") mlc.print_report(full=True) #(verbose > 1)) return mlc
def get_test_train_split(xml_files, classes, classes_dict, test_size, test_train_split): if test_train_split == 'stratified': label_array = np.zeros((len(xml_files), len(classes))) for i, file in enumerate(xml_files): labels, _, _ = read_content(file) sparse_labels = list(map(lambda x: classes_dict[x], labels)) label_array[i, sparse_labels] = 1 kf = IterativeStratification(n_splits=int(1 / test_size)) train, test = next(kf.split(xml_files, label_array)) train_xml_files = np.array(xml_files)[train].tolist() test_xml_files = np.array(xml_files)[test].tolist() return train_xml_files, test_xml_files elif test_train_split == 'sequential': xml_files = sorted(xml_files, key=lambda x: int(''.join( filter(str.isdigit, os.path.basename(x))))) split = int(len(xml_files) * (1 - test_size)) train_xml_files = np.array(xml_files)[:split].tolist() test_xml_files = np.array(xml_files)[split:].tolist() return train_xml_files, test_xml_files
def split_data(X, Y, n_splits=5, output_dir="splits"): split = IterativeStratification( n_splits=n_splits, order=1, random_state=0, ) for split_no, (train_idx, test_idx) in enumerate(split.split(X, Y)): print("processing fold", split_no + 1, "/", n_splits) X_train = X[train_idx] X_test = X[test_idx] Y_train = Y[train_idx] Y_test = Y[test_idx] assert not Y_train.all(axis=-1).any() assert not (1 - Y_train).all(axis=-1).any() assert not Y_test.all(axis=-1).any() assert not (1 - Y_test).all(axis=-1).any() split_dir = os.path.join(output_dir, "split_{}".format(split_no)) os.makedirs(split_dir, exist_ok=True) write_smiles(X_train, os.path.join(split_dir, "train.smi")) write_smiles(X_test, os.path.join(split_dir, "test.smi")) sp.save_npz(os.path.join(split_dir, "train.npz"), sp.csr_matrix(Y_train)) sp.save_npz(os.path.join(split_dir, "test.npz"), sp.csr_matrix(Y_test))
def iterative_train_test_split(X, y, test_size, order=2, random_state=None): """Iteratively stratified train/test split Parameters ---------- test_size : float, [0,1] the proportion of the dataset to include in the test split, the rest will be put in the train set Returns ------- X_train, y_train, X_test, y_test stratified division into train/test split :param order: :param random_state: """ stratifier = IterativeStratification( n_splits=2, order=order, sample_distribution_per_fold=[test_size, 1.0 - test_size], random_state=random_state) train_indexes, test_indexes = next(stratifier.split(X, y)) X_train, y_train = X[train_indexes, :], y[train_indexes, :] X_test, y_test = X[test_indexes, :], y[test_indexes, :] return X_train, y_train, X_test, y_test
def _multilabel_stratified_kfold_dfs(): df = get_train_df() label_mat = multilabel_binary_representation(df, sparse=True) kf = IterativeStratification(random_state=1234) # k=3 for train_index, val_index in kf.split(df.index.values, label_mat): fold_train_df = df.iloc[train_index] fold_val_df = df.iloc[val_index] yield fold_train_df, fold_val_df
def stratify_train_test(y_label, n_splits=10, seed=42): y_label_bin = MultiLabelBinarizer().fit_transform(y_label) k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=seed) for train, test in k_fold.split(y_label.index.to_list(), sps.lil_matrix(y_label_bin)): print("train", len(train), "test", len(test)) train_nodes = list(y_label.index[train]) test_nodes = list(y_label.index[test]) yield train_nodes, test_nodes
def iterative_train_test_split(X, y, test_size): stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size]) train_indexes, test_indexes = next(stratifier.split(X, y)) X_train = X[train_indexes] X_test = X[test_indexes] return X_train, X_test
def gen_folds(df, img_mat, target_mat, n_folds): 'Return dataframe with folds column' k_fold = IterativeStratification(n_splits=n_folds, order=1) splits = k_fold.split(img_mat, target_mat) df['fold'] = 0 # Generate folds column # Grab fold number and img indexes from splits, adjust fold column accordingly for fold, (_, fold_idxs) in enumerate(splits): valid_imgs = img_mat[fold_idxs] df.loc[df['cell_id'].isin(valid_imgs.reshape(-1)), 'fold'] = fold return df
def kfold(train_df, targets_df): train_df['kfold'] = -1 train_df = train_df.sample(frac=1).reset_index(drop=True) k_fold = IterativeStratification(n_splits=config.KFOLD_NUMBER, order=1) for f, (t_, v_) in enumerate( k_fold.split(X=train_df, y=targets_df.drop('sig_id', axis=1))): train_df.loc[v_, 'kfold'] = f return train_df
def objective(params): # objective fn to be minimized global train_path, test_path, label_to_idx_path, K, config_path, trials # get stratisfied split df = docs_to_sheet(train_path, "tmp.csv", label_to_idx_path) df.drop(columns=["text"], inplace=True) df.reset_index(inplace=True) # hacky way to make use of SkMultiLearn X = df.index y = df[[col for col in df.columns if col != "index"]].values del df k_fold = IterativeStratification(n_splits=K, order=1) # get docs with open(train_path, "rb") as f: docs = pickle.load(f) scores = [] tmp_tr_path = "temp_train.pkl" tmp_dev_path = "temp_dev.pkl" params["train_path"] = tmp_tr_path params["dev_path"] = tmp_dev_path params["test_path"] = test_path set_params(params, config_path) for train_idx, dev_idx in k_fold.split(X, y): # get split train_docs = [docs[i] for i in train_idx] dev_docs = [docs[i] for i in dev_idx] # save docs in temp location and free memory with open(tmp_tr_path, "wb") as f: pickle.dump(train_docs, f) with open(tmp_dev_path, "wb") as f: pickle.dump(dev_docs, f) del train_docs, dev_docs gc.collect() # call main r_k, p_k, rp_k, ndcg_k, avg_loss, hamming, emr, f1_micro, f1_macro = train_eval( False) scores.append(f1_micro) # save trials object for safety with open("trials_tmp.pkl", "wb") as f: pickle.dump(trials, f) return {"loss": 1 - np.mean(scores), "status": STATUS_OK}
def binary_split(X, Y, split, order=2): split = np.array(split / split.sum()) strat = IterativeStratification( order=order, n_splits=len(split), sample_distribution_per_fold=split.tolist()) idx1, idx2 = next(strat.split(X, Y)) ## switch if out of order with split... if np.sign(split[0] - split[1]) != np.sign(len(idx1) - len(idx2)): idx1, idx2 = idx2, idx1 set1 = X[idx1, :], Y[idx1, :] set2 = X[idx2, :], Y[idx2, :] return set1, set2
def main(): img, labels = load_data(dataset="train") stratifier = IterativeStratification(n_splits=8, random_state=1769) for i, (train_indexes, test_indexes) in enumerate(stratifier.split(X=img, y=labels)): print(train_indexes) print(test_indexes) split_filename = os.path.join(DATA_DIR, "KFold_{}".format(i)) np.savez(file=split_filename, train_indexes=train_indexes, test_indexes=test_indexes)
def _iterative_train_test_split(self, X, y): """Splits X and y into train and test sets using stratification. Args: X: The samples as :class:`list`. y: The one hot encoded labels for as :class:`list`. Returns: The trainings data X_train, y_train and testing data X_test, y_test as :class:`list`. """ stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[0.25, 0.75]) train_indexes, test_indexes = next(stratifier.split(X, y)) return X[train_indexes], y[train_indexes, :], X[test_indexes], y[ test_indexes, :]
def get_split(test_size=0.2): data = pd.read_csv(LABELS) data = data.apply(convert_targets, axis=1) k_fold = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size]) train_names = list(data['Id']) y = data['Target'].values y = np.array([ix for ix in y]) for train_idx, val_idx in k_fold.split(train_names, y): train_set = [train_names[i] for i in train_idx] val_set = [train_names[i] for i in val_idx] break return train_set, val_set
def test_if_positive_evidence_does_not_include_negative_evidence(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations) self.assertFalse(rows_used[0]) self.assertTrue(rows_used[1]) self.assertTrue(rows_used[2]) self.assertTrue(rows_used[3]) for combination, samples in stratifier.desired_samples_per_combination_per_fold.items(): for desire in samples: self.assertEqual(desire, 0)
def evaluate_kfold_label_classification(embedding, labels, k=10): assert len(labels.shape) == 2 model = LogisticRegressionCV(n_jobs=-1) #model=SVC(gamma='auto') if labels.shape[1] == 1: print("single label clasification") labels = labels.flatten() sss = StratifiedKFold(n_splits=k, shuffle=True, random_state=0) else: print("multi-label classification") sss = IterativeStratification(n_splits=k, random_state=0, order=2) model = OneVsRestClassifier(model) f1_micros = [] f1_macros = [] i = 1 for split_train, split_test in sss.split(embedding, labels): model.fit(embedding[split_train], labels[split_train]) predictions = model.predict(embedding[split_test]) f1_micro = f1_score(labels[split_test], predictions, average="micro") f1_macro = f1_score(labels[split_test], predictions, average="macro") f1_micros.append(f1_micro) f1_macros.append(f1_macro) print("Done {}/{} folds".format(i, k)) i += 1 return np.mean(f1_micros), np.mean(f1_macros)
def stratified_fold_split_for_rare( rare_samples, n_splits=5, interaction_order=1, random_state=42, least_representative_cols=("question_type_spelling", ), ): rare_ordinals = transform_target_columns_to_ordinals( rare_samples[list(least_representative_cols)]) k_fold = IterativeStratification(n_splits=n_splits, order=interaction_order, random_state=random_state) folds_rare = [] for _, fold_ids in k_fold.split(rare_ordinals, rare_ordinals): folds_rare.append(rare_samples.index.values[fold_ids]) return folds_rare
def test_if_variables_are_initialized_correctly(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) self.assertEqual(stratifier.n_samples, 4) self.assertEqual(stratifier.n_labels, 2) self.assertEqual(len(rows), 4) self.assertEqual(len(rows_used), 4) self.assertEqual(len(stratifier.percentage_per_fold), 2) self.assertEqual(len(stratifier.desired_samples_per_fold), 2) self.assertEqual(len(folds), 2) self.assertTrue(not any(rows_used.values())) self.assertFalse(any(rows_used.values())) self.assertEqual(stratifier.order, 1) for d in stratifier.percentage_per_fold: self.assertEqual(d, 1 / 2.0) for d in stratifier.desired_samples_per_fold: self.assertEqual(d, y.shape[0] / 2.0) self.assertEqual(len(all_combinations), 2) self.assertEqual(len(per_row_combinations[0]), 0) self.assertEqual(len(per_row_combinations[1]), 1) self.assertEqual(len(per_row_combinations[2]), 1) self.assertEqual(len(per_row_combinations[3]), 2) self.assertEqual(len(samples_with_combination), 2) self.assertEqual( len(stratifier.desired_samples_per_combination_per_fold), 2) for combination, samples in samples_with_combination.items(): self.assertEqual(len(set(combination)), 1) self.assertEqual(len(samples), 2) for combination, desirability in stratifier.desired_samples_per_combination_per_fold.items( ): self.assertEqual(len(set(combination)), 1) self.assertEqual(len(desirability), 2) for desire in desirability: self.assertEqual(desire, 1.0)
def load_datasets(path, drop_missing=True, n_tags=72, test_size=0.2, random_state=42): """Load and split dataset from raw CiP data. Args: path: Path to raw CiP dataset drop_missing: Drop events with no description or title n_tags: Number of top tags to use (ignored) test_size: Fraction of events to include in test set random_state: Random state for the split Returns: (events_train, tags_train, events_test, tags_test, top_tags, tags_train_stats) """ events_df, tags_df = load_raw_normalized_dataset(path, drop_missing=drop_missing) top_tags = calculate_top_tags(tags_df, n_tags=n_tags) # Only keep top tags tags_df = tags_df[tags_df['tag'].isin(top_tags)] tag_matrix = tags_to_matrix(events_df, tags_df, top_tags) # Split data into public training set and private test set stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size], random_state=random_state) train_indices, test_indices = next(stratifier.split(events_df, tag_matrix)) events_train, tags_train = events_df.iloc[train_indices], \ tag_matrix[train_indices, :] events_test, tags_test = events_df.iloc[test_indices], \ tag_matrix[test_indices, :] tags_train_stats = pd.DataFrame({ 'tag': top_tags, 'count': tags_train.sum(axis=0) }).sort_values('count', ascending=False) return (events_train, tags_train, events_test, tags_test, top_tags, tags_train_stats)
def test_if_positive_evidence_does_not_include_negative_evidence(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations) self.assertFalse(rows_used[0]) self.assertTrue(rows_used[1]) self.assertTrue(rows_used[2]) self.assertTrue(rows_used[3]) for combination, samples in stratifier.desired_samples_per_combination_per_fold.items( ): for desire in samples: self.assertEqual(desire, 0)
def run(self, labels_csv, models_dir): dataset_df = pd.read_csv(labels_csv) dataset = dataset_df.values.tolist() splits = [] labels = np.array([d[4:] for d in dataset]) k_fold = IterativeStratification( n_splits=self.cv, order=1, random_state=325 ) for i, (trainidx, valididx) in enumerate(k_fold.split(dataset, labels)): trainset = [dataset[k] for k in trainidx] validset = [dataset[k] for k in valididx] splits.append([trainset, validset]) model_dir = self.__model_dir(models_dir, i + 1) self.__train(trainset, validset, model_dir) return
def test_if_variables_are_initialized_correctly(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) self.assertEqual(stratifier.n_samples, 4) self.assertEqual(stratifier.n_labels, 2) self.assertEqual(len(rows), 4) self.assertEqual(len(rows_used), 4) self.assertEqual(len(stratifier.percentage_per_fold), 2) self.assertEqual(len(stratifier.desired_samples_per_fold), 2) self.assertEqual(len(folds), 2) self.assertTrue(not any(rows_used.values())) self.assertFalse(any(rows_used.values())) self.assertEqual(stratifier.order, 1) for d in stratifier.percentage_per_fold: self.assertEqual(d, 1 / 2.0) for d in stratifier.desired_samples_per_fold: self.assertEqual(d, y.shape[0] / 2.0) self.assertEqual(len(all_combinations), 2) self.assertEqual(len(per_row_combinations[0]), 0) self.assertEqual(len(per_row_combinations[1]), 1) self.assertEqual(len(per_row_combinations[2]), 1) self.assertEqual(len(per_row_combinations[3]), 2) self.assertEqual(len(samples_with_combination), 2) self.assertEqual(len(stratifier.desired_samples_per_combination_per_fold), 2) for combination, samples in samples_with_combination.items(): self.assertEqual(len(set(combination)), 1) self.assertEqual(len(samples), 2) for combination, desirability in stratifier.desired_samples_per_combination_per_fold.items(): self.assertEqual(len(set(combination)), 1) self.assertEqual(len(desirability), 2) for desire in desirability: self.assertEqual(desire, 1.0)
def run(self, labels2_csv, labels1_csv, models_dir): dataset2_df = pd.read_csv(labels2_csv) dataset2 = dataset2_df.values.tolist() dataset1_df = pd.read_csv(labels1_csv) dataset1 = dataset1_df.values.tolist() labels2 = np.array([d[5:] for d in dataset2]) k_fold = IterativeStratification(n_splits=self.cv, order=1, random_state=325) for i, (trainidx, valididx) in enumerate(k_fold.split(dataset2, labels2)): trainset = [dataset2[k] for k in trainidx] + dataset1 validset = [dataset2[k] for k in valididx] model_dir = self.__model_dir(models_dir, i + 1) self.__train(trainset, validset, model_dir) return
def stratified_fold_split_for_common( common_samples, n_splits=5, interaction_order=1, random_state=42, agg_func=pd.Series.mode, ): body_encoder = LabelEncoder() common_samples["group_id"] = body_encoder.fit_transform( common_samples["question_body"].astype(str)) common_ordinals = transform_target_columns_to_ordinals(common_samples) common_ordinals["group_id"] = common_samples["group_id"] common_groups = common_ordinals.groupby(["group_id"]) with Pool(cpu_count()) as pool: aggregated_common_ordinals = list( tqdm( pool.imap( functools.partial(aggregate_ordinals, agg_func=agg_func), common_groups, ), total=len(common_groups), desc="Aggregate ordinals over groups", )) aggregated_common_ordinals = pd.concat(aggregated_common_ordinals, axis=1).transpose() aggregated_common_ordinals.index.rename("group_id", inplace=True) k_fold = IterativeStratification(n_splits=n_splits, order=interaction_order, random_state=random_state) folds_common = [] for _, fold_groups in k_fold.split(aggregated_common_ordinals, aggregated_common_ordinals): fold_mask = common_ordinals["group_id"].isin(fold_groups) fold_ids = common_ordinals.index.values[fold_mask] folds_common.append(fold_ids) return folds_common
def split_stratified(self, dataset): Y = np.array([sample["Y"] for sample in dataset]) dataset = np.array(dataset) percentage = self.args["split"]["percentage"] stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[percentage, 1.0 - percentage]) remaining_idx, test_idx = next(stratifier.split(dataset, Y)) X_test = dataset[test_idx] dataset = dataset[remaining_idx] Y = Y[remaining_idx] percentage = percentage / (1.0 - percentage) stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[percentage, 1.0 - percentage]) train_idx, dev_idx = next(stratifier.split(dataset, Y)) X_train = dataset[train_idx] X_dev = dataset[dev_idx] return list(X_train), list(X_dev), list(X_test)
def split(dataset_path, test_size, stratification): df = get_csv(dataset_path, name="train") img_ids = df["image_id"] if stratification == "sklearn": train_set, valid_set = train_test_split(df[KEYS], test_size=test_size, random_state=SEED, shuffle=True) elif stratification == "sklearn_stratified": df['subset'] = np.nan splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=SEED) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() df.iloc[train_indcs, -1] = 'train' df.iloc[valid_indcs, -1] = 'valid' df.to_csv(os.path.join(dataset_path, 'train_stratified.csv'), index=None) elif stratification == "iterstrat": splitter = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=SEED) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() elif stratification == "skmultilearn": splitter = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size]) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() else: raise ValueError("Try something else :)") return train_set, valid_set
def crossfold(n_rounds,n_splits,classifier,x,y): perf = [] x_columns = x.columns y_columns = y.columns for i in range(n_rounds): print("Round: ",i) folds = IterativeStratification(n_splits=5, order=1) for train_index, test_index in folds.split(x, y): x = np.array(x) y = np.array(y) #print("TRAIN:", train_index, "TEST:", test_index) x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_train = pd.DataFrame(x_train) x_train.columns = x_columns x_test = pd.DataFrame(x_test) x_test.columns = x_columns y_train = pd.DataFrame(y_train) y_train.columns = y_columns y_test = pd.DataFrame(y_test) y_test.columns = y_columns if standardize == 1: scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) print("Modelling") model = get_model(classifier,x_train,y_train) print("Prediction") predictions = get_predictions(model,x_test) fold_perf = performance_evaluation(y_test,predictions) perf.append(fold_perf) return(perf)
def kfold_split(data, num_folds, seed, labels=None, label_info=None): """Do K-fold cross-validation""" num_data = len(data) msg = f"Splitting {num_data} input data to {num_folds} folds" if labels is None: info(msg) return list( KFold(num_folds, shuffle=True, random_state=seed).split(data)) else: multilabel = label_info.multilabel num_labels = len(label_info.label_names) if multilabel: info(msg + " using iterative stratification.") splitter = IterativeStratification(num_folds, order=1) oh_labels = one_hot(labels, num_labels, is_multilabel=True) return list(splitter.split(np.zeros(len(labels)), oh_labels)) else: try: info(msg + " using stratification.") return list( StratifiedKFold(num_folds, shuffle=True, random_state=seed).split(data, labels)) except ValueError as ve: error(f"Unable to complete a stratified fold split: {ve}")
def test_if_negative_evidence_is_distributed(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations) self.assertFalse(rows_used[0]) stratifier._distribute_negative_evidence(rows_used, folds) self.assertTrue(rows_used[0])
def evaluate_kfold_label_classification( embedding, labels, k=10): assert len(labels.shape) == 2 # model = LogisticRegressionCV( # max_iter=1000, # n_jobs=-1) model = SVC(probability=True) if labels.shape[1] == 1: print ("single label clasification") labels = labels.flatten() sss = StratifiedKFold(n_splits=k, shuffle=True, random_state=0) else: print ("multi-label classification") sss = IterativeStratification(n_splits=k, order=1) model = OneVsRestClassifier(model, ) k_fold_rocs = np.zeros(k) k_fold_f1s = np.zeros(k) k_fold_precisions = np.zeros(k) k_fold_recalls = np.zeros(k) for i, (split_train, split_test) in enumerate(\ sss.split(embedding, labels, )): print ("Fold", i+1, "fitting model") model.fit(embedding[split_train], labels[split_train]) probs = model.predict_proba(embedding[split_test]) (k_fold_rocs[i], k_fold_f1s[i], k_fold_precisions[i], k_fold_recalls[i]) = compute_measures( labels[split_test], probs,) print ("Completed {}/{} folds".format(i+1, k)) return (np.mean(k_fold_rocs), np.mean(k_fold_f1s), np.mean(k_fold_precisions), np.mean(k_fold_recalls))
def test_if_negative_evidence_is_distributed(self): stratifier = IterativeStratification(n_splits=2, order=1) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \ stratifier._prepare_stratification(y) stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations) self.assertFalse(rows_used[0]) stratifier._distribute_negative_evidence(rows_used, folds) self.assertTrue(rows_used[0])
def test_if_stratification_works(self): stratifier = IterativeStratification(n_splits=2, order=1) X = np.matrix([[0], [1], [2], [3]]) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) self.assertEqual(len(list(stratifier.split(X, y))), 2)