outcome_col_name)) # Prepare data for classification x_train = df_by_split['train'][feature_cols].values y_train = np.ravel(df_by_split['train'][outcome_col_name]) x_test = df_by_split['test'][feature_cols].values y_test = np.ravel(df_by_split['test'][outcome_col_name]) is_multiclass = len(np.unique(y_train)) > 2 # Perform hyper_searcher search n_examples = x_train.shape[0] n_features = x_train.shape[1] splitter = Splitter(size=args.validation_size, random_state=args.random_seed, n_splits=args.n_splits, cols_to_group=args.key_cols_to_group_when_splitting) key_train = splitter.make_groups_from_df(df_by_split['train'][key_cols]) # hyper_searcher search on validation over possible threshold values # Make sure all candidates at least provide # one instance of each class (positive and negative) yproba_class1_vals = list() for tr_inds, va_inds in splitter.split(x_train, groups=key_train): x_valid = x_train[va_inds] yproba_valid = x_valid yproba_class1_vals.extend(yproba_valid) unique_yproba_vals = np.unique(yproba_class1_vals) if unique_yproba_vals.shape[0] == 1: nontrivial_thr_vals = unique_yproba_vals else: # Try all thr values that would give at least one pos and one neg decision
df_by_split[split_name] = cur_df outcome_col_name = args.outcome_col_name # Prepare data for classification x_train = df_by_split['train'][feature_cols].values.astype(np.float32) y_train = np.ravel(df_by_split['train'][outcome_col_name]) x_test = df_by_split['test'][feature_cols].values.astype(np.float32) y_test = np.ravel(df_by_split['test'][outcome_col_name]) if args.valid_csv_files is None: # get the validation set splitter = Splitter( size=args.validation_size, random_state=41, n_splits=args.n_splits, cols_to_group=args.key_cols_to_group_when_splitting) # Assign training instances to splits by provided keys key_train = splitter.make_groups_from_df( df_by_split['train'][key_cols]) # get the train and validation splits for ss, (tr_inds, va_inds) in enumerate( splitter.split(x_train, y_train, groups=key_train)): x_tr = x_train[tr_inds].copy() y_tr = y_train[tr_inds].copy() x_valid = x_train[va_inds] y_valid = y_train[va_inds] x_train = x_tr