outcome_col_name))

    # Prepare data for classification
    x_train = df_by_split['train'][feature_cols].values
    y_train = np.ravel(df_by_split['train'][outcome_col_name])

    x_test = df_by_split['test'][feature_cols].values
    y_test = np.ravel(df_by_split['test'][outcome_col_name])
    is_multiclass = len(np.unique(y_train)) > 2
    
    # Perform hyper_searcher search
    n_examples = x_train.shape[0]
    n_features = x_train.shape[1]
    

    splitter = Splitter(size=args.validation_size, random_state=args.random_seed, n_splits=args.n_splits, cols_to_group=args.key_cols_to_group_when_splitting)
    key_train = splitter.make_groups_from_df(df_by_split['train'][key_cols])
    # hyper_searcher search on validation over possible threshold values
    # Make sure all candidates at least provide
    # one instance of each class (positive and negative)
    yproba_class1_vals = list()
    for tr_inds, va_inds in splitter.split(x_train, groups=key_train):
        x_valid = x_train[va_inds]
        yproba_valid = x_valid
        yproba_class1_vals.extend(yproba_valid)

    unique_yproba_vals = np.unique(yproba_class1_vals)
    if unique_yproba_vals.shape[0] == 1:
        nontrivial_thr_vals = unique_yproba_vals
    else:
            # Try all thr values that would give at least one pos and one neg decision
Exemple #2
0
        df_by_split[split_name] = cur_df

    outcome_col_name = args.outcome_col_name

    # Prepare data for classification
    x_train = df_by_split['train'][feature_cols].values.astype(np.float32)
    y_train = np.ravel(df_by_split['train'][outcome_col_name])

    x_test = df_by_split['test'][feature_cols].values.astype(np.float32)
    y_test = np.ravel(df_by_split['test'][outcome_col_name])

    if args.valid_csv_files is None:
        # get the validation set
        splitter = Splitter(
            size=args.validation_size,
            random_state=41,
            n_splits=args.n_splits,
            cols_to_group=args.key_cols_to_group_when_splitting)
        # Assign training instances to splits by provided keys
        key_train = splitter.make_groups_from_df(
            df_by_split['train'][key_cols])

        # get the train and validation splits
        for ss, (tr_inds, va_inds) in enumerate(
                splitter.split(x_train, y_train, groups=key_train)):
            x_tr = x_train[tr_inds].copy()
            y_tr = y_train[tr_inds].copy()
            x_valid = x_train[va_inds]
            y_valid = y_train[va_inds]

        x_train = x_tr