def tree_bins(x, y, inf_edges, tree): X_in = assure_numpy_array(x).reshape(-1, 1) y_in = assure_numpy_array(y).reshape(-1, 1) tree.fit(X_in, y_in) if tree.min_samples_leaf >= X_in.shape[0]: error_msg = ( "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m" + f"Currently min_samples_leaf {tree.min_samples_leaf} " + f"and the length of X is {X_in.shape[0]}") raise ValueError(error_msg) leaves = tree.apply(X_in) index, counts = np.unique(leaves, return_counts=True) bin_count = len(index) tpf = TreePathFinder(tree) boundaries = [bound['min'] for bound in tpf.get_boundaries().values()] boundaries += [tpf.get_boundaries()[leaves[-1]]['max']] if not inf_edges: boundaries[0] = np.min(X_in) boundaries[-1] = np.max(X_in) return counts, boundaries, bin_count, tree
def tree_bins(x, y, inf_edges, tree): """ Tree. """ X_in = assure_numpy_array(x).reshape(-1, 1) y_in = assure_numpy_array(y).reshape(-1, 1) tree.fit(X_in, y_in) if tree.min_samples_leaf >= X_in.shape[0]: error_msg = ( "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m" + f"Currently min_samples_leaf {tree.min_samples_leaf} " + f"and the length of X is {X_in.shape[0]}") raise ValueError(error_msg) leaves = tree.apply(X_in) index, counts = np.unique(leaves, return_counts=True) bin_count = len(index) boundaries = np.unique( tree.tree_.threshold[tree.tree_.feature != _tree.TREE_UNDEFINED]) boundaries = [np.min(X_in)] + boundaries.tolist() + [np.max(X_in)] if inf_edges: boundaries[0] = -np.inf boundaries[-1] = np.inf return counts.tolist(), boundaries, bin_count, tree
def test_assure_numpy_array_list(): x = [1, 2, 3] x_array = assure_numpy_array(x) assert isinstance(x_array, np.ndarray) np.testing.assert_array_equal(x_array, np.array(x)) x = [[1, 2], [3, 4]] x_array = assure_numpy_array(x) np.testing.assert_array_equal(x_array, np.array([[1, 2], [3, 4]])) with pytest.raises(DimensionalityError): assert assure_numpy_array(x, assure_1d=True)
def test_assure_numpy_array_dataframe(): x = pd.DataFrame({'x': [1, 2, 3]}) x_array = assure_numpy_array(x) assert isinstance(x_array, np.ndarray) np.testing.assert_array_equal(x_array, np.array([1, 2, 3])) x = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]}) x_array = assure_numpy_array(x) np.testing.assert_array_equal(x_array, np.array([[1, 1], [2, 2], [3, 3]])) with pytest.raises(DimensionalityError): assert assure_numpy_array(x, assure_1d=True)
def test_assure_numpy_array_array(): """ Test. """ x = np.array([1, 2, 3]) x_array = assure_numpy_array(x) assert isinstance(x_array, np.ndarray) np.testing.assert_array_equal(x_array, x) x = np.array([[1, 2], [3, 4]]) x_array = assure_numpy_array(x) np.testing.assert_array_equal(x_array, x) with pytest.raises(DimensionalityError): assert assure_numpy_array(x, assure_1d=True)
def test_assure_numpy_array_series(): """ Test. """ x = pd.Series([1, 2, 3]) x_array = assure_numpy_array(x) assert isinstance(x_array, np.ndarray) np.testing.assert_array_equal(x_array, np.array([1, 2, 3]))
def get_metric(X, y, clf, test_size, split_seed, scorers, train_sampling_type=None, test_sampling_type=None, train_sampling_fraction=1, test_sampling_fraction=1): """ Draws random train/test sample from the data using random seed and calculates metric of interest. Args: X (np.array or pd.DataFrame): Dataset with features. y (np.array or pd.Series): Target of the prediction. clf (model object): Binary classification model or pipeline. test_size (float): Fraction of data used for testing the model. split_seed (int): Randomized seed used for splitting data. scorers (list of Scorers): List of Scorer objects used to score the trained model. train_sampling_type (str, optional): String indicating what type of sampling should be applied on train set: - `None`: indicates that no additional sampling is done after splitting data, - `'bootstrap'`: indicates that sampling with replacement will be performed on train data, - `'subsample'`: indicates that sampling without repetition will be performed on train data. test_sampling_type (str, optional): string indicating what type of sampling should be applied on test set: - `None`: indicates that no additional sampling is done after splitting data - `'bootstrap'`: indicates that sampling with replacement will be performed on test data - `'subsample'`: indicates that sampling without repetition will be performed on test data train_sampling_fraction (float, optional): Fraction of train data sampled, if sample_train_type is not None. Default value is 1. test_sampling_fraction (float, optional): Fraction of test data sampled, if sample_test_type is not None. Default value is 1. Returns: (pd.Dataframe): Dataframe with results for a given model trained. Rows indicate the metric measured and columns ther results """ if not (isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame)): X = assure_numpy_array(X) if not (isinstance(X, np.ndarray) or isinstance(X, pd.Series)): y = assure_numpy_array(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=split_seed, stratify=y) # Sample data based on the input arguments X_train, y_train = sample_data(X=X_train, y=y_train, sampling_type=train_sampling_type, sampling_fraction=train_sampling_fraction, dataset_name='train') X_test, y_test = sample_data(X=X_test, y=y_test, sampling_type=test_sampling_type, sampling_fraction=test_sampling_fraction, dataset_name='test') clf = clf.fit(X_train, y_train) results_columns = [ 'metric_name', 'train_score', 'test_score', 'delta_score' ] results = pd.DataFrame([], columns=results_columns) for scorer in scorers: score_train = scorer.score(clf, X_train, y_train) score_test = scorer.score(clf, X_test, y_test) score_delta = score_train - score_test results = results.append( pd.DataFrame( [[scorer.metric_name, score_train, score_test, score_delta]], columns=results_columns)) return results