Esempio n. 1
0
    def tree_bins(x, y, inf_edges, tree):

        X_in = assure_numpy_array(x).reshape(-1, 1)
        y_in = assure_numpy_array(y).reshape(-1, 1)
        tree.fit(X_in, y_in)

        if tree.min_samples_leaf >= X_in.shape[0]:
            error_msg = (
                "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m"
                + f"Currently min_samples_leaf {tree.min_samples_leaf} " +
                f"and the length of X is {X_in.shape[0]}")
            raise ValueError(error_msg)

        leaves = tree.apply(X_in)
        index, counts = np.unique(leaves, return_counts=True)

        bin_count = len(index)

        tpf = TreePathFinder(tree)
        boundaries = [bound['min'] for bound in tpf.get_boundaries().values()]
        boundaries += [tpf.get_boundaries()[leaves[-1]]['max']]

        if not inf_edges:
            boundaries[0] = np.min(X_in)
            boundaries[-1] = np.max(X_in)

        return counts, boundaries, bin_count, tree
Esempio n. 2
0
    def tree_bins(x, y, inf_edges, tree):
        """
        Tree.
        """
        X_in = assure_numpy_array(x).reshape(-1, 1)
        y_in = assure_numpy_array(y).reshape(-1, 1)
        tree.fit(X_in, y_in)

        if tree.min_samples_leaf >= X_in.shape[0]:
            error_msg = (
                "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m"
                + f"Currently min_samples_leaf {tree.min_samples_leaf} " +
                f"and the length of X is {X_in.shape[0]}")
            raise ValueError(error_msg)

        leaves = tree.apply(X_in)
        index, counts = np.unique(leaves, return_counts=True)

        bin_count = len(index)

        boundaries = np.unique(
            tree.tree_.threshold[tree.tree_.feature != _tree.TREE_UNDEFINED])
        boundaries = [np.min(X_in)] + boundaries.tolist() + [np.max(X_in)]

        if inf_edges:
            boundaries[0] = -np.inf
            boundaries[-1] = np.inf

        return counts.tolist(), boundaries, bin_count, tree
Esempio n. 3
0
def test_assure_numpy_array_list():
    x = [1, 2, 3]
    x_array = assure_numpy_array(x)
    assert isinstance(x_array, np.ndarray)
    np.testing.assert_array_equal(x_array, np.array(x))
    x = [[1, 2], [3, 4]]
    x_array = assure_numpy_array(x)
    np.testing.assert_array_equal(x_array, np.array([[1, 2], [3, 4]]))
    with pytest.raises(DimensionalityError):
        assert assure_numpy_array(x, assure_1d=True)
Esempio n. 4
0
def test_assure_numpy_array_dataframe():
    x = pd.DataFrame({'x': [1, 2, 3]})
    x_array = assure_numpy_array(x)
    assert isinstance(x_array, np.ndarray)
    np.testing.assert_array_equal(x_array, np.array([1, 2, 3]))
    x = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]})
    x_array = assure_numpy_array(x)
    np.testing.assert_array_equal(x_array, np.array([[1, 1], [2, 2], [3, 3]]))
    with pytest.raises(DimensionalityError):
        assert assure_numpy_array(x, assure_1d=True)
Esempio n. 5
0
def test_assure_numpy_array_array():
    """
    Test.
    """
    x = np.array([1, 2, 3])
    x_array = assure_numpy_array(x)
    assert isinstance(x_array, np.ndarray)
    np.testing.assert_array_equal(x_array, x)
    x = np.array([[1, 2], [3, 4]])
    x_array = assure_numpy_array(x)
    np.testing.assert_array_equal(x_array, x)
    with pytest.raises(DimensionalityError):
        assert assure_numpy_array(x, assure_1d=True)
Esempio n. 6
0
def test_assure_numpy_array_series():
    """
    Test.
    """
    x = pd.Series([1, 2, 3])
    x_array = assure_numpy_array(x)
    assert isinstance(x_array, np.ndarray)
    np.testing.assert_array_equal(x_array, np.array([1, 2, 3]))
Esempio n. 7
0
def get_metric(X,
               y,
               clf,
               test_size,
               split_seed,
               scorers,
               train_sampling_type=None,
               test_sampling_type=None,
               train_sampling_fraction=1,
               test_sampling_fraction=1):
    """
    Draws random train/test sample from the data using random seed and calculates metric of interest.

    Args:
        X (np.array or pd.DataFrame):
            Dataset with features.

        y (np.array or pd.Series):
            Target of the prediction.

        clf (model object):
            Binary classification model or pipeline.

        test_size (float):
            Fraction of data used for testing the model.

        split_seed (int):
            Randomized seed used for splitting data.

        scorers (list of Scorers):
            List of Scorer objects used to score the trained model.

        train_sampling_type (str, optional):
            String indicating what type of sampling should be applied on train set:

                - `None`: indicates that no additional sampling is done after splitting data,
                - `'bootstrap'`: indicates that sampling with replacement will be performed on train data,
                - `'subsample'`: indicates that sampling without repetition will be performed  on train data.

        test_sampling_type (str, optional):
            string indicating what type of sampling should be applied on test set:

                - `None`: indicates that no additional sampling is done after splitting data
                - `'bootstrap'`: indicates that sampling with replacement will be performed on test data
                - `'subsample'`: indicates that sampling without repetition will be performed  on test data

        train_sampling_fraction (float, optional):
            Fraction of train data sampled, if sample_train_type is not None. Default value is 1.

        test_sampling_fraction (float, optional):
            Fraction of test data sampled, if sample_test_type is not None. Default value is 1.

    Returns: 
        (pd.Dataframe):
            Dataframe with results for a given model trained. Rows indicate the metric measured and columns ther results
    """

    if not (isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame)):
        X = assure_numpy_array(X)
    if not (isinstance(X, np.ndarray) or isinstance(X, pd.Series)):
        y = assure_numpy_array(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=split_seed, stratify=y)

    # Sample data based on the input arguments
    X_train, y_train = sample_data(X=X_train,
                                   y=y_train,
                                   sampling_type=train_sampling_type,
                                   sampling_fraction=train_sampling_fraction,
                                   dataset_name='train')
    X_test, y_test = sample_data(X=X_test,
                                 y=y_test,
                                 sampling_type=test_sampling_type,
                                 sampling_fraction=test_sampling_fraction,
                                 dataset_name='test')

    clf = clf.fit(X_train, y_train)

    results_columns = [
        'metric_name', 'train_score', 'test_score', 'delta_score'
    ]
    results = pd.DataFrame([], columns=results_columns)

    for scorer in scorers:
        score_train = scorer.score(clf, X_train, y_train)
        score_test = scorer.score(clf, X_test, y_test)
        score_delta = score_train - score_test

        results = results.append(
            pd.DataFrame(
                [[scorer.metric_name, score_train, score_test, score_delta]],
                columns=results_columns))
    return results