def test_fewer_bins_than_n_discretize_bins():
    df = pd.read_csv("credit.csv")
    for n in range(2, 20, 5):
        bin_transformer_ = BinTransformer(n_discretize_bins=n)
        bin_transformer_.fit(df)
        for feat, bin_ranges in bin_transformer_.bins_.items():
            assert len(bin_ranges) <= n
def test_correct_min_max_bins():
    df = pd.read_csv("credit.csv")
    bin_transformer_ = BinTransformer()
    bin_transformer_.fit(df)
    for feat, bins in bin_transformer_.bins_.items():
        assert bins[0][0] == df[feat].min()
        assert bins[-1][1] == df[feat].max()
def test_each_bin_in_order():
    df = pd.read_csv("credit.csv")
    bin_transformer_ = BinTransformer()
    bin_transformer_.fit(df)
    for feat, bins in bin_transformer_.bins_.items():
        for i, bin in enumerate(bins):
            assert (bin[0] <= bin[1])
def test_bin_ranges_are_flush():
    df = pd.read_csv("credit.csv")
    bin_transformer_ = BinTransformer()
    bin_transformer_.fit(df)
    for feat, bin_ranges in bin_transformer_.bins_.items():
        prev_ceil = None
        for floor, ceil in bin_ranges:
            assert prev_ceil is None or floor == prev_ceil
            prev_ceil = ceil
def preprocess_training_data(preprocess_params):

    # Get params
    trainset = preprocess_params["trainset"]
    y = preprocess_params["y"]
    class_feat = preprocess_params["class_feat"]
    pos_class = preprocess_params["pos_class"]
    user_requested_feature_names = preprocess_params["feature_names"]
    n_discretize_bins = preprocess_params["n_discretize_bins"]
    verbosity = preprocess_params["verbosity"]

    # Error check
    _check_valid_input_data(
        trainset,
        y,
        class_feat,
        user_requested_feature_names=user_requested_feature_names,
    )

    # Determine class_feat
    class_feat = _get_class_feat_name(class_feat, y)

    # Build new DataFrame containing both X and y.
    df = _convert_to_training_df(
        trainset,
        y,
        class_feat,
        user_requested_feature_names=user_requested_feature_names,
    )

    # Define pos_class
    pos_class = _get_pos_class(df, class_feat, pos_class)

    # Infer correct datatypes
    df = df.infer_objects()

    # Bin, if necessary
    bin_transformer_ = BinTransformer(n_discretize_bins=n_discretize_bins,
                                      verbosity=verbosity)
    df = bin_transformer_.fit_transform(df, ignore_feats=[class_feat])

    # Done
    return df, class_feat, pos_class, bin_transformer_
def test_no_bins():
    old_df = pd.read_csv("credit.csv")
    df = old_df.copy()
    bin_transformer_ = BinTransformer(n_discretize_bins=0)
    bin_transformer_.fit(df)
    bin_transformer_.transform(df)
    assert df.equals(old_df)
def _upgrade_bin_transformer_ifdepr(obj):
    old_bin_transformer_ = getattr(obj, "bin_transformer_")
    if type(old_bin_transformer_) == dict:
        new_bin_transformer_ = BinTransformer()
        new_bin_transformer_.bins_ = old_bin_transformer_
        setattr(obj, "bin_transformer_", new_bin_transformer_)