def get_discretizer(x, y, continuous_features=None, seed=None, min_depth=0) -> MDLP: discretizer = MDLP(random_state=seed, min_depth=min_depth) if continuous_features is not None: if continuous_features.dtype == np.bool: continuous_features = np.arange( len(continuous_features))[continuous_features] discretizer.fit(x, y, continuous_features) return discretizer
def num2cate_fit(df, min=2): ''' Arg df (Panda dataframes); the last col must be class, int 0 or 1 min (int): The minimum depth of the interval splitting. Overrides the MDLP stopping criterion. If the entropy at a given interval is found to be zero before `min_depth`, the algorithm will stop. Return mdlp (MDLP instance): transform, can be used to transform samples ''' Y = df.iloc[:, -1].values continuous_features =df.iloc[:, :-1].select_dtypes(include=['int64','float64']).columns.tolist() continuous_features.sort() # ensoure the features order between fit and transform X = df[continuous_features].values mdlp = MDLP(min_depth=min) mdlp.fit(X, Y) # X, Y should be numpy array return mdlp
def get_raw_bins(column, target): transformer = MDLP() transformer = transformer.fit(column, target) return list(transformer.cut_points_[0])