def _woe_binning(X, y, q=None, bins=None, max_leaf_nodes=None, cat_num_lim=0, **kwargs): '''use by Woe_encoder to get binning edges return ---- edges: {colname : [-inf, point1, point2..., inf]} ''' bin_edges = {} for name, col in X.iteritems(): df = pd.DataFrame({'x': col, 'y': y}) col_notna = df.dropna().x y_notna = df.dropna().y if (len(pd.unique(col_notna)) > cat_num_lim \ and api.is_numeric_dtype(col_notna)): label, bin_edges[name] = _binning(col_notna, bins, q, max_leaf_nodes, y_notna, **kwargs) return bin_edges
def restore_type(self, dtype, sample=None): """Restore type from Pandas """ # Pandas types if pdc.is_bool_dtype(dtype): return 'boolean' elif pdc.is_datetime64_any_dtype(dtype): return 'datetime' elif pdc.is_integer_dtype(dtype): return 'integer' elif pdc.is_numeric_dtype(dtype): return 'number' # Python types if sample is not None: if isinstance(sample, (list, tuple)): return 'array' elif isinstance(sample, datetime.date): return 'date' elif isinstance(sample, isodate.Duration): return 'duration' elif isinstance(sample, dict): return 'object' elif isinstance(sample, six.string_types): return 'string' elif isinstance(sample, datetime.time): return 'time' return 'string'
def to_num_datetime(col, name='array', thresh=0.80, **kwargs): '''convert col to numeric or datetime if possible, otherwise remain unchaged parameters ---- col --> series, scalar or ndarry will be turned into series type name --> name of the col series thresh --> default 0.8 - if more than the thresh percentage of X could be converted, then should commit conversion **kwargs - errors - {'ignore', 'raise', 'coerce'}, default --> 'coerce' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input other pandas to_datetime key words return ---- converted series or df ''' try: col = pd.Series(col) except Exception: raise Exception('col must be 1-d array/list/tuple/dict/Series') if api.is_numeric_dtype(col): return col if api.is_datetime64_any_dtype(col): return col if api.is_categorical_dtype(col): return col if col.count() == 0: return col is_numeric_convertible = False not_null_count = col.count() try: num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce')) if num.count() / not_null_count >= thresh: col = num is_numeric_convertible = True except: pass if not is_numeric_convertible: params = {'errors': 'coerce', 'infter_datetime_format': True} params.update(kwargs) try: date = pd.to_datetime(col, **params) if pd.notnull(date).sum() / not_null_count >= thresh: col = date except: pass return col
def f(s): if api.is_numeric_dtype(s): if s.apply(abs).max() <= 1: s = s.apply(lambda x: str(round(x * 100, decimals)) + '%') else: fmt = "{" + ":,.{}f".format(decimals) + "}" s = s.apply(lambda x: fmt.format(x)) return s
def _mapping_col(col, na_values=['null', '缺失值', -999, -99999, -1]): ''' encrypt categorical features ''' col = col.replace(na_values, np.nan) if not api.is_numeric_dtype(col): uniq = col.unique() mapper = dict( zip(uniq, [''.join(['C', str(i)]) for i in range(len(uniq))])) if mapper.get(np.nan) is not None: mapper.pop(np.nan) col = col.map(mapper, na_action='ignore') return col
def f(ser, kwd): ''' ''' param = kwd.get(ser.name) if param is None or not is_numeric_dtype(ser): return ser else: low, high, flag = param if low is not None: if flag == 'percentage': low = ser.quantile(low) ser = ser.where(low <= ser, np.nan) if high is not None: if flag == 'percentage': high = ser.quantile(high) ser = ser.where(ser <= high, np.nan) return ser
def _get_binning(X, y, q=None, bins=None, max_leaf_nodes=None, mono=None, cat_num_lim=0, **kwargs): '''use by Woe_encoder to get binning edges Parameters ----------- X : DataFrame y : binary target return ------- edges: {colname : [-inf, point1, point2..., inf]} ''' bin_edges = {} # reset index X = pd.DataFrame(X).reset_index(drop=True) y = pd.Series(y).reset_index(drop=True) for name, col in X.iteritems(): df = pd.DataFrame({'x': col, 'y': y}) col_notna = df.dropna().x y_notna = df.dropna().y if (len(pd.unique(col_notna)) > cat_num_lim \ and api.is_numeric_dtype(col_notna)): label, bin_edges[name] = binning(col_notna, bins, q, max_leaf_nodes, mono, y_notna, **kwargs) return bin_edges
def plotter_score_path(df_score, title=None, cm=None, style='-.o'): ''' df_score: data frame of scores of metrics ''' # plot data = df_score.select_dtypes(include='number') n = len(data.columns) i, j = plt.rcParams['figure.figsize'] fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2))) ax = get_flat_list(ax) if n == 1 else ax if cm is None: cm = plt.get_cmap('tab10') cmlist = [cm(i) for i in np.linspace(0, 1, n)] i = 0 for ax0, col in zip(ax, data.columns): s = data[col] if api.is_numeric_dtype(s): s.plot(ax=ax0, color=cmlist[i], style=style) ax0.fill_between(s.index, s - s.std(), s + s.std(), color='grey', alpha=.3, label=r'{} = {}$\pm$ {}'.format( col, round(s.mean(), 4), round(s.std(), 4))) plt.setp(ax0, ylabel=col) h, l = ax0.get_legend_handles_labels() ax0.legend([h[-1]], [l[-1]]) i += 1 ax[0].set_title(title) ax[-1].set_xlabel('index') plt.tight_layout(rect=(0, 0, 0.98, 0.96)) return fig
def plotter_cv_results_(results, train_style='mo-', test_style='go-.', title=None): '''plot univariate parameter cross validated results after grid search of model return ----- ax, or tuple of ax ''' scoring = results.filter(like='mean_train_').columns scoring = [i.replace('mean_train_', '') for i in scoring] df_param = results.filter(like='param_') param_array = df_param.columns if len(param_array) > 1: print('multi-parameter is encountered ... ') print(df_param.apply(lambda x: pd.Series(pd.unique(x)))) # plot n = len(scoring) i, j = plt.rcParams['figure.figsize'] fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2))) ax = get_flat_list(ax) if n == 1 else ax for s, ax0 in zip(scoring, ax): df = results[['mean_train_' + s, 'mean_test_' + s, 'std_test_' + s]] if len(param_array) == 1: df.index = results[param_array[0]] xlabel = param_array[0] num_param = api.is_numeric_dtype(df.index) if not num_param: df.index = np.arange(len(df.index)) else: xlabel = ' + '.join([i.split('__')[-1] for i in param_array]) df.sort_index(inplace=True) # plot mean = df['mean_test_' + s].values std = df.pop('std_test_' + s) x = df.index.get_values() df.plot.line(style=[train_style, test_style], ax=ax0) ax0.fill_between(x, mean - std, mean + std, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') # annotate x_max = df.index[np.argmax(mean)] best_score = np.max(mean) std = np.mean(std) h, l = ax0.get_legend_handles_labels() ax0.legend( [h[-1]], ['score_max= %0.4f $\pm$ %0.2f' % (np.max(mean), np.mean(std))]) ax0.axvline(x_max, linestyle='--', marker='x', color='y') ax0.annotate("%0.4f" % best_score, (x_max, best_score)) ax0.set_xlim(x.min() - 0.5, x.max() + 0.5) plt.setp(ax0, ylabel=s) # set title ax[0].set_title(title, fontsize=13) # use fig legend fig.legend(h, ('train', 'test', r'$\pm$ 1 std. dev.'), loc='upper right', ncol=3, bbox_to_anchor=(0.98, 1)) ax[-1].set_xlabel(xlabel) plt.tight_layout(rect=(0, 0, 1, 0.95)) return ax
def to_num_datetime(col, name='array', thresh=0.75, **kwargs): '''convert col to numeric or datetime if possible, otherwise remain unchaged parameters ----------- col : series scalar or ndarry input sequence name : str name of the col series thresh : float default 0.8, if more than the thresh percentage of X could be converted, then should commit conversion keyword args ------------ other pandas to_datetime key words errors : {'ignore', 'raise', 'coerce'} default 'coerce' If 'raise', then invalid parsing will raise an exception If 'coerce', then invalid parsing will be set as NaN If 'ignore', then invalid parsing will return the input return -------- s : series converted col ''' try: col = pd.Series(col) except Exception: raise Exception('col must be 1-d array/list/tuple/dict/Series') if api.is_numeric_dtype(col): return col if api.is_datetime64_any_dtype(col): return col if api.is_categorical_dtype(col): return col if col.count() == 0: return col if col.astype(str).str.contains('^0\d+$').any(): return col is_numeric_convertible = False not_null_count = col.count() try: num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce')) if num.count() / not_null_count >= thresh: col = num is_numeric_convertible = True except: pass if not is_numeric_convertible: params = {'errors': 'coerce', 'infer_datetime_format': True} params.update(kwargs) try: date = pd.to_datetime(col, **params) if pd.notnull(date).sum() / not_null_count >= thresh: col = date else: col = col.apply(lambda x: x if pd.isna(x) else str(x)) except: pass return col
def bin_tree(X, y, cat_num_lim=0, max_leaf_nodes=10, min_samples_leaf=0.05, random_state=0, verbose=0, **kwargs): '''Discretize features matrix based on Binary DecisionTree classifier .. note:: CART tree - gini impurity as criterion, not numeric dtype column will be igored, unique number of values less than "cat_num_lim" will be ignored parameters ----------- X : 2d array or dataframe matrix contain feature matrix, should be numerical dtype y : str col of class label, binary cat_num_lim number of unique vals limit to be treated as continueous feature max_leaf_nodes max number of bins min_samples_leaf minimum number of samples in leaf node **kwargs other tree keywords return ------- bin_edges : dict {'col_name' : bin_edges } ''' bin_edges = {} cols = [] un_split = [] # reset index X = pd.DataFrame(X).reset_index(drop=True) y = pd.Series(y).reset_index(drop=True) for name, col in X.iteritems(): df = pd.DataFrame({'x': col, 'y': y}) col_notna = df.dropna().x y_notna = df.dropna().y if (len(pd.unique(col_notna)) > cat_num_lim and api.is_numeric_dtype(col_notna)): # call _tree_univar_bin bin_edges[name] = _tree_univar_bin( col_notna, y_notna, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf, random_state=random_state, **get_kwargs(DecisionTreeClassifier, **kwargs)) if len(bin_edges[name]) < 3: un_split.append(name) else: cols.append(name) # log process msg1 = '''total of {2} unchaged (unique counts less than {1} or categorical dtype) =\n "{0}" '''.format(pd.Index(cols), cat_num_lim, len(cols)) msg2 = '''total of {1} unsplitable features = \n {0} ... '''.format(pd.Index(un_split), len(un_split)) if cols: logger.info(msg1) if un_split: logger.info(msg2) return bin_edges
def binning(y_pre=None, bins=None, q=None, max_leaf_nodes=None, mono=None, y_true=None, labels=None, **kwargs): '''supervised binning of y_pre based on y_true if y_true is not None .. _binningmeth: parameters ----------- y_pre : 1d array_like value of y to be cut y_true : 1d array like binary target y_true for supervised cutting bins : int number of equal width bins q : int number of equal frequency bins max_leaf_nodes : int number of tree nodes bins using tree cut if not None use supervised cutting based on decision tree mono : int number of bins that increases monotonically with "y" mean value .. note:: arguments [ q, bins, max_leaf_nodes, mono ] control binning method and only 1 of them can be specified. if not valid assign q=10 and bins=max_leaf_nodes=mono=None labels : bool see pd.cut, if False return integer indicator of bins, if True return arrays of labels (or can be manually input) Keyword args ------------- kwargs : Decision tree keyswords min_impurity_decrease=0.001 random_state=0 return -------- y_binlabel : array bin label of y_pre bin_edge : array ndarray of bin edges ''' bins, q, max_leaf_nodes, mono = _check_binning_keywords( bins, q, max_leaf_nodes, mono) y_pre = to_num_datetime(y_pre) y_pre_input = y_pre.copy() if y_true is not None: y_true = to_num_datetime(y_true) y_true = np.array(y_true) # drop na values for y_pre & y_true pairs in case of supervised cutting df = pd.DataFrame({'ypre': np.array(y_pre), 'ytrue': y_true}) df = df.dropna(subset=['ypre']) y_pre = df.pop('ypre') y_true = df.pop('ytrue') # if y_pre is not numeric data type, do not perform cut if not api.is_numeric_dtype(y_pre): return y_pre_input, y_pre.unique() if q is not None: bins = np.percentile(y_pre, np.linspace(0, 100, q + 1)) bins[0] = -np.Inf bins[-1] = np.Inf bins = np.unique(bins) if max_leaf_nodes is not None: if y_true.isna().sum() > 0: raise ValueError('none nan y_true must be supplied for tree cut') y_pre0 = pd.DataFrame(y_pre) bins_dict = bin_tree(y_pre0, y_true, max_leaf_nodes=max_leaf_nodes, **kwargs) bins = list(bins_dict.values())[0] if mono is not None: if y_true.isna().sum() > 0: raise ValueError('none nan y_true must be supplied for mono cut') bins = _mono_cut(Y=y_true, X=y_pre) if isinstance(bins, int): bins = np.linspace(np.min(y_pre), np.max(y_pre), bins + 1) bins[0] = -np.inf bins[-1] = np.Inf if bins is None: raise ValueError('no cutting bins supplied') if labels is True: labels = None y_binlabel, bin_edge = pd.cut(y_pre_input, bins, duplicates='drop', retbins=True, labels=labels) return y_binlabel, bin_edge
def dtype_specific_binary(left, right, numerics, datetimes, bools, strings, categoricals, intervals, errors='ignore'): """ A low-level base binary function to perform different binary operations based on the dtypes of left, right inputs. Can be used with functools.partial to create a custom binary function that can be passed to apply_columnwise or higher order compare_values utilities found elsewhere in this module. See examples for more details. This function supports 6 distinct groups of pandas dtypes which are validated using a corresponding set of helpers provided by the pandas.core.dtypes API: 1) numeric - is_numeric_dtype 2) datetime-like - is_datetime64_any_dtype or istimedelta64_dtype 3) bool - is_bool_dtype 4) string - is_string_dtype 5) categorical - is_categorical_dtype 6) is_interval_dtype If no supported dtype is matched, or `left` & `right` do not have matching dtypes apd.Series of NaN values is returned unless errors='raise' in which case a ValueError is raised. Parameters ---------- left : pd.Series, pd.DataFrame, np.ndarray right : pd.Series, pd.DataFrame, np.ndarray numerics : binary callable applied to numeric dtypes datetimes : binary callable applied to datetime-like objects bools : binary callable applied to bool dtypes strings : binary callable applied to string-like dtypes categoricals : binary callable applied to Categorical dtype intervals : binary callable applied to Interval dtype errors : str default 'ignore' issues warning and returns NaNs when dtype of left, right do not match if 'raise' is passed, will raise ValueError in such cases Returns ------- result of applying a specific binary callable to `left` and `right` inputs based on dtype """ _ld = left.dtype _rd = right.dtype if is_numeric_dtype(_ld) and is_numeric_dtype(_rd): return numerics(left, right) elif ((is_datetime64_any_dtype(_ld) or is_timedelta64_dtype(_ld) or is_timedelta64_ns_dtype(_ld)) and (is_datetime64_any_dtype(_rd) or is_timedelta64_dtype(_rd) or is_timedelta64_ns_dtype(_rd))): return datetimes(left, right) elif is_bool_dtype(_ld) and is_bool_dtype(_rd): return bools(left, right) elif is_string_dtype(_ld) and is_string_dtype(_rd): return strings(left, right) elif is_categorical_dtype(_ld) and is_categorical_dtype(_rd): return categoricals(left, right) elif is_interval_dtype(_ld) and is_interval_dtype(_rd): return intervals(left, right) else: # by default when dtypes are mismatched we issue a warning and return NaNs # raise if user requires it if errors == 'raise': raise ValueError( f"left and right do not have matching supported dtypes: {_ld.name}, {_rd.name}" ) else: warnings.warn( f"left: {left.name}, {_ld.name} and right: {right.name}, {_rd.name}" f" do not have comparable dtypes, returning NaNs") return pd.Series(np.nan, index=right.index)
def bin_tree(X, y, cat_num_lim=0, max_leaf_nodes=10, min_samples_leaf=0.05, random_state=0, verbose=0, **kwargs): '''discrete features based on univariate run of DecisionTree classifier (CART tree - gini impurity as criterion, not numeric dtype will be igored, unique number of values less than "cat_num_lim" will be ignored) df_X - df, contain feature matrix, should be numerical dtype y - col of class label, binary cat_num_lim=10 - number of unique vals limit to be treated as continueous feature max_leaf_nodes=5 - max number of bins min_samples_leaf=0.1 - minimum number of samples in leaf node **kwargs - other tree keywords return ---- bin_edges - dict of {'col_name' : bin_edges } ''' bin_edges = {} cols = [] un_split = [] for name, col in X.iteritems(): df = pd.DataFrame({'x': col, 'y': y}) col_notna = df.dropna().x y_notna = df.dropna().y if (len(pd.unique(col_notna)) > cat_num_lim and api.is_numeric_dtype(col_notna)): # call _tree_univar_bin bin_edges[name] = _tree_univar_bin( col_notna, y_notna, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf, random_state=random_state, **get_kwargs(DecisionTreeClassifier, **kwargs)) if len(bin_edges[name]) < 3: un_split.append(name) else: cols.append(name) if verbose > 0: msg1 = '''total of {2} unchaged (unique counts less than {1} or categorical dtype) =\n "{0}" '''.format(pd.Index(cols), cat_num_lim, len(cols)) msg2 = '''total of {1} unsplitable features = \n {0} ... '''.format(pd.Index(un_split), len(un_split)) msg3 = 'total of {} bin_edges obtained \n'.format(len(bin_edges)) if cols: print(msg1) if un_split: print(msg2) if bin_edges: print(msg3) return bin_edges
def fit(self, X, y=None): '''fit input_labels & out_labels ''' X = self._fit(X) # drop na columns na_col = X.columns[X.apply(lambda x: all(x.isna()))] X.dropna(axis=1, how='all', inplace=True) # drop uid cols or too discrete data uid_col = [] for k, col in X.iteritems(): if api.is_object_dtype(col): if len(pd.unique(col)) > 40: X.drop(k, axis=1, inplace=True) uid_col.append(k) elif api.is_integer_dtype(col): if len(pd.unique(col)) > 0.85 * len(col): X.drop(k, axis=1, inplace=True) uid_col.append(k) # drop constant const_col = [] for k, col in X.iteritems(): if (api.is_numeric_dtype(col) and col.std()<0.01) \ or len(pd.unique(col))==1: X.drop(k, axis=1, inplace=True) const_col.append(k) # filter dtypes options = { 'not_datetime': X.select_dtypes(exclude='datetime').columns, 'number': X.select_dtypes(include='number').columns, 'object': X.select_dtypes(include='object').columns, 'datetime': X.select_dtypes(include='datetime').columns, 'all': X.columns } self.objcols = options.get('object') self.numcols = options.get('number') self.datetimecols = options.get('datetime') self.obj_na = _get_imputer(self.na1) self.num_na = _get_imputer(self.na2) if self.obj_na is not None and not self.objcols.empty: self.obj_na.fit(X.reindex(columns=self.objcols)) if self.num_na is not None and not self.numcols.empty: self.num_na.fit(X.reindex(columns=self.numcols)) self.out_labels = options.get(self.dtype_filter).tolist() # -- if len(na_col) > 0: print('{} ...\n total {} columns are null , have been dropped \n'. format(na_col, len(na_col))) if len(uid_col) > 0: print( '''{} ...\n total {} columns are uid or has too many discrete categories (>40) , have been dropped \n'''.format( uid_col, len(uid_col))) if len(const_col) > 0: print( ''''{} ...\n total {} columns are constant , have been dropped \n'''.format(const_col, len(const_col))) if self.verbose > 0: for k, i in options.items(): print('data has {} of {} columns'.format(len(i.columns), k)) if len(na_col) > 0: print('null columns:\n {}'.format(list(na_col))) return self
def fit(self, X, y=None): '''fit input_labels & out_labels ''' X = self._fit(X, self.na_values) # drop na columns over na_thresh na_col = X.columns[X.apply(lambda x: all(x.isna()))] length = len(X) # drop null column thresh = self.na_thresh if api.is_integer(thresh): pass elif api.is_float(thresh): thresh = length * thresh else: msg = "'na_thresh' must be integer or float" logger.exception(msg, stack_info=True) raise ValueError(msg) X.dropna(axis=1, how='any', thresh=thresh, inplace=True) # drop constant const_col = [] for k, col in X.iteritems(): if (api.is_numeric_dtype(col) and col.std()<0.01) \ or len(pd.unique(col))==1: X.drop(k, axis=1, inplace=True) const_col.append(k) # drop uid cols or too discrete data uid_col = [] if self.drop_uid: for k, col in X.iteritems(): if api.is_object_dtype(col) or api.is_integer_dtype(col): if len(pd.unique(col)) > self.uniq_frac * len(col): X.drop(k, axis=1, inplace=True) uid_col.append(k) # drop too small fractions of categorical data count_frac = [] for k, col in X.iteritems(): if api.is_object_dtype(col): n = len(col) max_frac = col.value_counts().max() / n if max_frac < self.count_frac: X.drop(k, axis=1, inplace=True) count_frac.append(k) # filter dtypes options = { 'not_datetime': X.select_dtypes(exclude='datetime').columns, 'number': X.select_dtypes(include='number').columns, 'object': X.select_dtypes(include='object').columns, 'datetime': X.select_dtypes(include='datetime').columns, 'all': X.columns } self.objcols = options.get('object') self.numcols = options.get('number') self.datetimecols = options.get('datetime') self.obj_na = _get_imputer(self.na1) self.num_na = _get_imputer(self.na2) # fill na values for obj dtype if self.obj_na is not None and not self.objcols.empty: self.obj_na.fit(X.reindex(columns=self.objcols)) # fill na values for num dtype if self.num_na is not None and not self.numcols.empty: self.num_na.fit(X.reindex(columns=self.numcols)) self.out_labels = options.get(self.dtype_filter).tolist() # -- if len(na_col) > 0: msg =\ 'columns {} , total {} columns are null , have been dropped'.format( na_col, len(na_col)) logger.info(msg) if len(uid_col) > 0: msg = '''columns {}, total {} columns are uid, have been dropped '''.format(uid_col, len(uid_col)) logger.info(msg) if len(const_col) > 0: msg = '''columns {}, total {} columns are constant , have been dropped '''.format(const_col, len(const_col)) logger.info(msg) logger.info( 'matrix has valid {} columns; {} numeric columns; {} categorical columns; {} datetime columns' .format(X.shape[-1], len(self.numcols), len(self.objcols), len(self.datetimecols))) return self