Ejemplos de is_numeric_dtype en Python, ejemplos de pandas.core.dtypes.api.is_numeric_dtype en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: lw_preprocess.py Proyecto: tfaatfcn/lw-mlearn-rogerluo

def _woe_binning(X,
                 y,
                 q=None,
                 bins=None,
                 max_leaf_nodes=None,
                 cat_num_lim=0,
                 **kwargs):
    '''use by Woe_encoder to get binning edges
    
    return
    ----
    edges:
        {colname : [-inf, point1, point2..., inf]}
    '''
    bin_edges = {}
    for name, col in X.iteritems():
        df = pd.DataFrame({'x': col, 'y': y})
        col_notna = df.dropna().x
        y_notna = df.dropna().y
        if (len(pd.unique(col_notna)) > cat_num_lim \
            and api.is_numeric_dtype(col_notna)):
            label, bin_edges[name] = _binning(col_notna, bins, q,
                                              max_leaf_nodes, y_notna,
                                              **kwargs)
    return bin_edges

Ejemplo n.º 2

0

Mostrar archivo

Archivo: mapper.py Proyecto: decentral1se/tableschema-pandas-py

    def restore_type(self, dtype, sample=None):
        """Restore type from Pandas
        """

        # Pandas types
        if pdc.is_bool_dtype(dtype):
            return 'boolean'
        elif pdc.is_datetime64_any_dtype(dtype):
            return 'datetime'
        elif pdc.is_integer_dtype(dtype):
            return 'integer'
        elif pdc.is_numeric_dtype(dtype):
            return 'number'

        # Python types
        if sample is not None:
            if isinstance(sample, (list, tuple)):
                return 'array'
            elif isinstance(sample, datetime.date):
                return 'date'
            elif isinstance(sample, isodate.Duration):
                return 'duration'
            elif isinstance(sample, dict):
                return 'object'
            elif isinstance(sample, six.string_types):
                return 'string'
            elif isinstance(sample, datetime.time):
                return 'time'

        return 'string'

Ejemplo n.º 3

0

Mostrar archivo

Archivo: lw_preprocess.py Proyecto: tfaatfcn/lw-mlearn-rogerluo

def to_num_datetime(col, name='array', thresh=0.80, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    unchaged 
    
    parameters
    ----
    col --> series, scalar or ndarry will be turned into series type
    
    name --> name of the col series 
    
    thresh --> default 0.8 
        - if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    **kwargs 
    
    - errors - {'ignore', 'raise', 'coerce'}, default --> 'coerce'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    other pandas to_datetime key words
    
    return
    ----
    converted series or df
    '''
    try:
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

    try:
        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    except:
        pass
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infter_datetime_format': True}
        params.update(kwargs)
        try:
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
        except:
            pass
    return col

Ejemplo n.º 4

0

Mostrar archivo

Archivo: utilis.py Proyecto: zhaopsoul/lwmlearn-rogerluo

    def f(s):

        if api.is_numeric_dtype(s):
            if s.apply(abs).max() <= 1:
                s = s.apply(lambda x: str(round(x * 100, decimals)) + '%')
            else:
                fmt = "{" + ":,.{}f".format(decimals) + "}"
                s = s.apply(lambda x: fmt.format(x))
        return s

Ejemplo n.º 5

0

Mostrar archivo

    def _mapping_col(col, na_values=['null', '缺失值', -999, -99999, -1]):
        ''' encrypt categorical features
        '''
        col = col.replace(na_values, np.nan)
        if not api.is_numeric_dtype(col):
            uniq = col.unique()
            mapper = dict(
                zip(uniq, [''.join(['C', str(i)]) for i in range(len(uniq))]))

            if mapper.get(np.nan) is not None:
                mapper.pop(np.nan)

            col = col.map(mapper, na_action='ignore')

        return col

Ejemplo n.º 6

0

Mostrar archivo

Archivo: utilis.py Proyecto: zhaopsoul/lwmlearn-rogerluo

    def f(ser, kwd):
        '''
        '''
        param = kwd.get(ser.name)
        if param is None or not is_numeric_dtype(ser):
            return ser
        else:
            low, high, flag = param
            if low is not None:
                if flag == 'percentage':
                    low = ser.quantile(low)
                ser = ser.where(low <= ser, np.nan)

            if high is not None:
                if flag == 'percentage':
                    high = ser.quantile(high)
                ser = ser.where(ser <= high, np.nan)
        return ser

Ejemplo n.º 7

0

Mostrar archivo

def _get_binning(X,
                 y,
                 q=None,
                 bins=None,
                 max_leaf_nodes=None,
                 mono=None,
                 cat_num_lim=0,
                 **kwargs):
    '''use by Woe_encoder to get binning edges
    
    Parameters
    -----------
    
    X :
        DataFrame
    y :
        binary target
    
    return
    -------
    edges:
        {colname : [-inf, point1, point2..., inf]}
    '''
    bin_edges = {}
    # reset index
    X = pd.DataFrame(X).reset_index(drop=True)
    y = pd.Series(y).reset_index(drop=True)

    for name, col in X.iteritems():
        df = pd.DataFrame({'x': col, 'y': y})
        col_notna = df.dropna().x
        y_notna = df.dropna().y
        if (len(pd.unique(col_notna)) > cat_num_lim \
            and api.is_numeric_dtype(col_notna)):
            label, bin_edges[name] = binning(col_notna, bins, q,
                                             max_leaf_nodes, mono, y_notna,
                                             **kwargs)
    return bin_edges

Ejemplo n.º 8

0

Mostrar archivo

Archivo: plotter.py Proyecto: rogerlwlw/lw-mlearn-rogerluo

def plotter_score_path(df_score, title=None, cm=None, style='-.o'):
    '''
    df_score:
        data frame of scores of metrics
    '''
    # plot
    data = df_score.select_dtypes(include='number')
    n = len(data.columns)
    i, j = plt.rcParams['figure.figsize']
    fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2)))
    ax = get_flat_list(ax) if n == 1 else ax
    if cm is None:
        cm = plt.get_cmap('tab10')
    cmlist = [cm(i) for i in np.linspace(0, 1, n)]

    i = 0
    for ax0, col in zip(ax, data.columns):
        s = data[col]
        if api.is_numeric_dtype(s):

            s.plot(ax=ax0, color=cmlist[i], style=style)
            ax0.fill_between(s.index,
                             s - s.std(),
                             s + s.std(),
                             color='grey',
                             alpha=.3,
                             label=r'{} = {}$\pm$ {}'.format(
                                 col, round(s.mean(), 4), round(s.std(), 4)))
            plt.setp(ax0, ylabel=col)
            h, l = ax0.get_legend_handles_labels()
            ax0.legend([h[-1]], [l[-1]])
            i += 1
    ax[0].set_title(title)
    ax[-1].set_xlabel('index')
    plt.tight_layout(rect=(0, 0, 0.98, 0.96))
    return fig

Ejemplo n.º 9

0

Mostrar archivo

Archivo: plotter.py Proyecto: rogerlwlw/lw-mlearn-rogerluo

def plotter_cv_results_(results,
                        train_style='mo-',
                        test_style='go-.',
                        title=None):
    '''plot univariate parameter cross validated results after 
    grid search of model
    
    return
    -----
    ax, or tuple of ax
    '''
    scoring = results.filter(like='mean_train_').columns
    scoring = [i.replace('mean_train_', '') for i in scoring]
    df_param = results.filter(like='param_')
    param_array = df_param.columns
    if len(param_array) > 1:
        print('multi-parameter is encountered ... ')
        print(df_param.apply(lambda x: pd.Series(pd.unique(x))))
    # plot
    n = len(scoring)
    i, j = plt.rcParams['figure.figsize']
    fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2)))
    ax = get_flat_list(ax) if n == 1 else ax
    for s, ax0 in zip(scoring, ax):
        df = results[['mean_train_' + s, 'mean_test_' + s, 'std_test_' + s]]
        if len(param_array) == 1:
            df.index = results[param_array[0]]
            xlabel = param_array[0]
            num_param = api.is_numeric_dtype(df.index)
            if not num_param:
                df.index = np.arange(len(df.index))
        else:
            xlabel = ' + '.join([i.split('__')[-1] for i in param_array])

        df.sort_index(inplace=True)
        # plot
        mean = df['mean_test_' + s].values
        std = df.pop('std_test_' + s)
        x = df.index.get_values()
        df.plot.line(style=[train_style, test_style], ax=ax0)
        ax0.fill_between(x,
                         mean - std,
                         mean + std,
                         color='grey',
                         alpha=.2,
                         label=r'$\pm$ 1 std. dev.')
        # annotate
        x_max = df.index[np.argmax(mean)]
        best_score = np.max(mean)
        std = np.mean(std)
        h, l = ax0.get_legend_handles_labels()
        ax0.legend(
            [h[-1]],
            ['score_max= %0.4f $\pm$ %0.2f' % (np.max(mean), np.mean(std))])
        ax0.axvline(x_max, linestyle='--', marker='x', color='y')
        ax0.annotate("%0.4f" % best_score, (x_max, best_score))
        ax0.set_xlim(x.min() - 0.5, x.max() + 0.5)
        plt.setp(ax0, ylabel=s)

    # set title
    ax[0].set_title(title, fontsize=13)
    # use fig legend
    fig.legend(h, ('train', 'test', r'$\pm$ 1 std. dev.'),
               loc='upper right',
               ncol=3,
               bbox_to_anchor=(0.98, 1))
    ax[-1].set_xlabel(xlabel)
    plt.tight_layout(rect=(0, 0, 1, 0.95))
    return ax

Ejemplo n.º 10

0

Mostrar archivo

Archivo: utilis.py Proyecto: zhaopsoul/lwmlearn-rogerluo

def to_num_datetime(col, name='array', thresh=0.75, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    unchaged 
    
    parameters
    -----------
    col : series scalar or ndarry
        input sequence
    
    name : str
        name of the col series 
    
    thresh : float
        default 0.8,  
        if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    
    keyword args
    ------------
    other pandas to_datetime key words
    
    errors : {'ignore', 'raise', 'coerce'}
        default 'coerce'
        
        If 'raise', then invalid parsing will raise an exception
        
        If 'coerce', then invalid parsing will be set as NaN
        
        If 'ignore', then invalid parsing will return the input
    
    
    return
    --------
    s : series
        converted col
    '''
    try:
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col
    if col.astype(str).str.contains('^0\d+$').any():
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

    try:
        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    except:
        pass
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infer_datetime_format': True}
        params.update(kwargs)
        try:
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
            else:
                col = col.apply(lambda x: x if pd.isna(x) else str(x))
        except:
            pass

    return col

Ejemplo n.º 11

0

Mostrar archivo

def bin_tree(X,
             y,
             cat_num_lim=0,
             max_leaf_nodes=10,
             min_samples_leaf=0.05,
             random_state=0,
             verbose=0,
             **kwargs):
    '''Discretize features matrix based on Binary DecisionTree classifier
    
    .. note::
    
        CART tree - gini impurity as criterion, not numeric dtype column will 
        be igored, unique number of values less than "cat_num_lim" will be 
        ignored
    
    parameters
    -----------
    X : 2d array or dataframe matrix
        contain feature matrix, should be numerical dtype
    y : str
        col of class label, binary
        
    cat_num_lim
        number of unique vals limit to be treated as continueous feature
        
    max_leaf_nodes
        max number of bins
        
    min_samples_leaf
        minimum number of samples in leaf node
        
    **kwargs
        other tree keywords
    
    return
    -------
    bin_edges : dict
        {'col_name' : bin_edges }
    '''

    bin_edges = {}
    cols = []
    un_split = []
    # reset index
    X = pd.DataFrame(X).reset_index(drop=True)
    y = pd.Series(y).reset_index(drop=True)
    for name, col in X.iteritems():
        df = pd.DataFrame({'x': col, 'y': y})
        col_notna = df.dropna().x
        y_notna = df.dropna().y
        if (len(pd.unique(col_notna)) > cat_num_lim
                and api.is_numeric_dtype(col_notna)):
            # call _tree_univar_bin
            bin_edges[name] = _tree_univar_bin(
                col_notna,
                y_notna,
                max_leaf_nodes=max_leaf_nodes,
                min_samples_leaf=min_samples_leaf,
                random_state=random_state,
                **get_kwargs(DecisionTreeClassifier, **kwargs))
            if len(bin_edges[name]) < 3:
                un_split.append(name)
        else:
            cols.append(name)

    # log process

    msg1 = '''total of {2} unchaged (unique counts less 
           than {1} or categorical dtype) =\n "{0}" 
           '''.format(pd.Index(cols), cat_num_lim, len(cols))

    msg2 = '''total of {1} unsplitable features = \n {0} ...
           '''.format(pd.Index(un_split), len(un_split))

    if cols:
        logger.info(msg1)
    if un_split:
        logger.info(msg2)

    return bin_edges

Ejemplo n.º 12

0

Mostrar archivo

def binning(y_pre=None,
            bins=None,
            q=None,
            max_leaf_nodes=None,
            mono=None,
            y_true=None,
            labels=None,
            **kwargs):
    '''supervised binning 
    
    of y_pre based on y_true if y_true is not None
    
    .. _binningmeth:
        
    parameters
    -----------
    y_pre : 1d array_like
         value of y to be cut
    y_true : 1d array like
        binary target y_true for supervised cutting
    
    bins : int 
        number of equal width bins
        
    q : int 
        number of equal frequency bins 
         
    max_leaf_nodes : int
        number of tree nodes bins using tree cut
        if not None use supervised cutting based on decision tree
        
    mono : int
        number of bins that increases monotonically with "y" mean value  
        
        .. note::
            
            arguments [ q, bins, max_leaf_nodes, mono ] control binning method 
            and only 1 of them can be specified. 
            if not valid assign q=10 and bins=max_leaf_nodes=mono=None
            
    labels : bool
        see pd.cut, if False return integer indicator of bins, 
        if True return arrays of labels (or can be manually input)
        
    Keyword args
    -------------
    kwargs : 
        Decision tree keyswords
            
    min_impurity_decrease=0.001
        
    random_state=0 
        
    return 
    --------
    y_binlabel : array      
         bin label of y_pre 
    bin_edge : array
         ndarray of bin edges

    '''
    bins, q, max_leaf_nodes, mono = _check_binning_keywords(
        bins, q, max_leaf_nodes, mono)

    y_pre = to_num_datetime(y_pre)
    y_pre_input = y_pre.copy()
    if y_true is not None:
        y_true = to_num_datetime(y_true)
        y_true = np.array(y_true)

    # drop na values for y_pre & y_true pairs in case of supervised cutting
    df = pd.DataFrame({'ypre': np.array(y_pre), 'ytrue': y_true})
    df = df.dropna(subset=['ypre'])
    y_pre = df.pop('ypre')
    y_true = df.pop('ytrue')

    # if y_pre is not numeric data type, do not perform cut
    if not api.is_numeric_dtype(y_pre):
        return y_pre_input, y_pre.unique()

    if q is not None:
        bins = np.percentile(y_pre, np.linspace(0, 100, q + 1))
        bins[0] = -np.Inf
        bins[-1] = np.Inf
        bins = np.unique(bins)

    if max_leaf_nodes is not None:
        if y_true.isna().sum() > 0:
            raise ValueError('none nan y_true must be supplied for tree cut')
        y_pre0 = pd.DataFrame(y_pre)
        bins_dict = bin_tree(y_pre0,
                             y_true,
                             max_leaf_nodes=max_leaf_nodes,
                             **kwargs)
        bins = list(bins_dict.values())[0]

    if mono is not None:
        if y_true.isna().sum() > 0:
            raise ValueError('none nan y_true must be supplied for mono cut')
        bins = _mono_cut(Y=y_true, X=y_pre)

    if isinstance(bins, int):
        bins = np.linspace(np.min(y_pre), np.max(y_pre), bins + 1)
        bins[0] = -np.inf
        bins[-1] = np.Inf

    if bins is None:
        raise ValueError('no cutting bins supplied')

    if labels is True:
        labels = None

    y_binlabel, bin_edge = pd.cut(y_pre_input,
                                  bins,
                                  duplicates='drop',
                                  retbins=True,
                                  labels=labels)
    return y_binlabel, bin_edge

Ejemplo n.º 13

0

Mostrar archivo

def dtype_specific_binary(left,
                          right,
                          numerics,
                          datetimes,
                          bools,
                          strings,
                          categoricals,
                          intervals,
                          errors='ignore'):
    """
    A low-level base binary function to perform different binary operations based on the dtypes of left, right inputs.
    Can be used with functools.partial to create a custom binary function that can be passed to apply_columnwise or
    higher order compare_values utilities found elsewhere in this module.  See examples for more details.

    This function supports 6 distinct groups of pandas dtypes which are validated using a corresponding set of helpers
    provided by the pandas.core.dtypes API:

        1) numeric - is_numeric_dtype
        2) datetime-like - is_datetime64_any_dtype or istimedelta64_dtype
        3) bool - is_bool_dtype
        4) string - is_string_dtype
        5) categorical - is_categorical_dtype
        6) is_interval_dtype

    If no supported dtype is matched, or `left` & `right` do not have matching dtypes apd.Series of NaN values is
    returned unless errors='raise' in which case a ValueError is raised.

    Parameters
    ----------
    left : pd.Series, pd.DataFrame, np.ndarray
    right : pd.Series, pd.DataFrame, np.ndarray
    numerics : binary callable
        applied to numeric dtypes
    datetimes : binary callable
        applied to datetime-like objects
    bools : binary callable
        applied to bool dtypes
    strings : binary callable
        applied to string-like dtypes
    categoricals : binary callable
        applied to Categorical dtype
    intervals : binary callable
        applied to Interval dtype
    errors : str
        default 'ignore' issues warning and returns NaNs when dtype of left, right do not match
        if 'raise' is passed, will raise ValueError in such cases

    Returns
    -------
    result of applying a specific binary callable to `left` and `right` inputs based on dtype

    """
    _ld = left.dtype
    _rd = right.dtype
    if is_numeric_dtype(_ld) and is_numeric_dtype(_rd):
        return numerics(left, right)
    elif ((is_datetime64_any_dtype(_ld) or is_timedelta64_dtype(_ld)
           or is_timedelta64_ns_dtype(_ld))
          and (is_datetime64_any_dtype(_rd) or is_timedelta64_dtype(_rd)
               or is_timedelta64_ns_dtype(_rd))):
        return datetimes(left, right)
    elif is_bool_dtype(_ld) and is_bool_dtype(_rd):
        return bools(left, right)
    elif is_string_dtype(_ld) and is_string_dtype(_rd):
        return strings(left, right)
    elif is_categorical_dtype(_ld) and is_categorical_dtype(_rd):
        return categoricals(left, right)
    elif is_interval_dtype(_ld) and is_interval_dtype(_rd):
        return intervals(left, right)
    else:
        # by default when dtypes are mismatched we issue a warning and return NaNs
        # raise if user requires it
        if errors == 'raise':
            raise ValueError(
                f"left and right do not have matching supported dtypes: {_ld.name}, {_rd.name}"
            )
        else:
            warnings.warn(
                f"left: {left.name}, {_ld.name} and right: {right.name}, {_rd.name}"
                f" do not have comparable dtypes, returning NaNs")
            return pd.Series(np.nan, index=right.index)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: lw_preprocess.py Proyecto: tfaatfcn/lw-mlearn-rogerluo

def bin_tree(X,
             y,
             cat_num_lim=0,
             max_leaf_nodes=10,
             min_samples_leaf=0.05,
             random_state=0,
             verbose=0,
             **kwargs):
    '''discrete features based on univariate run of DecisionTree classifier
    (CART tree - gini impurity as criterion, not numeric dtype will be igored,
    unique number of values less than "cat_num_lim" will be ignored)
    
    df_X 
        - df, contain feature matrix, should be numerical dtype
    y 
        - col of class label, binary
    cat_num_lim=10
        - number of unique vals limit to be treated as continueous feature
    max_leaf_nodes=5
        - max number of bins
    min_samples_leaf=0.1
        - minimum number of samples in leaf node
    **kwargs
        - other tree keywords
    
    return
    ----
    bin_edges
        - dict of {'col_name' : bin_edges }
    '''

    bin_edges = {}
    cols = []
    un_split = []
    for name, col in X.iteritems():
        df = pd.DataFrame({'x': col, 'y': y})
        col_notna = df.dropna().x
        y_notna = df.dropna().y
        if (len(pd.unique(col_notna)) > cat_num_lim
                and api.is_numeric_dtype(col_notna)):
            # call _tree_univar_bin
            bin_edges[name] = _tree_univar_bin(
                col_notna,
                y_notna,
                max_leaf_nodes=max_leaf_nodes,
                min_samples_leaf=min_samples_leaf,
                random_state=random_state,
                **get_kwargs(DecisionTreeClassifier, **kwargs))
            if len(bin_edges[name]) < 3:
                un_split.append(name)
        else:
            cols.append(name)

    if verbose > 0:
        msg1 = '''total of {2} unchaged (unique counts less 
               than {1} or categorical dtype) =\n "{0}" 
               '''.format(pd.Index(cols), cat_num_lim, len(cols))
        msg2 = '''total of {1} unsplitable features = \n {0} ...
               '''.format(pd.Index(un_split), len(un_split))
        msg3 = 'total of {} bin_edges obtained \n'.format(len(bin_edges))
        if cols:
            print(msg1)
        if un_split:
            print(msg2)
        if bin_edges:
            print(msg3)

    return bin_edges

Ejemplo n.º 15

0

Mostrar archivo

Archivo: lw_preprocess.py Proyecto: tfaatfcn/lw-mlearn-rogerluo

    def fit(self, X, y=None):
        '''fit input_labels & out_labels 
        '''
        X = self._fit(X)

        # drop na columns
        na_col = X.columns[X.apply(lambda x: all(x.isna()))]
        X.dropna(axis=1, how='all', inplace=True)

        # drop uid cols or too discrete data
        uid_col = []
        for k, col in X.iteritems():
            if api.is_object_dtype(col):
                if len(pd.unique(col)) > 40:
                    X.drop(k, axis=1, inplace=True)
                    uid_col.append(k)
            elif api.is_integer_dtype(col):
                if len(pd.unique(col)) > 0.85 * len(col):
                    X.drop(k, axis=1, inplace=True)
                    uid_col.append(k)

        # drop constant
        const_col = []
        for k, col in X.iteritems():
            if (api.is_numeric_dtype(col) and col.std()<0.01) \
            or len(pd.unique(col))==1:
                X.drop(k, axis=1, inplace=True)
                const_col.append(k)

        # filter dtypes
        options = {
            'not_datetime': X.select_dtypes(exclude='datetime').columns,
            'number': X.select_dtypes(include='number').columns,
            'object': X.select_dtypes(include='object').columns,
            'datetime': X.select_dtypes(include='datetime').columns,
            'all': X.columns
        }

        self.objcols = options.get('object')
        self.numcols = options.get('number')
        self.datetimecols = options.get('datetime')
        self.obj_na = _get_imputer(self.na1)
        self.num_na = _get_imputer(self.na2)

        if self.obj_na is not None and not self.objcols.empty:
            self.obj_na.fit(X.reindex(columns=self.objcols))
        if self.num_na is not None and not self.numcols.empty:
            self.num_na.fit(X.reindex(columns=self.numcols))

        self.out_labels = options.get(self.dtype_filter).tolist()
        # --
        if len(na_col) > 0:
            print('{} ...\n total {} columns are null , have been dropped \n'.
                  format(na_col, len(na_col)))
        if len(uid_col) > 0:
            print(
                '''{} ...\n total {} columns are uid or has too many discrete 
                categories (>40) , have been dropped \n'''.format(
                    uid_col, len(uid_col)))
        if len(const_col) > 0:
            print(
                ''''{} ...\n total {} columns are constant , have been dropped
                \n'''.format(const_col, len(const_col)))

        if self.verbose > 0:
            for k, i in options.items():
                print('data has {} of {} columns'.format(len(i.columns), k))
            if len(na_col) > 0:
                print('null columns:\n {}'.format(list(na_col)))
        return self

Ejemplo n.º 16

0

Mostrar archivo

    def fit(self, X, y=None):
        '''fit input_labels & out_labels 
        '''
        X = self._fit(X, self.na_values)
        # drop na columns over na_thresh
        na_col = X.columns[X.apply(lambda x: all(x.isna()))]
        length = len(X)

        # drop null column
        thresh = self.na_thresh
        if api.is_integer(thresh):
            pass
        elif api.is_float(thresh):
            thresh = length * thresh
        else:
            msg = "'na_thresh' must be integer or float"
            logger.exception(msg, stack_info=True)
            raise ValueError(msg)
        X.dropna(axis=1, how='any', thresh=thresh, inplace=True)

        # drop constant
        const_col = []
        for k, col in X.iteritems():
            if (api.is_numeric_dtype(col) and col.std()<0.01) \
            or len(pd.unique(col))==1:
                X.drop(k, axis=1, inplace=True)
                const_col.append(k)

        # drop uid cols or too discrete data
        uid_col = []
        if self.drop_uid:
            for k, col in X.iteritems():
                if api.is_object_dtype(col) or api.is_integer_dtype(col):
                    if len(pd.unique(col)) > self.uniq_frac * len(col):
                        X.drop(k, axis=1, inplace=True)
                        uid_col.append(k)

            # drop too small fractions of categorical data
            count_frac = []
            for k, col in X.iteritems():
                if api.is_object_dtype(col):
                    n = len(col)
                    max_frac = col.value_counts().max() / n
                    if max_frac < self.count_frac:
                        X.drop(k, axis=1, inplace=True)
                        count_frac.append(k)

        # filter dtypes
        options = {
            'not_datetime': X.select_dtypes(exclude='datetime').columns,
            'number': X.select_dtypes(include='number').columns,
            'object': X.select_dtypes(include='object').columns,
            'datetime': X.select_dtypes(include='datetime').columns,
            'all': X.columns
        }

        self.objcols = options.get('object')
        self.numcols = options.get('number')
        self.datetimecols = options.get('datetime')

        self.obj_na = _get_imputer(self.na1)
        self.num_na = _get_imputer(self.na2)
        # fill na values for obj dtype
        if self.obj_na is not None and not self.objcols.empty:
            self.obj_na.fit(X.reindex(columns=self.objcols))
        # fill na values for num dtype
        if self.num_na is not None and not self.numcols.empty:
            self.num_na.fit(X.reindex(columns=self.numcols))

        self.out_labels = options.get(self.dtype_filter).tolist()
        # --
        if len(na_col) > 0:
            msg =\
            'columns {} , total {} columns are null , have been dropped'.format(
                na_col, len(na_col))
            logger.info(msg)

        if len(uid_col) > 0:
            msg = '''columns {}, total {} columns are uid, have been dropped
                 '''.format(uid_col, len(uid_col))
            logger.info(msg)

        if len(const_col) > 0:
            msg = '''columns {},  total {} columns are constant , have been dropped
                  '''.format(const_col, len(const_col))
            logger.info(msg)

        logger.info(
            'matrix has valid {} columns; {} numeric columns; {} categorical columns; {} datetime columns'
            .format(X.shape[-1], len(self.numcols), len(self.objcols),
                    len(self.datetimecols)))

        return self