Esempio n. 1
0
def warm2Columns(dfSrc,column1,column2,badFlag,binSize=10,countLimit=200):
    df = dfSrc[[column1,column2,badFlag]].copy()
    bins1 = np.unique(algos.quantile(df[column1], np.linspace(0, 1, binSize+1)))
    bins2 = np.unique(algos.quantile(df[column2], np.linspace(0, 1, binSize+1)))
    df[column1+'_bin'] = pd.tools.tile._bins_to_cuts(df[column1], bins1, include_lowest=True)
    df[column2+'_bin'] = pd.tools.tile._bins_to_cuts(df[column2], bins2, include_lowest=True)
    pvMean = df.pivot_table(badFlag,column1+'_bin',column2+'_bin',np.mean).fillna(0)
    pvSize = df.pivot_table(badFlag,column1+'_bin',column2+'_bin',np.size).fillna(0)
    
    if сheckIndex(pvSize):
        for ind in pvSize.index:
            for col in pvSize.columns:
                if np.isnan(pvSize.loc[ind,col].values[0][0]):
                    pvMean.loc[ind,col]=0
                elif pvSize.loc[ind,col].values[0][0]<countLimit:
                    pvMean.loc[ind,col]=0
    else:
        for ind in pvSize.index:
            for col in pvSize.columns:
                if np.isnan(pvSize.loc[ind,col]):
                    pvMean.loc[ind,col]=0
                elif pvSize.loc[ind,col] < countLimit:
                    pvMean.loc[ind,col]=0

    ss = sns.heatmap(pvMean,annot=True)
    return pvMean,pvSize
Esempio n. 2
0
def show_orders_hist(order_pd, s_list=None, q_default=10):

    if s_list is None:
        s_list = ['lowBkCnt', 'atr_std', 'jump_power', 'diff_days',
                  'wave_score1', 'wave_score2', 'wave_score3',
                  'deg_60WindowPd', 'deg_hisWindowPd', 'deg_windowPd']

    s_list = filter(lambda x: order_pd.columns.tolist().count(x) > 0, s_list)
    for sn in s_list:
        uq = len(np.unique(order_pd[sn]))
        if uq == 1:
            continue

        bins = 10
        bins = uq // 50 if uq // 50 > bins else bins
        order_pd[sn].hist(bins=bins)
        plt.show()

        try:
            cats = pd.qcut(order_pd[sn], q_default)
        except Exception:
            '''
                某一个数据超出q的数量导致无法分
            '''
            import pandas.core.algorithms as algos
            bins = algos.quantile(np.unique(order_pd[sn]), np.linspace(0, 1, q_default + 1))
            cats = pd.tools.tile._bins_to_cuts(order_pd[sn], bins, include_lowest=True)
            # ZLog.info(sn + ' qcut except use bins!')
        ZLog.info('{0} show hist and qcuts'.format(sn))
        ZLog.info(cats.value_counts())
Esempio n. 3
0
def qcut(x, q, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.

    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins.
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.
    precision : int
        The precision at which to store and display the bins labels

    Returns
    -------
    out : Categorical or Series or array of integers if labels is False
        The return type (Categorical or Series) depends on the input: a Series
        of type category if input is a Series else Categorical. Bins are
        represented as categories when categorical data is returned.
    bins : ndarray of floats
        Returned only if `retbins` is True.

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    >>> pd.qcut(range(5), 4)
    [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
    Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
    >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
    [good, good, medium, bad, bad]
    Categories (3, object): [good < medium < bad]
    >>> pd.qcut(range(5), 4, labels=False)
    array([0, 0, 1, 2, 3], dtype=int64)
    """
    x_is_series, series_index, name, x = _preprocess_for_cut(x)

    x, dtype = _coerce_to_type(x)

    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype)

    return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name)
def rmg_qcut(x, q, labels=None, retbins=False, precision=3):
    if com.is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    bins = np.unique(bins)
    return pandas.tools.tile._bins_to_cuts(x, bins, labels=labels, retbins=retbins,
        precision=precision, include_lowest=True)
Esempio n. 5
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
        assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        self.assert_(np.array_equal(labels, ex_levels))
Esempio n. 6
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        tm.assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        self.assert_categorical_equal(labels, ex_levels)
Esempio n. 7
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        ex_bins[0] -= (arr.max() - arr.min()) * 0.001
        assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins)
        self.assert_(np.array_equal(labels, ex_levels))
Esempio n. 8
0
    def __init__(self, df):



        df.columns = [x.lower() for x in df.columns]

        target = [col for col in df.columns if 'target' in col]
        target = ''.join(target)

        dfdropped = df[['id', target]]
        to_drop = ['id', target]
        dfc = df.drop(to_drop, axis=1)

        #num_variables = len(dfc.columns.tolist()) - 5
        #r = random.randint(0, num_variables)
        #print(r, r+5)

        dfc_subset = pd.DataFrame()
        col_names = dfc.columns.tolist()
        print(col_names)
        print(type(col_names))
        for i in range(len(col_names)):
            if dfc[col_names[i]].max() - dfc[col_names[i]].min() > 5:
                dfc_subset[col_names[i]] = df[col_names[i]]
        print(dfc_subset)



        df_names = dfc_subset.columns.values
        index = len(df_names) - 1

        #CREATION OF VARIABLES
        for i in np.delete(df_names, index):
            for j in np.delete(df_names, index):
                dfc[i+"/"+j] = np.where(dfc[j]==0,0,dfc[i]/dfc[j])
                dfc[i+"*"+j] = dfc[i]*dfc[j]
                dfc[i+"-"+j] = dfc[i]+dfc[j]
                dfc[i+"+"+j] = dfc[i]+dfc[j]

        #DUMMY AND BINNING
        col_names_subset = dfc_subset.columns.tolist()
        dfc_bins = pd.DataFrame()
        for col in range(len(col_names_subset)):
            bins = algos.quantile(np.unique(dfc_subset[col_names_subset[col]]), np.linspace(0, 1, 11))
            result = pd.tools.tile._bins_to_cuts(dfc_subset[col_names_subset[col]], bins, include_lowest=True)
            dfc_bins[col_names_subset[col]] = result
            #dfc_bins = pd.qcut(dfc_subset[col_names_subset[col]], 5)



        print("\n Exported csv file with variables created from all possible ratios, binning, and created dummy variables for a subset of variables")


        autocreated = pd.concat([dfdropped, dfc, dfc_bins], axis=1)
        autocreated.to_csv("autocreatedvariablesdataset.csv")
Esempio n. 9
0
 def make_boost_dummies(self, orderPd, cats_ss, prefix, regex):
     try:
         cats = pd.qcut(cats_ss, self.qcut_bins)
     except Exception, e:
         '''
             某一个数据超出q的数量导致无法分
         '''
         import pandas.core.algorithms as algos
         bins = algos.quantile(np.unique(cats_ss), np.linspace(0, 1, self.qcut_bins + 1))
         cats = pd.tools.tile._bins_to_cuts(cats_ss, bins, include_lowest=True)
         ZLog.info(prefix + ' qcut except use bins!')
Esempio n. 10
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        # We store the bins as Index that have been rounded
        # to comparisons are a bit tricky.
        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        result = labels.categories.left.values
        assert np.allclose(result, ex_bins[:-1], atol=1e-2)
        result = labels.categories.right.values
        assert np.allclose(result, ex_bins[1:], atol=1e-2)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        tm.assert_categorical_equal(labels, ex_levels)
Esempio n. 11
0
def qcut(x, q, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.

    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Labels to use for bin edges, or False to return integer bin labels
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.
    precision : int
        The precision at which to store and display the bins labels

    Returns
    -------
    cat : Categorical or Series
        Returns a Series of type category if input is a Series else Categorical.

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    """
    if com.is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    res = _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision,
                        include_lowest=True)
    if isinstance(x, Series):
        res = Series(res, index=x.index)
    return res
Esempio n. 12
0
def qcutnew(x, q, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.
    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins.
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.
    precision : int
        The precision at which to store and display the bins labels
    Returns
    -------
    out : Categorical or Series or array of integers if labels is False
        The return type (Categorical or Series) depends on the input: a Series
        of type category if input is a Series else Categorical. Bins are
        represented as categories when categorical data is returned.
    bins : ndarray of floats
        Returned only if `retbins` is True.
    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object
    """
    if com.is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    return _bins_to_cuts_new(x, bins, labels=labels, retbins=retbins,
                         precision=precision, include_lowest=True)
Esempio n. 13
0
def qcut(x, q=4, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce 1000 integers from 0 to 9
    indicating the

    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. Array of
        quantiles must span [0, 1]
    labels : array or boolean, default None
        Labels to use for bin edges, or False to return integer bin labels
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.

    Returns
    -------

    Notes
    -----

    Examples
    --------
    """
    if com.is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    bins[0] -= 0.001 * (x.max() - x.min())

    return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
                         precision=precision)
Esempio n. 14
0
def qcut(x, q, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce 1000 integers from 0 to 9
    indicating the

    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Labels to use for bin edges, or False to return integer bin labels
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.

    Returns
    -------
    cat : Categorical

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    """
    if com.is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
                         precision=precision, include_lowest=True)
Esempio n. 15
0
def qcut(
    x,
    q,
    labels=None,
    retbins: bool = False,
    precision: int = 3,
    duplicates: str = "raise",
):
    """
    Quantile-based discretization function.

    Discretize variable into equal-sized buckets based on rank or based
    on sample quantiles. For example 1000 values for 10 quantiles would
    produce a Categorical object indicating quantile membership for each data point.

    Parameters
    ----------
    x : 1d ndarray or Series
    q : int or list-like of float
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
    labels : array or False, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins. If True, raises an error.
    retbins : bool, optional
        Whether to return the (bins, labels) or not. Can be useful if bins
        is given as a scalar.
    precision : int, optional
        The precision at which to store and display the bins labels.
    duplicates : {default 'raise', 'drop'}, optional
        If bin edges are not unique, raise ValueError or drop non-uniques.

    Returns
    -------
    out : Categorical or Series or array of integers if labels is False
        The return type (Categorical or Series) depends on the input: a Series
        of type category if input is a Series else Categorical. Bins are
        represented as categories when categorical data is returned.
    bins : ndarray of floats
        Returned only if `retbins` is True.

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    >>> pd.qcut(range(5), 4)
    ... # doctest: +ELLIPSIS
    [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
    Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...

    >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
    ... # doctest: +SKIP
    [good, good, medium, bad, bad]
    Categories (3, object): [good < medium < bad]

    >>> pd.qcut(range(5), 4, labels=False)
    array([0, 0, 1, 2, 3])
    """
    original = x
    x = _preprocess_for_cut(x)
    x, dtype = _coerce_to_type(x)

    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    fac, bins = _bins_to_cuts(
        x,
        bins,
        labels=labels,
        precision=precision,
        include_lowest=True,
        dtype=dtype,
        duplicates=duplicates,
    )

    return _postprocess_for_cut(fac, bins, retbins, dtype, original)
Esempio n. 16
0
    def autoBinarize(self):
        self.MinValue = self.data[self.var_name].min()
        self.AvgValue = self.data[self.var_name].mean()
        self.MedianValue = self.data[self.var_name].median()
        self.MaxValue = self.data[self.var_name].max()

        justmiss = self.data.loc[self.data[self.var_name].isnull(),
                                 [self.var_name, self.target]]
        notmiss = self.data.loc[self.data[self.var_name].notnull(),
                                [self.var_name, self.target]]

        r = 0
        n = 20
        best_r = 0
        best_n = 0

        if (notmiss.shape[0] < self.Total * 0.005):
            # non-empty records less than 0.5% of Total
            d1 = pd.DataFrame({
                "X":
                notmiss[self.var_name],
                "Y":
                notmiss[self.target],
                "Bucket":
                pd.qcut(notmiss[self.var_name], 1, duplicates='drop')
            })
            d2 = d1.groupby('Bucket', as_index=True)
        else:
            while ((np.abs(r) < 0.99999) and (n > 0)):
                try:
                    d1 = pd.DataFrame({
                        "X":
                        notmiss[self.var_name],
                        "Y":
                        notmiss[self.target],
                        "Bucket":
                        pd.qcut(notmiss[self.var_name], n, duplicates='drop')
                    })
                    d2 = d1.groupby('Bucket', as_index=True)
                    r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
                    if np.abs(r) > np.abs(best_r):
                        best_r = r
                        best_n = n
                    n = n - 1
                except Exception as e:
                    print("Exception for variable %s step n = %i: %s" %
                          (self.var_name, n, e))
                    n = n - 1

            if len(d2) == 1:
                try:
                    n = best_n
                    bins = algos.quantile(notmiss[self.var_name],
                                          np.linspace(0, 1, n))
                    if len(np.unique(bins)) == 2:
                        bins = np.insert(bins, 0, 1)
                        bins[1] = bins[1] - (bins[1] / 2)
                    d1 = pd.DataFrame({
                        "X":
                        notmiss[self.var_name],
                        "Y":
                        notmiss[self.target],
                        "Bucket":
                        pd.cut(notmiss[self.var_name],
                               np.unique(bins),
                               include_lowest=True)
                    })
                    d2 = d1.groupby('Bucket', as_index=True)
                except Exception as e:
                    print("Exception for variable %s step n = %i: %s" %
                          (self.var_name, n, e))
                    d1 = pd.DataFrame({
                        "X":
                        notmiss[self.var_name],
                        "Y":
                        notmiss[self.target],
                        "Bucket":
                        pd.qcut(notmiss[self.var_name], 1, duplicates='drop')
                    })
                    d2 = d1.groupby('Bucket', as_index=True)

        self.intervals['Variable'] = self.var_name
        self.intervals['MinValue'] = d2.min().X
        self.intervals['MaxValue'] = d2.max().X

        self.intervals['Interval'] = [
            ' - '.join(str(x) for x in y) for y in map(
                tuple, self.intervals[['MinValue', 'MaxValue']].values)
        ]

        self.intervals['Total'] = d2.count().Y
        self.intervals['Bads'] = d2.sum().Y
        self.intervals.loc[np.isnan(self.intervals['Bads']), 'Bads'] = 0
        self.intervals['Goods'] = d2.count().Y - d2.sum().Y
        self.intervals.loc[np.isnan(self.intervals['Goods']), 'Goods'] = 0

        if len(justmiss.index) > 0:
            d4 = pd.DataFrame({'MinValue': np.nan}, index=[0])
            d4['MaxValue'] = np.nan
            d4['Interval'] = "Missing Value"
            d4['Total'] = justmiss.count()[self.target]
            d4['Bads'] = justmiss.sum()[self.target]
            d4['Goods'] = justmiss.count()[self.target] - justmiss.sum()[
                self.target]
            self.intervals = self.intervals.append(d4,
                                                   ignore_index=True,
                                                   sort=True)

        # Here has to be the common code from Base class
        BinVariable.autoBinarize(self)
Esempio n. 17
0
def mono_bin(Y, X, max_bin, force_bin):
    """
    binning function for int and float type variables, and not binary indicator variable
    
    Parameters
    ----------
    
    Y : pandas series
        target vector
        
    X : pandas dataframe
        training dataset
        
    max_bin : int
        the maximum number of bins (categories) for numeric variable binning. 
        
    force_bin : int
        For some numeric variables, the mono_bin function may produce only one 
        category while binning. ‘force_bin’ ensures that at least produces two
        categories will be produced. 
        
    Return
    ------
    
    d3 : pandas dataframe
        Weight of evidence / information value table and other data used to 
        calculate WOE and IV for variable i in dataset
       
    """
    n = max_bin
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X', 'Y']][df1.X.isnull()]
    notmiss = df1[['X', 'Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({
                "X": notmiss.X,
                "Y": notmiss.Y,
                "Bucket": pd.qcut(notmiss.X, n)
            })
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1] - (bins[1] / 2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": \
                           pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
        d2 = d1.groupby('Bucket', as_index=True)

    d3 = pd.DataFrame({}, index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    print(d3)
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3 = d3.reset_index(drop=True)
    d3 = d3.drop(d3[d3.COUNT == 0].index)

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4, ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT / d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT / d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT / d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_NON_EVENT / d3.DIST_EVENT)
    d3["IV"] = (d3.DIST_NON_EVENT - d3.DIST_EVENT) * np.log(
        d3.DIST_NON_EVENT / d3.DIST_EVENT)
    d3["VAR_NAME"] = "VAR"

    d3['max_range'] = d3['MAX_VALUE']
    d3["min_range"] = d3.groupby('VAR_NAME')['MAX_VALUE'].shift(1)
    d3.loc[d3['min_range'].isnull(), 'min_range'] = -np.inf
    d3.loc[d3['MIN_VALUE'].isnull(), 'min_range'] = np.nan

    _max = d3.loc[d3['max_range'] != np.nan, 'max_range'].max()
    d3.loc[d3['max_range'] == _max, 'max_range'] = np.inf

    d3['bucket'] = '(' + d3['min_range'].astype(
        str) + ', ' + d3['max_range'].astype(str) + ']'
    d3.loc[d3['bucket'] == '(nan, nan]', 'bucket'] = 'missing'
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'min_range', 'max_range', 'COUNT', 'EVENT', 'EVENT_RATE', \
             'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT', 'bucket', 'WOE', 'IV']]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()

    return (d3)
Esempio n. 18
0
    def finger(X, y=None, cut_points=None,
               n_quantiles=4, labels=None,
               min_val=None, max_val=None, **params):
        """Manually bins continuous variable into the declared intervals.

        If the cut-off points are not declared the split is made using
        quantiles.

        Parameters
        ----------
        X: array-like, shape = (n_samples, )
            Vector passed as an one-dimensional array-like object where
            n_samples in the number of samples.

        y: Ignore

        cut_points: array-like, optional (default=None)
            Increasing monotonic sequence generating right-closed intervals.
            Values not allocated to any of the categories will be assigned to
            the empty set. For example given: cut_points=[1, 5, 9] will
            generate intervals: [X.min(), 1], (1, 5], (5, 9], (9, X.max()].
            If you want to specify lower and upper limitations, set parameters:
            "min_val", "max_val" to a specific value.

        n_quantiles: int, optional (default=4)
            When cut_points are not declared it sets the number of quantiles
            to which the variable will be splitted. For example setting
            n_quantiles = 4 will return quartiles of X values between min_val
            and max_val.

        labels: string: {'auto'} or list, optional (default=None)
            Specifies returned bucket names, needs to be the same length as the
            number of created buckets:

            - `auto`:

              Assigns default values to group names by numbering them.

        min_val: float, optional (default=None)
            Determines lower limit value. If not specified takes -np.inf.

        max_val: float, optional (default=None)
            Determines upper limit value. If not specified takes np.inf.

        Returns
        -------
        X_new: array, shape = (n_samples, )
            Input data with its original values ​​being substituted with their
            respective labels.

        """
        X = np.asarray(X)
        x = X[~np.isnan(X)]

        if min_val is None:
            min_val = -np.inf

        if max_val is None:
            max_val = np.inf

        # Default break_points in case of no declaration of cut_points
        if cut_points is None:
            x = x[(x >= min_val) & (x <= max_val)]
            break_points = algos.quantile(
                np.unique(x),
                np.linspace(0, 1, n_quantiles + 1)
            )
        else:
            break_points = np.insert(
                cut_points.astype(float),
                [0, len(cut_points)],
                [min_val, max_val]
            )
        break_points = np.unique(break_points)

        if labels == 'auto':
            labels = range(len(break_points) - 1)

        X_new = pd.cut(
            X, bins=break_points, labels=labels, include_lowest=True
        )

        return X_new
Esempio n. 19
0
meta = df.describe()

meta.to_excel(
    r'C:\Users\wanti\Desktop\MMA\MMA 831 Marketing Analytics\MMA831_midterm\meta2.xlsx',
    index=False)

###########################
#  generate logit plot
# logit plot

for i in range(1, 100):
    # find the right size to bin x
    test = df.loc[:, [df.columns[i], 'target2']]
    test = test.dropna()
    bins = np.unique(
        algos.quantile(test.loc[:, df.columns[i]], np.linspace(0, 1, 11)))
    test['bin'] = pd.cut(test.loc[:, df.columns[i]], bins, right=False)

    # using bin to get logit y
    tt = test.groupby(['bin'], as_index=False).agg({
        'target2': ['count', 'sum'],
        df.columns[i]: 'mean'
    })
    tt['logity'] = np.log(
        (tt.iloc[:, 2] + 1) / (tt.iloc[:, 1] - tt.iloc[:, 2] + 1))

    # plot it
    plt.figure()
    plt.plot(tt.iloc[:, 3], tt['logity'], color='blue')
    plt.title(df.columns[i] + " vs target")
    plt.show()
Esempio n. 20
0
def quantile(x):
	vals = x.values
	return algos.quantile(vals, np.linspace(0,1,11))
Esempio n. 21
0
def qcut(x, q, labels=None, retbins=False, precision=3):
    """
    Quantile-based discretization function. Discretize variable into
    equal-sized buckets based on rank or based on sample quantiles. For example
    1000 values for 10 quantiles would produce a Categorical object indicating
    quantile membership for each data point.

    Parameters
    ----------
    x : ndarray or Series
    q : integer or array of quantiles
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
    labels : array or boolean, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins.
    retbins : bool, optional
        Whether to return the bins or not. Can be useful if bins is given
        as a scalar.
    precision : int
        The precision at which to store and display the bins labels

    Returns
    -------
    out : Categorical or Series or array of integers if labels is False
        The return type (Categorical or Series) depends on the input: a Series
        of type category if input is a Series else Categorical. Bins are
        represented as categories when categorical data is returned.
    bins : ndarray of floats
        Returned only if `retbins` is True.

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    >>> pd.qcut(range(5), 4)
    [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
    Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
    >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
    [good, good, medium, bad, bad]
    Categories (3, object): [good < medium < bad]
    >>> pd.qcut(range(5), 4, labels=False)
    array([0, 0, 1, 2, 3], dtype=int64)
    """
    x_is_series, series_index, name, x = _preprocess_for_cut(x)

    x, dtype = _coerce_to_type(x)

    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = algos.quantile(x, quantiles)
    fac, bins = _bins_to_cuts(x, bins, labels=labels,
                              precision=precision, include_lowest=True,
                              dtype=dtype)

    return _postprocess_for_cut(fac, bins, retbins, x_is_series,
                                series_index, name)
Esempio n. 22
0
def show_orders_hist(df,
                     feature_columns,
                     show=True,
                     only_hist=True,
                     show_pie=False):
    """
    可视化统计feature_columns序列所指定的特征在df中的直方图或者饼状图,
    根据参数only_hist是否进行pd.qcut统计输出

    eg:
        from abupy import AbuML, ml
        ttn_raw = AbuML.load_ttn_raw_df()
        ml.show_orders_hist(ttn_raw, ['Age', 'Fare', 'Pclass'])

    :param df: pd.DataFrame对象
    :param feature_columns: 特征名称序列,eg:['Age', 'Fare', 'Pclass']
    :param show: 是否可视化直方图或者饼状图
    :param show_pie: 是否优先考虑绘制饼状图,默认false
    :param only_hist: 是否进行pd.qcut统计输出
    """
    if not isinstance(df, pd.DataFrame):
        logging.info('df must pd.DataFrame, not type {}'.format(type(df)))
        return

    # 第一步过滤不在在特征列中的feature_columns元素
    feature_columns = list(
        filter(lambda x: df.columns.tolist().count(x) > 0, feature_columns))
    # 第二步过滤feature_columns元素中类型不是int或者float的
    feature_columns = list(
        filter(
            lambda x: df[x].dtype == int or df[x].dtype == float or df[x].dtype
            == np.uint or df[x].dtype == np.uint8, feature_columns))
    # 第三步过滤feature_columns元素中所指特征列中unique==1的,eg:1列全是1,全是0,没办法做bin
    feature_columns = list(
        filter(lambda x: len(np.unique(df[x])) > 1, feature_columns))

    axs_list = None
    if len(feature_columns) == 0:
        # 晒没了的情况,直接返回
        logging.info(
            '{}\n{}\nnot exist! or unique==1!, or dtype != int or float'.
            format(df.columns, df.dtypes))
        return

    if show:
        # 如果可视化直方图,先确定子画布列数,一行放两个,取math.ceil,eg:3 /2 = 2
        n_rows = int(math.ceil(len(feature_columns) / 2))
        # 行高取5,总高度:n_rows * 5
        fig_h = n_rows * 5
        # plt.subplots生成子画布
        _, axs = plt.subplots(nrows=n_rows, ncols=2, figsize=(14, fig_h))
        # 如果是多于1个的即展开字画本序列为1d序列
        axs_list = axs if n_rows == 1 else list(
            itertools.chain.from_iterable(axs))

    for ind, feature in enumerate(feature_columns):
        feature_unique = len(np.unique(df[feature]))
        ax = None
        if axs_list is not None:
            ax = axs_list[ind]
            ax.set_title(feature)
        if show_pie and feature_unique < 10:
            # 如果特征的值unique < 10个,通过value_counts直接画饼图
            df[feature].value_counts().plot(ax=ax, kind='pie')
        else:
            # 画直方图
            bins = int(feature_unique / 50) if feature_unique / 50 > 10 else 10
            df[feature].hist(ax=ax, bins=bins)

        if only_hist:
            # 只做可视化就continue
            continue

        try:
            # qcut切分10等份
            cats = pd.qcut(df[feature], 10)
        except Exception:
            # 某一个数据超出q的数量导致无法分
            import pandas.core.algorithms as algos
            bins = algos.quantile(np.unique(df[feature]),
                                  np.linspace(0, 1, 10 + 1))
            # noinspection PyProtectedMember,PyUnresolvedReferences
            cats = pd.tools.tile._bins_to_cuts(df[feature],
                                               bins,
                                               include_lowest=True)

        logging.info('{0} show hist and qcuts'.format(feature))
        """
            Age show hist and qcuts
            (31.8, 36]    91
            (14, 19]      87
            (41, 50]      78
            [0.42, 14]    77
            (22, 25]      70
            (19, 22]      67
            (28, 31.8]    66
            (50, 80]      64
            (25, 28]      61
            (36, 41]      53
            Name: Age, dtype: int64
        """
        logging.info(cats.value_counts())
Esempio n. 23
0
#  to generate logit plot, let's create binayr target first
df['target2'] = np.where(df['target'] > 0, 1, 0)
df['target2'].value_counts()

df['target'].value_counts()




# logit plot


for i in range(1,100):
    # find the right size to bin x
    test = df.loc[:,[df.columns[i],'target2']]
    bins = np.unique(algos.quantile(test.loc[:,df.columns[i]], np.linspace(0, 1, 11)))
    test['bin'] = pd.cut(test.loc[:,df.columns[i]], bins ,right=False)
    
    # using bin to get logit y
    tt = test.groupby(['bin'],as_index= False).agg({ 'target2':['count','sum'], 
                         df.columns[i]:'mean'})
    tt['logity'] = np.log((tt.iloc[:,2] + 1)/(tt.iloc[:,1] -tt.iloc[:,2] + 1))
    
    # plot it
    plt.figure()
    plt.plot(tt.iloc[:,3],tt['logity'],color='blue')
    plt.title(df.columns[i]+" vs target")
    plt.show()


Esempio n. 24
0
def bucket_data(df, buckets, label=None, privacy=None, verbose=0,
                bin_features=None):
    df = df.copy()

    # partition continuous and integer data into buckets
    for col in df.columns:
        do_buckets = col != label and (
            df[col].dtype == 'float64' or (
                df[col].dtype == 'int64' and len(set(df[col])) > buckets)
        ) and (bin_features is None or col in bin_features)

        if not do_buckets:
            continue

        if verbose >= 2:
            print 'bucketing column', repr(col)

        arr = np.nan_to_num(df[col].as_matrix())

        # this is here to mask out zeros, in case the majority of values are
        # zeros and it's impossible to do normal bucketing
        #mx = np.ma.masked_equal(arr, 0, copy=True)
        #bins = algos.quantile(arr[~mx.mask], np.linspace(0, 1, buckets+1))

        # then add back in a bucket specifically for zeros
        #bins = np.insert(bins, 0, 0)
        #bins[1] = bins[1] - bins[1] / 2


        epsilon = 1e-10 if df[col].dtype == 'float64' else 1
        bins = algos.quantile(arr, np.linspace(0, 1, buckets+1))

        if privacy is not None and privacy > 0:
            assert buckets == 2
            median = estimate_median_private(arr, privacy, min(arr), max(arr))
            bins = np.array([0, median, max(arr)])
            if verbose >= 2:
                print 'median real', sorted(arr)[len(arr)/2], 'estimate', median

        for i in range(1, len(bins)):
            if bins[i] <= bins[i - 1]:
                bins[i] = bins[i - 1] + epsilon

        df[col] = pd.cut(arr, bins, labels=range(buckets), include_lowest=True)
        continue

        # tools.tile gone as of pandas 16 :(
        #cuts = pd.tools.tile._bins_to_cuts(arr, bins, labels=range(buckets),
                                           #include_lowest=True)

        #df[col] = pd.qcut(df[col], buckets, labels=range(buckets))

        # sample values until you get enough real ones. This doesn't work if
        # there are too many "NaN"s.
        #sample = df.sample(n=int(math.sqrt(len(df[col])) + 1))[col].copy()
        sample = df[col].sort_values(inplace=False, na_position='last')
        num_num = len(sample.dropna(inplace=False))
        n = float(num_num) / buckets

        # these are the percentiles of the numbers in the series - dictating
        # the boundaries of the buckets.

        # we only do this convoluted thing here to support the sampling step
        # above. Otherwise we would just sort everything and put elements
        # [n:i+n] into each bucket.
        bucket_list = [sample.iloc[int(i*n)] for i in range(1, buckets)]

        if False:    # simple method
            bucket_list = sorted(set(bucket_list))
            for i, row in df.iterrows():
                if np.isnan(row[col]):
                    val = 0
                else:
                    val = next((idx for idx, b in enumerate(bucket_list)
                                if b >= row[col]), len(bucket_list))
                df.set_value(i, col, val)

            print 'Bucket values for %s:' % col
            print '\tv <= %.3f' % bucket_list[0]
            for i in range(len(bucket_list) - 1):
                v = bucket_list[i]
                nv = bucket_list[i+1]
                print '\t%.3f < v <= %.3f' % (v, nv)
            print '\tv > %.3f' % bucket_list[-1]
            print

        else:       # more complicated method
            bucket_vals = {}
            exact_vals = {}

            idx = 0
            for v in sorted(list(set(bucket_list))):
                idx += 1
                bucket_vals[v] = idx
                if bucket_list.count(v) > 1:
                    idx += 1
                    exact_vals[v] = idx

            sorted_vals = sorted(bucket_vals.items())

            print 'Bucket values for %s:' % col
            print '\tv <= %.3f' % sorted_vals[0][0]
            for i in range(len(sorted_vals) - 1):
                v = sorted_vals[i][0]
                nv = sorted_vals[i+1][0]
                if v in exact_vals:
                    print '\tv == %.3f' % v

                if nv in exact_vals:
                    print '\t%.3f < v < %.3f' % (v, nv)
                else:
                    print '\t%.3f < v <= %.3f' % (v, nv)

            print '\tv > %.3f' % sorted_vals[-1][0]
            print

            num_buckets = len(bucket_vals) + len(exact_vals)
            for i, row in df.iterrows():
                if np.isnan(row[col]):
                    val = 0
                elif row[col] in exact_vals:
                    val = exact_vals[row[col]]
                else:
                    val = next((idx for b, idx in sorted_vals
                                if b >= row[col]), num_buckets)
                df.set_value(i, col, val)

        df[col] = df[col].astype(int)

    return df
Esempio n. 25
0
def test_quantile():
    s = Series(np.random.randn(100))

    result = algos.quantile(s, [0, .25, .5, .75, 1.])
    expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
    tm.assert_almost_equal(result, expected)
Esempio n. 26
0
def mono_bin(Y, X, n=max_bin):

    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X', 'Y']][df1.X.isnull()]
    notmiss = df1[['X', 'Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({
                "X": notmiss.X,
                "Y": notmiss.Y,
                "Bucket": pd.qcut(notmiss.X, n)
            })
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1
            # print(r)
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1] - (bins[1] / 2)
        d1 = pd.DataFrame({
            "X":
            notmiss.X,
            "Y":
            notmiss.Y,
            "Bucket":
            pd.cut(notmiss.X, np.unique(bins), include_lowest=True)
        })
        d2 = d1.groupby('Bucket', as_index=True)

    d3 = pd.DataFrame({}, index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3 = d3.reset_index(drop=True)

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4, ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT / d3.sum().EVENT
    d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.EVENT_RATE / d3.NON_EVENT_RATE)
    d3["IV"] = (d3.EVENT_RATE - d3.NON_EVENT_RATE) * np.log(
        d3.EVENT_RATE / d3.NON_EVENT_RATE)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[[
        'VAR_NAME', 'MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE',
        'NONEVENT', 'NON_EVENT_RATE', 'WOE', 'IV'
    ]]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()

    return (d3)
Esempio n. 27
0
def show_orders_hist(df, feature_columns, show=True, only_hist=True, show_pie=False):
    """
    可视化统计feature_columns序列所指定的特征在df中的直方图或者饼状图,
    根据参数only_hist是否进行pd.qcut统计输出

    eg:
        from abupy import AbuML, ml
        ttn_raw = AbuML.load_ttn_raw_df()
        ml.show_orders_hist(ttn_raw, ['Age', 'Fare', 'Pclass'])

    :param df: pd.DataFrame对象
    :param feature_columns: 特征名称序列,eg:['Age', 'Fare', 'Pclass']
    :param show: 是否可视化直方图或者饼状图
    :param show_pie: 是否优先考虑绘制饼状图,默认false
    :param only_hist: 是否进行pd.qcut统计输出
    """
    if not isinstance(df, pd.DataFrame):
        logging.info('df must pd.DataFrame, not type {}'.format(type(df)))
        return

    # 第一步过滤不在在特征列中的feature_columns元素
    feature_columns = list(filter(lambda x: df.columns.tolist().count(x) > 0, feature_columns))
    # 第二步过滤feature_columns元素中类型不是int或者float的
    feature_columns = list(
        filter(
            lambda x: df[x].dtype == int or df[x].dtype == float or df[x].dtype == np.uint or df[x].dtype == np.uint8,
            feature_columns))
    # 第三步过滤feature_columns元素中所指特征列中unique==1的,eg:1列全是1,全是0,没办法做bin
    feature_columns = list(filter(lambda x: len(np.unique(df[x])) > 1, feature_columns))

    axs_list = None
    if len(feature_columns) == 0:
        # 晒没了的情况,直接返回
        logging.info('{}\n{}\nnot exist! or unique==1!, or dtype != int or float'.format(
            df.columns, df.dtypes))
        return

    if show:
        # 如果可视化直方图,先确定子画布列数,一行放两个,取math.ceil,eg:3 /2 = 2
        n_rows = int(math.ceil(len(feature_columns) / 2))
        # 行高取5,总高度:n_rows * 5
        fig_h = n_rows * 5
        # plt.subplots生成子画布
        _, axs = plt.subplots(nrows=n_rows, ncols=2, figsize=(14, fig_h))
        # 如果是多于1个的即展开字画本序列为1d序列
        axs_list = axs if n_rows == 1 else list(itertools.chain.from_iterable(axs))

    for ind, feature in enumerate(feature_columns):
        feature_unique = len(np.unique(df[feature]))
        ax = None
        if axs_list is not None:
            ax = axs_list[ind]
            ax.set_title(feature)
        if show_pie and feature_unique < 10:
            # 如果特征的值unique < 10个,通过value_counts直接画饼图
            df[feature].value_counts().plot(ax=ax, kind='pie')
        else:
            # 画直方图
            bins = int(feature_unique / 50) if feature_unique / 50 > 10 else 10
            df[feature].hist(ax=ax, bins=bins)

        if only_hist:
            # 只做可视化就continue
            continue

        try:
            # qcut切分10等份
            cats = pd.qcut(df[feature], 10)
        except Exception:
            # 某一个数据超出q的数量导致无法分
            import pandas.core.algorithms as algos
            bins = algos.quantile(np.unique(df[feature]), np.linspace(0, 1, 10 + 1))
            # noinspection PyProtectedMember,PyUnresolvedReferences
            cats = pd.tools.tile._bins_to_cuts(df[feature], bins, include_lowest=True)

        logging.info('{0} show hist and qcuts'.format(feature))
        """
            Age show hist and qcuts
            (31.8, 36]    91
            (14, 19]      87
            (41, 50]      78
            [0.42, 14]    77
            (22, 25]      70
            (19, 22]      67
            (28, 31.8]    66
            (50, 80]      64
            (25, 28]      61
            (36, 41]      53
            Name: Age, dtype: int64
        """
        logging.info(cats.value_counts())
Esempio n. 28
0
def resample_by_magnitude(input_shapefile,
                          output_shapefile,
                          target_field,
                          bins=10,
                          fields_to_keep=[],
                          bootstrap=True,
                          output_samples=None,
                          validation_file=None,
                          validation_points=100):
    """
    Parameters
    ----------
    input_shapefile: str
    output_shapefile: str
    target_field: str
        target field name based on which resampling is performed. Field must
        exist in the input_shapefile
    bins: int
        number of bins for sampling
    fields_to_keep: list
        of strings to store in the output shapefile
    bootstrap: bool, optional
        whether to sample with replacement or not
    output_samples: int, optional
        number of samples in the output shpfile. If not provided, the output
        samples will be assumed to be the same as the original shapefile
    validation_file: str, optional
        validation file name
    validation_points: int, optional
        approximate number of points in the validation shapefile
    Returns
    -------

    """
    log.info("resampling shapefile by values")
    if bootstrap and validation_file:
        raise ValueError('bootstrapping should not be use while'
                         'creating a validation shapefile.')

    if len(fields_to_keep):
        fields_to_keep.append(target_field)
    else:
        fields_to_keep = [target_field]
    gdf_out = filter_fields(fields_to_keep, input_shapefile)

    # the idea is stolen from pandas.qcut
    # pd.qcut does not work for cases when it result in non-unique bin edges
    target = gdf_out[target_field].values
    bin_edges = algos.quantile(np.unique(target), np.linspace(0, 1, bins + 1))
    result = pd.tools.tile._bins_to_cuts(target,
                                         bin_edges,
                                         labels=False,
                                         include_lowest=True)
    # add to output df for sampling
    gdf_out[BIN] = result

    dfs_to_concat = []
    validation_dfs_to_concat = []
    total_samples = output_samples if output_samples else gdf_out.shape[0]
    samples_per_bin = total_samples // bins

    validate_array = np.ones(bins, dtype=np.bool)
    if validation_file and bins > validation_points:
        validate_array[validation_points:] = False
        np.random.shuffle(validate_array)

    gb = gdf_out.groupby(BIN)
    for i, (b, gr) in enumerate(gb):
        if bootstrap:
            dfs_to_concat.append(
                gr.sample(n=samples_per_bin, replace=bootstrap))
        else:
            _df, v_df = _sample_without_replacement(gr, samples_per_bin,
                                                    validate_array[i])
            dfs_to_concat.append(_df)
            validation_dfs_to_concat.append(v_df)

    final_df = pd.concat(dfs_to_concat)
    final_df.sort_index(inplace=True)
    final_df.drop(BIN, axis=1).to_file(output_shapefile)
    if validation_file:
        validation_df = pd.concat(validation_dfs_to_concat)
        validation_df.to_file(validation_file)
        log.info('Wrote validation shapefile {}'.format(validation_file))
    return output_shapefile
Esempio n. 29
0
def test_quantile():
    s = Series(np.random.randn(100))

    result = algos.quantile(s, [0, .25, .5, .75, 1.])
    expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
    tm.assert_almost_equal(result, expected)
Esempio n. 30
0
def resample_by_magnitude(input_data,
                          target_field,
                          bins=10,
                          interval='percentile',
                          fields_to_keep=[],
                          bootstrap=True,
                          output_samples=None,
                          validation=False,
                          validation_points=100):
    """
    Parameters
    ----------
    input_gdf : geopandas.GeoDataFrame
        Geopandas dataframe containing targets to be resampled.
    target_field : str
        target field name based on which resampling is performed. Field 
        must exist in the input_shapefile
    bins : int
        number of bins for sampling
    fields_to_keep : list
        of strings to store in the output shapefile
    bootstrap : bool, optional
        whether to sample with replacement or not
    output_samples : int, optional
        number of samples in the output shpfile. If not provided, the 
        output samples will be assumed to be the same as the original 
        shapefile
    validation : bool, optional
        validation file name
    validation_points : int, optional
        approximate number of points in the validation shapefile

    Returns
    -------

    """
    if bootstrap and validation:
        raise ValueError('bootstrapping should not be use while'
                         'creating a validation shapefile.')

    if interval not in ['percentile', 'linear']:
        _logger.warning(
            "Interval method '{}' not recognised, defaulting to 'percentile'".
            format(interval))
        interval = 'percentile'

    if len(fields_to_keep):
        fields_to_keep.append(target_field)
    else:
        fields_to_keep = [target_field]
    gdf_out = prepapre_dataframe(input_data, fields_to_keep)
    # the idea is stolen from pandas.qcut
    # pd.qcut does not work for cases when it result in non-unique bin edges
    target = gdf_out[target_field].values
    if interval == 'percentile':
        bin_edges = algos.quantile(np.unique(target),
                                   np.linspace(0, 1, bins + 1))
    elif interval == 'linear':
        bin_edges = np.linspace(np.min(target), np.max(target), bins + 1)
    result = pd.core.reshape.tile._bins_to_cuts(target,
                                                bin_edges,
                                                labels=False,
                                                include_lowest=True)

    # add to output df for sampling
    gdf_out[BIN] = result[0]

    dfs_to_concat = []
    validation_dfs_to_concat = []
    total_samples = output_samples if output_samples else gdf_out.shape[0]
    samples_per_bin = total_samples // bins

    validate_array = np.ones(bins, dtype=np.bool)
    if validation and bins > validation_points:
        validate_array[validation_points:] = False
        np.random.shuffle(validate_array)

    gb = gdf_out.groupby(BIN)
    for i, (b, gr) in enumerate(gb):
        if bootstrap:
            dfs_to_concat.append(
                gr.sample(n=samples_per_bin, replace=bootstrap))
        else:
            _df, v_df = _sample_without_replacement(gr, samples_per_bin,
                                                    validate_array[i])
            dfs_to_concat.append(_df)
            validation_dfs_to_concat.append(v_df)

    final_df = pd.concat(dfs_to_concat)
    final_df.sort_index(inplace=True)
    output_gdf = final_df.drop(BIN, axis=1)
    if validation:
        validation_df = pd.concat(validation_dfs_to_concat)
        return output_gdf, validation_df
    else:
        return output_gdf