Beispiel #1
0
 def fit(self, X, y, variable_types={}):
     self.X_shape = X.shape
     self.y_shape = y.shape
     if variable_types:
         variable_type_string = "".join([variable_types[col] for col in X.columns])
         self.model = KernelReg(y, X, variable_type_string, reg_type="ll")
     else:
         self.model = KernelReg(y, X, "c" * X.shape[1], reg_type="ll")
     return self
Beispiel #2
0
 def __init__(self,
              summaryfile=None,
              inwave=None,
              indata=None,
              inerrs=None,
              inmask=None,
              smooth=None):
     self.data = indata
     self.wave = inwave
     self.errs = inerrs
     self.mask = inmask
     if summaryfile:
         self.open_summary(summaryfile)
         # Interpolate masked areas
         self.data[self.mask] = np.nan
         self.nonnanidx = np.where(~self.mask)[0]
         self.interp = np.interp(self.wave, self.wave[self.nonnanidx],
                                 self.data[self.nonnanidx])
         self.interr = np.interp(self.wave, self.wave[self.nonnanidx],
                                 self.errs[self.nonnanidx])
     if smooth == 'll':
         lle = KernelReg(self.interp, self.wave, 'c', bw=[10])
         mean, marg = lle.fit()
         del marg
         self.smoothed = mean
     elif smooth == 'box':
         mean = np.convolve(self.data, np.array([1, 1, 1]) / 3)
     else:
         self.smoothed = self.data
     self._build_plot()
Beispiel #3
0
def dataSmoothing3(changes):
    length = len(changes)
    x = np.linspace(1, length, num=length, endpoint=True)
    y = np.array(changes)
    kr = KernelReg(y, x, 'c')
    r_fit = KernelReg.r_squared(kr)
    #plt.figure(1)
    #plt.subplot(131)
    #plt.plot(x, y, 'go-')
    #plt.title("Original",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    if length < 20:
        x1 = np.linspace(1, length, num=3 * length, endpoint=True)
    else:
        x1 = x
    y_pred, y_std = kr.fit(x1)
    #plt.subplot(132)
    #plt.plot(x1, y_pred,'bo-')
    #plt.title("Smoothing",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    #plt.show()
    ynew = dataResampling(y_pred)
    xnew = np.linspace(1, 20, 20, endpoint=False)
    #plt.subplot(133)
    #plt.plot(xnew, ynew,'ro-')
    #plt.title("Resampling",fontsize=20)
    #plt.xlabel('Periods',fontsize=20)
    #plt.ylabel('Dockerfile Size',fontsize=20)
    #plt.grid(True)
    #plt.show()
    return ynew, r_fit
Beispiel #4
0
def FWHM(wave, pertdata, mode='data', imin=False, ll_bw='cv_ls'):
    """ Mode can be data, ll, lc
    """
    fwhms = []
    imins = []
    mvels = []
    LLEs = []
    for i in tqdm(range(pertdata.shape[0])):
        data = pertdata[i, :]
        if mode in ['ll', 'lc']:
            lle = KernelReg(data, wave, 'c', reg_type=mode, bw=ll_bw)
            data = lle.fit()[0]
            LLEs.append(data)
            print('LLE bandwidth: ', lle.bw[0], end="\r")
        iplwave = np.linspace(wave.min(), wave.max(), 1000)
        ipldata = np.interp(iplwave, wave, data)
        iplidx = np.where(ipldata > ipldata.max() / 2)[0]
        vmin, vmax = iplidx.min(), iplidx.max()
        fwhms.append(iplwave[vmax] - iplwave[vmin])
        if imin:
            imins.append(1 - data.max())
            mvels.append(iplwave[ipldata.argmax()])
    if imin:
        return np.array(fwhms), np.array(mvels), np.array(imins), np.array(
            LLEs)
    return np.array(fwhms)
Beispiel #5
0
def integrated_calibration_index_mod(y, p):
    """
    local reg 使うバージョン
    TOOD: statsmodels.nonparametric.kernel_regression.KernReg がとても遅い. C++とかで実装したほうが良いのでは?
    """
    ll = KernelReg(endog=y, exog=p, reg_type='ll', var_type='o')
    return mean_absolute_error(y, ll.fit()[0])
Beispiel #6
0
def get_fitted_values(week):
    
# week - for knowing for which s_spotify values to take s_streams

    # делаем working_df с которой будет работать модель
    working_df = pd.read_csv(get_paths()[1]+"all_spotify.csv")
    working_df = working_df.drop(working_df.columns[[0]], axis=1)
    
    # делаем регрессию
    y = np.array(list(working_df["streams"]))
    x_r = np.array(list(working_df["rank"]))
    x_s = np.array(list(working_df["s_streams"]))

    var_cont = (np.var(x_s))**0.5
    b_c = var_cont*(len(y)**(-1/5))
    print(b_c)

    # count ordered discrete variable bandwidth
    b_o = len(y)**(-2/5)
    print(b_o)


    reg_new = KernelReg(y, [x_r, x_s], var_type="oc", reg_type = "ll", bw = [b_o, b_c]) 
    
    df_of_needed_week = working_df[working_df["week_f_show"] == week]
    last_week_sstreams = df_of_needed_week["s_streams"][-1:].values[0]
    fit_values = reg_new.fit([[i for i in range(1,201)],[last_week_sstreams for h in range(1,201) ]])[0]
    
    return fit_values
Beispiel #7
0
def kreg_demo1(hs=None, fast=True, fun='hisj'):
    """Compare KRegression to KernelReg from statsmodels.nonparametric

    Examples
    --------
    >>> kreg_demo1()
    """
    N = 100
    # ei = np.random.normal(loc=0, scale=0.075, size=(N,))
    ei = np.array([
        -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525,
        0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784,
        -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097,
        0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183,
        -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959,
        0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057,
        -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139,
        0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877,
        0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899,
        -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498,
        0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802,
        0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446,
        -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111,
        -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788,
        -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828,
        -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477,
        -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327,
        0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406,
        -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619,
        -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065])
    x = np.linspace(0, 1, N)

    va_1 = 0.3 ** 2
    va_2 = 0.7 ** 2
    y0 = np.exp(-x ** 2 / (2 * va_1)) + 1.3 * np.exp(-(x - 1) ** 2 / (2 * va_2))
    y = y0 + ei
    kernel = Kernel('gauss', fun=fun)
    hopt = kernel.hisj(x)
    kreg = KRegression(
        x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt)
    if fast:
        kreg.__call__ = kreg.eval_grid_fast

    f = kreg(x, output='plot', title='Kernel regression', plotflag=1)
    plt.figure(0)
    f.plot(label='p=0')

    kreg.p = 1
    f1 = kreg(x, output='plot', title='Kernel regression', plotflag=1)
    f1.plot(label='p=1')
    # print(f1.data)
    plt.plot(x, y, '.', label='data')
    plt.plot(x, y0, 'k', label='True model')
    from statsmodels.nonparametric.kernel_regression import KernelReg
    kreg2 = KernelReg(y, x, ('c'))
    y2 = kreg2.fit(x)
    plt.plot(x, y2[0], 'm', label='statsmodel')

    plt.legend()
def smooth_xy(x, y):
    x = np.squeeze(x)
    y = np.squeeze(y)
    #v = lowess(y, x, frac=.05)
    kernel_reg = KernelReg(y, x, var_type='c', reg_type='lc')
    kernel_reg.bw = np.asarray([.01])
    y = kernel_reg.fit(x)[0]
    return x, y
Beispiel #9
0
    def pred_from_loess(self, train_x, train_y, x_to_pred):
        """
    	Trains simple loess regression and returns predictions
    	"""
        kr_model = KernelReg(endog=train_y,
                             exog=train_x,
                             var_type='c',
                             bw=[self.bandwidth])

        return kr_model.fit(x_to_pred)[0]
def fitData(X, y, method):
    if method == "simple-lr":
        model = LinearRegression().fit(X, y)
        return model.predict(X)
    elif method == "nonpara-lr":
        model = KernelRidge(kernel='linear').fit(X, y)
        return model.predict(X)
    elif method == "nonpara-poly":
        model = KernelReg(endog=y, exog=X, var_type='c', reg_type='ll')
        x2 = np.reshape(range(600), (-1, 1))
        return model.fit(x2)[0]
Beispiel #11
0
 def __init__(self, x, y, yerr=None):
     reg = KernelReg([y], [x], var_type='c', reg_type='ll')
     vals = reg.fit(x)[0]
     self.spline = interp.UnivariateSpline(x,
                                           vals,
                                           w=np.isfinite(vals),
                                           ext='const')
     # calculate RMS and normalize to stop normalization drifting
     xs = np.linspace(np.min(x), np.max(x), 1000)
     ys = self.spline(xs)
     self.rms = np.sqrt(np.sum(ys**2) / 1000)
def compute_arrival_rate(volume, duration, strikes):
    volume_duration = pd.concat([volume.sum(), duration.sum()],
                                keys=['Volume', 'Duration'],
                                axis=1)
    volume_duration_kernel = volume_duration.apply(
        lambda vd: vd.groupby('Half-spread').apply(lambda d: KernelReg(
            d.xs(d.name, level='Half-spread'),
            d.xs(d.name, level='Half-spread').index, 'c', 'lc')))
    arrival_rate = volume_duration_kernel.apply(
        lambda vd: vd.groupby('Half-spread').apply(lambda k: pd.Series(
            k.xs(k.name).fit(strikes)[0], strikes)))
    return np.log(arrival_rate['Volume'] / arrival_rate['Duration'])
Beispiel #13
0
    def apply_code(self, mqb, ctx):
        predictions = []

        ctx['iter_count'] += 1

        for duration in self.DURATIONS:

            if ctx['iter_count'] > duration * self.TIMES_IN_WINDOW:
                close_mid_values = mqb['close_mid'].last_with_duration(
                    self.TIMES_IN_WINDOW, duration)
                indexes = linspace(1., len(close_mid_values),
                                   len(close_mid_values))
                close_prices = pd.Series(index=indexes, data=close_mid_values)
                prices = close_prices.copy()

                kr = KernelReg([prices.values], [prices.index.values],
                               var_type='c',
                               bw=[1.8, 1])

                max_mins = self.find_max_min(prices, kr)

                if max_mins.shape[0] == 5:
                    e1 = max_mins.iloc[0]
                    e2 = max_mins.iloc[1]
                    e3 = max_mins.iloc[2]
                    e4 = max_mins.iloc[3]
                    e5 = max_mins.iloc[4]

                    if e1 > e2 and e3 > e2 and e5 > e2 and e1 > e4 and e3 > e4 and e5 > e4:
                        if e5 > e3 > e1 and e2 < e4:
                            if close_mid_values[-1] > e5:
                                prediction = {'duration': duration, 'value': 1}
                                predictions.append(prediction)

                elif max_mins.shape[0] == 6:
                    e1 = max_mins.iloc[0]
                    e2 = max_mins.iloc[1]
                    e3 = max_mins.iloc[2]
                    e4 = max_mins.iloc[3]
                    e5 = max_mins.iloc[4]
                    e6 = max_mins.iloc[5]

                    if e1 > e2 and e3 > e2 and e5 > e2 and e1 > e4 and e1 > e4 and e5 > e4 and e1 > 6 and e3 > e6 and e5 > e6:
                        if e1 < e3 < e5 and e6 < e4 < e2:
                            if close_mid_values[-1] < e6:
                                prediction = {
                                    'duration': duration,
                                    'value': -1
                                }
                                predictions.append(prediction)

        return predictions
def find_extrema(s, bw='cv_ls'):
    """
    Input:
        s: prices as pd.series
        bw: bandwith as str or array like
    Returns:
        prices: with 0-based index as pd.series
        extrema: extrema of prices as pd.series
        smoothed_prices: smoothed prices using kernel regression as pd.series
        smoothed_extrema: extrema of smoothed_prices as pd.series
    """
    # Copy series so we can replace index and perform non-parametric
    # kernel regression.
    prices = s.copy()
    prices = prices.reset_index()
    prices.columns = ['date', 'price']
    prices = prices['price']

    kr = KernelReg([prices.values], [prices.index.to_numpy()],
                   var_type='c',
                   bw=bw)
    f = kr.fit([prices.index])

    # Use smoothed prices to determine local minima and maxima
    smooth_prices = pd.Series(data=f[0], index=prices.index)
    smooth_local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    smooth_local_min = argrelextrema(smooth_prices.values, np.less)[0]
    local_max_min = np.sort(
        np.concatenate([smooth_local_max, smooth_local_min]))
    smooth_extrema = smooth_prices.loc[local_max_min]

    # Iterate over extrema arrays returning datetime of passed
    # prices array. Uses idxmax and idxmin to window for local extrema.
    price_local_max_dt = []
    for i in smooth_local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].idxmax())

    price_local_min_dt = []
    for i in smooth_local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].idxmin())

    maxima = pd.Series(prices.loc[price_local_max_dt])
    minima = pd.Series(prices.loc[price_local_min_dt])
    extrema = pd.concat([maxima, minima]).sort_index()

    # Return series for each with bar as index
    return extrema, prices, smooth_extrema, smooth_prices
def find_max_min(prices):
    """
    Get min and max of a series consisting of prices
    """

    prices_ = prices.copy()
    prices_.index = np.linspace(1., len(prices_), len(prices_))
    kr = KernelReg([prices_.values], [prices_.index.values],
                   var_type='c',
                   bw=[1.8])
    f = kr.fit([prices_.index.values])
    smooth_prices = pd.Series(data=f[0], index=prices.index)

    local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    local_min = argrelextrema(smooth_prices.values, np.less)[0]

    price_local_max_dt = []
    for i in local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax())

    price_local_min_dt = []
    for i in local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin())

    prices.name = 'price'
    maxima = pd.DataFrame(prices.loc[price_local_max_dt])
    minima = pd.DataFrame(prices.loc[price_local_min_dt])
    max_min = pd.concat([maxima, minima]).sort_index()
    max_min.index.name = 'date'
    max_min = max_min.reset_index()
    max_min = max_min[~max_min.date.duplicated()]
    p = prices.reset_index()
    max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values
    max_min = max_min.set_index('day_num').price

    return max_min
def calc_smooth(prices: pd.Series, *, bw: Union[np.ndarray, str] = 'cv_ls', a: float = None, use_array: bool = True) -> Union[pd.Series, np.ndarray]:
    """计算Nadaraya-Watson核估计后的价格数据

    Args:
        prices (pd.Series): 价格数据
        bw (Union[np.ndarray,str]): Either a user-specified bandwidth or the method for bandwidth selection. Defaults to cv_ls.
        a (float, optional): 论文中所说的比例数据. Defaults to None.
        use_array (bool, optional): 为True返回ndarray,False返回为pd.Series. Defaults to True.

    Returns:
        Union[pd.Series,np.ndarry]
    """

    if not isinstance(prices, pd.Series):
        raise ValueError('prices必须为pd.Series')

    idx = np.arange(len(prices))

    kr = KernelReg(prices.values, idx,
                   var_type='c', reg_type='ll', bw=bw)

    if a is None:

        f = kr.fit(idx)[0]

    else:

        kr.bw = a * kr.bw  # 论文用的0.3 * h

        f = kr.fit(idx)[0]

    if use_array:

        return f

    else:

        return pd.Series(data=f, index=prices.index)
def estimator_nw(data, est_kwargs={}, **kwargs):
    from statsmodels.nonparametric.kernel_regression import KernelReg
    #http://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kernel_density.EstimatorSettings.html
    from statsmodels.nonparametric.kernel_regression import EstimatorSettings
    k = len(data['x']['Train'].T)
    #    n = len(data['x']['Train'])

    if 'reg_type' in est_kwargs.keys():
        reg_type = est_kwargs[
            'reg_type']  #Allows for locally linear estimation
    else:
        reg_type = 'lc'  #Default is local constant (Nadaraya-Watson).

    #Estimate model
    nw = KernelReg(
        data['y']['Train'],
        data['x']['Train'],  #Fits regression
        var_type='c' * k,  #Continuous variables
        reg_type=reg_type,
        bw='aic',  #Least-squares cross val. Else aic for aic hurdwidth
        defaults=EstimatorSettings(
            n_jobs=1,  #No parallel
            efficient=True,
            randomize=True,  #bw estimation random subsampling
            n_res=25,  #Number of resamples
            n_sub=50,  # Size of samples 
        ),
    )
    betahat = np.array([])  #NP does not have coefficients

    # Extract results
    prob, mrgeff = {}, {}
    for split in ('Train', 'Test'):
        prob[split], mrgeff[split] = nw.fit(data_predict=data['x'][split])

    return betahat, prob, mrgeff
Beispiel #18
0
def find_max_min(prices):
    prices_ = prices.copy()
    prices_.index = linspace(1., len(prices_), len(prices_))
    #kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8, 1])
    kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[2]) # 小了捕捉局部,大了捕捉全局 !
    # Either a user-specified bandwidth or the method for bandwidth selection.
    # If a string, valid values are ‘cv_ls’ (least-squares cross-validation) and ‘aic’ (AIC Hurvich bandwidth estimation).
    # Default is ‘cv_ls’.
    f = kr.fit([prices_.index.values])

    smooth_prices = pd.Series(data=f[0], index=prices.index)

    local_max = argrelextrema(smooth_prices.values, np.greater)[0]
    local_min = argrelextrema(smooth_prices.values, np.less)[0]
    price_local_max_dt = []
    for i in local_max:
        if (i > 1) and (i < len(prices) - 1):
            price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax())

    price_local_min_dt = []
    for i in local_min:
        if (i > 1) and (i < len(prices) - 1):
            price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin())

    prices.name = 'price'
    maxima = pd.DataFrame(prices.loc[price_local_max_dt])
    minima = pd.DataFrame(prices.loc[price_local_min_dt])
    max_min = pd.concat([maxima, minima]).sort_index()
    max_min.index.name = 'date'
    max_min = max_min.reset_index()
    max_min = max_min[~max_min.date.duplicated()]
    p = prices.reset_index()
    max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values
    max_min = max_min.set_index('day_num').price

    return max_min
Beispiel #19
0
 def fit(self, X_train, y_train):
     # By default, this function will do a local linear regression
     self.regression = KernelReg(y_train, X_train, var_type='c')
     return self
Beispiel #20
0
    def __init__(self,
                 X,
                 causes,
                 effects,
                 admissable_set=[],
                 variable_types=None,
                 expectation=False,
                 density=True):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) 
        for some admissable set of control variables, Z.  First we 
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = admissable_set
        self.conditional_density_vars = conditional_density_vars

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [
                variable_types[var] for var in conditional_density_vars
            ]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if 'c' not in variable_types.values():
            bw = 'cv_ml'
        else:
            bw = 'normal_reference'

        if admissable_set:
            self.density = KDEMultivariate(X[admissable_set],
                                           var_type=''.join(density_types),
                                           bw=bw)

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type=''.join(dep_type),
            indep_type=''.join(indep_type),
            bw=bw)
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                ''.join(indep_type),
                bw='cv_ls')

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type in ['o', 'u']
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set)))
        self.continuous_variables = [
            variable for variable, var_type in self.variable_types.items()
            if var_type == 'c'
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set)))
Beispiel #21
0
xin=pd.read_csv('xin.csv', names='x')
xin_list=[]
for row in range(xin.shape[0]):
    xin_list.append(float(xin.iloc[row]))

yin=pd.read_csv('yin.csv', names=['y'])
yin_list=[]
for row in range(yin.shape[0]):
    yin_list.append(float(yin.iloc[row]))



df=pd.concat([yin,xin], axis=1)

# Using statsmodels
kde = KernelReg(x, y, var_type='c', reg_type='ll', bw=[3.2])

estimator = kde.fit(y)
estimator = np.reshape(estimator[0], df.shape[0])

plt.scatter(x, y)
plt.scatter(x, estimator, c='r')
plt.show()

# Using SKFDA

df_grid=skfda.FDataGrid(df)

bandwidth = np.arange(0.1, 5, 0.2)

llr = val.SmoothingParameterSearch(
Beispiel #22
0
    def __init__(
        self,
        X,
        causes,
        effects,
        admissable_set=[],
        variable_types=None,
        expectation=False,
        density=True,
    ):
        """
        We want to calculate the causal effect of X and Y through
        back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z)
        for some admissable set of control variables, Z.  First we
        calculate the conditional density P(Y|X,Z), then the density
        P(Z).  We find the support of Z so we can properly sum over
        it later.  variable_types are a dictionary with the column name
        pointing to an element of set(['o', 'u', 'c']), for 'ordered',
        'unordered discrete', or 'continuous'.
        """
        conditional_density_vars = causes + admissable_set
        self.causes = causes
        self.effects = effects
        self.admissable_set = list(
            admissable_set
        )  # uses a list internally; AdjustForDirectCauses.admissable_set returns a set
        self.conditional_density_vars = conditional_density_vars

        if (
            len(X) > 300
            or max(len(causes + admissable_set), len(effects + admissable_set)) >= 3
        ):
            self.defaults = EstimatorSettings(n_jobs=4, efficient=True)
        else:
            self.defaults = EstimatorSettings(n_jobs=-1, efficient=False)

        if variable_types:
            self.variable_types = variable_types
            dep_type = [variable_types[var] for var in effects]
            indep_type = [variable_types[var] for var in conditional_density_vars]
            density_types = [variable_types[var] for var in admissable_set]
        else:
            self.variable_types = self.__infer_variable_types(X)

        if "c" not in variable_types.values():
            bw = "cv_ml"
        else:
            bw = "normal_reference"

        if admissable_set:
            self.density = KDEMultivariate(
                X[admissable_set],
                var_type="".join(density_types),
                bw=bw,
                defaults=self.defaults,
            )

        self.conditional_density = KDEMultivariateConditional(
            endog=X[effects],
            exog=X[conditional_density_vars],
            dep_type="".join(dep_type),
            indep_type="".join(indep_type),
            bw=bw,
            defaults=self.defaults,
        )
        if expectation:
            self.conditional_expectation = KernelReg(
                X[effects].values,
                X[conditional_density_vars].values,
                "".join(indep_type),
                bw="cv_ls",
            )

        self.support = self.__get_support(X)

        self.discrete_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type in ["o", "u"]
        ]
        self.discrete_Z = list(
            set(self.discrete_variables).intersection(set(admissable_set))
        )
        self.continuous_variables = [
            variable
            for variable, var_type in self.variable_types.items()
            if var_type == "c"
        ]
        self.continuous_Z = list(
            set(self.continuous_variables).intersection(set(admissable_set))
        )
Beispiel #23
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 24 00:18:16 2019

KernelReg practice

@author: mbattley
"""

from statsmodels.nonparametric.kernel_regression import KernelReg
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 2 * np.pi, 100)
y = np.sin(x) + np.random.random(100) * 0.2
# The third parameter specifies the type of the variable x;
# 'c' stands for continuous
kr = KernelReg(y, x, 'c')
plt.plot(x, y, '+')
y_pred, y_std = kr.fit(x)
plt.plot(x, y_pred)
plt.show()
Beispiel #24
0
def selector(case):
    if case == 1:
        results_dir = create_results_directory('./results/paper/dtr_vs_xgb')
        x, y = load_boston(return_X_y=True)
        x = pd.DataFrame(x,
                         columns=[
                             'crime', 'zn', 'indus', 'chas', 'nox', 'rm',
                             'age', 'dis', 'rad', 'tax', 'ptratio', 'blacks',
                             'lstat'
                         ])
        x = x[['rm', 'lstat']]
        df_all = x.copy()
        df_all['price'] = y

        # Plot 3D scatter
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(df_all['rm'], df_all['lstat'], df_all['price'])
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/scatter.png')
        plt.close()

        dtr = DecisionTreeRegressor(max_depth=2)
        dtr.fit(x, y)
        plot_tree(dtr, impurity=False)
        plt.savefig(f'{results_dir}/dtr_visual.png')
        plt.close()

        x_min = x.min(axis=0)
        x_max = x.max(axis=0)

        rm_linspace = np.linspace(x_min['rm'], x_max['rm'], 100)
        lstat_linspace = np.linspace(x_min['lstat'], x_max['lstat'], 100)

        rm, lstat = np.meshgrid(rm_linspace, lstat_linspace)
        points = np.stack(map(np.ravel, (rm, lstat)), axis=1)
        z = dtr.predict(points).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/dtr_prediction.png')
        plt.close()

        # Linear regression
        lr = LinearRegression().fit(x, y)
        z = lr.predict(points).reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/lr_prediction.png')
        plt.close()

        # Linear regression
        kr = KernelReg(exog=x, endog=y, var_type='cc')
        z = kr.fit(points)[0].reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/kr_prediction.png')
        plt.close()

        # XGB
        hparams = {
            'seed': 42,
            'booster': 'gbtree',
            'learning_rate': 0.1,
            'objective': 'reg:squarederror',
            'verbosity': 0,
            'subsample': 1,
            'max_depth': 2,
            'colsample_bytree': 0.5,
        }
        dtrain = xgb.DMatrix(x.values, label=y)
        model = xgb.train(hparams,
                          dtrain=dtrain,
                          num_boost_round=100,
                          verbose_eval=False)
        z_xgb = model.predict(xgb.DMatrix(points)).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z_xgb,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/xgb_prediction.png')
Beispiel #25
0
 def fit(self, X_train, y_train):
     N, p = X_train.shape
     self.kernel = KernelReg(y_train, X_train, var_type=p * 'c')
            print count

target_dim = 5

from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components=target_dim,
                 kernel="precomputed",
                 eigen_solver="auto",
                 tol=1e-9,
                 max_iter=3000,
                 n_jobs=-1)
feature_coords = kpca.fit_transform((sim_mat**2) * -0.5)

from statsmodels.nonparametric.kernel_regression import KernelReg

landfalls = np.array([float(h.made_landfall) for h in hurricane_list])

inds = np.argsort(feature_coords[:, 0])

feature_coords_sorted = feature_coords[inds]
landfalls_sorted = landfalls[inds]

vartypes = ''.join('c' * target_dim)
reg = KernelReg(landfalls_sorted, feature_coords_sorted, vartypes)
[mean, mfx] = reg.fit()

plt.figure()
plt.scatter(feature_coords_sorted[:, 0], landfalls_sorted, color="green")
plt.plot(feature_coords_sorted[:, 0], mean, color="red")
plt.show()
Beispiel #27
0
def get_regularized_params(
    model_parameters,
    genes,
    genes_step1,
    genes_log10_gmean_step1,
    genes_log10_gmean,
    cell_attr,
    umi,
    batch_var=None,
    bw_adjust=3,
    gmean_eps=1,
    theta_regularization="od_factor",
    exclude_poisson=False,
    poisson_genes=None,
    method="theta_ml",
):
    model_parameters = model_parameters.copy()

    model_parameters_fit = pd.DataFrame(
        npy.nan, index=genes, columns=model_parameters.columns
    )

    """
    exog_predict = genes_log10_gmean#.values
    for column in model_parameters.columns:
        if column == "theta":
            continue
        endog = model_parameters.loc[genes_step1, column].values
        exog_fit = genes_log10_gmean_step1#.values
        bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust)#.values)
        reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw)
        model_parameters_fit[column] = reg.fit(exog_predict)[0]

    """
    x_points_df = pd.DataFrame({"gene_log10_gmean": genes_log10_gmean})
    x_points_df["min_gene_log10_gmean_step1"] = genes_log10_gmean_step1.min()

    x_points_df["x_points"] = npy.nanmax(x_points_df, axis=1)
    x_points_df["max_gene_log10_gmean_step1"] = npy.nanmax(genes_log10_gmean_step1)
    x_points_df["x_points"] = x_points_df[
        ["x_points", "max_gene_log10_gmean_step1"]
    ].min(1)
    x_points = x_points_df["x_points"].values
    for column in model_parameters.columns:
        if column == "theta":
            continue
        endog = model_parameters.loc[genes_step1, column].values
        exog_fit = genes_log10_gmean_step1  # .values
        if method == "glgmp":
            bw = bw_SJr(genes_log10_gmean_step1, bw_adjust=bw_adjust)  # .values)
            params = ksmooth(genes_log10_gmean, genes_log10_gmean_step1, endog, bw[0])
            index = model_parameters_fit.index.values[params["order"] - 1]
            model_parameters_fit.loc[index, column] = params["smoothed"]
        else:
            bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust)  # .values)
            reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw)
            fit = reg.fit(x_points)
            model_parameters_fit[column] = npy.squeeze(fit[0])

    if theta_regularization == "theta":
        theta = npy.power(10, (model_parameters["od_factor"]))
    else:
        theta = npy.power(10, genes_log10_gmean) / (
            npy.power(10, model_parameters_fit["od_factor"]) - 1
        )
    model_parameters_fit["theta"] = theta
    if exclude_poisson:
        # relace theta by inf
        if poisson_genes is not None:
            model_parameters_fit.loc[poisson_genes, "theta"] = npy.inf

    return model_parameters_fit
Beispiel #28
0
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.nonparametric.kernel_regression import KernelReg

x = np.sort(np.random.rand(400) * 10 - 2)
y = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 + (
    (np.random.rand(len(x)) - 0.5) * 50)
y_clean = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14

reg = KernelReg(y, x, 'c')
[mean, mfx] = reg.fit()

plt.figure()
plt.scatter(x, y)
plt.plot(x, mean, color="red")
plt.plot(x, y_clean, color="green")
plt.show()
fairK = np.array((3, 5, 9, 15, 20, 25, 30, 35, 40, 45))

event_lengths = durs_run1_new / fairK

unique_event_lengths = np.unique(event_lengths)
x = event_lengths.ravel()

test_x = np.linspace(min(x), max(x), num=100)
smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots))

opt_bw_holder = np.zeros((nBoots, len(ROI_data)))

for ROI in range(len(ROI_data)):
    for b in range(nBoots):
        opt_bw = 0
        y = ROI_data[ROI][:, :, b].ravel()
        KR = KernelReg(y, x, var_type='c')
        opt_bw += KR.bw / len(ROI_data)
        opt_bw_holder[b, ROI] = opt_bw
        y = ROI_data[ROI][:, :, b].ravel()
        KR = KernelReg(y, x, var_type='c', bw=opt_bw)
        smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0]

np.save(
    datadir + 'smooth_' + suffix + '_' + save_fn +
    '_auto_independent_bandwidths', smooth_wva)
np.save(
    datadir + 'smooth_' + suffix + '_' + save_fn +
    '_auto_independent_optimal_bandwidth', opt_bw_holder)
    def __init__(self,
                 f,
                 f2,
                 pts3d,
                 left_pts,
                 right_pts,
                 oldpts3d,
                 safety_check=False):
        self.f = f
        self.f2 = f2
        self.safety_check = safety_check
        self.pts3d = np.matrix(pts3d)
        self.minimum = np.min(self.pts3d[:, 2])
        self.maximum = np.max(self.pts3d[:, 2])
        self.oldpts3d = oldpts3d
        self.left_pts = left_pts
        self.right_pts = right_pts
        pts2d = []
        ptsz = []
        f3 = open("../calibration_data/camera_matrix.p", "rb")
        self.cmat = pickle.load(f3)
        f3.close()

        for pt in pts3d:
            pts2d.append(pt[:2])
            ptsz.append(np.ceil(pt[2] * 1000000))
        self.neigh = KNeighborsClassifier(n_neighbors=2)
        self.neigh.fit(pts2d, ptsz)
        self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:, 0].ravel(),
                                       np.matrix(pts3d)[:, 1].ravel(),
                                       np.matrix(pts3d)[:, 2].ravel(),
                                       function='linear',
                                       epsilon=.1)
        pts3d = np.array(pts3d).T
        print pts3d.shape
        print pts3d[:2, :].shape, pts3d[2, :].shape
        self.f = KernelReg(pts3d[2, :], pts3d[:2, :], 'cc')