Python RLM Exemples, statsmodels.api.RLM Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : fusion.py Projet : whigg/captoolkit

def main(ifile, n=''):

    # Message to terminal
    print 'processing file:', ifile, '...'

    # Check for empty file
    if os.stat(ifile).st_size == 0:
        print 'input file is empty!'
        return

    print 'loading data ...'

    # Determine input file type
    if not ifile.endswith(('.h5', '.H5', '.hdf', '.hdf5')):
        print "input file must be in hdf5-format"
        return

    # Input variables names
    xvar, yvar, tvar, zvar, svar, ivar, ovar = icol

    # Load all 1d variables needed
    with h5py.File(ifile, 'r') as fi:

        # Read in needed variables
        lon = fi[xvar][:]  # Longitude (deg)
        lat = fi[yvar][:]  # Latitude  (deg)
        time = fi[tvar][:]  # Time      (yrs)
        elev = fi[zvar][:]  # Height    (meters)
        sigma = fi[svar][:]  # RMSE      (meters)
        mode = fi[ivar][:]  # Mission   (int)
        oind = fi[ovar][:] if ovar in fi else np.ones(
            lon.shape)  # Outliers  (int)

    # Check for NaN-values
    inan = ~np.isnan(elev) & ~np.isnan(oind)

    # Remove NaN values from arrays
    lon, lat, time, elev, sigma, mode = lon[inan], lat[inan], time[inan], \
                                        elev[inan], sigma[inan], mode[inan]

    # Select only observations inside time interval
    itime = (time > t1lim) & (time < t2lim)

    # Select wanted time span
    lon, lat, time, elev, sigma, mode = lon[itime], lat[itime], time[itime], \
                                        elev[itime], sigma[itime], mode[itime]

    # Select only wanted missions - not mission 4
    imode = (mode != 4)

    # Select wanted modes
    lon, lat, time, elev, sigma, mode = lon[imode], lat[imode], time[imode], \
                                        elev[imode], sigma[imode], mode[imode]

    # EPSG number for lon/lat proj
    projGeo = '4326'

    # EPSG number for grid proj
    projGrd = proj

    print 'converting lon/lat to x/y ...'

    # Get geographic boundaries + max search radius
    if bbox:

        # Extract bounding box
        (xmin, xmax, ymin, ymax) = bbox

        # Transform coordinates
        (x, y) = transform_coord(projGeo, projGrd, lon, lat)

        # Select data inside bounding box
        ig = (x >= xmin - dmax) & (x <= xmax + dmax) & \
             (y >= ymin - dmax) & (y <= ymax + dmax)

        # Check bbox for obs.
        if len(x[ig]) == 0:
            print 'no data points inside bounding box!'
            return

        # Cut data to bounding box limits
        lon, lat, time, elev, sigma, mode = lon[ig], lat[ig], time[ig], \
                                            elev[g], sigma[ig], mode[ig]

    else:

        # Convert into stereographic coordinates
        (x, y) = transform_coord(projGeo, projGrd, lon, lat)

        # Get bbox from data
        (xmin, xmax, ymin, ymax) = x.min(), x.max(), y.min(), y.max()

    # Construct solution grid - add border to grid
    (Xi, Yi) = make_grid(xmin - 10e3, xmax + 10e3, ymin - 10e3, ymax + 10e3,
                         dx, dy)

    # Convert to geographical coordinates
    (LON, LAT) = transform_coord(projGrd, projGeo, Xi, Yi)

    # Flatten prediction grid
    xi = Xi.ravel()
    yi = Yi.ravel()

    # Zip data to vector
    coord = zip(x.ravel(), y.ravel())

    print 'building the k-d tree ...'

    # Construct KD-Tree
    Tree = cKDTree(coord)

    # Number of months of time series
    months = len(np.arange(t1lim, t2lim + tstep, tstep))

    # Total number of columns
    ntot = months + 4

    # Create output array
    OFILE0 = np.ones((len(xi), 23)) * 9999
    OFILE1 = np.ones((len(xi), ntot)) * 9999
    OFILE2 = np.ones((len(xi), ntot)) * 9999
    OFILE3 = np.ones((len(xi), ntot)) * 9999
    OFILE4 = np.ones((len(xi), ntot)) * 9999

    # Save corrected rate
    b_rate = np.ones((len(xi), 1)) * np.nan
    # Set up search cap
    dr = np.arange(dmin, dmax + 2e3, 2e3)

    # Enter prediction loop
    for i in xrange(len(xi)):

        # Number of observations
        nobs = 0

        # Time difference
        dt = 0

        # Temporal sampling
        npct = 1

        # Number of sensors
        nsen = 0

        # Meet data constraints
        for ii in xrange(len(dr)):

            # Query the Tree with data coordinates
            idx = Tree.query_ball_point((xi[i], yi[i]), dr[ii])

            # Check for empty arrays
            if len(time[idx]) == 0:
                continue

            # Constraints parameters
            dt = np.max(time[idx]) - np.min(time[idx])
            nobs = len(time[idx])
            nsen = len(np.unique(mode[idx]))

            # Bin time vector
            t_sample = binning(time[idx], time[idx], t1lim, t2lim, 1.0 / 12.,
                               5, 5)[1]

            # Test for null vector
            if len(t_sample) == 0: continue

            # Sampling fraction
            npct = np.float(len(t_sample[~np.isnan(t_sample)])) / len(t_sample)

            # Constraints
            if nobs > nlim:
                if dt > dtlim:
                    if nsen >= nmlim:
                        if npct > 0.70:
                            break

        # Final test of data coverage
        if (nobs < nlim) or (dt < dtlim): continue

        # Parameters for model-solution
        xcap = x[idx]
        ycap = y[idx]
        tcap = time[idx]
        hcap = elev[idx]
        scap = sigma[idx]
        mcap = mode[idx]

        # Centroid of all data
        xc = np.median(xcap)
        yc = np.median(ycap)

        # Get reference
        mref = mref_

        # Reference to specific mission
        if len(hcap[mcap == mref]) > 0:

            # Tie to laser surface
            hcap -= np.median(hcap[mcap == mref])

        elif len(hcap[mcap == (mref + 1)]) > 0:

            # Tie to SARin surface
            hcap -= np.median(hcap[mcap == (mref + 1)])

            # Change mission tie index
            mref += 1

        else:

            # Tie to mean surface
            hcap -= np.median(hcap)

        #
        # Least-Squares Adjustment
        # ---------------------------------
        #
        # h =  x_t + x_j + x_s
        # x = (A' W A)^(-1) A' W y
        # r = y - Ax
        #
        # ---------------------------------
        #

        # Threshold for outliers in each bin
        alpha = 5.0

        # Convergence tolerance (%)
        tol = 3.0

        # Times series binning of each mission
        (tcap, hcap, scap, ncap, mcap) = bin_mission(tcap, hcap, mcap, scap,
                                                     t1lim, t2lim, tstep, tol,
                                                     alpha)

        # Size of original observational matrix
        (n, m) = hcap.T.shape

        # Unravel array to vectors
        tcap = tcap.T.ravel()
        hcap = hcap.T.ravel()
        scap = scap.T.ravel()
        mcap = mcap.T.ravel()

        # Additional outlier editing
        inan = np.isnan(
            binfilt(tcap.copy(), hcap.copy(), tcap.min(), tcap.max(), 3.0,
                    3. / 12.))

        # Set outliers to NaN
        hcap[inan] = np.nan
        scap[inan] = np.nan
        mcap[inan] = np.nan

        # Trend component
        dti = tcap - tref

        # Compute new statistics
        (nobs, tspan) = len(hcap[~np.isnan(hcap)]), tcap.max() - tcap.min()

        # Reject grid node if true
        if (nobs < nlim) & (tspan < dtlim): continue

        # Four-term fourier series for seasonality
        cos1 = np.cos(2 * np.pi * dti)
        sin1 = np.sin(2 * np.pi * dti)
        cos2 = np.cos(4 * np.pi * dti)
        sin2 = np.sin(4 * np.pi * dti)

        # Construct bias parameters
        b_ice1 = np.zeros(hcap.shape)
        b_csin = np.zeros(hcap.shape)
        b_clrm = np.zeros(hcap.shape)
        b_ra21 = np.zeros(hcap.shape)
        b_ra22 = np.zeros(hcap.shape)
        b_ers1 = np.zeros(hcap.shape)
        b_ers2 = np.zeros(hcap.shape)

        # Set unit-step functions (0/1)
        b_ers1[mcap == 6] = 1.
        b_ers2[mcap == 5] = 1.
        b_ice1[mcap == 0] = 1.
        b_ra21[mcap == 3] = 1.
        b_ra22[mcap == 4] = 1.
        b_csin[mcap == 1] = 1.
        b_clrm[mcap == 2] = 1.

        # Design matrix for adjustment procedure
        Acap = np.vstack((dti, 0.5*dti**2, cos1, sin1, cos2, sin2, b_ice1, \
                          b_csin, b_clrm, b_ra21, b_ra22, b_ers2, b_ers1)).T

        # Try to solve least-squares system
        try:

            # Least-squares bias adjustment
            linear_model = sm.RLM(hcap, Acap, missing='drop')

            # Fit the model to the data
            linear_model_fit = linear_model.fit(maxiter=10)

        # If not possible continue
        except:

            continue

        # Length post editing
        nsol = len(hcap)

        # Coefficients and standard errors
        Cm = linear_model_fit.params
        Ce = linear_model_fit.bse

        # Amplitude of annual seasoanl signal
        amp = np.sqrt(Cm[2]**2 + Cm[3]**2)

        # Phase of annual seasoanl signal
        phi = np.arctan2(Cm[3], Cm[2]) / (2.0 * np.pi)

        # Compute model residuals
        dh = hcap - np.dot(Acap, Cm)

        # Identify outliers
        inan = np.isnan(iterfilt(dh.copy(), -slim, slim, 5, 3.0))

        # Set outliers to NaN
        hcap[inan] = np.nan
        scap[inan] = np.nan
        mcap[inan] = np.nan

        # Compute RMSE of corrected residuals
        rmse = mad_std(dh[~inan])

        # Bias correction
        h_bias = np.dot(Acap[:, [-7, -6, -5, -4, -3, -2, -1]],
                        Cm[[-7, -6, -5, -4, -3, -2, -1]])

        # Save original uncorrected time series
        horg = hcap.copy()

        # Remove inter mission biases
        hcap -= h_bias

        # Initiate residual cross-calibration flag
        flag = 0

        # Apply post-fit cross-calibration in overlapping areas
        hcap, flag = cross_calibrate(tcap.copy(), hcap.copy(), dh.copy(),
                                     mcap.copy(), 1.0)

        # Binned time for plotting
        tbin = np.arange(t1lim, t2lim, tstep) + 0.5 * tstep

        # Re-format back to arrays
        hbo = horg.reshape(n, m).T
        hbi = hcap.reshape(n, m).T
        tbi = tcap.reshape(n, m).T
        ebi = scap.reshape(n, m).T
        mbi = mcap.reshape(n, m).T

        # Copy original vector
        hcor = np.copy(hbi)

        # Take the weighted average of all mission in each bin
        (hbi_w, ebi_w) = np.ma.average(np.ma.array(hbi, mask=np.isnan(hbi)), \
                         weights=np.ma.array(ebi, mask=np.isnan(ebi)), \
                         axis=0, returned=True)

        # Convert back to original array, with nan's
        hbi_w = np.ma.filled(hbi_w, np.nan)
        ebi_w = np.ma.filled(ebi_w, np.nan)

        # Number of rows to add
        n_add = 6 - len(hbi)

        # Construct binary mask
        binary = hbi_w.copy()

        # Set to zeros (data) and ones (nan)
        binary[~np.isnan(binary)] = 0
        binary[np.isnan(binary)] = 1

        # Apply distance transform
        bwd = distance_transform_edt(binary)

        # Set these values to nan's
        inoip = bwd >= 12

        # Pad by adding rows
        for kx in xrange(n_add):

            # Add rows to obs. matrix
            hbo = np.vstack((hbo, np.ones(hbi_w.shape) * np.nan))
            hbi = np.vstack((hbi, np.ones(hbi_w.shape) * np.nan))
            ebi = np.vstack((ebi, np.ones(hbi_w.shape) * np.nan))
            mbi = np.vstack((mbi, np.ones(hbi_w.shape) * np.nan))
            tbi = np.vstack((tbi, tbin))

        # Padd mission arrays using weighted averages
        hbi = fill(hbi, hbi_w)
        ebi = fill(ebi, ebi_w)

        # Reject grid node if true
        if len(hbi_w[~np.isnan(hbi_w)]) <= 2: continue

        #
        # Kalman state-space model
        # ------------------------
        #
        # z_t = H * z_t + v_t
        # x_t = A * x_t-1 + w_t-1
        #
        # ------------------------
        #

        # Create observational matrix
        Ht = np.eye(4)

        # Determine the number of rows to add
        n_add = len(hbi) - 4

        # Rows to observational matrix
        for ky in xrange(n_add):

            # Add rows to obs. matrix
            Ht = np.vstack((Ht, [0, 0, 0, 0]))

        # Populate observational matrix
        Ht[:, [0, 2]] = 1

        # Seasonal signal
        ck = np.cos(np.pi / 6)
        sk = np.sin(np.pi / 6)

        # Transition matrix
        At = [[1.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, +ck, +sk],
              [0.0, 0.0, -sk, +ck]]

        # Observational noise
        Rt = np.diag(np.nanmean(ebi**2, 1))

        # Initial start value of filter
        y0 = np.median(hbi_w[~np.isnan(hbi_w)][0:3])

        # Constrain only transition covariance
        params = ['transition_covariance']

        # Estimating transition covaraiance from individual missions
        if len(hcap[(mcap <= 3) & (~np.isnan(mcap))]) > 1:

            # Only good missions
            Ct = KalmanFilter(em_vars=params). \
                em(hcap[mcap <= 3], n_iter=2).transition_covariance

        else:

            # All missions
            Ct = KalmanFilter(em_vars=params). \
                em(hcap[~np.isnan(hcap)], n_iter=2).transition_covariance

        # Transition covariance
        Qt = np.diag([0.0, 1.0, 0.5, 0.5]) * tstep * Ct

        # Initial state vector
        m0 = [y0, Cm[0], Cm[2], Cm[3]]

        # Initial state covariance
        q0 = np.diag([0, Ce[0], Ce[2], Ce[3]])**2

        # Create kalman filter
        kf = KalmanFilter(initial_state_mean=m0,
                          initial_state_covariance=q0,
                          transition_matrices=At,
                          observation_matrices=Ht,
                          observation_covariance=Rt,
                          transition_covariance=Qt)

        # Estimate number percentage of interpolated data
        n_per = 100 * float(len(hbi_w[np.isnan(hbi_w)])) / len(hbi_w)

        # Mask and transpose array
        hbi_masked = ma.masked_array(hbi, mask=np.isnan(hbi)).T

        # Apply Kalman filter with parameter learning on residuals
        (dh_ts, dh_es) = kf.smooth(hbi_masked)

        # Compute the total RSS of all model parameters
        dh_es = [dh_es[k, 0, 0] for k in xrange(len(dh_es))]

        # Sum all parameters for time series
        dh_ts = dh_ts[:, 0]

        # Compute standard deviation
        dh_es = np.sqrt(dh_es)

        # Mask output array
        dh_ts[inoip] = np.nan
        dh_es[inoip] = np.nan

        # Rename weighted solution
        dh_ws = hbi_w
        dh_ew = ebi_w

        # Converte back to georaphical coordinates
        (lon_c, lat_c) = transform_coord(projGrd, projGeo, xc, yc)

        # Final search radius
        radius = dr[ii]

        # Compute new elevation change rate after post-fit residuals
        b_rate = np.polyfit(tbin[~np.isnan(dh_ws)] -
                            tbin[~np.isnan(dh_ws)].mean(),
                            dh_ws[~np.isnan(dh_ws)],
                            2,
                            w=1.0 / dh_ew[[~np.isnan(dh_ws)]]**2)[1]

        # Save data to output files
        OFILE0[i, :] = np.hstack(
            (lat_c, lon_c, Cm[0], Ce[0], Cm[1], Ce[1], rmse, dt, amp, phi,
             n_per, Cm[[-7, -6, -5, -4, -3, -2,
                        -1]], nobs, nsol, radius, flag, b_rate))
        OFILE1[i, :] = np.hstack(
            (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_ts))
        OFILE2[i, :] = np.hstack(
            (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_es))
        OFILE3[i, :] = np.hstack(
            (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_ws))
        OFILE4[i, :] = np.hstack(
            (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_es))

        # Print progress
        print str(i) + "/" + str(len(xi))+" Radius: "+ str(np.around(dr[ii], 0)) +" Rate: "+str(np.around(Cm[0]*100,2))+\
              " (cm/yr)"+' Interp: '+str(np.around(n_per,0))+' Rate_adj: '+str(np.around(b_rate*100,2))+" (cm/yr)"

    # Identify unwanted data
    I0 = OFILE0[:, 0] != 9999
    I1 = OFILE1[:, 0] != 9999
    I2 = OFILE2[:, 0] != 9999
    I3 = OFILE3[:, 0] != 9999
    I4 = OFILE4[:, 0] != 9999

    # Remove unwnated data
    OFILE0 = OFILE0[I0, :]
    OFILE1 = OFILE1[I1, :]
    OFILE2 = OFILE2[I2, :]
    OFILE3 = OFILE3[I3, :]
    OFILE4 = OFILE4[I4, :]

    # Check if we have any data
    if len(OFILE0[:, 0]) == 0:
        # Print message
        print " No data to save! "
        return

    # Save solution to disk
    with h5py.File(ifile.replace('.h5', '_sf.h5'), 'w') as f0:

        # Save meta data
        f0['sf'] = OFILE0

    with h5py.File(ifile.replace('.h5', '_ts.h5'), 'w') as f1:

        # Save adjusted and merged time series
        f1['ts'] = OFILE1

    with h5py.File(ifile.replace('.h5', '_es.h5'), 'w') as f2:

        # Save error estimate for time series
        f2['es'] = OFILE2

    with h5py.File(ifile.replace('.h5', '_tw.h5'), 'w') as f3:

        # Save error estimate for time series
        f3['tw'] = OFILE3

    with h5py.File(ifile.replace('.h5', '_ew.h5'), 'w') as f4:

        # Save error estimate for time series
        f4['ew'] = OFILE4

Exemple #2

0

Afficher le fichier

Fichier : sdm.py Projet : kanderso-nrel/pvlib-python

def _extract_sdm_params(ee, tc, iph, io, rs, rsh, n, u, specs, const,
                        model):
    # Get single diode model parameters from five parameters iph, io, rs, rsh
    # and n vs. effective irradiance and temperature
    try:
        import statsmodels.api as sm
    except ImportError:
        raise ImportError(
            'Parameter extraction using Sandia method requires statsmodels')

    tck = tc + 273.15
    tok = const['T0'] + 273.15  # convert to to K

    params = {}

    if model == 'pvsyst':
        # Estimate I_o_ref and EgRef
        x_for_io = const['q'] / const['k'] * (1. / tok - 1. / tck[u]) / n[u]

        # Estimate R_sh_0, R_sh_ref and R_sh_exp
        # Initial guesses. R_sh_0 is value at ee=0.
        nans = np.isnan(rsh)
        if any(ee < 400):
            grsh0 = np.mean(rsh[np.logical_and(~nans, ee < 400)])
        else:
            grsh0 = np.max(rsh)
        # Rsh_ref is value at Ee = 1000
        if any(ee > 400):
            grshref = np.mean(rsh[np.logical_and(~nans, ee > 400)])
        else:
            grshref = np.min(rsh)
        # PVsyst default for Rshexp is 5.5
        R_sh_exp = 5.5

        # Find parameters for Rsh equation

        def fun_rsh(x, rshexp, ee, e0, rsh):
            tf = np.log10(_rsh_pvsyst(x, R_sh_exp, ee, e0)) - np.log10(rsh)
            return tf

        x0 = np.array([grsh0, grshref])
        beta = optimize.least_squares(
            fun_rsh, x0, args=(R_sh_exp, ee[u], const['E0'], rsh[u]),
            bounds=np.array([[1., 1.], [1.e7, 1.e6]]), verbose=2)
        # Extract PVsyst parameter values
        R_sh_0 = beta.x[0]
        R_sh_ref = beta.x[1]

        # parameters unique to PVsyst
        params['R_sh_0'] = R_sh_0
        params['R_sh_exp'] = R_sh_exp

    elif model == 'desoto':
        dEgdT = 0.0002677
        x_for_io = const['q'] / const['k'] * (
            1. / tok - 1. / tck[u] + dEgdT * (tc[u] - const['T0']) / tck[u])

        # Estimate R_sh_ref
        nans = np.isnan(rsh)
        x = const['E0'] / ee[np.logical_and(u, ee > 400, ~nans)]
        y = rsh[np.logical_and(u, ee > 400, ~nans)]
        new_x = sm.add_constant(x)
        beta = sm.RLM(y, new_x).fit()
        R_sh_ref = beta.params[1]

        params['dEgdT'] = dEgdT

    # Estimate I_o_ref and EgRef
    y = np.log(io[u]) - 3. * np.log(tck[u] / tok)
    new_x = sm.add_constant(x_for_io)
    res = sm.RLM(y, new_x).fit()
    beta = res.params
    I_o_ref = np.exp(beta[0])
    EgRef = beta[1]

    # Estimate I_L_ref
    x = tc[u] - const['T0']
    y = iph[u] * (const['E0'] / ee[u])
    # average over non-NaN values of Y and X
    nans = np.isnan(y - specs['alpha_sc'] * x)
    I_L_ref = np.mean(y[~nans] - specs['alpha_sc'] * x[~nans])

    # Estimate R_s
    nans = np.isnan(rs)
    R_s = np.mean(rs[np.logical_and(u, ee > 400, ~nans)])

    params['I_L_ref'] = I_L_ref
    params['I_o_ref'] = I_o_ref
    params['EgRef'] = EgRef
    params['R_sh_ref'] = R_sh_ref
    params['R_s'] = R_s
    # save values for each IV curve
    params['iph'] = iph
    params['io'] = io
    params['rsh'] = rsh
    params['rs'] = rs
    params['u'] = u

    return params

Exemple #3

0

Afficher le fichier

 def setup(self):
     #fit for each test, because results will be changed by test
     x = self.exog
     np.random.seed(987689)
     y = x.sum(1) + np.random.randn(x.shape[0])
     self.results = sm.RLM(y, self.exog).fit()

Exemple #4

0

Afficher le fichier

Fichier : example_basic_stats.py Projet : gauzias/python_stats_basics

               hue='SITE_ID',
               data=df_pheno_morpho)
    plt.show()
    # and assess whether the effects of each factor
    results = smf.ols(
        'surface_S_C_left ~ AGE_AT_SCAN + C(SITE_ID)+ AGE_AT_SCAN * C(SITE_ID)',
        data=df_pheno_morpho).fit()
    print(results.summary())

    # comparison between OLS and RLM
    y2 = df_pheno_morpho['surface_S_C_left']
    x1 = df_pheno_morpho['AGE_AT_SCAN']
    X = sm.add_constant(x1)
    ols_model = sm.OLS(y2, X).fit()
    print(ols_model.summary())
    rlm_model = sm.RLM(y2, X).fit()
    print(rlm_model.summary())

    # nice figure with confidence intervals
    prstd, iv_l, iv_u = wls_prediction_std(ols_model)

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(x1, y2, 'o', label="data")
    ax.plot(x1, ols_model.fittedvalues, 'r-', label="OLS")
    ax.plot(x1, iv_u, 'r--')
    ax.plot(x1, iv_l, 'r--')
    ax.plot(x1, rlm_model.fittedvalues, 'g.-', label="RLM")
    legend = ax.legend(loc="best")
    plt.show()

    # influence analysis for outliers detection

Exemple #5

0

Afficher le fichier

def solve_iteratively(science, reference, mask_tolerance=10e-5, gain_tolerance=10e-6, max_iterations=5,
                      sigma_cut=5., use_pixels=False, show=False, percent=99, use_mask=True, size_cut=True,
                      pixstack_limit=None):
    """Solve for linear fit iteratively"""

    gain = 1.
    gain0 = 10e5
    i = 1
    # pad image to power of two to speed fft
    old_size = science.shape
    science_image = pad_to_power2(science)
    reference_image = pad_to_power2(reference)

    science_psf = center_psf(resize_psf(science.raw_psf, science_image.shape))
    science_psf /= np.sum(science.raw_psf)
    reference_psf = center_psf(resize_psf(reference.raw_psf, reference_image.shape))
    reference_psf /= np.sum(reference.raw_psf)

    science_std = pad_to_power2(science.background_std)
    reference_std = pad_to_power2(reference.background_std)

    science_mask = pad_to_power2(science.mask, value='bool')
    reference_mask = pad_to_power2(reference.mask, value='bool')

    # fft arrays
    science_image_fft = np.fft.fft2(science_image)
    reference_image_fft = np.fft.fft2(reference_image)
    science_psf_fft = np.fft.fft2(science_psf)
    reference_psf_fft = np.fft.fft2(reference_psf)

    while abs(gain - gain0) > gain_tolerance:

        # calculate the psf in the difference image to convolve masks
        # not a simple convolution of the two PSF's; see the paper for details
        difference_zero_point = gain / np.sqrt(science_std ** 2 + reference_std ** 2 * gain ** 2)
        denominator = science_std ** 2 * abs(reference_psf_fft) ** 2
        denominator += reference_std ** 2 * gain ** 2 * abs(science_psf_fft) ** 2
        difference_psf_fft = gain * science_psf_fft * reference_psf_fft / (difference_zero_point * np.sqrt(denominator))

        if use_mask:
            # convolve masks with difference psf to mask all pixels within a psf radius
            # this is important to prevent convolutions of saturated pixels from affecting the fit
            science_mask_convolved = np.fft.ifft2(difference_psf_fft * np.fft.fft2(science_mask))
            science_mask_convolved[science_mask_convolved > mask_tolerance] = 1
            science_mask_convolved = np.real(science_mask_convolved).astype(int)
            reference_mask_convolved = np.fft.ifft2(difference_psf_fft * np.fft.fft2(reference_mask))
            reference_mask_convolved[reference_mask_convolved > mask_tolerance] = 1
            reference_mask_convolved = np.real(reference_mask_convolved).astype(int)

        # do the convolutions on the images
        denominator = science_std ** 2 * abs(reference_psf_fft) ** 2
        denominator += gain ** 2 * reference_std ** 2 * abs(science_psf_fft) ** 2

        science_convolved_image_fft = reference_psf_fft * science_image_fft / np.sqrt(denominator)
        reference_convolved_image_fft = science_psf_fft * reference_image_fft / np.sqrt(denominator)

        science_convolved_image = np.real(np.fft.ifft2(science_convolved_image_fft))
        reference_convolved_image = np.real(np.fft.ifft2(reference_convolved_image_fft))

        # remove power of 2 padding
        science_convolved_image = science_convolved_image[: old_size[0], : old_size[1]]
        reference_convolved_image = reference_convolved_image[: old_size[0], : old_size[1]]
        if use_mask:
            science_mask_convolved = science_mask_convolved[: old_size[0], : old_size[1]]
            reference_mask_convolved = reference_mask_convolved[: old_size[0], : old_size[1]]
        else:
            science_mask_convolved = None
            reference_mask_convolved = None

        # do a linear robust regression between convolved image
        x, y = join_images(science_convolved_image, science_mask_convolved, reference_convolved_image, 
                           reference_mask_convolved, sigma_cut, use_pixels, show, percent, size_cut, pixstack_limit)
        robust_fit = stats.RLM(y, stats.add_constant(x), stats.robust.norms.TukeyBiweight()).fit()
        parameters = robust_fit.params
        gain0 = gain
        gain = parameters[-1]
        if show:
            import matplotlib.pyplot as plt
            xfit = np.logspace(np.log10(np.min(x)), np.log10(np.max(x)))
            plt.plot(xfit, robust_fit.predict(stats.add_constant(xfit)))
            plt.pause(0.1)

        logging.info('Iteration {0}: Gain = {1}'.format(i, gain))
        if i == max_iterations:
            logging.warning('Maximum regression ({0}) iterations reached'.format(max_iterations))
            break
        i += 1

    logging.info('Fit done in {0} iterations'.format(i))

    return gain

Exemple #6

0

Afficher le fichier

Fichier : 单因子检验.py Projet : ycg860102/MultityFactor_TF

def dealData(bk, begd, endd, adjustPeriods, factorsInfo, FactorName, Path):
    #数据库连接引擎

    tableName = factorsInfo.get("tableName")
    direction = factorsInfo.get("direction")  #因子方向1为正序，0位逆序
    reciprocal = factorsInfo.get("reciprocal")  #因子值是否取倒数
    isLogDeal = factorsInfo.get("isLogDeal")  #因子是否进行ln处理

    #engine = create_engine('mysql://*****:*****@172.16.158.142/dwlh?charset=utf8')

    store = pd.HDFStore(tableName + '.h5', "r", complevel=9)

    periedValues = []
    #循环每次调仓日期
    for i in adjustPeriods.index[:-1]:

        adjustDay = adjustPeriods.ix[i, "date"]
        nextAdjustDay = adjustPeriods.ix[i, "nextAdjustDay"]
        logging.warning(u"处理第" + adjustDay + u"天数据!")
        factor = store.select(
            '/' + tableName + '/' + FactorName,
            where=["Date='{date}'".format(date=adjustDay.replace('-', ''))])

        #将日期字段设置为日期类型
        factor.con_date = pd.to_datetime(factor.Date)
        factor['stock_code'] = factor["stockID"].apply(lambda x: x[2:])

        #按照调仓日期取板块信息，天软函数getbkByName，会剔除调仓日一字涨跌停、停牌以及上市时间小于120日的股票
        BKStocks = TSLPy2.RemoteCallFunc('getbkByName2', [
            bk,
            TSLPy2.EncodeDate(int(adjustDay[:4]), int(adjustDay[5:7]),
                              int(adjustDay[8:10]))
        ], {})
        BKStocks = pd.DataFrame(BKStocks[1])
        BKStocks["SWNAME"] = BKStocks["SWNAME"].apply(
            lambda x: x.decode('gbk'))
        BKStocks["stock_code"] = BKStocks["id"].apply(lambda x: x[2:])
        BKStocks["TotalValue"] = BKStocks["TotalValue"].apply(np.log)
        #对因子值和板块合并
        factor = factor.merge(BKStocks, on="stock_code")
        #判断是否对因子值进行倒序处理
        if reciprocal == 1:
            factor[FactorName] = factor[FactorName].apply(lambda x: 1 / x
                                                          if x <> 0 else x)

        if isLogDeal == 1:
            factor[FactorName] = factor[FactorName].apply(np.log)

        #对因子值进行方向处理
        factor[FactorName] = factor[FactorName] * direction

        #替换异常值
        factorMedia = factor[FactorName].median()
        MAD = (factor[FactorName] - factorMedia).apply(abs).median()
        factor.loc[factor[FactorName] > (factorMedia + 3 * 1.4826 * MAD),
                   FactorName] = factorMedia + 3 * 1.4826 * MAD
        factor.loc[factor[FactorName] < (factorMedia - 3 * 1.4826 * MAD),
                   FactorName] = factorMedia - 3 * 1.4826 * MAD

        #zscore标准化
        factorMean = factor[FactorName].mean()
        factorStd = factor[FactorName].std()
        factor[FactorName] = factor[FactorName].apply(
            lambda x: (x - factorMean) / factorStd
            if factorStd <> 0 else (x - factorMean))

        #下期收益序列：
        stokzf = pd.DataFrame(
            TSLPy2.RemoteCallFunc('getStockZF', [
                bk,
                TSLPy2.EncodeDate(int(adjustDay[:4]), int(adjustDay[5:7]),
                                  int(adjustDay[8:10])),
                TSLPy2.EncodeDate(int(nextAdjustDay[:4]),
                                  int(nextAdjustDay[5:7]),
                                  int(nextAdjustDay[8:10]))
            ], {})[1])
        factor = factor.merge(stokzf, on="stock_code")
        factor.set_index("stock_code", inplace=True)

        #通过回归对因子进行行业和市值中性化处理
        factor = factor.dropna()
        #方法1
        #y, X = dmatrices('{factorName} ~  SWNAME + TotalValue'.format(factorName=FactorName), data=factor, return_type='dataframe')
        #方法2
        y = factor[FactorName]
        X = pd.get_dummies(factor['SWNAME'])
        if FactorName <> 'CAP':
            X['TotalValue'] = factor['TotalValue']
        X = sm.add_constant(X)

        #res = sm.OLS(y, X).fit() #通过OLS进行回归
        res2 = sm.RLM(y, X).fit()  #通过RLM进行回归
        #res3= sm.WLS(y, X).fit() #通过WLS进行回归
        #factorParam = res2.params[FactorName]
        #factorT = res2.tvalues[FactorName]

        #tinyedFactor列，为回归后的残差项，看做新的因子值
        factor["tinyedFactor2"] = factor[FactorName] - res2.fittedvalues
        factor["tinyedFactor"] = res2.resid
        #对新的因子值和下期涨幅进行T检验，得到T值和P值
        factorT, factorP = ttest_rel(factor["tinyedFactor"], factor["zf"])

        #计算IC值和RANKIC值
        IC = factor["zf"].corr(factor["tinyedFactor"])
        rankIC = factor["zf"].corr(factor["tinyedFactor"], method="spearman")

        periedValues.append(
            pd.DataFrame(
                {
                    "FactorName": FactorName,
                    "adjustDay": adjustDay,
                    "IC": IC,
                    "rankIC": rankIC,
                    "factorP": factorP,
                    "factorT": factorT
                },
                index=[0]))
        """
        fig, ax = plt.subplots(figsize=(8,6))

        ax.plot(factor["con_roe"], y, 'o', label="Data")
        #ax.plot(x["con_roe"], y_true, 'b-', label="True")
        ax.plot(factor["con_roe"], res2.fittedvalues, 'r--.', label="RLMPredicted")
        ax.plot(factor["con_roe"], res.fittedvalues, 'b--.', label="OLSPredicted")
        legend = ax.legend(loc="best")
        """
    store.close()
    return periedValues

Exemple #7

0

Afficher le fichier

def fit_linreg_robust(x,
                      y,
                      mask=None,
                      intercept=False,
                      r2=True,
                      est_method="rlm"):
    """Apply robust linear regression of y w.r.t x.

    Arguments
    ---------
    x: :class:`~numpy.ndarray` or sparse `csr_matrix`
        A vector of independent variables.
    y: :class:`~numpy.ndarray` or sparse `csr_matrix`
        A vector of dependent variables.
    intercept: bool
        If using steady state assumption for fitting, then:
        True -- the linear regression is performed with an unfixed intercept;
        False -- the linear regresssion is performed with a fixed zero intercept.
    est_method: str (default: `rlm`)
        The linear regression estimation method that will be used.

    Returns
    -------
    k: float
        The estimated slope.
    b: float
        The estimated intercept.
    r2: float
        Coefficient of determination or r square calculated with the extreme data points.
    all_r2: float
        The r2 calculated using all data points.
    """

    x = x.A if issparse(x) else x
    y = y.A if issparse(y) else y

    _mask = np.logical_and(~np.isnan(x), ~np.isnan(y))
    if mask is not None:
        _mask &= mask
    xx = x[_mask]
    yy = y[_mask]

    try:
        if est_method.lower() == "rlm":
            xx_ = sm.add_constant(xx) if intercept else xx
            res = sm.RLM(yy, xx_).fit()
            k, b = res.params[::-1] if intercept else (res.params[0], 0)
        elif est_method.lower() == "ransac":
            reg = RANSACRegressor(LinearRegression(fit_intercept=intercept),
                                  random_state=0)
            reg.fit(xx.reshape(-1, 1), yy.reshape(-1, 1))
            k, b = reg.estimator_.coef_[0, 0], (reg.estimator_.intercept_[0]
                                                if intercept else 0)
        else:
            raise ImportError(
                f"estimation method {est_method} is not implemented. "
                f"Currently supported linear regression methods include `rlm` and `ransac`."
            )
    except:
        if intercept:
            ym = np.mean(yy)
            xm = np.mean(xx)

            cov = np.mean(xx * yy) - xm * ym
            var_x = np.mean(xx * xx) - xm * xm
            k = cov / var_x
            b = ym - k * xm
            # # assume b is always positive
            # if b < 0:
            #     k, b = np.mean(xx * yy) / np.mean(xx * xx), 0
        else:
            # use uncentered cov and var_x
            cov = np.mean(xx * yy)
            var_x = np.mean(xx * xx)
            k = cov / var_x
            b = 0

    if r2:
        SS_tot_n, all_SS_tot_n = np.var(yy), np.var(y)
        SS_res_n, all_SS_res_n = (
            np.mean((yy - k * xx - b)**2),
            np.mean((y - k * x - b)**2),
        )
        r2, all_r2 = 1 - SS_res_n / SS_tot_n, 1 - all_SS_res_n / all_SS_tot_n

        return k, b, r2, all_r2
    else:
        return k, b

Exemple #8

0

Afficher le fichier

# Переменных x может быть одна или две, и они должны быть записаны
# столбцами в соответствующих степенях.
# Следующий набор команд это и делает:
x_var1 = create_var1(x_cols[0], centering)
x_array = x_var1
for i in range(2, x_degs[0] + 1):
    x_array = np.column_stack((x_array, x_var1**i))
if len(x_cols) > 1:
    x_var2 = create_var2(x_cols[1], centering)
    x_array = np.column_stack((x_array, x_var2))
    for i in range(2, x_degs[1] + 1):
        x_array = np.column_stack((x_array, x_var2**i))
x_array = sm.add_constant(x_array, prepend=True)

rlm_model = sm.RLM(y_array, x_array, M=sm.robust.norms.TukeyBiweight())
results = rlm_model.fit()

# -----------------------------------------------


def centering_back(params, degs):
    A, B1, C1, B2, C2, D2 = 0., 0., 0., 0., 0., 0.
    A = params[0]
    if len(degs) == 1:
        global m_x1, m_x2
        # Потому что при построении x1 было получено m_x1, а
        # m_x2 так и осталось нулем. Но тут я работаю с одной
        # переменной как со второй (это подтверждают двойки
        # в названии используемых в этом блоке if переменных
        m_x1, m_x2 = m_x2, m_x1

Exemple #9

0

Afficher le fichier

Fichier : example_rlm.py Projet : jayhetee/statsmodels

Robust Linear Models

Notes
-----
The syntax for the arguments will be shortened to accept string arguments
in the future.
"""

import statsmodels.api as sm

###Example for using Huber's T norm with the default
###median absolute deviation scaling

data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)
huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print hub_results.params
print hub_results.bse

###Or with the 'H2' covariance matrix
hub_results2 = huber_t.fit(cov="H2")
print hub_results2.params
print hub_results2.bse

###Example for using Andrew's Wave norm with
###Huber's Proposal 2 scaling and 'H3' covariance matrix
andrew_mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.AndrewWave())
andrew_results = andrew_mod.fit(scale_est=sm.robust.scale.HuberScale(), cov="H3")
print andrew_results.params

Exemple #10

0

Afficher le fichier

 def fit(self, y, X):
     model = sm.RLM(y, X, M=sm.robust.norms.HuberT())
     return model.fit()

Exemple #11

0

Afficher le fichier

def xr_regression_resid(y):
    date = y.date
    model = sm.RLM(y.values, X.loc[date].values, M=sm.robust.norms.HuberT())
    results = model.fit()
    return xr.DataArray(results.resid)

Exemple #12

0

Afficher le fichier

    dt_pos = idx_date.get_loc(cur_date)
    if dt_pos == 0:
        continue
    dt_pre_pos = dt_pos - 1
    # symbols having valid value(not nan)
    s = X[:, dt_pre_pos].notnull().all(axis=0)
    valid_x = X[:, dt_pre_pos, s].symbol.values

    w = y.loc[cur_date].notnull()
    valid_y = y.loc[cur_date, w].symbol.values

    valid_symbol = np.intersect1d(valid_x, valid_y)
    try:
        model = sm.RLM(
            y.loc[cur_date, valid_symbol].values,
            X.isel(date=dt_pre_pos,
                   symbol=idx_symbol.get_indexer(valid_symbol)).values.T,
            M=sm.robust.norms.HuberT())
        results = model.fit()
    except ValueError:
        continue
    params.loc[cur_date] = results.params
    residuals.loc[cur_date, valid_symbol] = results.resid


class RLMModel:
    """ create RLM regression module
    """
    def __init__(self):
        pass

Exemple #13

0

Afficher le fichier

Fichier : cubefuse.py Projet : tsutterley/captoolkit

def main(files, n=''):

    # Input variables names
    xvar, yvar, tvar, zvar, evar, ivar = icol

    # If cubes for each mission are in separate files,
    # concatenate them and generate a single cube.
    # Each mission (on individual file) will be given a unique identifier.
    for nf, ifile in enumerate(files):
        print 'processing file:', ifile, '...'

        if nf == 0:
            with h5py.File(ifile, 'r') as fi:
                x = fi[xvar][:]  # 1d
                y = fi[yvar][:]  # 1d
                time = fi[tvar][:]  # 1d
                elev = fi[zvar][:]  # 3d
                mode = fi[ivar][:] if ivar in fi \
                        else np.full_like(time, nf)  # 1d
                sigma = fi[evar][:] if evar in fi \
                        else np.full_like(elev, np.nan)  # 3d
        else:
            with h5py.File(ifile, 'r') as fi:
                time = np.hstack((time, fi[tvar][:]))  # 1d
                elev = np.dstack((elev, fi[zvar][:]))  # 3d
                mode = np.hstack((mode, fi[ivar][:] if ivar in fi \
                        else np.full_like(fi[tvar][:], nf)))  # 1d
                sigma = np.dstack((sigma, fi[evar][:] if evar in fi \
                        else np.full_like(fi[zvar][:], np.nan)))  # 3d

    if len(np.unique(mode)) < 2:
        print 'it seems there is only one mission!'
        return

    t1, t2 = np.nanmin(time), np.nanmax(time)  ##TODO: Rethink this

    # Output containers
    zi = np.full_like(elev, np.nan)
    ei = np.full_like(elev, np.nan)
    ni = np.full_like(elev, np.nan)

    # Temporal coverage
    t_pct = np.zeros(elev.shape)

    # Minimum sampling for all mission < 81.5 deg
    nsam = 0.60

    # Enter prediction loop
    for i in xrange(elev.shape[0]):
        for j in xrange(elev.shape[1]):

            # Number of observations
            nobs = 0

            # Time difference
            dt = 0

            # Temporal sampling
            npct = 1

            # Number of sensors
            nsen = 0

            # Final test of data coverage
            #if (nobs < nlim) or (npct < 0.70): continue

            # Parameters for model-solution
            tcap = time[:]
            mcap = mode[:]
            hcap = elev[i, j, :]
            scap = sigma[i, j, :]

            torg = tcap.copy()
            morg = mcap.copy()
            horg = hcap.copy()
            sorg = scap.copy()

            # Least-Squares Adjustment
            # ---------------------------------
            #
            # h =  x_t + x_j + x_s
            # x = (A' A)^(-1) A' y
            # r = y - Ax
            #
            # ---------------------------------

            # Need to think of a smarth way to filter out outliears.
            # In particular those at the end of each mission-record!!!
            # Also, need to plot and see how the model fit compares to the data.
            ##FIXME ############################################################

            # compute median series
            ##NOTE: Not needed for calibrating cube series (they are clean)
            if 0:
                hcap = binfilter(tcap,
                                 hcap,
                                 mcap,
                                 window=3,
                                 n_abs=5,
                                 interp=False)

            ##FIXME ############################################################

            if sum(~np.isnan(hcap)) < nlim: continue

            #plt.figure()
            ii = mcap == np.unique(mcap)[0]
            jj = mcap == np.unique(mcap)[1]

            plt.plot(tcap[ii], hcap[ii])
            plt.plot(tcap[jj], hcap[jj])

            dt = tcap - tref  # trend component

            # Create design matrix for alignment
            Acap, cols = design_matrix(dt, mcap)

            try:
                # Least-squares bias adjustment
                linear_model = sm.RLM(hcap, Acap, missing='drop')
                linear_model_fit = linear_model.fit(maxiter=niter)
            except:
                print "Solution invalid!"
                continue

            # Coefficients and standard errors
            Cm = linear_model_fit.params
            Ce = linear_model_fit.bse

            # Compute model residuals
            dh = hcap - np.dot(Acap, Cm)

            # Compute RMSE of corrected residuals (fit)
            rms_fit = mad_std(dh)

            # Bias correction (mission offsets)
            h_cal_fit = np.dot(Acap[:, cols], Cm[cols])

            # Remove inter satellite biases
            horg -= h_cal_fit

            # Plot
            if 1:
                plt.figure()
                plt.plot(torg[ii], horg[ii])
                plt.plot(torg[jj], horg[jj])
                plt.show()

            ##FIXME: This doesn't work. Think of a better strategy!!!!!!!!!!!!
            ##TODO: How/Where to do this??? <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            # Bin full calibrated record
            if 0:
                tmed, hmed, emed, nmed = binning(torg,
                                                 horg,
                                                 xmin=t1,
                                                 xmax=t2,
                                                 dx=1 / 12.,
                                                 window=3 / 12.,
                                                 median=True,
                                                 interp=False)[:4]

            # Interpolate
            '''
            try:
                i_valid = ~np.isnan(hmed)
                i_inval = np.isnan(hmed)
                hmed[i_inval] = np.interp(tmed[i_inval], tmed[i_valid], hmed[i_valid])
            except:
                continue
            '''

            # Reference final solution
            '''
            if 1:
                # To original discrete time step
                idx = find_nearest(tmed, tref)
                hmed -= hmed[idx]
            else:
                # To exact given time epoch 
                href = np.interp(tref, tmed[~np.isnan(hmed)], hmed[~np.isnan(hmed)])
            '''
            """
            zi[i,j,:] = hmed
            ei[i,j,:] = emed
            ni[i,j,:] = nmed
            """

            # Plot crosscal time series
            if 1:
                horg[np.abs(horg) > mad_std(horg) * 5] = np.nan

                plt.figure(figsize=(12, 4))
                plt.scatter(tcap, horg, s=10, c=mcap, alpha=0.7, cmap='tab10')
                plt.scatter(tcap, hcap, s=10, c=mcap, cmap='gray')

                try:
                    plt.figure(figsize=(12, 3.5))
                    plt.plot(tmed, hmed, '-', linewidth=2)
                    plt.ylim(np.nanmin(hmed), np.nanmax(hmed))
                    plt.xlim(t1, t2)
                except:
                    pass

                plt.show()
                continue
            '''
            # Transform coordinates
            (lon_i, lat_i) = transform_coord(projGrd, projGeo, xcap, ycap)
            (lon_0, lat_0) = transform_coord(projGrd, projGeo, xi[i], yi[i])
            
            # ********************** #
            
            # Apply calibration to original data points
            horg -= h_cal_fit
                
            # Save output variables to list for each solution
            lats.append(lat_i)
            lons.append(lon_i)
            lat0.append(lat_0)
            lon0.append(lon_0)
            dxy0.append(dxy)
            h_ts.append(horg)
            e_ts.append(sorg)
            m_id.append(morg)
            h_cf.append(h_cal_fit)
            f_cr.append(flag)
            tobs.append(torg)
            rmse.append(rms_fit)
            '''
            # Transform coordinates

            # Print meta data to terminal
            if (i % 1) == 0:
                print 'Progress:',str(i),'/',str(len(xi)), \
                      'Rate:', np.around(Cm[1],2), \
                      'Acceleration:', np.around(Cm[2],2)

    # Saveing the data to file
    print 'Saving data to file ...'
    '''
    ofile = ifile.replace('.h5', '_XCAL_FUSED.h5')
    with h5py.File(ofile, 'w') as f:
        f['h_res'] = zi.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0])
        f['h_err'] = ei.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0])
        f['n_obs'] = ni.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0])
        f['x'] = Xi[0,:]
        f['y'] = Yi[:,0]
        f['t'] = tmed

    print 'out ->', ofile
    '''
    return

Exemple #14

0

Afficher le fichier

def main(ifile, n='', robust_fit=True, n_iter=niter):

    # Check for empty file
    if is_empty(ifile):
        print 'SKIP FILE: EMPTY OR CORRUPTED FILE:', ifile
        return

    # Start timing of script
    startTime = datetime.now()

    print 'loading data ...'

    xvar, yvar, tvar, zvar, svar, ivar, cvar = names

    with h5py.File(ifile, 'r') as fi:
        lon = fi[xvar][:]
        lat = fi[yvar][:]
        time = fi[tvar][:]
        height = fi[zvar][:]
        sigma = fi[svar][:] if svar in fi else np.ones(lon.shape)
        id = fi[ivar][:] if ivar in fi else np.ones(lon.shape) * nmidx
        cal = fi[cvar][:] if cvar in fi else np.zeros(lon.shape)

    # Filter in time
    if 1:
        i_time, = np.where(
            (time > 1993.972) & (time < 1995.222))  ##NOTE: To remove ERS-1 GM
        if len(i_time) > 0: height[i_time] = np.nan

    ##NOTE: Filter data based on 'cal' but DO NOT REMOVE NANs!
    if sum(cal) != 0:
        cal[np.isnan(cal)] = 0.  # keep values w/o correction
        height -= cal  # correct absolute H for bs

    # Filter NaNs
    if 1:
        i_valid = ~np.isnan(height)
        lon = lon[i_valid]
        lat = lat[i_valid]
        time = time[i_valid]
        height = height[i_valid]
        sigma = sigma[i_valid]
        id = id[i_valid]
        cal = cal[i_valid]

    projGeo = '4326'  # EPSG number for lon/lat proj
    projGrd = projo  # EPSG number for grid proj

    print 'converting lon/lat to x/y ...'

    # If no bbox was given
    if bbox_ is None:
        try:
            bbox = get_bbox(ifile)  # Try reading bbox from file name
        except:
            bbox = None
    else:
        bbox = bbox_

    # Get geographic boundaries + max search radius
    if bbox:
        # Extract bounding box
        xmin, xmax, ymin, ymax = bbox

        # Transform coordinates
        x, y = transform_coord(projGeo, projGrd, lon, lat)

        # Select data inside bounding box
        Ig = (x >= xmin - dmax) & (x <= xmax + dmax) & (y >= ymin - dmax) & (
            y <= ymax + dmax)

        # Check bbox for obs.
        if len(x[Ig]) == 0:
            print 'SKIP FILE: NO DATA POINTS INSIDE BBOX:', ifile
            return

        print 'Number of obs. edited by bbox!', 'before:', len(
            x), 'after:', len(x[Ig])

        # Only select wanted data
        x = x[Ig]
        y = y[Ig]
        id = id[Ig]
        time = time[Ig]
        height = height[Ig]
        sigma = sigma[Ig]
    else:
        # Convert into stereographic coordinates
        x, y = transform_coord(projGeo, projGrd, lon, lat)

        # Get bbox from data
        xmin, xmax, ymin, ymax = x.min(), x.max(), y.min(), y.max()

    # Apply transformation to time
    if expr: time = eval(expr.replace('t', 'time'))

    # Define time interval of solution
    if tspan:
        # Time interval = given time span
        t1lim, t2lim = tspan

        # Select only observations inside time interval
        Itime = (time > t1lim) & (time < t2lim)

        # Keep only data inside time span
        x = x[Itime]
        y = y[Itime]
        id = id[Itime]
        time = time[Itime]
        height = height[Itime]
        sigma = sigma[Itime]
    else:
        # Time interval = all data
        t1lim, t2lim = time.min(), time.max()

    if mode == 'p':
        # Point solution - all points
        xi, yi = np.copy(x), np.copy(y)
    else:
        # Grid solution - defined by nodes
        Xi, Yi = make_grid(xmin, xmax, ymin, ymax, dx, dy)

        xi, yi = Xi.ravel(), Yi.ravel()
        coord = zip(x.ravel(), y.ravel())

        print 'building the k-d tree ...'
        Tree = cKDTree(coord)

    # Overall (fixed) mean time
    t_mean = np.round(np.nanmean(time), 2)

    # Number of nodes
    nodes = len(xi)

    # Initialize bias param
    bias = np.ones(lon.shape) * np.nan

    # Temporal resolution: months -> years
    tstep = tstep_ / 12.0

    # Expected max number of months in time series
    months = len(np.arange(t1lim, t2lim + tstep, tstep))
    M = 5

    # Create output containers (data matrix)
    DATA0 = np.full((nodes, 21), np.nan)
    DATA1 = np.full((nodes, months + M), np.nan)
    DATA2 = np.full((nodes, months + M), np.nan)

    # Search radius array (dmax is slightly increased by 1e-4)
    dr = np.arange(dmin, dmax, 500)

    # Enter prediction loop
    print 'predicting values ...'
    for i in xrange(len(xi)):

        xc, yc = xi[i], yi[i]  # Center coordinates

        # Loop through search radii
        for rad in dr:

            # Get indices of data within search radius (after relocation)
            i_cell, reloc_dist = get_radius_idx(x,
                                                y,
                                                xc,
                                                yc,
                                                rad,
                                                Tree,
                                                n_reloc=nreloc)

            if len(i_cell) < nlim: continue  # use larger radius

            tcap, hcap = time[i_cell], height[i_cell]

            Nb = sum(~np.isnan(hcap))  # length before editing

            # 3-sigma filter
            if SIGMAFILT:
                #hcap = sigma_filter(tcap, hcap, order=1, n_sigma=3, n_iter=3)  ##NOTE: It removes too much!!!
                hcap[np.abs(hcap - np.nanmedian(hcap)) > mad_std(hcap) *
                     3] = np.nan
                hcap[np.abs(hcap - np.nanmedian(hcap)) > 300] = np.nan

            Na = sum(~np.isnan(hcap))  # Length after editing

            n_mon, t_span = n_months(tcap, hcap, tstep=tstep)

            ##NOTE: Not using n_mon and t_span to constrain the solution! <<<<<<<<<<<<<<<<<<<<<
            # If enough data accept radius
            #if Na >= nlim and n_mon >= MINMONTHS and t_span >= dtlim:
            if Na >= nlim:
                break
            else:
                i_cell = []

        if not i_cell: continue

        # Parameters for model-solution
        xcap = x[i_cell]
        ycap = y[i_cell]
        tcap = time[i_cell]
        hcap = height[i_cell]
        mcap = id[i_cell]
        scap = sigma[i_cell]

        i_valid = ~np.isnan(hcap)
        if sum(i_valid) < nlim: continue

        xcap = xcap[i_valid]
        ycap = ycap[i_valid]
        tcap = tcap[i_valid]
        hcap = hcap[i_valid]
        mcap = mcap[i_valid]
        scap = scap[i_valid]

        if nreloc:
            xc = np.median(xcap)  # update inversion cell coords
            yc = np.median(ycap)

        # Define resolution param (a fraction of the accepted radius)
        dres = dres_ * rad

        # Estimate variance
        vcap = scap * scap

        # If reference time not given, use fixed or variable mean
        if tref_ == 'fixed':
            tref = t_mean
        elif tref_ == 'variable':
            tref = np.nanmean(tcap)
        else:
            tref = np.float(tref_)

        # Design matrix elements
        c0 = np.ones(len(xcap))  # intercept    (0)
        c1 = xcap - xc  # dx           (1)
        c2 = ycap - yc  # dy           (2)
        c3 = c1 * c2  # dx**2
        c4 = c1 * c1  # dx**2
        c5 = c2 * c2  # dy**2
        c6 = tcap - tref  # trend        (6)
        c7 = 0.5 * (c6 * c6)  # acceleration (7)
        c8 = np.sin(2 * np.pi * c6)  # seasonal sin (8)
        c9 = np.cos(2 * np.pi * c6)  # seasonal cos (9)

        # Compute distance from prediction point to data inside cap
        dist = np.sqrt((xcap - xc) * (xcap - xc) + (ycap - yc) * (ycap - yc))

        # Add small value to stabilize SVD solution
        vcap += 1e-6

        # Weighting factor: distance and error
        Wcap = 1.0 / (vcap * (1.0 + (dist / dres) * (dist / dres)))

        # Create some intermediate output variables
        sx, sy, at, ae, bi = np.nan, np.nan, np.nan, np.nan, np.nan

        # Setup design matrix
        if model == 0:
            # Trend and seasonal
            Acap = np.vstack((c0, c8, c9, c6)).T
            mcol = [1, 2, 3]  # columns to add back
        elif model == 1:
            # Trend, acceleration and seasonal
            Acap = np.vstack((c0, c7, c8, c9, c6)).T
            mcol = [1, 2, 3, 4]
        elif model == 2:
            # Trend, acceleration, seasonal and bi-linear surface
            Acap = np.vstack((c0, c1, c2, c7, c8, c9, c6)).T
            mcol = [3, 4, 5, 6]
        else:
            # Trend, acceleration, seasonal and bi-quadratic surface (full model)
            Acap = np.vstack((c0, c1, c2, c3, c4, c5, c7, c8, c9, c6)).T
            mcol = [6, 7, 8, 9]

        has_bias = False  # bias flag

        # Check if bias is needed
        if len(np.unique(mcap)) > 1:
            # Add bias to design matrix
            Acap = np.vstack((Acap.T, mcap)).T
            has_bias = True

        ##NOTE: Not using t_span to constrain solution! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        # Check constrains before solving model (min_pts and min_tspan)
        #if len(hcap) < nlim or np.max(tcap)-np.min(tcap) < dtlim: continue
        if len(hcap) < nlim: continue
        """ Least-squares fit """

        if robust_fit:
            # Robust least squares
            try:
                model_fit = sm.RLM(hcap, Acap,
                                   missing='drop').fit(maxiter=n_iter,
                                                       tol=0.001)
            except:
                print 'SOMETHING WRONG WITH THE FIT... SKIPPING CELL!!!'
                continue
        else:
            # Weighted Least squares
            model_fit = sm.WLS(hcap, Acap, weights=Wcap, missing='drop').fit()

        Cm = model_fit.params  # coeffs
        Ce = model_fit.bse  # std err
        resid = model_fit.resid  # data - model

        # Check rate and error
        if np.abs(Cm[-1]) > dhlim or np.isinf(Ce[-1]):
            continue  ##NOTE: Important for ICESat !!!

        # Residuals dH = H - A * Cm (remove linear trend)
        dh = hcap - np.dot(Acap, Cm)

        if robust_fit:
            chisq = chisquared(model_fit)
        else:
            chisq = rsquared(model_fit)

        # Compute amplitude of seasonal signal
        asea = np.sqrt(Cm[-2] * Cm[-2] + Cm[-3] * Cm[-3])

        # Compute phase offset
        psea = np.arctan2(Cm[-2], Cm[-3])

        # Convert phase to decimal years                                                   ##FIXME: Convert phase to days !!!
        psea /= (2 * np.pi)

        # Compute root-mean-square of full model residuals
        rms = mad_std(resid)

        # Add back wanted model parameters
        dh += np.dot(Acap[:, mcol], Cm[mcol])

        # Simple binning of residuals
        tb, hb, eb, nb = binning(
            tcap.copy(), dh.copy(), t1lim, t2lim,
            tstep)[:4]  ##FIXME: Use Median to construct time series

        # Convert centroid location to latitude and longitude
        lon_c, lat_c = transform_coord(projGrd, projGeo, xc, yc)

        # Position
        DATA0[i, 0] = lat_c
        DATA0[i, 1] = lon_c

        # Elevation Change
        DATA0[i, 2] = Cm[-1]  # trend
        DATA0[i, 3] = Ce[-1]  # trend error

        # Compute acceleration and error
        if model > 0:
            at, ae = Cm[-4], Ce[-4]

        DATA0[i, 4] = at  # acceleration
        DATA0[i, 5] = ae  # acceleration error

        # Surface Elevation
        DATA0[i, 6] = Cm[0]
        DATA0[i, 7] = Ce[0]

        # Model RMS
        DATA0[i, 8] = rms

        # Compute x,y slopes in degrees
        if model > 1:
            sx, sy = np.arctan(Cm[1]) * (180 / np.pi), np.arctan(
                Cm[2]) * (180 / np.pi)

        # Surface slope values
        DATA0[i, 9] = sx
        DATA0[i, 10] = sy

        # Time span of data in cap
        DATA0[i, 11] = t_span
        DATA0[i, 12] = tref

        # Seasonal signal
        DATA0[i, 13] = asea
        DATA0[i, 14] = psea

        # Bias magnitude
        if has_bias: bi = Cm[-1]

        # Aux-data from solution
        DATA0[i, 15] = len(hcap)
        DATA0[i, 16] = dmin
        DATA0[i, 17] = rad
        DATA0[i, 18] = Nb - Na
        DATA0[i, 19] = chisq
        DATA0[i, 20] = bi

        # Time series values
        DATA1[i, :] = np.hstack((lat_c, lon_c, t1lim, t2lim, len(tb),
                                 hb))  ##FIXME: Think how to do this better
        DATA2[i, :] = np.hstack((lat_c, lon_c, t1lim, t2lim, len(tb), eb))

        # Print progress (every N iterations)
        if (i % 200) == 0:
            print 'cell#', str(i) + "/" + str(len(xi)),  \
                  'trend:', np.around(Cm[mcol[-1]],2), 'm/yr', 'n_months:', n_mon, \
                  'n_pts:', len( resid), 'radius:', rad, 'reloc_dist:', reloc_dist

    # Remove invalid entries from data matrix
    if mode == 'p':
        i_nan = np.where(np.isnan(DATA0[:, 3]))
        DATA0 = np.delete(DATA0.T, i_nan, 1).T
        i_nan = np.where(np.isnan(DATA1[:, 3]))
        DATA1 = np.delete(DATA1.T, i_nan, 1).T
        i_nan = np.where(np.isnan(DATA2[:, 3]))
        DATA2 = np.delete(DATA2.T, i_nan, 1).T
    else:
        ##NOTE: NaNs are not removed in case a grid soluction (n_reloc=0) is selected.
        if not nreloc:
            grids = [d.reshape(Xi.shape) for d in DATA0.T]  # 1d -> 2d (grids)

        variables = [
            'lat', 'lon', 'trend', 'trend_err', 'accel', 'accel_err', 'height',
            'height_err', 'model_rms', 'slope_x', 'slope_y', 't_span', 't_ref',
            'amp_seas', 'pha_seas', 'n_obs', 'd_min', 'd_ri', 'n_edited',
            'chi2', 'bias'
        ]

    # Check if output arrays are empty
    if np.isnan(DATA0[:, 3]).all():
        print 'SKIP FILE: NO PREDICTIONS TO SAVE:', ifile
        return

    # Define output file name
    if ofile:
        outfile = ofile
    else:
        outfile = ifile

    # Output file names - strings
    path, ext = os.path.splitext(outfile)
    ofile0 = path + '_sf.h5'
    ofile1 = path + '_ts.h5'
    ofile2 = path + '_es.h5'

    print 'saving data ...'

    # Save surface fit parameters
    with h5py.File(ofile0, 'w') as fo0:
        if mode == 'p':
            fo0['sf'] = DATA0  # data matrix
        elif nreloc:
            for v, a in zip(variables, DATA0.T):
                fo0[v] = a  # 1d arrays
        else:
            for v, g in zip(variables, grids):
                fo0[v] = g  # 2d arrays
            fo0['x'], fo0['y'] = Xi[0, :], Yi[:, 0]

    # Save binned time series values
    with h5py.File(ofile1, 'w') as fo1:
        fo1['ts'] = DATA1

    # Save binned time series errors
    with h5py.File(ofile2, 'w') as fo2:
        fo2['es'] = DATA2

    # Print some statistics
    print '*' * 70
    print('%s %.5f %s %.2f %s %.2f %s %.2f %s %s' %
          ('Mean:', np.nanmean(DATA0[:, 2]), 'Std:', np.nanstd(
              DATA0[:, 2]), 'Min:', np.nanmin(DATA0[:, 2]), 'Max:',
           np.nanmax(DATA0[:, 2]), 'Model:', model))
    print '*' * 70
    print 'Execution time: ' + str(datetime.now() - startTime)
    print 'Surface fit results ->', ofile0
    print 'Time series values -> ', ofile1
    print 'Time series errors -> ', ofile2

Exemple #15

0

Afficher le fichier

# In[8]:

dataset.corr() > 0.98


# In[17]:

xtrain_dataframe = pd.DataFrame(xtrain)
ytrain_dataframe = pd.DataFrame(ytrain)
xtest_dataframe = pd.DataFrame(xtest)
ytest_dataframe = pd.DataFrame(ytest)
xtrain_dataframe.columns = [u'R1',u'R2',u'R3',u'R4',u'R5',u'R6',u'R7',u'R8',u'Temp.',u'Humidity']
ytrain_dataframe.columns = ['class']
xtest_dataframe.columns = [u'R1',u'R2',u'R3',u'R4',u'R5',u'R6',u'R7',u'R8',u'Temp.',u'Humidity']
ytest_dataframe.columns = ['class']
res = sm.RLM(ytrain_dataframe, xtrain_dataframe).fit()
res.summary()


# When you perform a hypothesis test in statistics, a p-value helps you determine the significance of your results. Hypothesis tests are used to test the validity of a claim that is made about a population. This claim that’s on trial, in essence, is called the null hypothesis.
# 
# The alternative hypothesis is the one you would believe if the null hypothesis is concluded to be untrue. The evidence in the trial is your data and the statistics that go along with it. All hypothesis tests ultimately use a p-value to weigh the strength of the evidence (what the data are telling you about the population). The p-value is a number between 0 and 1 and interpreted in the following way:
# 
# 1. A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis.
# 
# 2. A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis.
# 
# 3. p-values very close to the cutoff (0.05) are considered to be marginal (could go either way). Always report the p-value so your readers can draw their own conclusions.
# 
# 
# #### P value of R1 is too much which means that this variable donot have affect on the model. Hence we can remove this variable from our model.

Exemple #16

0

Afficher le fichier

Fichier : plot_funcs.py Projet : ehbaker/Modules

def OLS_plot(col_x,
             col_y,
             dat,
             hue=None,
             robust=False,
             title=None,
             color='blue',
             aspect=3):
    '''
    create correlation plot between two columns in a dataframe; add r2 and kendal tau stats to plot
    hue: name of column used to color the data points
    '''
    #Calculate correlation stats

    #OLS regression
    if robust == False:
        res = sm.OLS(dat[col_y], sm.add_constant(dat[col_x]),
                     missing='drop').fit()
        pval = res.pvalues[col_x]
        r2 = res.rsquared_adj
        slope = res.params[col_x]
    if robust:
        res = sm.RLM(dat[col_y], sm.add_constant(dat[col_x]),
                     missing='drop').fit()
        pval = res.pvalues[col_x]
        r2 = sm.OLS(
            dat[col_y], dat[col_x],
            missing='drop').fit().rsquared_adj  #same r2 as for non-robust
        slope = res.params[col_x]

    #Kendal-Tau (non-parametric)
    kt_dat = dat.dropna(subset=[col_x, col_y])
    kendall_tau, kt_pval_num = scipy.stats.stats.kendalltau(kt_dat[col_y],
                                                            kt_dat[col_x],
                                                            nan_policy="omit")
    kt_pval = pretty_p_val(kt_pval_num)

    #Build plot
    sns.lmplot(y=col_y,
               x=col_x,
               data=dat,
               hue=hue,
               robust=robust,
               line_kws={
                   'color': 'red',
                   'lw': 1,
                   'alpha': 0.8
               },
               scatter_kws={
                   'color': color,
                   'alpha': 0.6
               },
               aspect=aspect)
    #plt.xticks(rotation=-90)
    summary_text = "$r^2$=" + str(r2)[0:4] + "; " + pretty_p_val(
        pval) + ". Slope= " + str(slope.round(4))
    plt.tight_layout(pad=2)
    plt.figtext(0.93, 0.01, summary_text, horizontalalignment='right')
    plt.figtext(0.02,
                0.01,
                r"K. $\tau$ = " + str(kendall_tau)[0:5] + "; " + kt_pval,
                horizontalalignment='left')
    ax = plt.gca()
    ax.set_title(title)

    #store results for function to return
    d = {
        'r2': [r2],
        'r2_p': [pval],
        'slope': [slope],
        'kendall_tau': [kendall_tau],
        'kt_pval': [kt_pval_num]
    }
    df = pd.DataFrame(data=d)
    return (df)

Exemple #17

0

Afficher le fichier

Fichier : utils.py Projet : opnumten/dynamo-release

def relative2abs(adata,
                 dilution,
                 volume,
                 from_layer=None,
                 to_layers=None,
                 mixture_type=1,
                 ERCC_controls=None,
                 ERCC_annotation=None):
    """Converts FPKM/TPM data to transcript counts using ERCC spike-in. This is based on the relative2abs function from
    monocle 2 (Qiu, et. al, Nature Methods, 2017).

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            an Annodata object
        dilution: `float`
            the dilution of the spikein transcript in the lysis reaction mix. Default is 40, 000. The number of spike-in
            transcripts per single-cell lysis reaction was calculated from.
        volume: `float`
            the approximate volume of the lysis chamber (nanoliters). Default is 10
        from_layer: `str` or `None`
            The layer in which the ERCC TPM values will be used as the covariate for the ERCC based linear regression.
        to_layers: `str`, `None` or `list-like`
            The layers that our ERCC based transformation will be applied to.
        mixture_type:
            the type of spikein transcripts from the spikein mixture added in the experiments. By default, it is mixture 1.
            Note that m/c we inferred are also based on mixture 1.
        ERCC_controls:
            the FPKM/TPM matrix for each ERCC spike-in transcript in the cells if user wants to perform the transformation based
            on their spike-in data. Note that the row and column names should match up with the ERCC_annotation and relative_
            exprs_matrix respectively.
        ERCC_annotation:
            the ERCC_annotation matrix from illumina USE GUIDE which will be ued for calculating the ERCC transcript copy
            number for performing the transformation.

    Returns
    -------
        An adata object with the data specified in the to_layers transformed into absolute counts.
    """

    if ERCC_annotation is None:
        ERCC_annotation = pd.read_csv(
            'https://www.dropbox.com/s/cmiuthdw5tt76o5/ERCC_specification.txt?dl=1',
            sep='\t')

    ERCC_id = ERCC_annotation['ERCC ID']

    ERCC_id = adata.var_names.intersection(ERCC_id)
    if len(ERCC_id) < 10 and ERCC_controls is None:
        raise Exception(
            f'The adata object you provided has less than 10 ERCC genes.')

    if to_layers is not None:
        to_layers = [to_layers] if to_layers is str else to_layers
        to_layers = list(set(adata.layers.keys()).intersection(to_layers))
        if len(to_layers) == 0:
            raise Exception(
                f"The layers {to_layers} that will be converted to absolute counts doesn't match any layers"
                f"from the adata object.")

    mixture_name = "concentration in Mix 1 (attomoles/ul)" if mixture_type == 1 else "concentration in Mix 2 (attomoles/ul)"
    ERCC_annotation['numMolecules'] = ERCC_annotation.loc[:, mixture_name] * (
        volume * 10**(-3) * 1 / dilution * 10**(-18) * 6.02214129 * 10**(23))

    ERCC_annotation['rounded_numMolecules'] = ERCC_annotation[
        'numMolecules'].astype(int)

    if from_layer in [None, 'X']:
        X, X_ercc = (adata.X, adata[:, ERCC_id].X
                     if ERCC_controls is None else ERCC_controls)
    else:
        X, X_ercc = (adata.layers[from_layer], adata[:, ERCC_id] \
            if ERCC_controls is None else ERCC_controls)

    logged = False if X.max() > 100 else True

    if not logged:
        X, X_ercc = (np.log1p(X.A) if issparse(X_ercc) else np.log1p(X), \
                     np.log1p(X_ercc.A) if issparse(X_ercc) else np.log1p(X_ercc))
    else:
        X, X_ercc = (X.A if issparse(X_ercc) else X,
                     X_ercc.A if issparse(X_ercc) else X_ercc)

    y = np.log1p(ERCC_annotation['numMolecules'])

    for i in range(adata.n_obs):
        X_i, X_ercc_i = X[i, :], X_ercc[i, :]

        X_i, X_ercc_i = sm.add_constant(X_i), sm.add_constant(X_ercc_i)
        res = sm.RLM(y, X_ercc_i).fit()
        k, b = res.params[::-1]

        if to_layers is None:
            X = adata.X
            logged = False if X.max() > 100 else True

            if not logged:
                X_i = np.log1p(X[i, :].A) if issparse(X) else np.log1p(X[i, :])
            else:
                X_i = X[i, :].A if issparse(X) else X[i, :]

            res = k * X_i + b
            res = res if logged else np.expm1(res)
            adata.X[i, :] = csr_matrix(res) if issparse(X) else res
        else:
            for cur_layer in to_layers:
                X = adata.layers[cur_layer]

                logged = False if X.max() > 100 else True
                if not logged:
                    X_i = np.log1p(X[i, :].A) if issparse(X) else np.log1p(
                        X[i, :])
                else:
                    X_i = X[i, :].A if issparse(X) else X[i, :]

                res = k * X_i + b if logged else np.expm1(k * X_i + b)
                adata.layers[cur_layer][i, :] = csr_matrix(res) if issparse(
                    X) else res

Exemple #18

0

Afficher le fichier

Fichier : ex_pandas.py Projet : PaulGureghian1/Statsmodels

#Example: OLS
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

print(results.params)
print(results.cov_params())

infl = results.get_influence()
print(infl.summary_table())

#raise

#Example RLM
huber_t = sm.RLM(Y, X, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print(hub_results.params)
print(hub_results.bcov_scaled)
print(hub_results.summary())

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl


def plot_acf_multiple(ys, lags=20):
    """

    """
    from statsmodels.tsa.stattools import acf

Exemple #19

0

Afficher le fichier

Fichier : wls_extended.py Projet : mussabota/time-series-analysis

print('corrected rsquared')
print((wls_fit3.uncentered_tss - wls_fit3.ssr) / wls_fit3.uncentered_tss)
plt.figure()
plt.title('WLS dropping heteroscedasticity variable from regressors')
plt.plot(data.endog, wls_fit3.fittedvalues, 'o')
plt.xlim([0, 2000])
# @savefig wls_drop_het.png
plt.ylim([0, 2000])
print('raw correlation of endog and fittedvalues')
print(np.corrcoef(data.endog, wls_fit.fittedvalues))
print('raw correlation coefficient of endog and fittedvalues squared')
print(np.corrcoef(data.endog, wls_fit.fittedvalues)[0, 1]**2)

# compare with robust regression,
# heteroscedasticity correction downweights the outliers
rlm_fit = sm.RLM(data.endog, data.exog).fit()
plt.figure()
plt.title('using robust for comparison')
plt.plot(data.endog, rlm_fit.fittedvalues, 'o')
plt.xlim([0, 2000])
# @savefig wls_robust_compare.png
plt.ylim([0, 2000])

# What is going on? A more systematic look at the data
# ----------------------------------------------------

# two helper functions


def getrsq(fitresult):
    '''calculates rsquared residual, total and explained sums of squares

Exemple #20

0

Afficher le fichier

def rlsq(x, y, n=1):
    """ Fit a robust polynomial of n:th deg."""

    # Test solution
    if len(x[~np.isnan(y)]) <= (n + 1):

        if n == 0:
            p = np.nan
            s = np.nan
        else:
            p = np.zeros((1, n)) * np.nan
            s = np.nan

        return p, s

    # Empty array
    A = np.empty((0, len(x)))

    # Create counter
    i = 0

    # Determine if we need centering
    if n > 1:
        # Center x-axis
        x -= np.nanmean(x)

    # Special case
    if n == 0:

        # Mean offset
        A = np.ones(len(x))

    else:

        # Make design matrix
        while i <= n:
            # Stack coefficients
            A = np.vstack((A, x ** i))

            # Update counter
            i += 1

    # Test to see if we can solve the system
    try:

        # Robust least squares fit
        fit = sm.RLM(y, A.T, missing='drop').fit(maxiter=5, tol=0.001)

        # polynomial coefficients
        p = fit.params

        # RMS of the residuals
        s = mad_std(fit.resid)

    except:

        # Set output to NaN
        if n == 0:
            p = np.nan
            s = np.nan
        else:
            p = np.zeros((1, n)) * np.nan
            s = np.nan

    return p[::-1], s

Exemple #21

0

Afficher le fichier

Fichier : illustrate.py Projet : rsiverd/ultracool

if (context.vlevel >= 2):
    sys.stderr.write("%s\n" % fulldiv)
    sys.stderr.write("guess_ra:  %15.7f\n" % guess_ra)
    sys.stderr.write("guess_de:  %15.7f\n" % guess_de)

#afpars = [np.radians(guess_ra), np.radians(guess_de), ts_pmra_masyr/1e3, ts_pmde_masyr/1e3, 1.0]
afpars = [np.radians(guess_ra), np.radians(guess_de), 
        np.radians(ts_ra_model[1]), np.radians(ts_de_model[1]), 1.0]
appcoo = af.apparent_radec(use_epoch.tdb.jd, afpars, use_eph)

# proper fit:
design_matrix = np.column_stack((np.ones(syr.size), syr))
#de_design_matrix = np.column_stack((np.ones(syr.size), syr))
ra_ols_res = sm.OLS(sra, design_matrix).fit()
de_ols_res = sm.OLS(sde, design_matrix).fit()
ra_rlm_res = sm.RLM(sra, design_matrix).fit()
de_rlm_res = sm.RLM(sde, design_matrix).fit()
rlm_pmde_masyr = de_rlm_res.params[1] * 3.6e6
rlm_pmra_masyr = ra_rlm_res.params[1] * 3.6e6 \
        * np.cos(np.radians(de_rlm_res.params[0]))

if (context.vlevel >= 1):
    sys.stderr.write("%s\n" % fulldiv)
    sys.stderr.write("\nTheil-Sen intercepts:\n")
    sys.stderr.write("RA:   %15.7f\n" % ts_ra_model[0])
    sys.stderr.write("DE:   %15.7f\n" % ts_de_model[0])
    
    sys.stderr.write("\nTheil-Sen proper motions:\n")
    sys.stderr.write("RA:   %10.6f mas/yr\n" % ts_pmra_masyr)
    sys.stderr.write("DE:   %10.6f mas/yr\n" % ts_pmde_masyr)

Exemple #22

0

Afficher le fichier

def main(ifile, n=''):

    # Check for empty file
    if os.stat(ifile).st_size == 0:
        print('input file is empty!')
        return

    # Start timing of script
    startTime = datetime.now()

    print('loading data ...')

    # Determine input file type
    if not ifile.endswith(('.h5', '.H5', '.hdf', '.hdf5')):
        print("Input file must be in hdf5-format")
        return

    # Input variables
    xvar, yvar, tvar, zvar = icol

    # Load all 1d variables needed
    with h5py.File(ifile, 'r') as fi:

        lon = fi[xvar][:]
        lat = fi[yvar][:]
        time = fi[tvar][:]
        height = fi[zvar][:]

    # EPSG number for lon/lat proj
    projGeo = '4326'

    # EPSG number for grid proj
    projGrd = proj

    print('converting lon/lat to x/y ...')

    # Convert into stereographic coordinates
    (x, y) = transform_coord(projGeo, projGrd, lon, lat)

    # Get bbox from data
    (xmin, xmax, ymin, ymax) = x.min(), x.max(), y.min(), y.max()

    # Apply transformation to time
    if expr: time = eval(expr.replace('t', 'time'))

    # Overall (fixed) mean time
    t_mean = np.round(np.nanmean(time), 2)

    # Grid solution - defined by nodes
    (Xi, Yi) = make_grid(xmin, xmax, ymin, ymax, dx, dy)

    # Flatten prediction grid
    xi = Xi.ravel()
    yi = Yi.ravel()

    # Zip data to vector
    coord = list(zip(x.ravel(), y.ravel()))

    # Construct cKDTree
    print('building the k-d tree ...')
    Tree = cKDTree(coord)

    # Create output containers
    dh_topo = np.full(height.shape, np.nan)
    de_topo = np.full(height.shape, 999999.)
    mi_topo = np.full(height.shape, np.nan)
    hm_topo = np.full(height.shape, np.nan)
    sx_topo = np.full(height.shape, np.nan)
    sy_topo = np.full(height.shape, np.nan)
    tr_topo = np.full(height.shape, np.nan)

    # Set slope limit
    slp_lim = np.tan(np.deg2rad(slplim))

    # Enter prediction loop
    print('predicting values ...')
    for i in range(len(xi)):

        x0, y0 = xi[i], yi[i]

        # Get indexes of data within search radius or cell bbox
        idx = get_radius_idx(
                x, y, x0, y0, dmax, Tree, n_reloc=nreloc,
                min_months=18, max_reloc=3, time=None, height=None)

        # Length of data in search cap
        nobs = len(x[idx])

        # Check data density
        if (nobs < nlim): continue

        # Parameters for model-solution
        xcap = x[idx]
        ycap = y[idx]
        tcap = time[idx]
        hcap = height[idx]

        # Copy original height vector
        h_org = hcap.copy()

        # Centroid node
        xc = np.median(xcap)
        yc = np.median(ycap)

        # If reference time not given, use fixed or variable mean
        if tref_ == 'fixed':
            tref = t_mean
        elif tref_ == 'variable':
            tref = np.nanmean(tcap)
        else:
            tref = np.float(tref_)

        # Design matrix elements
        c0 = np.ones(len(xcap))
        c1 = xcap - xc
        c2 = ycap - yc
        c3 = c1 * c2
        c4 = c1 * c1
        c5 = c2 * c2
        c6 = tcap - tref

        # Length before editing
        nb = len(hcap)

        # Determine model order
        if order == 2 and nb >= mlim * 2:

            # Biquadratic surface and linear trend
            Acap = np.vstack((c0, c1, c2, c3, c4, c5, c6)).T

            # Model identifier
            mi = 1

        # Set model order
        elif nb >= mlim:

            # Bilinear surface and linear trend
            Acap = np.vstack((c0, c1, c2, c6)).T

            # Model identifier
            mi = 2

        else:

            # Model identifier
            mi = 3

        # Modelled topography
        if mi == 1:

            # Construct model object
            linear_model = sm.RLM(hcap, Acap, M=sm.robust.norms.HuberT(), missing='drop')

            # Fit the model to the data,
            linear_model_fit = linear_model.fit(maxiter=niter, tol=0.001)

            # Coefficients
            Cm = linear_model_fit.params

            # Biquadratic surface
            h_model = np.dot(np.vstack((c0, c1, c2, c3, c4, c5)).T, Cm[[0, 1, 2, 3, 4, 5]])

            # Compute along and across track slope
            sx = np.sign(Cm[1]) * slp_lim if np.abs(Cm[1]) > slp_lim else Cm[1]
            sy = np.sign(Cm[2]) * slp_lim if np.abs(Cm[2]) > slp_lim else Cm[2]

            # Mean height
            h_avg = Cm[0]

        elif mi == 2:

            # Construct model object
            linear_model = sm.RLM(hcap, Acap, M=sm.robust.norms.HuberT(), missing='drop')

            # Fit the model to the data,
            linear_model_fit = linear_model.fit(maxiter=niter, tol=0.001)

            # Coefficients
            Cm = linear_model_fit.params

            # Bilinear surface
            h_model = np.dot(np.vstack((c0, c1, c2)).T, Cm[[0, 1, 2]])

            # Compute along and across track slope
            sx = np.sign(Cm[1]) * slp_lim if np.abs(Cm[1]) > slp_lim else Cm[1]
            sy = np.sign(Cm[2]) * slp_lim if np.abs(Cm[2]) > slp_lim else Cm[2]

            # Mean height
            h_avg = Cm[0]

        else:

            # Mean surface from median
            h_avg = np.median(hcap)

            # Compute distance estimates from centroid
            s_dx = (xcap - xc) + 1e-3
            s_dy = (ycap - yc) + 1e-3

            # Center surface height
            dh_i = h_org - h_avg

            # Compute along-track slope
            px, rms_x = rlsq(s_dx, dh_i, 1)
            py, rms_x = rlsq(s_dy, dh_i, 1)

            # Set along-track slope
            s_x = 0 if np.isnan(px[0]) else px[0]

            # Set across-track slope to zero
            s_y = 0 if np.isnan(py[0]) else py[0]

            # Compute along and across track slope
            sx = np.sign(s_x) * slp_lim if np.abs(s_x) > slp_lim else s_x
            sy = np.sign(s_y) * slp_lim if np.abs(s_y) > slp_lim else s_y

            # Compute the surface height correction
            h_model = h_avg + (sx * s_dx) + (sy * s_dy)

        # Compute full slope
        slope = np.arctan(np.sqrt(sx**2 + sy**2)) * (180 / np.pi)

        # Compute residual
        dh = h_org - h_model

        # Number of observations
        na = len(dh)

        # RMSE of the residuals
        RMSE = mad_std(dh)

        # Overwrite errors
        iup = RMSE < de_topo[idx]

        # Create temporary variables
        dh_cap = dh_topo[idx].copy()
        de_cap = de_topo[idx].copy()
        hm_cap = hm_topo[idx].copy()
        mi_cap = mi_topo[idx].copy()
        tr_cap = tr_topo[idx].copy()

        # Update variables
        dh_cap[iup] = dh[iup]
        de_cap[iup] = RMSE
        hm_cap[iup] = h_avg
        mi_cap[iup] = mi
        tr_cap[iup] = tref

        # Update with current solution
        dh_topo[idx] = dh_cap
        de_topo[idx] = de_cap
        hm_topo[idx] = hm_cap
        mi_topo[idx] = mi_cap
        tr_topo[idx] = tr_cap
        sx_topo[idx] = np.arctan(sx) * (180 / np.pi)
        sy_topo[idx] = np.arctan(sy) * (180 / np.pi)

        # Print progress (every N iterations)
        if (i % 100) == 0 and diag is True:

            # Print message every i:th solution
            print(('%s %i %s %2i %s %i %s %03d %s %.3f %s %.3f' % \
                    ('#',i,'/',len(xi),'Model:',mi,'Nobs:',nb,'Slope:',\
                    np.around(slope,3),'Residual:',np.around(mad_std(dh),3))))

    # Print percentage of not filled
    print(('Total NaNs (percent): %.2f' % \
            (100 * float(len(dh_topo[np.isnan(dh_topo)])) / float(len(dh_topo)))))

    # Print percentage of each model
    one = np.sum(mi_topo == 1)
    two = np.sum(mi_topo == 2)
    tre = np.sum(mi_topo == 3)
    N = float(len(mi_topo))

    print(('Model types (percent): 1 = %.2f, 2 = %.2f, 3 = %.2f' % \
            (100 * one/N, 100 * two/N, 100 * tre/N)))

    # Append new columns to original file
    with h5py.File(ifile, 'a') as fi:

        # Check if we have variables in file
        try:

            # Save variables
            fi['h_res'] = dh_topo
            fi['h_mod'] = hm_topo
            fi['e_res'] = de_topo
            fi['m_deg'] = mi_topo
            fi['t_ref'] = tr_topo
            fi['slp_x'] = sx_topo
            fi['slp_y'] = sy_topo

        except:

            # Update variables
            fi['h_res'][:] = dh_topo
            fi['h_mod'][:] = hm_topo
            fi['e_res'][:] = de_topo
            fi['m_deg'][:] = mi_topo
            fi['t_ref'][:] = tr_topo
            fi['slp_x'][:] = sx_topo
            fi['slp_y'][:] = sy_topo

    # Rename file
    if ifile.find('TOPO') < 0:
        os.rename(ifile, ifile.replace('.h5', '_TOPO.h5'))

    # Print some statistics
    print(('*' * 75))
    print(('%s %s %.5f %s %.2f %s %.2f %s %.2f %s %.2f' % \
        ('Statistics',
         'Mean:', np.nanmedian(dh_topo),
         'Std.dev:', mad_std(dh_topo),
         'Min:', np.nanmin(dh_topo),
         'Max:', np.nanmax(dh_topo),
         'RMSE:', np.nanmedian(de_topo[dh_topo!=999999]),)))
    print(('*' * 75))
    print('')

    # Print execution time of algorithm
    print(('Execution time: '+ str(datetime.now()-startTime)))

Exemple #23

0

Afficher le fichier

    def robust_linear(self, x, y):

        rlm_model = sm.RLM(y, x, M=sm.robust.norms.HuberT())
        rlm_results = rlm_model.fit()
        print(rlm_results.summary())
        print(rlm_results.params)

Exemple #24

0

Afficher le fichier

    z = ['x2']
    alpha = 0.05
    size = 5000
    x1 = np.random.normal(size=size)
    x2 = np.random.normal(size=size) + x1
    x3 = np.random.normal(size=size) + x2
    X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3})
    test = MixedChiSquaredTest(y,
                               x,
                               z,
                               X,
                               alpha,
                               variable_types={
                                   'x1': 'c',
                                   'x2': 'c',
                                   'x3': 'c'
                               })
    print 'null', test.chi2_bound
    print 'actual', test.chi2
    print test.independent()
    raise Exception
    X_sampled = test.generate_ci_sample()
    print X.corr()
    print X_sampled.corr()
    regression = sm.RLM(X[y], X[x + z])
    result = regression.fit()
    print result.summary()
    regression = sm.RLM(X_sampled[y], X_sampled[x + z])
    result = regression.fit()
    print result.summary()

Exemple #25

0

Afficher le fichier

Fichier : robust_models_1.py Projet : josephryu/Python_Stats

    fontsize=16,
)
# annotate these with their index
for i, row in dta.loc[dta['log.Te'] < 3.8].iterrows():
    ax.annotate(i, row, row + .01, fontsize=14)
xlim, ylim = ax.get_xlim(), ax.get_ylim()

from IPython.display import Image
Image(filename='star_diagram.png')

y = dta['log.light']
X = sm.add_constant(dta['log.Te'], prepend=True)
ols_model = sm.OLS(y, X).fit()
abline_plot(model_results=ols_model, ax=ax)

rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()
abline_plot(model_results=rlm_mod, ax=ax, color='red')

# * Why? Because M-estimators are not robust to leverage points.

infl = ols_model.get_influence()

h_bar = 2 * (ols_model.df_model + 1) / ols_model.nobs
hat_diag = infl.summary_frame()['hat_diag']
hat_diag.loc[hat_diag > h_bar]

sidak2 = ols_model.outlier_test('sidak')
sidak2.sort_values('unadj_p', inplace=True)
print(sidak2)

fdr2 = ols_model.outlier_test('fdr_bh')

Exemple #26

0

Afficher le fichier

def linear_best_fit(data,
                    x_args,
                    y_args,
                    fillNaN=True,
                    robust=True,
                    printdata=False,
                    plot=False):
    """
    --------------------------------------------------------------------------
    Create linear line of best fit and get coefficients
    --------------------------------------------------------------------------
    Input:
    --------------------------------------------------------------------------
    Output:
    intercept - float, intercept of the linear equation (y=slope*x+intercept)
    slope - float, slope of the linear equation (y=slope*x+intercept)
    --------------------------------------------------------------------------
    WARNIGN:
    Input data cannot be negative - see first part of the code
    --------------------------------------------------------------------------
    """
    divider = '------------------------------------------------------------'
    #Filter data, get rid of NaN's
    data = data[(data[x_args] >= -1)]
    data = data[(data[y_args] >= -1)]
    #Set bounds
    x_min = data[x_args].min()
    x_max = data[x_args].max()
    #Use add_constants to get intercept
    x2_args = sm.add_constant(data[x_args])
    if robust:
        model = sm.RLM(data[y_args], x2_args, M=sm.robust.norms.LeastSquares())
    else:
        model = sm.OLS(data[y_args], x2_args)
    #Straight line equation coefficients
    parameters = model.fit().params
    intercept = parameters[0]
    slope = parameters[1]
    if printdata:
        #Get bounds of y-values
        y_min = data[y_args].min()
        y_max = data[y_args].max()
        print('Data for {} vs {}:'.format(y_args, x_args))
        print(divider)
        print('Range of x:{} - {}, y:{} - {}'.format(x_min, x_max, y_min,
                                                     y_max))
        print(divider)
        if robust:
            #Calculate OLS as well in order to get R^2 value
            model2 = sm.OLS(data[y_args], x2_args)
            parameters2 = model2.fit().params
            intercept2 = parameters2[0]
            slope2 = parameters2[1]
            #Calculate R^2
            r2 = model2.fit().rsquared
            print('OLS: Slope: {}, Intercept: {}'.format(slope2, intercept2))
            print(divider)
            print('R^2={:.3f}'.format(r2))
            print(divider)
            print('RLM: Slope: {}, Intercept: {}'.format(slope, intercept))
            print(divider)
        else:
            print('OLS: Slope: {}, Intercept: {}'.format(slope, intercept))
            print(divider)
            print('R^2=')
            print(divider)
        print('Extreme points: ({},{:.2f})({},{:.2f})'.format(
            x_min, (slope * x_min + intercept), x_max,
            (slope * x_max + intercept)))
        print(divider)
    if plot:
        ax = data.plot(x=x_args, y=y_args, kind='scatter')
        #Plot regression line on the same axes, set values
        x = [x_min, x_max]
        ax.plot(x, [intercept + x_min * slope, intercept + x_max * slope])
        ax.set_xlim([x_min, x_max])
    return intercept, slope

Exemple #27

0

Afficher le fichier

Fichier : wrapper.py Projet : vpolisky/statsmodels

        func = getattr(wrapping, meth)
        wrapper = make_wrapper(func, how)
        setattr(klass, meth, wrapper)


if __name__ == '__main__':
    import statsmodels.api as sm
    from pandas import DataFrame
    data = sm.datasets.longley.load(as_pandas=False)
    df = DataFrame(data.exog, columns=data.exog_name)
    y = data.endog
    # data.exog = sm.add_constant(data.exog)
    df['intercept'] = 1.
    olsresult = sm.OLS(y, df).fit()
    rlmresult = sm.RLM(y, df).fit()

    # olswrap = RegressionResultsWrapper(olsresult)
    # rlmwrap = RLMResultsWrapper(rlmresult)

    data = sm.datasets.wfs.load(as_pandas=False)
    # get offset
    offset = np.log(data.exog[:, -1])
    exog = data.exog[:, :-1]

    # convert dur to dummy
    exog = sm.tools.categorical(exog, col=0, drop=True)
    # drop reference category
    # convert res to dummy
    exog = sm.tools.categorical(exog, col=0, drop=True)
    # convert edu to dummy

Exemple #28

0

Afficher le fichier

Fichier : dobparking.py Projet : dfc99/td-parking

k['rate'] = k['BUILDINGGARAGE'] / k['carnumbersum']
k = k.groupby('bc').agg({'rate': 'median', 'BUILDINGGARAGE': 'count'})

k = df[(df['carnumbersum'] > 0) & (df['GarageArea'] == 0) &
       (df['BUILDINGGARAGE'] == 0) &
       (df['parkinglots'] > 0)].reset_index(drop=True)
# 1428
k['rate'] = k['parkinglots'] / k['carnumbersum']
k = k.groupby('bc').agg({'rate': 'median', 'parkinglots': 'count'})

k = df[(df['carnumbersum'] > 0)
       & ((df['GarageArea'] > 0) | (df['BUILDINGGARAGE'] > 0)
          | (df['parkinglots'] > 0))].reset_index(drop=True)
X = k[['GarageArea', 'BUILDINGGARAGE', 'parkinglots']]
y = k['carnumbersum']
model = sm.RLM(y, X).fit()
model.summary()
k['predict'] = model.predict()
k.to_csv(path + 'k.csv', index=False)

k = df[(df['carnumbersum'] > 0) & (df['GarageArea'] == 0) &
       (df['BUILDINGGARAGE'] == 0) &
       (df['parkinglots'] == 0)].reset_index(drop=True)
k = k[(k['bc'] == 'A') | (k['bc'] == 'B') |
      (k['BldgClass'] == 'C0')].reset_index(drop=True)
k.to_csv(path + 'k.csv', index=False)

X = df[['GarageArea', 'BUILDINGGARAGE', 'parkinglots', 'LotArea']]
y = df['carnumbersum']
model = sm.OLS(y, X).fit()
model.summary()

Exemple #29

0

Afficher le fichier

Fichier : robust_models_0.py Projet : KWRProjects/SIM_TS-statsmodels

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# ## Estimation
#
# Load data:

data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)

# Huber's T norm with the (default) median absolute deviation scaling

huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print(hub_results.params)
print(hub_results.bse)
print(
    hub_results.summary(
        yname='y',
        xname=['var_%d' % i for i in range(len(hub_results.params))]))

# Huber's T norm with 'H2' covariance matrix

hub_results2 = huber_t.fit(cov="H2")
print(hub_results2.params)
print(hub_results2.bse)

# Andrew's Wave norm with Huber's Proposal 2 scaling and 'H3' covariance

Exemple #30

0

Afficher le fichier

    def regComb(self, dsReg, field='LSTM', opt=1, fTest=None):
        statSigma = dsReg.statCalSigma(field=field)
        # do regression
        if opt == 1:
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat
            y = np.square(dsReg.LSTM-dsReg.SMAP) - \
                np.square(statSigma.sigmaX_mat)
            xx = np.stack((x1.flatten(), x2.flatten()), axis=1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 2:
            x1 = np.square(statSigma.sigmaMC_mat)
            y = np.square(dsReg.LSTM-dsReg.SMAP) - \
                np.square(statSigma.sigmaX_mat)
            xx = x1.flatten().reshape(-1, 1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 3:
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = np.square(statSigma.sigmaX_mat)
            x3 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat
            x4 = np.ones(x1.shape)
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = np.stack(
                (x1.flatten(), x2.flatten(), x3.flatten(), x4.flatten()),
                axis=1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 4:
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = np.square(statSigma.sigmaX_mat)
            x3 = np.ones(x1.shape)
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = np.stack((x1.flatten(), x2.flatten(), x3.flatten()), axis=1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 5:
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = np.square(statSigma.sigmaX_mat)
            x3 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = np.stack((x1.flatten(), x2.flatten(), x3.flatten()), axis=1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 6:
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = np.square(statSigma.sigmaX_mat)
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = np.stack((x1.flatten(), x2.flatten()), axis=1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 7:
            x1 = np.square(statSigma.sigmaMC_mat)
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = x1.flatten().reshape(-1, 1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 8:
            x1 = np.square(statSigma.sigmaX_mat)
            y = np.square(dsReg.LSTM - dsReg.SMAP)
            xx = x1.flatten().reshape(-1, 1)
            yy = y.flatten().reshape(-1, 1)
        elif opt == 9:
            x1 = np.ones(statSigma.sigma_mat.shape)
            y = np.square(dsReg.LSTM-dsReg.SMAP) - \
                np.square(statSigma.sigma_mat)
            xx = x1.flatten().reshape(-1, 1)
            yy = y.flatten().reshape(-1, 1)

        ind = np.where(~np.isnan(yy))[0]
        xf = xx[ind, :]
        yf = yy[ind]
        # w, _, _, _ = np.linalg.lstsq(xf, yf)
        # model = sm.OLS(yf, xf)
        model = sm.RLM(yf, xf)
        result = model.fit()
        w = result.params
        if fTest is not None:
            ftestP = list()
            ftestF = list()
            for k in range(len(w)):
                ww = w.copy()
                ww[k] = fTest[k]
                ff = result.f_test(ww)
                ftestP.append(ff.pvalue)
                ftestF.append(ff.fvalue)

        if opt == 1:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                self.sigmaMC_mat * self.sigmaX_mat * w[1] +
                np.square(self.sigmaX_mat))
            k = -w[1] / 2
            a = w[0] - k**2
            out = [a, k]
        elif opt == 2:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                np.square(self.sigmaX_mat))
            x1 = np.square(statSigma.sigmaMC_mat)
            x2 = np.ones(x1.shape)
            y = np.square(statSigma.sigmaX_mat)
            xx = np.stack((x1.flatten(), x2.flatten()), axis=1)
            yy = y.flatten().reshape(-1, 1)
            k, _, _, _ = np.linalg.lstsq(xx, yy)
            k = k[0]
            a = w[0] + k
            out = [a, k]
        elif opt == 3:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                np.square(self.sigmaX_mat) * w[1] +
                self.sigmaMC_mat * self.sigmaX_mat * w[2] +
                np.ones(self.sigmaX_mat.shape) * w[3])
            out = w
        elif opt == 4:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                np.square(self.sigmaX_mat) * w[1] +
                np.ones(self.sigmaX_mat.shape) * w[2])
            out = w
        elif opt == 5:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                np.square(self.sigmaX_mat) * w[1] +
                self.sigmaMC_mat * self.sigmaX_mat * w[2])
            out = w
        elif opt == 6:
            self.sigmaReg_mat = np.sqrt(
                np.square(self.sigmaMC_mat) * w[0] +
                np.square(self.sigmaX_mat) * w[1])
            out = w
        elif opt == 7:
            self.sigmaReg_mat = np.sqrt(np.square(self.sigmaMC_mat) * w[0])
            out = w
        elif opt == 8:
            self.sigmaReg_mat = np.sqrt(np.square(self.sigmaX_mat) * w[0])
            out = w
        elif opt == 9:
            self.sigmaReg_mat = np.sqrt(np.square(self.sigma_mat) + w[0])
            out = w
        self.sigmaReg = np.sqrt(np.mean(self.sigmaReg_mat**2, axis=1))
        if fTest is None:
            return result
        else:
            return (out, ftestP, ftestF)