def main(ifile, n=''): # Message to terminal print 'processing file:', ifile, '...' # Check for empty file if os.stat(ifile).st_size == 0: print 'input file is empty!' return print 'loading data ...' # Determine input file type if not ifile.endswith(('.h5', '.H5', '.hdf', '.hdf5')): print "input file must be in hdf5-format" return # Input variables names xvar, yvar, tvar, zvar, svar, ivar, ovar = icol # Load all 1d variables needed with h5py.File(ifile, 'r') as fi: # Read in needed variables lon = fi[xvar][:] # Longitude (deg) lat = fi[yvar][:] # Latitude (deg) time = fi[tvar][:] # Time (yrs) elev = fi[zvar][:] # Height (meters) sigma = fi[svar][:] # RMSE (meters) mode = fi[ivar][:] # Mission (int) oind = fi[ovar][:] if ovar in fi else np.ones( lon.shape) # Outliers (int) # Check for NaN-values inan = ~np.isnan(elev) & ~np.isnan(oind) # Remove NaN values from arrays lon, lat, time, elev, sigma, mode = lon[inan], lat[inan], time[inan], \ elev[inan], sigma[inan], mode[inan] # Select only observations inside time interval itime = (time > t1lim) & (time < t2lim) # Select wanted time span lon, lat, time, elev, sigma, mode = lon[itime], lat[itime], time[itime], \ elev[itime], sigma[itime], mode[itime] # Select only wanted missions - not mission 4 imode = (mode != 4) # Select wanted modes lon, lat, time, elev, sigma, mode = lon[imode], lat[imode], time[imode], \ elev[imode], sigma[imode], mode[imode] # EPSG number for lon/lat proj projGeo = '4326' # EPSG number for grid proj projGrd = proj print 'converting lon/lat to x/y ...' # Get geographic boundaries + max search radius if bbox: # Extract bounding box (xmin, xmax, ymin, ymax) = bbox # Transform coordinates (x, y) = transform_coord(projGeo, projGrd, lon, lat) # Select data inside bounding box ig = (x >= xmin - dmax) & (x <= xmax + dmax) & \ (y >= ymin - dmax) & (y <= ymax + dmax) # Check bbox for obs. if len(x[ig]) == 0: print 'no data points inside bounding box!' return # Cut data to bounding box limits lon, lat, time, elev, sigma, mode = lon[ig], lat[ig], time[ig], \ elev[g], sigma[ig], mode[ig] else: # Convert into stereographic coordinates (x, y) = transform_coord(projGeo, projGrd, lon, lat) # Get bbox from data (xmin, xmax, ymin, ymax) = x.min(), x.max(), y.min(), y.max() # Construct solution grid - add border to grid (Xi, Yi) = make_grid(xmin - 10e3, xmax + 10e3, ymin - 10e3, ymax + 10e3, dx, dy) # Convert to geographical coordinates (LON, LAT) = transform_coord(projGrd, projGeo, Xi, Yi) # Flatten prediction grid xi = Xi.ravel() yi = Yi.ravel() # Zip data to vector coord = zip(x.ravel(), y.ravel()) print 'building the k-d tree ...' # Construct KD-Tree Tree = cKDTree(coord) # Number of months of time series months = len(np.arange(t1lim, t2lim + tstep, tstep)) # Total number of columns ntot = months + 4 # Create output array OFILE0 = np.ones((len(xi), 23)) * 9999 OFILE1 = np.ones((len(xi), ntot)) * 9999 OFILE2 = np.ones((len(xi), ntot)) * 9999 OFILE3 = np.ones((len(xi), ntot)) * 9999 OFILE4 = np.ones((len(xi), ntot)) * 9999 # Save corrected rate b_rate = np.ones((len(xi), 1)) * np.nan # Set up search cap dr = np.arange(dmin, dmax + 2e3, 2e3) # Enter prediction loop for i in xrange(len(xi)): # Number of observations nobs = 0 # Time difference dt = 0 # Temporal sampling npct = 1 # Number of sensors nsen = 0 # Meet data constraints for ii in xrange(len(dr)): # Query the Tree with data coordinates idx = Tree.query_ball_point((xi[i], yi[i]), dr[ii]) # Check for empty arrays if len(time[idx]) == 0: continue # Constraints parameters dt = np.max(time[idx]) - np.min(time[idx]) nobs = len(time[idx]) nsen = len(np.unique(mode[idx])) # Bin time vector t_sample = binning(time[idx], time[idx], t1lim, t2lim, 1.0 / 12., 5, 5)[1] # Test for null vector if len(t_sample) == 0: continue # Sampling fraction npct = np.float(len(t_sample[~np.isnan(t_sample)])) / len(t_sample) # Constraints if nobs > nlim: if dt > dtlim: if nsen >= nmlim: if npct > 0.70: break # Final test of data coverage if (nobs < nlim) or (dt < dtlim): continue # Parameters for model-solution xcap = x[idx] ycap = y[idx] tcap = time[idx] hcap = elev[idx] scap = sigma[idx] mcap = mode[idx] # Centroid of all data xc = np.median(xcap) yc = np.median(ycap) # Get reference mref = mref_ # Reference to specific mission if len(hcap[mcap == mref]) > 0: # Tie to laser surface hcap -= np.median(hcap[mcap == mref]) elif len(hcap[mcap == (mref + 1)]) > 0: # Tie to SARin surface hcap -= np.median(hcap[mcap == (mref + 1)]) # Change mission tie index mref += 1 else: # Tie to mean surface hcap -= np.median(hcap) # # Least-Squares Adjustment # --------------------------------- # # h = x_t + x_j + x_s # x = (A' W A)^(-1) A' W y # r = y - Ax # # --------------------------------- # # Threshold for outliers in each bin alpha = 5.0 # Convergence tolerance (%) tol = 3.0 # Times series binning of each mission (tcap, hcap, scap, ncap, mcap) = bin_mission(tcap, hcap, mcap, scap, t1lim, t2lim, tstep, tol, alpha) # Size of original observational matrix (n, m) = hcap.T.shape # Unravel array to vectors tcap = tcap.T.ravel() hcap = hcap.T.ravel() scap = scap.T.ravel() mcap = mcap.T.ravel() # Additional outlier editing inan = np.isnan( binfilt(tcap.copy(), hcap.copy(), tcap.min(), tcap.max(), 3.0, 3. / 12.)) # Set outliers to NaN hcap[inan] = np.nan scap[inan] = np.nan mcap[inan] = np.nan # Trend component dti = tcap - tref # Compute new statistics (nobs, tspan) = len(hcap[~np.isnan(hcap)]), tcap.max() - tcap.min() # Reject grid node if true if (nobs < nlim) & (tspan < dtlim): continue # Four-term fourier series for seasonality cos1 = np.cos(2 * np.pi * dti) sin1 = np.sin(2 * np.pi * dti) cos2 = np.cos(4 * np.pi * dti) sin2 = np.sin(4 * np.pi * dti) # Construct bias parameters b_ice1 = np.zeros(hcap.shape) b_csin = np.zeros(hcap.shape) b_clrm = np.zeros(hcap.shape) b_ra21 = np.zeros(hcap.shape) b_ra22 = np.zeros(hcap.shape) b_ers1 = np.zeros(hcap.shape) b_ers2 = np.zeros(hcap.shape) # Set unit-step functions (0/1) b_ers1[mcap == 6] = 1. b_ers2[mcap == 5] = 1. b_ice1[mcap == 0] = 1. b_ra21[mcap == 3] = 1. b_ra22[mcap == 4] = 1. b_csin[mcap == 1] = 1. b_clrm[mcap == 2] = 1. # Design matrix for adjustment procedure Acap = np.vstack((dti, 0.5*dti**2, cos1, sin1, cos2, sin2, b_ice1, \ b_csin, b_clrm, b_ra21, b_ra22, b_ers2, b_ers1)).T # Try to solve least-squares system try: # Least-squares bias adjustment linear_model = sm.RLM(hcap, Acap, missing='drop') # Fit the model to the data linear_model_fit = linear_model.fit(maxiter=10) # If not possible continue except: continue # Length post editing nsol = len(hcap) # Coefficients and standard errors Cm = linear_model_fit.params Ce = linear_model_fit.bse # Amplitude of annual seasoanl signal amp = np.sqrt(Cm[2]**2 + Cm[3]**2) # Phase of annual seasoanl signal phi = np.arctan2(Cm[3], Cm[2]) / (2.0 * np.pi) # Compute model residuals dh = hcap - np.dot(Acap, Cm) # Identify outliers inan = np.isnan(iterfilt(dh.copy(), -slim, slim, 5, 3.0)) # Set outliers to NaN hcap[inan] = np.nan scap[inan] = np.nan mcap[inan] = np.nan # Compute RMSE of corrected residuals rmse = mad_std(dh[~inan]) # Bias correction h_bias = np.dot(Acap[:, [-7, -6, -5, -4, -3, -2, -1]], Cm[[-7, -6, -5, -4, -3, -2, -1]]) # Save original uncorrected time series horg = hcap.copy() # Remove inter mission biases hcap -= h_bias # Initiate residual cross-calibration flag flag = 0 # Apply post-fit cross-calibration in overlapping areas hcap, flag = cross_calibrate(tcap.copy(), hcap.copy(), dh.copy(), mcap.copy(), 1.0) # Binned time for plotting tbin = np.arange(t1lim, t2lim, tstep) + 0.5 * tstep # Re-format back to arrays hbo = horg.reshape(n, m).T hbi = hcap.reshape(n, m).T tbi = tcap.reshape(n, m).T ebi = scap.reshape(n, m).T mbi = mcap.reshape(n, m).T # Copy original vector hcor = np.copy(hbi) # Take the weighted average of all mission in each bin (hbi_w, ebi_w) = np.ma.average(np.ma.array(hbi, mask=np.isnan(hbi)), \ weights=np.ma.array(ebi, mask=np.isnan(ebi)), \ axis=0, returned=True) # Convert back to original array, with nan's hbi_w = np.ma.filled(hbi_w, np.nan) ebi_w = np.ma.filled(ebi_w, np.nan) # Number of rows to add n_add = 6 - len(hbi) # Construct binary mask binary = hbi_w.copy() # Set to zeros (data) and ones (nan) binary[~np.isnan(binary)] = 0 binary[np.isnan(binary)] = 1 # Apply distance transform bwd = distance_transform_edt(binary) # Set these values to nan's inoip = bwd >= 12 # Pad by adding rows for kx in xrange(n_add): # Add rows to obs. matrix hbo = np.vstack((hbo, np.ones(hbi_w.shape) * np.nan)) hbi = np.vstack((hbi, np.ones(hbi_w.shape) * np.nan)) ebi = np.vstack((ebi, np.ones(hbi_w.shape) * np.nan)) mbi = np.vstack((mbi, np.ones(hbi_w.shape) * np.nan)) tbi = np.vstack((tbi, tbin)) # Padd mission arrays using weighted averages hbi = fill(hbi, hbi_w) ebi = fill(ebi, ebi_w) # Reject grid node if true if len(hbi_w[~np.isnan(hbi_w)]) <= 2: continue # # Kalman state-space model # ------------------------ # # z_t = H * z_t + v_t # x_t = A * x_t-1 + w_t-1 # # ------------------------ # # Create observational matrix Ht = np.eye(4) # Determine the number of rows to add n_add = len(hbi) - 4 # Rows to observational matrix for ky in xrange(n_add): # Add rows to obs. matrix Ht = np.vstack((Ht, [0, 0, 0, 0])) # Populate observational matrix Ht[:, [0, 2]] = 1 # Seasonal signal ck = np.cos(np.pi / 6) sk = np.sin(np.pi / 6) # Transition matrix At = [[1.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, +ck, +sk], [0.0, 0.0, -sk, +ck]] # Observational noise Rt = np.diag(np.nanmean(ebi**2, 1)) # Initial start value of filter y0 = np.median(hbi_w[~np.isnan(hbi_w)][0:3]) # Constrain only transition covariance params = ['transition_covariance'] # Estimating transition covaraiance from individual missions if len(hcap[(mcap <= 3) & (~np.isnan(mcap))]) > 1: # Only good missions Ct = KalmanFilter(em_vars=params). \ em(hcap[mcap <= 3], n_iter=2).transition_covariance else: # All missions Ct = KalmanFilter(em_vars=params). \ em(hcap[~np.isnan(hcap)], n_iter=2).transition_covariance # Transition covariance Qt = np.diag([0.0, 1.0, 0.5, 0.5]) * tstep * Ct # Initial state vector m0 = [y0, Cm[0], Cm[2], Cm[3]] # Initial state covariance q0 = np.diag([0, Ce[0], Ce[2], Ce[3]])**2 # Create kalman filter kf = KalmanFilter(initial_state_mean=m0, initial_state_covariance=q0, transition_matrices=At, observation_matrices=Ht, observation_covariance=Rt, transition_covariance=Qt) # Estimate number percentage of interpolated data n_per = 100 * float(len(hbi_w[np.isnan(hbi_w)])) / len(hbi_w) # Mask and transpose array hbi_masked = ma.masked_array(hbi, mask=np.isnan(hbi)).T # Apply Kalman filter with parameter learning on residuals (dh_ts, dh_es) = kf.smooth(hbi_masked) # Compute the total RSS of all model parameters dh_es = [dh_es[k, 0, 0] for k in xrange(len(dh_es))] # Sum all parameters for time series dh_ts = dh_ts[:, 0] # Compute standard deviation dh_es = np.sqrt(dh_es) # Mask output array dh_ts[inoip] = np.nan dh_es[inoip] = np.nan # Rename weighted solution dh_ws = hbi_w dh_ew = ebi_w # Converte back to georaphical coordinates (lon_c, lat_c) = transform_coord(projGrd, projGeo, xc, yc) # Final search radius radius = dr[ii] # Compute new elevation change rate after post-fit residuals b_rate = np.polyfit(tbin[~np.isnan(dh_ws)] - tbin[~np.isnan(dh_ws)].mean(), dh_ws[~np.isnan(dh_ws)], 2, w=1.0 / dh_ew[[~np.isnan(dh_ws)]]**2)[1] # Save data to output files OFILE0[i, :] = np.hstack( (lat_c, lon_c, Cm[0], Ce[0], Cm[1], Ce[1], rmse, dt, amp, phi, n_per, Cm[[-7, -6, -5, -4, -3, -2, -1]], nobs, nsol, radius, flag, b_rate)) OFILE1[i, :] = np.hstack( (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_ts)) OFILE2[i, :] = np.hstack( (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_es)) OFILE3[i, :] = np.hstack( (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_ws)) OFILE4[i, :] = np.hstack( (lat_c, lon_c, t1lim, t2lim, len(hbi.T), dh_es)) # Print progress print str(i) + "/" + str(len(xi))+" Radius: "+ str(np.around(dr[ii], 0)) +" Rate: "+str(np.around(Cm[0]*100,2))+\ " (cm/yr)"+' Interp: '+str(np.around(n_per,0))+' Rate_adj: '+str(np.around(b_rate*100,2))+" (cm/yr)" # Identify unwanted data I0 = OFILE0[:, 0] != 9999 I1 = OFILE1[:, 0] != 9999 I2 = OFILE2[:, 0] != 9999 I3 = OFILE3[:, 0] != 9999 I4 = OFILE4[:, 0] != 9999 # Remove unwnated data OFILE0 = OFILE0[I0, :] OFILE1 = OFILE1[I1, :] OFILE2 = OFILE2[I2, :] OFILE3 = OFILE3[I3, :] OFILE4 = OFILE4[I4, :] # Check if we have any data if len(OFILE0[:, 0]) == 0: # Print message print " No data to save! " return # Save solution to disk with h5py.File(ifile.replace('.h5', '_sf.h5'), 'w') as f0: # Save meta data f0['sf'] = OFILE0 with h5py.File(ifile.replace('.h5', '_ts.h5'), 'w') as f1: # Save adjusted and merged time series f1['ts'] = OFILE1 with h5py.File(ifile.replace('.h5', '_es.h5'), 'w') as f2: # Save error estimate for time series f2['es'] = OFILE2 with h5py.File(ifile.replace('.h5', '_tw.h5'), 'w') as f3: # Save error estimate for time series f3['tw'] = OFILE3 with h5py.File(ifile.replace('.h5', '_ew.h5'), 'w') as f4: # Save error estimate for time series f4['ew'] = OFILE4
def _extract_sdm_params(ee, tc, iph, io, rs, rsh, n, u, specs, const, model): # Get single diode model parameters from five parameters iph, io, rs, rsh # and n vs. effective irradiance and temperature try: import statsmodels.api as sm except ImportError: raise ImportError( 'Parameter extraction using Sandia method requires statsmodels') tck = tc + 273.15 tok = const['T0'] + 273.15 # convert to to K params = {} if model == 'pvsyst': # Estimate I_o_ref and EgRef x_for_io = const['q'] / const['k'] * (1. / tok - 1. / tck[u]) / n[u] # Estimate R_sh_0, R_sh_ref and R_sh_exp # Initial guesses. R_sh_0 is value at ee=0. nans = np.isnan(rsh) if any(ee < 400): grsh0 = np.mean(rsh[np.logical_and(~nans, ee < 400)]) else: grsh0 = np.max(rsh) # Rsh_ref is value at Ee = 1000 if any(ee > 400): grshref = np.mean(rsh[np.logical_and(~nans, ee > 400)]) else: grshref = np.min(rsh) # PVsyst default for Rshexp is 5.5 R_sh_exp = 5.5 # Find parameters for Rsh equation def fun_rsh(x, rshexp, ee, e0, rsh): tf = np.log10(_rsh_pvsyst(x, R_sh_exp, ee, e0)) - np.log10(rsh) return tf x0 = np.array([grsh0, grshref]) beta = optimize.least_squares( fun_rsh, x0, args=(R_sh_exp, ee[u], const['E0'], rsh[u]), bounds=np.array([[1., 1.], [1.e7, 1.e6]]), verbose=2) # Extract PVsyst parameter values R_sh_0 = beta.x[0] R_sh_ref = beta.x[1] # parameters unique to PVsyst params['R_sh_0'] = R_sh_0 params['R_sh_exp'] = R_sh_exp elif model == 'desoto': dEgdT = 0.0002677 x_for_io = const['q'] / const['k'] * ( 1. / tok - 1. / tck[u] + dEgdT * (tc[u] - const['T0']) / tck[u]) # Estimate R_sh_ref nans = np.isnan(rsh) x = const['E0'] / ee[np.logical_and(u, ee > 400, ~nans)] y = rsh[np.logical_and(u, ee > 400, ~nans)] new_x = sm.add_constant(x) beta = sm.RLM(y, new_x).fit() R_sh_ref = beta.params[1] params['dEgdT'] = dEgdT # Estimate I_o_ref and EgRef y = np.log(io[u]) - 3. * np.log(tck[u] / tok) new_x = sm.add_constant(x_for_io) res = sm.RLM(y, new_x).fit() beta = res.params I_o_ref = np.exp(beta[0]) EgRef = beta[1] # Estimate I_L_ref x = tc[u] - const['T0'] y = iph[u] * (const['E0'] / ee[u]) # average over non-NaN values of Y and X nans = np.isnan(y - specs['alpha_sc'] * x) I_L_ref = np.mean(y[~nans] - specs['alpha_sc'] * x[~nans]) # Estimate R_s nans = np.isnan(rs) R_s = np.mean(rs[np.logical_and(u, ee > 400, ~nans)]) params['I_L_ref'] = I_L_ref params['I_o_ref'] = I_o_ref params['EgRef'] = EgRef params['R_sh_ref'] = R_sh_ref params['R_s'] = R_s # save values for each IV curve params['iph'] = iph params['io'] = io params['rsh'] = rsh params['rs'] = rs params['u'] = u return params
def setup(self): #fit for each test, because results will be changed by test x = self.exog np.random.seed(987689) y = x.sum(1) + np.random.randn(x.shape[0]) self.results = sm.RLM(y, self.exog).fit()
hue='SITE_ID', data=df_pheno_morpho) plt.show() # and assess whether the effects of each factor results = smf.ols( 'surface_S_C_left ~ AGE_AT_SCAN + C(SITE_ID)+ AGE_AT_SCAN * C(SITE_ID)', data=df_pheno_morpho).fit() print(results.summary()) # comparison between OLS and RLM y2 = df_pheno_morpho['surface_S_C_left'] x1 = df_pheno_morpho['AGE_AT_SCAN'] X = sm.add_constant(x1) ols_model = sm.OLS(y2, X).fit() print(ols_model.summary()) rlm_model = sm.RLM(y2, X).fit() print(rlm_model.summary()) # nice figure with confidence intervals prstd, iv_l, iv_u = wls_prediction_std(ols_model) fig, ax = plt.subplots(figsize=(8, 6)) ax.plot(x1, y2, 'o', label="data") ax.plot(x1, ols_model.fittedvalues, 'r-', label="OLS") ax.plot(x1, iv_u, 'r--') ax.plot(x1, iv_l, 'r--') ax.plot(x1, rlm_model.fittedvalues, 'g.-', label="RLM") legend = ax.legend(loc="best") plt.show() # influence analysis for outliers detection
def solve_iteratively(science, reference, mask_tolerance=10e-5, gain_tolerance=10e-6, max_iterations=5, sigma_cut=5., use_pixels=False, show=False, percent=99, use_mask=True, size_cut=True, pixstack_limit=None): """Solve for linear fit iteratively""" gain = 1. gain0 = 10e5 i = 1 # pad image to power of two to speed fft old_size = science.shape science_image = pad_to_power2(science) reference_image = pad_to_power2(reference) science_psf = center_psf(resize_psf(science.raw_psf, science_image.shape)) science_psf /= np.sum(science.raw_psf) reference_psf = center_psf(resize_psf(reference.raw_psf, reference_image.shape)) reference_psf /= np.sum(reference.raw_psf) science_std = pad_to_power2(science.background_std) reference_std = pad_to_power2(reference.background_std) science_mask = pad_to_power2(science.mask, value='bool') reference_mask = pad_to_power2(reference.mask, value='bool') # fft arrays science_image_fft = np.fft.fft2(science_image) reference_image_fft = np.fft.fft2(reference_image) science_psf_fft = np.fft.fft2(science_psf) reference_psf_fft = np.fft.fft2(reference_psf) while abs(gain - gain0) > gain_tolerance: # calculate the psf in the difference image to convolve masks # not a simple convolution of the two PSF's; see the paper for details difference_zero_point = gain / np.sqrt(science_std ** 2 + reference_std ** 2 * gain ** 2) denominator = science_std ** 2 * abs(reference_psf_fft) ** 2 denominator += reference_std ** 2 * gain ** 2 * abs(science_psf_fft) ** 2 difference_psf_fft = gain * science_psf_fft * reference_psf_fft / (difference_zero_point * np.sqrt(denominator)) if use_mask: # convolve masks with difference psf to mask all pixels within a psf radius # this is important to prevent convolutions of saturated pixels from affecting the fit science_mask_convolved = np.fft.ifft2(difference_psf_fft * np.fft.fft2(science_mask)) science_mask_convolved[science_mask_convolved > mask_tolerance] = 1 science_mask_convolved = np.real(science_mask_convolved).astype(int) reference_mask_convolved = np.fft.ifft2(difference_psf_fft * np.fft.fft2(reference_mask)) reference_mask_convolved[reference_mask_convolved > mask_tolerance] = 1 reference_mask_convolved = np.real(reference_mask_convolved).astype(int) # do the convolutions on the images denominator = science_std ** 2 * abs(reference_psf_fft) ** 2 denominator += gain ** 2 * reference_std ** 2 * abs(science_psf_fft) ** 2 science_convolved_image_fft = reference_psf_fft * science_image_fft / np.sqrt(denominator) reference_convolved_image_fft = science_psf_fft * reference_image_fft / np.sqrt(denominator) science_convolved_image = np.real(np.fft.ifft2(science_convolved_image_fft)) reference_convolved_image = np.real(np.fft.ifft2(reference_convolved_image_fft)) # remove power of 2 padding science_convolved_image = science_convolved_image[: old_size[0], : old_size[1]] reference_convolved_image = reference_convolved_image[: old_size[0], : old_size[1]] if use_mask: science_mask_convolved = science_mask_convolved[: old_size[0], : old_size[1]] reference_mask_convolved = reference_mask_convolved[: old_size[0], : old_size[1]] else: science_mask_convolved = None reference_mask_convolved = None # do a linear robust regression between convolved image x, y = join_images(science_convolved_image, science_mask_convolved, reference_convolved_image, reference_mask_convolved, sigma_cut, use_pixels, show, percent, size_cut, pixstack_limit) robust_fit = stats.RLM(y, stats.add_constant(x), stats.robust.norms.TukeyBiweight()).fit() parameters = robust_fit.params gain0 = gain gain = parameters[-1] if show: import matplotlib.pyplot as plt xfit = np.logspace(np.log10(np.min(x)), np.log10(np.max(x))) plt.plot(xfit, robust_fit.predict(stats.add_constant(xfit))) plt.pause(0.1) logging.info('Iteration {0}: Gain = {1}'.format(i, gain)) if i == max_iterations: logging.warning('Maximum regression ({0}) iterations reached'.format(max_iterations)) break i += 1 logging.info('Fit done in {0} iterations'.format(i)) return gain
def dealData(bk, begd, endd, adjustPeriods, factorsInfo, FactorName, Path): #数据库连接引擎 tableName = factorsInfo.get("tableName") direction = factorsInfo.get("direction") #因子方向1为正序,0位逆序 reciprocal = factorsInfo.get("reciprocal") #因子值是否取倒数 isLogDeal = factorsInfo.get("isLogDeal") #因子是否进行ln处理 #engine = create_engine('mysql://*****:*****@172.16.158.142/dwlh?charset=utf8') store = pd.HDFStore(tableName + '.h5', "r", complevel=9) periedValues = [] #循环每次调仓日期 for i in adjustPeriods.index[:-1]: adjustDay = adjustPeriods.ix[i, "date"] nextAdjustDay = adjustPeriods.ix[i, "nextAdjustDay"] logging.warning(u"处理第" + adjustDay + u"天数据!") factor = store.select( '/' + tableName + '/' + FactorName, where=["Date='{date}'".format(date=adjustDay.replace('-', ''))]) #将日期字段设置为日期类型 factor.con_date = pd.to_datetime(factor.Date) factor['stock_code'] = factor["stockID"].apply(lambda x: x[2:]) #按照调仓日期取板块信息,天软函数getbkByName,会剔除调仓日一字涨跌停、停牌以及上市时间小于120日的股票 BKStocks = TSLPy2.RemoteCallFunc('getbkByName2', [ bk, TSLPy2.EncodeDate(int(adjustDay[:4]), int(adjustDay[5:7]), int(adjustDay[8:10])) ], {}) BKStocks = pd.DataFrame(BKStocks[1]) BKStocks["SWNAME"] = BKStocks["SWNAME"].apply( lambda x: x.decode('gbk')) BKStocks["stock_code"] = BKStocks["id"].apply(lambda x: x[2:]) BKStocks["TotalValue"] = BKStocks["TotalValue"].apply(np.log) #对因子值和板块合并 factor = factor.merge(BKStocks, on="stock_code") #判断是否对因子值进行倒序处理 if reciprocal == 1: factor[FactorName] = factor[FactorName].apply(lambda x: 1 / x if x <> 0 else x) if isLogDeal == 1: factor[FactorName] = factor[FactorName].apply(np.log) #对因子值进行方向处理 factor[FactorName] = factor[FactorName] * direction #替换异常值 factorMedia = factor[FactorName].median() MAD = (factor[FactorName] - factorMedia).apply(abs).median() factor.loc[factor[FactorName] > (factorMedia + 3 * 1.4826 * MAD), FactorName] = factorMedia + 3 * 1.4826 * MAD factor.loc[factor[FactorName] < (factorMedia - 3 * 1.4826 * MAD), FactorName] = factorMedia - 3 * 1.4826 * MAD #zscore标准化 factorMean = factor[FactorName].mean() factorStd = factor[FactorName].std() factor[FactorName] = factor[FactorName].apply( lambda x: (x - factorMean) / factorStd if factorStd <> 0 else (x - factorMean)) #下期收益序列: stokzf = pd.DataFrame( TSLPy2.RemoteCallFunc('getStockZF', [ bk, TSLPy2.EncodeDate(int(adjustDay[:4]), int(adjustDay[5:7]), int(adjustDay[8:10])), TSLPy2.EncodeDate(int(nextAdjustDay[:4]), int(nextAdjustDay[5:7]), int(nextAdjustDay[8:10])) ], {})[1]) factor = factor.merge(stokzf, on="stock_code") factor.set_index("stock_code", inplace=True) #通过回归对因子进行行业和市值中性化处理 factor = factor.dropna() #方法1 #y, X = dmatrices('{factorName} ~ SWNAME + TotalValue'.format(factorName=FactorName), data=factor, return_type='dataframe') #方法2 y = factor[FactorName] X = pd.get_dummies(factor['SWNAME']) if FactorName <> 'CAP': X['TotalValue'] = factor['TotalValue'] X = sm.add_constant(X) #res = sm.OLS(y, X).fit() #通过OLS进行回归 res2 = sm.RLM(y, X).fit() #通过RLM进行回归 #res3= sm.WLS(y, X).fit() #通过WLS进行回归 #factorParam = res2.params[FactorName] #factorT = res2.tvalues[FactorName] #tinyedFactor列,为回归后的残差项,看做新的因子值 factor["tinyedFactor2"] = factor[FactorName] - res2.fittedvalues factor["tinyedFactor"] = res2.resid #对新的因子值和下期涨幅进行T检验,得到T值和P值 factorT, factorP = ttest_rel(factor["tinyedFactor"], factor["zf"]) #计算IC值和RANKIC值 IC = factor["zf"].corr(factor["tinyedFactor"]) rankIC = factor["zf"].corr(factor["tinyedFactor"], method="spearman") periedValues.append( pd.DataFrame( { "FactorName": FactorName, "adjustDay": adjustDay, "IC": IC, "rankIC": rankIC, "factorP": factorP, "factorT": factorT }, index=[0])) """ fig, ax = plt.subplots(figsize=(8,6)) ax.plot(factor["con_roe"], y, 'o', label="Data") #ax.plot(x["con_roe"], y_true, 'b-', label="True") ax.plot(factor["con_roe"], res2.fittedvalues, 'r--.', label="RLMPredicted") ax.plot(factor["con_roe"], res.fittedvalues, 'b--.', label="OLSPredicted") legend = ax.legend(loc="best") """ store.close() return periedValues
def fit_linreg_robust(x, y, mask=None, intercept=False, r2=True, est_method="rlm"): """Apply robust linear regression of y w.r.t x. Arguments --------- x: :class:`~numpy.ndarray` or sparse `csr_matrix` A vector of independent variables. y: :class:`~numpy.ndarray` or sparse `csr_matrix` A vector of dependent variables. intercept: bool If using steady state assumption for fitting, then: True -- the linear regression is performed with an unfixed intercept; False -- the linear regresssion is performed with a fixed zero intercept. est_method: str (default: `rlm`) The linear regression estimation method that will be used. Returns ------- k: float The estimated slope. b: float The estimated intercept. r2: float Coefficient of determination or r square calculated with the extreme data points. all_r2: float The r2 calculated using all data points. """ x = x.A if issparse(x) else x y = y.A if issparse(y) else y _mask = np.logical_and(~np.isnan(x), ~np.isnan(y)) if mask is not None: _mask &= mask xx = x[_mask] yy = y[_mask] try: if est_method.lower() == "rlm": xx_ = sm.add_constant(xx) if intercept else xx res = sm.RLM(yy, xx_).fit() k, b = res.params[::-1] if intercept else (res.params[0], 0) elif est_method.lower() == "ransac": reg = RANSACRegressor(LinearRegression(fit_intercept=intercept), random_state=0) reg.fit(xx.reshape(-1, 1), yy.reshape(-1, 1)) k, b = reg.estimator_.coef_[0, 0], (reg.estimator_.intercept_[0] if intercept else 0) else: raise ImportError( f"estimation method {est_method} is not implemented. " f"Currently supported linear regression methods include `rlm` and `ransac`." ) except: if intercept: ym = np.mean(yy) xm = np.mean(xx) cov = np.mean(xx * yy) - xm * ym var_x = np.mean(xx * xx) - xm * xm k = cov / var_x b = ym - k * xm # # assume b is always positive # if b < 0: # k, b = np.mean(xx * yy) / np.mean(xx * xx), 0 else: # use uncentered cov and var_x cov = np.mean(xx * yy) var_x = np.mean(xx * xx) k = cov / var_x b = 0 if r2: SS_tot_n, all_SS_tot_n = np.var(yy), np.var(y) SS_res_n, all_SS_res_n = ( np.mean((yy - k * xx - b)**2), np.mean((y - k * x - b)**2), ) r2, all_r2 = 1 - SS_res_n / SS_tot_n, 1 - all_SS_res_n / all_SS_tot_n return k, b, r2, all_r2 else: return k, b
# Переменных x может быть одна или две, и они должны быть записаны # столбцами в соответствующих степенях. # Следующий набор команд это и делает: x_var1 = create_var1(x_cols[0], centering) x_array = x_var1 for i in range(2, x_degs[0] + 1): x_array = np.column_stack((x_array, x_var1**i)) if len(x_cols) > 1: x_var2 = create_var2(x_cols[1], centering) x_array = np.column_stack((x_array, x_var2)) for i in range(2, x_degs[1] + 1): x_array = np.column_stack((x_array, x_var2**i)) x_array = sm.add_constant(x_array, prepend=True) rlm_model = sm.RLM(y_array, x_array, M=sm.robust.norms.TukeyBiweight()) results = rlm_model.fit() # ----------------------------------------------- def centering_back(params, degs): A, B1, C1, B2, C2, D2 = 0., 0., 0., 0., 0., 0. A = params[0] if len(degs) == 1: global m_x1, m_x2 # Потому что при построении x1 было получено m_x1, а # m_x2 так и осталось нулем. Но тут я работаю с одной # переменной как со второй (это подтверждают двойки # в названии используемых в этом блоке if переменных m_x1, m_x2 = m_x2, m_x1
Robust Linear Models Notes ----- The syntax for the arguments will be shortened to accept string arguments in the future. """ import statsmodels.api as sm ###Example for using Huber's T norm with the default ###median absolute deviation scaling data = sm.datasets.stackloss.load() data.exog = sm.add_constant(data.exog) huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT()) hub_results = huber_t.fit() print hub_results.params print hub_results.bse ###Or with the 'H2' covariance matrix hub_results2 = huber_t.fit(cov="H2") print hub_results2.params print hub_results2.bse ###Example for using Andrew's Wave norm with ###Huber's Proposal 2 scaling and 'H3' covariance matrix andrew_mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.AndrewWave()) andrew_results = andrew_mod.fit(scale_est=sm.robust.scale.HuberScale(), cov="H3") print andrew_results.params
def fit(self, y, X): model = sm.RLM(y, X, M=sm.robust.norms.HuberT()) return model.fit()
def xr_regression_resid(y): date = y.date model = sm.RLM(y.values, X.loc[date].values, M=sm.robust.norms.HuberT()) results = model.fit() return xr.DataArray(results.resid)
dt_pos = idx_date.get_loc(cur_date) if dt_pos == 0: continue dt_pre_pos = dt_pos - 1 # symbols having valid value(not nan) s = X[:, dt_pre_pos].notnull().all(axis=0) valid_x = X[:, dt_pre_pos, s].symbol.values w = y.loc[cur_date].notnull() valid_y = y.loc[cur_date, w].symbol.values valid_symbol = np.intersect1d(valid_x, valid_y) try: model = sm.RLM( y.loc[cur_date, valid_symbol].values, X.isel(date=dt_pre_pos, symbol=idx_symbol.get_indexer(valid_symbol)).values.T, M=sm.robust.norms.HuberT()) results = model.fit() except ValueError: continue params.loc[cur_date] = results.params residuals.loc[cur_date, valid_symbol] = results.resid class RLMModel: """ create RLM regression module """ def __init__(self): pass
def main(files, n=''): # Input variables names xvar, yvar, tvar, zvar, evar, ivar = icol # If cubes for each mission are in separate files, # concatenate them and generate a single cube. # Each mission (on individual file) will be given a unique identifier. for nf, ifile in enumerate(files): print 'processing file:', ifile, '...' if nf == 0: with h5py.File(ifile, 'r') as fi: x = fi[xvar][:] # 1d y = fi[yvar][:] # 1d time = fi[tvar][:] # 1d elev = fi[zvar][:] # 3d mode = fi[ivar][:] if ivar in fi \ else np.full_like(time, nf) # 1d sigma = fi[evar][:] if evar in fi \ else np.full_like(elev, np.nan) # 3d else: with h5py.File(ifile, 'r') as fi: time = np.hstack((time, fi[tvar][:])) # 1d elev = np.dstack((elev, fi[zvar][:])) # 3d mode = np.hstack((mode, fi[ivar][:] if ivar in fi \ else np.full_like(fi[tvar][:], nf))) # 1d sigma = np.dstack((sigma, fi[evar][:] if evar in fi \ else np.full_like(fi[zvar][:], np.nan))) # 3d if len(np.unique(mode)) < 2: print 'it seems there is only one mission!' return t1, t2 = np.nanmin(time), np.nanmax(time) ##TODO: Rethink this # Output containers zi = np.full_like(elev, np.nan) ei = np.full_like(elev, np.nan) ni = np.full_like(elev, np.nan) # Temporal coverage t_pct = np.zeros(elev.shape) # Minimum sampling for all mission < 81.5 deg nsam = 0.60 # Enter prediction loop for i in xrange(elev.shape[0]): for j in xrange(elev.shape[1]): # Number of observations nobs = 0 # Time difference dt = 0 # Temporal sampling npct = 1 # Number of sensors nsen = 0 # Final test of data coverage #if (nobs < nlim) or (npct < 0.70): continue # Parameters for model-solution tcap = time[:] mcap = mode[:] hcap = elev[i, j, :] scap = sigma[i, j, :] torg = tcap.copy() morg = mcap.copy() horg = hcap.copy() sorg = scap.copy() # Least-Squares Adjustment # --------------------------------- # # h = x_t + x_j + x_s # x = (A' A)^(-1) A' y # r = y - Ax # # --------------------------------- # Need to think of a smarth way to filter out outliears. # In particular those at the end of each mission-record!!! # Also, need to plot and see how the model fit compares to the data. ##FIXME ############################################################ # compute median series ##NOTE: Not needed for calibrating cube series (they are clean) if 0: hcap = binfilter(tcap, hcap, mcap, window=3, n_abs=5, interp=False) ##FIXME ############################################################ if sum(~np.isnan(hcap)) < nlim: continue #plt.figure() ii = mcap == np.unique(mcap)[0] jj = mcap == np.unique(mcap)[1] plt.plot(tcap[ii], hcap[ii]) plt.plot(tcap[jj], hcap[jj]) dt = tcap - tref # trend component # Create design matrix for alignment Acap, cols = design_matrix(dt, mcap) try: # Least-squares bias adjustment linear_model = sm.RLM(hcap, Acap, missing='drop') linear_model_fit = linear_model.fit(maxiter=niter) except: print "Solution invalid!" continue # Coefficients and standard errors Cm = linear_model_fit.params Ce = linear_model_fit.bse # Compute model residuals dh = hcap - np.dot(Acap, Cm) # Compute RMSE of corrected residuals (fit) rms_fit = mad_std(dh) # Bias correction (mission offsets) h_cal_fit = np.dot(Acap[:, cols], Cm[cols]) # Remove inter satellite biases horg -= h_cal_fit # Plot if 1: plt.figure() plt.plot(torg[ii], horg[ii]) plt.plot(torg[jj], horg[jj]) plt.show() ##FIXME: This doesn't work. Think of a better strategy!!!!!!!!!!!! ##TODO: How/Where to do this??? <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Bin full calibrated record if 0: tmed, hmed, emed, nmed = binning(torg, horg, xmin=t1, xmax=t2, dx=1 / 12., window=3 / 12., median=True, interp=False)[:4] # Interpolate ''' try: i_valid = ~np.isnan(hmed) i_inval = np.isnan(hmed) hmed[i_inval] = np.interp(tmed[i_inval], tmed[i_valid], hmed[i_valid]) except: continue ''' # Reference final solution ''' if 1: # To original discrete time step idx = find_nearest(tmed, tref) hmed -= hmed[idx] else: # To exact given time epoch href = np.interp(tref, tmed[~np.isnan(hmed)], hmed[~np.isnan(hmed)]) ''' """ zi[i,j,:] = hmed ei[i,j,:] = emed ni[i,j,:] = nmed """ # Plot crosscal time series if 1: horg[np.abs(horg) > mad_std(horg) * 5] = np.nan plt.figure(figsize=(12, 4)) plt.scatter(tcap, horg, s=10, c=mcap, alpha=0.7, cmap='tab10') plt.scatter(tcap, hcap, s=10, c=mcap, cmap='gray') try: plt.figure(figsize=(12, 3.5)) plt.plot(tmed, hmed, '-', linewidth=2) plt.ylim(np.nanmin(hmed), np.nanmax(hmed)) plt.xlim(t1, t2) except: pass plt.show() continue ''' # Transform coordinates (lon_i, lat_i) = transform_coord(projGrd, projGeo, xcap, ycap) (lon_0, lat_0) = transform_coord(projGrd, projGeo, xi[i], yi[i]) # ********************** # # Apply calibration to original data points horg -= h_cal_fit # Save output variables to list for each solution lats.append(lat_i) lons.append(lon_i) lat0.append(lat_0) lon0.append(lon_0) dxy0.append(dxy) h_ts.append(horg) e_ts.append(sorg) m_id.append(morg) h_cf.append(h_cal_fit) f_cr.append(flag) tobs.append(torg) rmse.append(rms_fit) ''' # Transform coordinates # Print meta data to terminal if (i % 1) == 0: print 'Progress:',str(i),'/',str(len(xi)), \ 'Rate:', np.around(Cm[1],2), \ 'Acceleration:', np.around(Cm[2],2) # Saveing the data to file print 'Saving data to file ...' ''' ofile = ifile.replace('.h5', '_XCAL_FUSED.h5') with h5py.File(ofile, 'w') as f: f['h_res'] = zi.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0]) f['h_err'] = ei.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0]) f['n_obs'] = ni.reshape(Xi.shape[0], Xi.shape[1], ti.shape[0]) f['x'] = Xi[0,:] f['y'] = Yi[:,0] f['t'] = tmed print 'out ->', ofile ''' return
def main(ifile, n='', robust_fit=True, n_iter=niter): # Check for empty file if is_empty(ifile): print 'SKIP FILE: EMPTY OR CORRUPTED FILE:', ifile return # Start timing of script startTime = datetime.now() print 'loading data ...' xvar, yvar, tvar, zvar, svar, ivar, cvar = names with h5py.File(ifile, 'r') as fi: lon = fi[xvar][:] lat = fi[yvar][:] time = fi[tvar][:] height = fi[zvar][:] sigma = fi[svar][:] if svar in fi else np.ones(lon.shape) id = fi[ivar][:] if ivar in fi else np.ones(lon.shape) * nmidx cal = fi[cvar][:] if cvar in fi else np.zeros(lon.shape) # Filter in time if 1: i_time, = np.where( (time > 1993.972) & (time < 1995.222)) ##NOTE: To remove ERS-1 GM if len(i_time) > 0: height[i_time] = np.nan ##NOTE: Filter data based on 'cal' but DO NOT REMOVE NANs! if sum(cal) != 0: cal[np.isnan(cal)] = 0. # keep values w/o correction height -= cal # correct absolute H for bs # Filter NaNs if 1: i_valid = ~np.isnan(height) lon = lon[i_valid] lat = lat[i_valid] time = time[i_valid] height = height[i_valid] sigma = sigma[i_valid] id = id[i_valid] cal = cal[i_valid] projGeo = '4326' # EPSG number for lon/lat proj projGrd = projo # EPSG number for grid proj print 'converting lon/lat to x/y ...' # If no bbox was given if bbox_ is None: try: bbox = get_bbox(ifile) # Try reading bbox from file name except: bbox = None else: bbox = bbox_ # Get geographic boundaries + max search radius if bbox: # Extract bounding box xmin, xmax, ymin, ymax = bbox # Transform coordinates x, y = transform_coord(projGeo, projGrd, lon, lat) # Select data inside bounding box Ig = (x >= xmin - dmax) & (x <= xmax + dmax) & (y >= ymin - dmax) & ( y <= ymax + dmax) # Check bbox for obs. if len(x[Ig]) == 0: print 'SKIP FILE: NO DATA POINTS INSIDE BBOX:', ifile return print 'Number of obs. edited by bbox!', 'before:', len( x), 'after:', len(x[Ig]) # Only select wanted data x = x[Ig] y = y[Ig] id = id[Ig] time = time[Ig] height = height[Ig] sigma = sigma[Ig] else: # Convert into stereographic coordinates x, y = transform_coord(projGeo, projGrd, lon, lat) # Get bbox from data xmin, xmax, ymin, ymax = x.min(), x.max(), y.min(), y.max() # Apply transformation to time if expr: time = eval(expr.replace('t', 'time')) # Define time interval of solution if tspan: # Time interval = given time span t1lim, t2lim = tspan # Select only observations inside time interval Itime = (time > t1lim) & (time < t2lim) # Keep only data inside time span x = x[Itime] y = y[Itime] id = id[Itime] time = time[Itime] height = height[Itime] sigma = sigma[Itime] else: # Time interval = all data t1lim, t2lim = time.min(), time.max() if mode == 'p': # Point solution - all points xi, yi = np.copy(x), np.copy(y) else: # Grid solution - defined by nodes Xi, Yi = make_grid(xmin, xmax, ymin, ymax, dx, dy) xi, yi = Xi.ravel(), Yi.ravel() coord = zip(x.ravel(), y.ravel()) print 'building the k-d tree ...' Tree = cKDTree(coord) # Overall (fixed) mean time t_mean = np.round(np.nanmean(time), 2) # Number of nodes nodes = len(xi) # Initialize bias param bias = np.ones(lon.shape) * np.nan # Temporal resolution: months -> years tstep = tstep_ / 12.0 # Expected max number of months in time series months = len(np.arange(t1lim, t2lim + tstep, tstep)) M = 5 # Create output containers (data matrix) DATA0 = np.full((nodes, 21), np.nan) DATA1 = np.full((nodes, months + M), np.nan) DATA2 = np.full((nodes, months + M), np.nan) # Search radius array (dmax is slightly increased by 1e-4) dr = np.arange(dmin, dmax, 500) # Enter prediction loop print 'predicting values ...' for i in xrange(len(xi)): xc, yc = xi[i], yi[i] # Center coordinates # Loop through search radii for rad in dr: # Get indices of data within search radius (after relocation) i_cell, reloc_dist = get_radius_idx(x, y, xc, yc, rad, Tree, n_reloc=nreloc) if len(i_cell) < nlim: continue # use larger radius tcap, hcap = time[i_cell], height[i_cell] Nb = sum(~np.isnan(hcap)) # length before editing # 3-sigma filter if SIGMAFILT: #hcap = sigma_filter(tcap, hcap, order=1, n_sigma=3, n_iter=3) ##NOTE: It removes too much!!! hcap[np.abs(hcap - np.nanmedian(hcap)) > mad_std(hcap) * 3] = np.nan hcap[np.abs(hcap - np.nanmedian(hcap)) > 300] = np.nan Na = sum(~np.isnan(hcap)) # Length after editing n_mon, t_span = n_months(tcap, hcap, tstep=tstep) ##NOTE: Not using n_mon and t_span to constrain the solution! <<<<<<<<<<<<<<<<<<<<< # If enough data accept radius #if Na >= nlim and n_mon >= MINMONTHS and t_span >= dtlim: if Na >= nlim: break else: i_cell = [] if not i_cell: continue # Parameters for model-solution xcap = x[i_cell] ycap = y[i_cell] tcap = time[i_cell] hcap = height[i_cell] mcap = id[i_cell] scap = sigma[i_cell] i_valid = ~np.isnan(hcap) if sum(i_valid) < nlim: continue xcap = xcap[i_valid] ycap = ycap[i_valid] tcap = tcap[i_valid] hcap = hcap[i_valid] mcap = mcap[i_valid] scap = scap[i_valid] if nreloc: xc = np.median(xcap) # update inversion cell coords yc = np.median(ycap) # Define resolution param (a fraction of the accepted radius) dres = dres_ * rad # Estimate variance vcap = scap * scap # If reference time not given, use fixed or variable mean if tref_ == 'fixed': tref = t_mean elif tref_ == 'variable': tref = np.nanmean(tcap) else: tref = np.float(tref_) # Design matrix elements c0 = np.ones(len(xcap)) # intercept (0) c1 = xcap - xc # dx (1) c2 = ycap - yc # dy (2) c3 = c1 * c2 # dx**2 c4 = c1 * c1 # dx**2 c5 = c2 * c2 # dy**2 c6 = tcap - tref # trend (6) c7 = 0.5 * (c6 * c6) # acceleration (7) c8 = np.sin(2 * np.pi * c6) # seasonal sin (8) c9 = np.cos(2 * np.pi * c6) # seasonal cos (9) # Compute distance from prediction point to data inside cap dist = np.sqrt((xcap - xc) * (xcap - xc) + (ycap - yc) * (ycap - yc)) # Add small value to stabilize SVD solution vcap += 1e-6 # Weighting factor: distance and error Wcap = 1.0 / (vcap * (1.0 + (dist / dres) * (dist / dres))) # Create some intermediate output variables sx, sy, at, ae, bi = np.nan, np.nan, np.nan, np.nan, np.nan # Setup design matrix if model == 0: # Trend and seasonal Acap = np.vstack((c0, c8, c9, c6)).T mcol = [1, 2, 3] # columns to add back elif model == 1: # Trend, acceleration and seasonal Acap = np.vstack((c0, c7, c8, c9, c6)).T mcol = [1, 2, 3, 4] elif model == 2: # Trend, acceleration, seasonal and bi-linear surface Acap = np.vstack((c0, c1, c2, c7, c8, c9, c6)).T mcol = [3, 4, 5, 6] else: # Trend, acceleration, seasonal and bi-quadratic surface (full model) Acap = np.vstack((c0, c1, c2, c3, c4, c5, c7, c8, c9, c6)).T mcol = [6, 7, 8, 9] has_bias = False # bias flag # Check if bias is needed if len(np.unique(mcap)) > 1: # Add bias to design matrix Acap = np.vstack((Acap.T, mcap)).T has_bias = True ##NOTE: Not using t_span to constrain solution! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Check constrains before solving model (min_pts and min_tspan) #if len(hcap) < nlim or np.max(tcap)-np.min(tcap) < dtlim: continue if len(hcap) < nlim: continue """ Least-squares fit """ if robust_fit: # Robust least squares try: model_fit = sm.RLM(hcap, Acap, missing='drop').fit(maxiter=n_iter, tol=0.001) except: print 'SOMETHING WRONG WITH THE FIT... SKIPPING CELL!!!' continue else: # Weighted Least squares model_fit = sm.WLS(hcap, Acap, weights=Wcap, missing='drop').fit() Cm = model_fit.params # coeffs Ce = model_fit.bse # std err resid = model_fit.resid # data - model # Check rate and error if np.abs(Cm[-1]) > dhlim or np.isinf(Ce[-1]): continue ##NOTE: Important for ICESat !!! # Residuals dH = H - A * Cm (remove linear trend) dh = hcap - np.dot(Acap, Cm) if robust_fit: chisq = chisquared(model_fit) else: chisq = rsquared(model_fit) # Compute amplitude of seasonal signal asea = np.sqrt(Cm[-2] * Cm[-2] + Cm[-3] * Cm[-3]) # Compute phase offset psea = np.arctan2(Cm[-2], Cm[-3]) # Convert phase to decimal years ##FIXME: Convert phase to days !!! psea /= (2 * np.pi) # Compute root-mean-square of full model residuals rms = mad_std(resid) # Add back wanted model parameters dh += np.dot(Acap[:, mcol], Cm[mcol]) # Simple binning of residuals tb, hb, eb, nb = binning( tcap.copy(), dh.copy(), t1lim, t2lim, tstep)[:4] ##FIXME: Use Median to construct time series # Convert centroid location to latitude and longitude lon_c, lat_c = transform_coord(projGrd, projGeo, xc, yc) # Position DATA0[i, 0] = lat_c DATA0[i, 1] = lon_c # Elevation Change DATA0[i, 2] = Cm[-1] # trend DATA0[i, 3] = Ce[-1] # trend error # Compute acceleration and error if model > 0: at, ae = Cm[-4], Ce[-4] DATA0[i, 4] = at # acceleration DATA0[i, 5] = ae # acceleration error # Surface Elevation DATA0[i, 6] = Cm[0] DATA0[i, 7] = Ce[0] # Model RMS DATA0[i, 8] = rms # Compute x,y slopes in degrees if model > 1: sx, sy = np.arctan(Cm[1]) * (180 / np.pi), np.arctan( Cm[2]) * (180 / np.pi) # Surface slope values DATA0[i, 9] = sx DATA0[i, 10] = sy # Time span of data in cap DATA0[i, 11] = t_span DATA0[i, 12] = tref # Seasonal signal DATA0[i, 13] = asea DATA0[i, 14] = psea # Bias magnitude if has_bias: bi = Cm[-1] # Aux-data from solution DATA0[i, 15] = len(hcap) DATA0[i, 16] = dmin DATA0[i, 17] = rad DATA0[i, 18] = Nb - Na DATA0[i, 19] = chisq DATA0[i, 20] = bi # Time series values DATA1[i, :] = np.hstack((lat_c, lon_c, t1lim, t2lim, len(tb), hb)) ##FIXME: Think how to do this better DATA2[i, :] = np.hstack((lat_c, lon_c, t1lim, t2lim, len(tb), eb)) # Print progress (every N iterations) if (i % 200) == 0: print 'cell#', str(i) + "/" + str(len(xi)), \ 'trend:', np.around(Cm[mcol[-1]],2), 'm/yr', 'n_months:', n_mon, \ 'n_pts:', len( resid), 'radius:', rad, 'reloc_dist:', reloc_dist # Remove invalid entries from data matrix if mode == 'p': i_nan = np.where(np.isnan(DATA0[:, 3])) DATA0 = np.delete(DATA0.T, i_nan, 1).T i_nan = np.where(np.isnan(DATA1[:, 3])) DATA1 = np.delete(DATA1.T, i_nan, 1).T i_nan = np.where(np.isnan(DATA2[:, 3])) DATA2 = np.delete(DATA2.T, i_nan, 1).T else: ##NOTE: NaNs are not removed in case a grid soluction (n_reloc=0) is selected. if not nreloc: grids = [d.reshape(Xi.shape) for d in DATA0.T] # 1d -> 2d (grids) variables = [ 'lat', 'lon', 'trend', 'trend_err', 'accel', 'accel_err', 'height', 'height_err', 'model_rms', 'slope_x', 'slope_y', 't_span', 't_ref', 'amp_seas', 'pha_seas', 'n_obs', 'd_min', 'd_ri', 'n_edited', 'chi2', 'bias' ] # Check if output arrays are empty if np.isnan(DATA0[:, 3]).all(): print 'SKIP FILE: NO PREDICTIONS TO SAVE:', ifile return # Define output file name if ofile: outfile = ofile else: outfile = ifile # Output file names - strings path, ext = os.path.splitext(outfile) ofile0 = path + '_sf.h5' ofile1 = path + '_ts.h5' ofile2 = path + '_es.h5' print 'saving data ...' # Save surface fit parameters with h5py.File(ofile0, 'w') as fo0: if mode == 'p': fo0['sf'] = DATA0 # data matrix elif nreloc: for v, a in zip(variables, DATA0.T): fo0[v] = a # 1d arrays else: for v, g in zip(variables, grids): fo0[v] = g # 2d arrays fo0['x'], fo0['y'] = Xi[0, :], Yi[:, 0] # Save binned time series values with h5py.File(ofile1, 'w') as fo1: fo1['ts'] = DATA1 # Save binned time series errors with h5py.File(ofile2, 'w') as fo2: fo2['es'] = DATA2 # Print some statistics print '*' * 70 print('%s %.5f %s %.2f %s %.2f %s %.2f %s %s' % ('Mean:', np.nanmean(DATA0[:, 2]), 'Std:', np.nanstd( DATA0[:, 2]), 'Min:', np.nanmin(DATA0[:, 2]), 'Max:', np.nanmax(DATA0[:, 2]), 'Model:', model)) print '*' * 70 print 'Execution time: ' + str(datetime.now() - startTime) print 'Surface fit results ->', ofile0 print 'Time series values -> ', ofile1 print 'Time series errors -> ', ofile2
# In[8]: dataset.corr() > 0.98 # In[17]: xtrain_dataframe = pd.DataFrame(xtrain) ytrain_dataframe = pd.DataFrame(ytrain) xtest_dataframe = pd.DataFrame(xtest) ytest_dataframe = pd.DataFrame(ytest) xtrain_dataframe.columns = [u'R1',u'R2',u'R3',u'R4',u'R5',u'R6',u'R7',u'R8',u'Temp.',u'Humidity'] ytrain_dataframe.columns = ['class'] xtest_dataframe.columns = [u'R1',u'R2',u'R3',u'R4',u'R5',u'R6',u'R7',u'R8',u'Temp.',u'Humidity'] ytest_dataframe.columns = ['class'] res = sm.RLM(ytrain_dataframe, xtrain_dataframe).fit() res.summary() # When you perform a hypothesis test in statistics, a p-value helps you determine the significance of your results. Hypothesis tests are used to test the validity of a claim that is made about a population. This claim that’s on trial, in essence, is called the null hypothesis. # # The alternative hypothesis is the one you would believe if the null hypothesis is concluded to be untrue. The evidence in the trial is your data and the statistics that go along with it. All hypothesis tests ultimately use a p-value to weigh the strength of the evidence (what the data are telling you about the population). The p-value is a number between 0 and 1 and interpreted in the following way: # # 1. A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis. # # 2. A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis. # # 3. p-values very close to the cutoff (0.05) are considered to be marginal (could go either way). Always report the p-value so your readers can draw their own conclusions. # # # #### P value of R1 is too much which means that this variable donot have affect on the model. Hence we can remove this variable from our model.
def OLS_plot(col_x, col_y, dat, hue=None, robust=False, title=None, color='blue', aspect=3): ''' create correlation plot between two columns in a dataframe; add r2 and kendal tau stats to plot hue: name of column used to color the data points ''' #Calculate correlation stats #OLS regression if robust == False: res = sm.OLS(dat[col_y], sm.add_constant(dat[col_x]), missing='drop').fit() pval = res.pvalues[col_x] r2 = res.rsquared_adj slope = res.params[col_x] if robust: res = sm.RLM(dat[col_y], sm.add_constant(dat[col_x]), missing='drop').fit() pval = res.pvalues[col_x] r2 = sm.OLS( dat[col_y], dat[col_x], missing='drop').fit().rsquared_adj #same r2 as for non-robust slope = res.params[col_x] #Kendal-Tau (non-parametric) kt_dat = dat.dropna(subset=[col_x, col_y]) kendall_tau, kt_pval_num = scipy.stats.stats.kendalltau(kt_dat[col_y], kt_dat[col_x], nan_policy="omit") kt_pval = pretty_p_val(kt_pval_num) #Build plot sns.lmplot(y=col_y, x=col_x, data=dat, hue=hue, robust=robust, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }, scatter_kws={ 'color': color, 'alpha': 0.6 }, aspect=aspect) #plt.xticks(rotation=-90) summary_text = "$r^2$=" + str(r2)[0:4] + "; " + pretty_p_val( pval) + ". Slope= " + str(slope.round(4)) plt.tight_layout(pad=2) plt.figtext(0.93, 0.01, summary_text, horizontalalignment='right') plt.figtext(0.02, 0.01, r"K. $\tau$ = " + str(kendall_tau)[0:5] + "; " + kt_pval, horizontalalignment='left') ax = plt.gca() ax.set_title(title) #store results for function to return d = { 'r2': [r2], 'r2_p': [pval], 'slope': [slope], 'kendall_tau': [kendall_tau], 'kt_pval': [kt_pval_num] } df = pd.DataFrame(data=d) return (df)
def relative2abs(adata, dilution, volume, from_layer=None, to_layers=None, mixture_type=1, ERCC_controls=None, ERCC_annotation=None): """Converts FPKM/TPM data to transcript counts using ERCC spike-in. This is based on the relative2abs function from monocle 2 (Qiu, et. al, Nature Methods, 2017). Parameters ---------- adata: :class:`~anndata.AnnData` an Annodata object dilution: `float` the dilution of the spikein transcript in the lysis reaction mix. Default is 40, 000. The number of spike-in transcripts per single-cell lysis reaction was calculated from. volume: `float` the approximate volume of the lysis chamber (nanoliters). Default is 10 from_layer: `str` or `None` The layer in which the ERCC TPM values will be used as the covariate for the ERCC based linear regression. to_layers: `str`, `None` or `list-like` The layers that our ERCC based transformation will be applied to. mixture_type: the type of spikein transcripts from the spikein mixture added in the experiments. By default, it is mixture 1. Note that m/c we inferred are also based on mixture 1. ERCC_controls: the FPKM/TPM matrix for each ERCC spike-in transcript in the cells if user wants to perform the transformation based on their spike-in data. Note that the row and column names should match up with the ERCC_annotation and relative_ exprs_matrix respectively. ERCC_annotation: the ERCC_annotation matrix from illumina USE GUIDE which will be ued for calculating the ERCC transcript copy number for performing the transformation. Returns ------- An adata object with the data specified in the to_layers transformed into absolute counts. """ if ERCC_annotation is None: ERCC_annotation = pd.read_csv( 'https://www.dropbox.com/s/cmiuthdw5tt76o5/ERCC_specification.txt?dl=1', sep='\t') ERCC_id = ERCC_annotation['ERCC ID'] ERCC_id = adata.var_names.intersection(ERCC_id) if len(ERCC_id) < 10 and ERCC_controls is None: raise Exception( f'The adata object you provided has less than 10 ERCC genes.') if to_layers is not None: to_layers = [to_layers] if to_layers is str else to_layers to_layers = list(set(adata.layers.keys()).intersection(to_layers)) if len(to_layers) == 0: raise Exception( f"The layers {to_layers} that will be converted to absolute counts doesn't match any layers" f"from the adata object.") mixture_name = "concentration in Mix 1 (attomoles/ul)" if mixture_type == 1 else "concentration in Mix 2 (attomoles/ul)" ERCC_annotation['numMolecules'] = ERCC_annotation.loc[:, mixture_name] * ( volume * 10**(-3) * 1 / dilution * 10**(-18) * 6.02214129 * 10**(23)) ERCC_annotation['rounded_numMolecules'] = ERCC_annotation[ 'numMolecules'].astype(int) if from_layer in [None, 'X']: X, X_ercc = (adata.X, adata[:, ERCC_id].X if ERCC_controls is None else ERCC_controls) else: X, X_ercc = (adata.layers[from_layer], adata[:, ERCC_id] \ if ERCC_controls is None else ERCC_controls) logged = False if X.max() > 100 else True if not logged: X, X_ercc = (np.log1p(X.A) if issparse(X_ercc) else np.log1p(X), \ np.log1p(X_ercc.A) if issparse(X_ercc) else np.log1p(X_ercc)) else: X, X_ercc = (X.A if issparse(X_ercc) else X, X_ercc.A if issparse(X_ercc) else X_ercc) y = np.log1p(ERCC_annotation['numMolecules']) for i in range(adata.n_obs): X_i, X_ercc_i = X[i, :], X_ercc[i, :] X_i, X_ercc_i = sm.add_constant(X_i), sm.add_constant(X_ercc_i) res = sm.RLM(y, X_ercc_i).fit() k, b = res.params[::-1] if to_layers is None: X = adata.X logged = False if X.max() > 100 else True if not logged: X_i = np.log1p(X[i, :].A) if issparse(X) else np.log1p(X[i, :]) else: X_i = X[i, :].A if issparse(X) else X[i, :] res = k * X_i + b res = res if logged else np.expm1(res) adata.X[i, :] = csr_matrix(res) if issparse(X) else res else: for cur_layer in to_layers: X = adata.layers[cur_layer] logged = False if X.max() > 100 else True if not logged: X_i = np.log1p(X[i, :].A) if issparse(X) else np.log1p( X[i, :]) else: X_i = X[i, :].A if issparse(X) else X[i, :] res = k * X_i + b if logged else np.expm1(k * X_i + b) adata.layers[cur_layer][i, :] = csr_matrix(res) if issparse( X) else res
#Example: OLS model = sm.OLS(Y, X) results = model.fit() print(results.summary()) print(results.params) print(results.cov_params()) infl = results.get_influence() print(infl.summary_table()) #raise #Example RLM huber_t = sm.RLM(Y, X, M=sm.robust.norms.HuberT()) hub_results = huber_t.fit() print(hub_results.params) print(hub_results.bcov_scaled) print(hub_results.summary()) import matplotlib.pyplot as plt from matplotlib import cm import matplotlib as mpl def plot_acf_multiple(ys, lags=20): """ """ from statsmodels.tsa.stattools import acf
print('corrected rsquared') print((wls_fit3.uncentered_tss - wls_fit3.ssr) / wls_fit3.uncentered_tss) plt.figure() plt.title('WLS dropping heteroscedasticity variable from regressors') plt.plot(data.endog, wls_fit3.fittedvalues, 'o') plt.xlim([0, 2000]) # @savefig wls_drop_het.png plt.ylim([0, 2000]) print('raw correlation of endog and fittedvalues') print(np.corrcoef(data.endog, wls_fit.fittedvalues)) print('raw correlation coefficient of endog and fittedvalues squared') print(np.corrcoef(data.endog, wls_fit.fittedvalues)[0, 1]**2) # compare with robust regression, # heteroscedasticity correction downweights the outliers rlm_fit = sm.RLM(data.endog, data.exog).fit() plt.figure() plt.title('using robust for comparison') plt.plot(data.endog, rlm_fit.fittedvalues, 'o') plt.xlim([0, 2000]) # @savefig wls_robust_compare.png plt.ylim([0, 2000]) # What is going on? A more systematic look at the data # ---------------------------------------------------- # two helper functions def getrsq(fitresult): '''calculates rsquared residual, total and explained sums of squares
def rlsq(x, y, n=1): """ Fit a robust polynomial of n:th deg.""" # Test solution if len(x[~np.isnan(y)]) <= (n + 1): if n == 0: p = np.nan s = np.nan else: p = np.zeros((1, n)) * np.nan s = np.nan return p, s # Empty array A = np.empty((0, len(x))) # Create counter i = 0 # Determine if we need centering if n > 1: # Center x-axis x -= np.nanmean(x) # Special case if n == 0: # Mean offset A = np.ones(len(x)) else: # Make design matrix while i <= n: # Stack coefficients A = np.vstack((A, x ** i)) # Update counter i += 1 # Test to see if we can solve the system try: # Robust least squares fit fit = sm.RLM(y, A.T, missing='drop').fit(maxiter=5, tol=0.001) # polynomial coefficients p = fit.params # RMS of the residuals s = mad_std(fit.resid) except: # Set output to NaN if n == 0: p = np.nan s = np.nan else: p = np.zeros((1, n)) * np.nan s = np.nan return p[::-1], s
if (context.vlevel >= 2): sys.stderr.write("%s\n" % fulldiv) sys.stderr.write("guess_ra: %15.7f\n" % guess_ra) sys.stderr.write("guess_de: %15.7f\n" % guess_de) #afpars = [np.radians(guess_ra), np.radians(guess_de), ts_pmra_masyr/1e3, ts_pmde_masyr/1e3, 1.0] afpars = [np.radians(guess_ra), np.radians(guess_de), np.radians(ts_ra_model[1]), np.radians(ts_de_model[1]), 1.0] appcoo = af.apparent_radec(use_epoch.tdb.jd, afpars, use_eph) # proper fit: design_matrix = np.column_stack((np.ones(syr.size), syr)) #de_design_matrix = np.column_stack((np.ones(syr.size), syr)) ra_ols_res = sm.OLS(sra, design_matrix).fit() de_ols_res = sm.OLS(sde, design_matrix).fit() ra_rlm_res = sm.RLM(sra, design_matrix).fit() de_rlm_res = sm.RLM(sde, design_matrix).fit() rlm_pmde_masyr = de_rlm_res.params[1] * 3.6e6 rlm_pmra_masyr = ra_rlm_res.params[1] * 3.6e6 \ * np.cos(np.radians(de_rlm_res.params[0])) if (context.vlevel >= 1): sys.stderr.write("%s\n" % fulldiv) sys.stderr.write("\nTheil-Sen intercepts:\n") sys.stderr.write("RA: %15.7f\n" % ts_ra_model[0]) sys.stderr.write("DE: %15.7f\n" % ts_de_model[0]) sys.stderr.write("\nTheil-Sen proper motions:\n") sys.stderr.write("RA: %10.6f mas/yr\n" % ts_pmra_masyr) sys.stderr.write("DE: %10.6f mas/yr\n" % ts_pmde_masyr)
def main(ifile, n=''): # Check for empty file if os.stat(ifile).st_size == 0: print('input file is empty!') return # Start timing of script startTime = datetime.now() print('loading data ...') # Determine input file type if not ifile.endswith(('.h5', '.H5', '.hdf', '.hdf5')): print("Input file must be in hdf5-format") return # Input variables xvar, yvar, tvar, zvar = icol # Load all 1d variables needed with h5py.File(ifile, 'r') as fi: lon = fi[xvar][:] lat = fi[yvar][:] time = fi[tvar][:] height = fi[zvar][:] # EPSG number for lon/lat proj projGeo = '4326' # EPSG number for grid proj projGrd = proj print('converting lon/lat to x/y ...') # Convert into stereographic coordinates (x, y) = transform_coord(projGeo, projGrd, lon, lat) # Get bbox from data (xmin, xmax, ymin, ymax) = x.min(), x.max(), y.min(), y.max() # Apply transformation to time if expr: time = eval(expr.replace('t', 'time')) # Overall (fixed) mean time t_mean = np.round(np.nanmean(time), 2) # Grid solution - defined by nodes (Xi, Yi) = make_grid(xmin, xmax, ymin, ymax, dx, dy) # Flatten prediction grid xi = Xi.ravel() yi = Yi.ravel() # Zip data to vector coord = list(zip(x.ravel(), y.ravel())) # Construct cKDTree print('building the k-d tree ...') Tree = cKDTree(coord) # Create output containers dh_topo = np.full(height.shape, np.nan) de_topo = np.full(height.shape, 999999.) mi_topo = np.full(height.shape, np.nan) hm_topo = np.full(height.shape, np.nan) sx_topo = np.full(height.shape, np.nan) sy_topo = np.full(height.shape, np.nan) tr_topo = np.full(height.shape, np.nan) # Set slope limit slp_lim = np.tan(np.deg2rad(slplim)) # Enter prediction loop print('predicting values ...') for i in range(len(xi)): x0, y0 = xi[i], yi[i] # Get indexes of data within search radius or cell bbox idx = get_radius_idx( x, y, x0, y0, dmax, Tree, n_reloc=nreloc, min_months=18, max_reloc=3, time=None, height=None) # Length of data in search cap nobs = len(x[idx]) # Check data density if (nobs < nlim): continue # Parameters for model-solution xcap = x[idx] ycap = y[idx] tcap = time[idx] hcap = height[idx] # Copy original height vector h_org = hcap.copy() # Centroid node xc = np.median(xcap) yc = np.median(ycap) # If reference time not given, use fixed or variable mean if tref_ == 'fixed': tref = t_mean elif tref_ == 'variable': tref = np.nanmean(tcap) else: tref = np.float(tref_) # Design matrix elements c0 = np.ones(len(xcap)) c1 = xcap - xc c2 = ycap - yc c3 = c1 * c2 c4 = c1 * c1 c5 = c2 * c2 c6 = tcap - tref # Length before editing nb = len(hcap) # Determine model order if order == 2 and nb >= mlim * 2: # Biquadratic surface and linear trend Acap = np.vstack((c0, c1, c2, c3, c4, c5, c6)).T # Model identifier mi = 1 # Set model order elif nb >= mlim: # Bilinear surface and linear trend Acap = np.vstack((c0, c1, c2, c6)).T # Model identifier mi = 2 else: # Model identifier mi = 3 # Modelled topography if mi == 1: # Construct model object linear_model = sm.RLM(hcap, Acap, M=sm.robust.norms.HuberT(), missing='drop') # Fit the model to the data, linear_model_fit = linear_model.fit(maxiter=niter, tol=0.001) # Coefficients Cm = linear_model_fit.params # Biquadratic surface h_model = np.dot(np.vstack((c0, c1, c2, c3, c4, c5)).T, Cm[[0, 1, 2, 3, 4, 5]]) # Compute along and across track slope sx = np.sign(Cm[1]) * slp_lim if np.abs(Cm[1]) > slp_lim else Cm[1] sy = np.sign(Cm[2]) * slp_lim if np.abs(Cm[2]) > slp_lim else Cm[2] # Mean height h_avg = Cm[0] elif mi == 2: # Construct model object linear_model = sm.RLM(hcap, Acap, M=sm.robust.norms.HuberT(), missing='drop') # Fit the model to the data, linear_model_fit = linear_model.fit(maxiter=niter, tol=0.001) # Coefficients Cm = linear_model_fit.params # Bilinear surface h_model = np.dot(np.vstack((c0, c1, c2)).T, Cm[[0, 1, 2]]) # Compute along and across track slope sx = np.sign(Cm[1]) * slp_lim if np.abs(Cm[1]) > slp_lim else Cm[1] sy = np.sign(Cm[2]) * slp_lim if np.abs(Cm[2]) > slp_lim else Cm[2] # Mean height h_avg = Cm[0] else: # Mean surface from median h_avg = np.median(hcap) # Compute distance estimates from centroid s_dx = (xcap - xc) + 1e-3 s_dy = (ycap - yc) + 1e-3 # Center surface height dh_i = h_org - h_avg # Compute along-track slope px, rms_x = rlsq(s_dx, dh_i, 1) py, rms_x = rlsq(s_dy, dh_i, 1) # Set along-track slope s_x = 0 if np.isnan(px[0]) else px[0] # Set across-track slope to zero s_y = 0 if np.isnan(py[0]) else py[0] # Compute along and across track slope sx = np.sign(s_x) * slp_lim if np.abs(s_x) > slp_lim else s_x sy = np.sign(s_y) * slp_lim if np.abs(s_y) > slp_lim else s_y # Compute the surface height correction h_model = h_avg + (sx * s_dx) + (sy * s_dy) # Compute full slope slope = np.arctan(np.sqrt(sx**2 + sy**2)) * (180 / np.pi) # Compute residual dh = h_org - h_model # Number of observations na = len(dh) # RMSE of the residuals RMSE = mad_std(dh) # Overwrite errors iup = RMSE < de_topo[idx] # Create temporary variables dh_cap = dh_topo[idx].copy() de_cap = de_topo[idx].copy() hm_cap = hm_topo[idx].copy() mi_cap = mi_topo[idx].copy() tr_cap = tr_topo[idx].copy() # Update variables dh_cap[iup] = dh[iup] de_cap[iup] = RMSE hm_cap[iup] = h_avg mi_cap[iup] = mi tr_cap[iup] = tref # Update with current solution dh_topo[idx] = dh_cap de_topo[idx] = de_cap hm_topo[idx] = hm_cap mi_topo[idx] = mi_cap tr_topo[idx] = tr_cap sx_topo[idx] = np.arctan(sx) * (180 / np.pi) sy_topo[idx] = np.arctan(sy) * (180 / np.pi) # Print progress (every N iterations) if (i % 100) == 0 and diag is True: # Print message every i:th solution print(('%s %i %s %2i %s %i %s %03d %s %.3f %s %.3f' % \ ('#',i,'/',len(xi),'Model:',mi,'Nobs:',nb,'Slope:',\ np.around(slope,3),'Residual:',np.around(mad_std(dh),3)))) # Print percentage of not filled print(('Total NaNs (percent): %.2f' % \ (100 * float(len(dh_topo[np.isnan(dh_topo)])) / float(len(dh_topo))))) # Print percentage of each model one = np.sum(mi_topo == 1) two = np.sum(mi_topo == 2) tre = np.sum(mi_topo == 3) N = float(len(mi_topo)) print(('Model types (percent): 1 = %.2f, 2 = %.2f, 3 = %.2f' % \ (100 * one/N, 100 * two/N, 100 * tre/N))) # Append new columns to original file with h5py.File(ifile, 'a') as fi: # Check if we have variables in file try: # Save variables fi['h_res'] = dh_topo fi['h_mod'] = hm_topo fi['e_res'] = de_topo fi['m_deg'] = mi_topo fi['t_ref'] = tr_topo fi['slp_x'] = sx_topo fi['slp_y'] = sy_topo except: # Update variables fi['h_res'][:] = dh_topo fi['h_mod'][:] = hm_topo fi['e_res'][:] = de_topo fi['m_deg'][:] = mi_topo fi['t_ref'][:] = tr_topo fi['slp_x'][:] = sx_topo fi['slp_y'][:] = sy_topo # Rename file if ifile.find('TOPO') < 0: os.rename(ifile, ifile.replace('.h5', '_TOPO.h5')) # Print some statistics print(('*' * 75)) print(('%s %s %.5f %s %.2f %s %.2f %s %.2f %s %.2f' % \ ('Statistics', 'Mean:', np.nanmedian(dh_topo), 'Std.dev:', mad_std(dh_topo), 'Min:', np.nanmin(dh_topo), 'Max:', np.nanmax(dh_topo), 'RMSE:', np.nanmedian(de_topo[dh_topo!=999999]),))) print(('*' * 75)) print('') # Print execution time of algorithm print(('Execution time: '+ str(datetime.now()-startTime)))
def robust_linear(self, x, y): rlm_model = sm.RLM(y, x, M=sm.robust.norms.HuberT()) rlm_results = rlm_model.fit() print(rlm_results.summary()) print(rlm_results.params)
z = ['x2'] alpha = 0.05 size = 5000 x1 = np.random.normal(size=size) x2 = np.random.normal(size=size) + x1 x3 = np.random.normal(size=size) + x2 X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3}) test = MixedChiSquaredTest(y, x, z, X, alpha, variable_types={ 'x1': 'c', 'x2': 'c', 'x3': 'c' }) print 'null', test.chi2_bound print 'actual', test.chi2 print test.independent() raise Exception X_sampled = test.generate_ci_sample() print X.corr() print X_sampled.corr() regression = sm.RLM(X[y], X[x + z]) result = regression.fit() print result.summary() regression = sm.RLM(X_sampled[y], X_sampled[x + z]) result = regression.fit() print result.summary()
fontsize=16, ) # annotate these with their index for i, row in dta.loc[dta['log.Te'] < 3.8].iterrows(): ax.annotate(i, row, row + .01, fontsize=14) xlim, ylim = ax.get_xlim(), ax.get_ylim() from IPython.display import Image Image(filename='star_diagram.png') y = dta['log.light'] X = sm.add_constant(dta['log.Te'], prepend=True) ols_model = sm.OLS(y, X).fit() abline_plot(model_results=ols_model, ax=ax) rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit() abline_plot(model_results=rlm_mod, ax=ax, color='red') # * Why? Because M-estimators are not robust to leverage points. infl = ols_model.get_influence() h_bar = 2 * (ols_model.df_model + 1) / ols_model.nobs hat_diag = infl.summary_frame()['hat_diag'] hat_diag.loc[hat_diag > h_bar] sidak2 = ols_model.outlier_test('sidak') sidak2.sort_values('unadj_p', inplace=True) print(sidak2) fdr2 = ols_model.outlier_test('fdr_bh')
def linear_best_fit(data, x_args, y_args, fillNaN=True, robust=True, printdata=False, plot=False): """ -------------------------------------------------------------------------- Create linear line of best fit and get coefficients -------------------------------------------------------------------------- Input: -------------------------------------------------------------------------- Output: intercept - float, intercept of the linear equation (y=slope*x+intercept) slope - float, slope of the linear equation (y=slope*x+intercept) -------------------------------------------------------------------------- WARNIGN: Input data cannot be negative - see first part of the code -------------------------------------------------------------------------- """ divider = '------------------------------------------------------------' #Filter data, get rid of NaN's data = data[(data[x_args] >= -1)] data = data[(data[y_args] >= -1)] #Set bounds x_min = data[x_args].min() x_max = data[x_args].max() #Use add_constants to get intercept x2_args = sm.add_constant(data[x_args]) if robust: model = sm.RLM(data[y_args], x2_args, M=sm.robust.norms.LeastSquares()) else: model = sm.OLS(data[y_args], x2_args) #Straight line equation coefficients parameters = model.fit().params intercept = parameters[0] slope = parameters[1] if printdata: #Get bounds of y-values y_min = data[y_args].min() y_max = data[y_args].max() print('Data for {} vs {}:'.format(y_args, x_args)) print(divider) print('Range of x:{} - {}, y:{} - {}'.format(x_min, x_max, y_min, y_max)) print(divider) if robust: #Calculate OLS as well in order to get R^2 value model2 = sm.OLS(data[y_args], x2_args) parameters2 = model2.fit().params intercept2 = parameters2[0] slope2 = parameters2[1] #Calculate R^2 r2 = model2.fit().rsquared print('OLS: Slope: {}, Intercept: {}'.format(slope2, intercept2)) print(divider) print('R^2={:.3f}'.format(r2)) print(divider) print('RLM: Slope: {}, Intercept: {}'.format(slope, intercept)) print(divider) else: print('OLS: Slope: {}, Intercept: {}'.format(slope, intercept)) print(divider) print('R^2=') print(divider) print('Extreme points: ({},{:.2f})({},{:.2f})'.format( x_min, (slope * x_min + intercept), x_max, (slope * x_max + intercept))) print(divider) if plot: ax = data.plot(x=x_args, y=y_args, kind='scatter') #Plot regression line on the same axes, set values x = [x_min, x_max] ax.plot(x, [intercept + x_min * slope, intercept + x_max * slope]) ax.set_xlim([x_min, x_max]) return intercept, slope
func = getattr(wrapping, meth) wrapper = make_wrapper(func, how) setattr(klass, meth, wrapper) if __name__ == '__main__': import statsmodels.api as sm from pandas import DataFrame data = sm.datasets.longley.load(as_pandas=False) df = DataFrame(data.exog, columns=data.exog_name) y = data.endog # data.exog = sm.add_constant(data.exog) df['intercept'] = 1. olsresult = sm.OLS(y, df).fit() rlmresult = sm.RLM(y, df).fit() # olswrap = RegressionResultsWrapper(olsresult) # rlmwrap = RLMResultsWrapper(rlmresult) data = sm.datasets.wfs.load(as_pandas=False) # get offset offset = np.log(data.exog[:, -1]) exog = data.exog[:, :-1] # convert dur to dummy exog = sm.tools.categorical(exog, col=0, drop=True) # drop reference category # convert res to dummy exog = sm.tools.categorical(exog, col=0, drop=True) # convert edu to dummy
k['rate'] = k['BUILDINGGARAGE'] / k['carnumbersum'] k = k.groupby('bc').agg({'rate': 'median', 'BUILDINGGARAGE': 'count'}) k = df[(df['carnumbersum'] > 0) & (df['GarageArea'] == 0) & (df['BUILDINGGARAGE'] == 0) & (df['parkinglots'] > 0)].reset_index(drop=True) # 1428 k['rate'] = k['parkinglots'] / k['carnumbersum'] k = k.groupby('bc').agg({'rate': 'median', 'parkinglots': 'count'}) k = df[(df['carnumbersum'] > 0) & ((df['GarageArea'] > 0) | (df['BUILDINGGARAGE'] > 0) | (df['parkinglots'] > 0))].reset_index(drop=True) X = k[['GarageArea', 'BUILDINGGARAGE', 'parkinglots']] y = k['carnumbersum'] model = sm.RLM(y, X).fit() model.summary() k['predict'] = model.predict() k.to_csv(path + 'k.csv', index=False) k = df[(df['carnumbersum'] > 0) & (df['GarageArea'] == 0) & (df['BUILDINGGARAGE'] == 0) & (df['parkinglots'] == 0)].reset_index(drop=True) k = k[(k['bc'] == 'A') | (k['bc'] == 'B') | (k['BldgClass'] == 'C0')].reset_index(drop=True) k.to_csv(path + 'k.csv', index=False) X = df[['GarageArea', 'BUILDINGGARAGE', 'parkinglots', 'LotArea']] y = df['carnumbersum'] model = sm.OLS(y, X).fit() model.summary()
import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std # ## Estimation # # Load data: data = sm.datasets.stackloss.load() data.exog = sm.add_constant(data.exog) # Huber's T norm with the (default) median absolute deviation scaling huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT()) hub_results = huber_t.fit() print(hub_results.params) print(hub_results.bse) print( hub_results.summary( yname='y', xname=['var_%d' % i for i in range(len(hub_results.params))])) # Huber's T norm with 'H2' covariance matrix hub_results2 = huber_t.fit(cov="H2") print(hub_results2.params) print(hub_results2.bse) # Andrew's Wave norm with Huber's Proposal 2 scaling and 'H3' covariance
def regComb(self, dsReg, field='LSTM', opt=1, fTest=None): statSigma = dsReg.statCalSigma(field=field) # do regression if opt == 1: x1 = np.square(statSigma.sigmaMC_mat) x2 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat y = np.square(dsReg.LSTM-dsReg.SMAP) - \ np.square(statSigma.sigmaX_mat) xx = np.stack((x1.flatten(), x2.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) elif opt == 2: x1 = np.square(statSigma.sigmaMC_mat) y = np.square(dsReg.LSTM-dsReg.SMAP) - \ np.square(statSigma.sigmaX_mat) xx = x1.flatten().reshape(-1, 1) yy = y.flatten().reshape(-1, 1) elif opt == 3: x1 = np.square(statSigma.sigmaMC_mat) x2 = np.square(statSigma.sigmaX_mat) x3 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat x4 = np.ones(x1.shape) y = np.square(dsReg.LSTM - dsReg.SMAP) xx = np.stack( (x1.flatten(), x2.flatten(), x3.flatten(), x4.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) elif opt == 4: x1 = np.square(statSigma.sigmaMC_mat) x2 = np.square(statSigma.sigmaX_mat) x3 = np.ones(x1.shape) y = np.square(dsReg.LSTM - dsReg.SMAP) xx = np.stack((x1.flatten(), x2.flatten(), x3.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) elif opt == 5: x1 = np.square(statSigma.sigmaMC_mat) x2 = np.square(statSigma.sigmaX_mat) x3 = statSigma.sigmaMC_mat * statSigma.sigmaX_mat y = np.square(dsReg.LSTM - dsReg.SMAP) xx = np.stack((x1.flatten(), x2.flatten(), x3.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) elif opt == 6: x1 = np.square(statSigma.sigmaMC_mat) x2 = np.square(statSigma.sigmaX_mat) y = np.square(dsReg.LSTM - dsReg.SMAP) xx = np.stack((x1.flatten(), x2.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) elif opt == 7: x1 = np.square(statSigma.sigmaMC_mat) y = np.square(dsReg.LSTM - dsReg.SMAP) xx = x1.flatten().reshape(-1, 1) yy = y.flatten().reshape(-1, 1) elif opt == 8: x1 = np.square(statSigma.sigmaX_mat) y = np.square(dsReg.LSTM - dsReg.SMAP) xx = x1.flatten().reshape(-1, 1) yy = y.flatten().reshape(-1, 1) elif opt == 9: x1 = np.ones(statSigma.sigma_mat.shape) y = np.square(dsReg.LSTM-dsReg.SMAP) - \ np.square(statSigma.sigma_mat) xx = x1.flatten().reshape(-1, 1) yy = y.flatten().reshape(-1, 1) ind = np.where(~np.isnan(yy))[0] xf = xx[ind, :] yf = yy[ind] # w, _, _, _ = np.linalg.lstsq(xf, yf) # model = sm.OLS(yf, xf) model = sm.RLM(yf, xf) result = model.fit() w = result.params if fTest is not None: ftestP = list() ftestF = list() for k in range(len(w)): ww = w.copy() ww[k] = fTest[k] ff = result.f_test(ww) ftestP.append(ff.pvalue) ftestF.append(ff.fvalue) if opt == 1: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + self.sigmaMC_mat * self.sigmaX_mat * w[1] + np.square(self.sigmaX_mat)) k = -w[1] / 2 a = w[0] - k**2 out = [a, k] elif opt == 2: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + np.square(self.sigmaX_mat)) x1 = np.square(statSigma.sigmaMC_mat) x2 = np.ones(x1.shape) y = np.square(statSigma.sigmaX_mat) xx = np.stack((x1.flatten(), x2.flatten()), axis=1) yy = y.flatten().reshape(-1, 1) k, _, _, _ = np.linalg.lstsq(xx, yy) k = k[0] a = w[0] + k out = [a, k] elif opt == 3: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + np.square(self.sigmaX_mat) * w[1] + self.sigmaMC_mat * self.sigmaX_mat * w[2] + np.ones(self.sigmaX_mat.shape) * w[3]) out = w elif opt == 4: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + np.square(self.sigmaX_mat) * w[1] + np.ones(self.sigmaX_mat.shape) * w[2]) out = w elif opt == 5: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + np.square(self.sigmaX_mat) * w[1] + self.sigmaMC_mat * self.sigmaX_mat * w[2]) out = w elif opt == 6: self.sigmaReg_mat = np.sqrt( np.square(self.sigmaMC_mat) * w[0] + np.square(self.sigmaX_mat) * w[1]) out = w elif opt == 7: self.sigmaReg_mat = np.sqrt(np.square(self.sigmaMC_mat) * w[0]) out = w elif opt == 8: self.sigmaReg_mat = np.sqrt(np.square(self.sigmaX_mat) * w[0]) out = w elif opt == 9: self.sigmaReg_mat = np.sqrt(np.square(self.sigma_mat) + w[0]) out = w self.sigmaReg = np.sqrt(np.mean(self.sigmaReg_mat**2, axis=1)) if fTest is None: return result else: return (out, ftestP, ftestF)