def test_df_match_match_on_window_border(): """ test matching if a value lies exactly on the window border. """ ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D")) match_df = pd.DataFrame({"matched_data": np.arange(4)}, index=[datetime(2007, 1, 1, 9), datetime(2007, 1, 2, 9), datetime(2007, 1, 3, 12), datetime(2007, 1, 5, 9)]) matched = tmatching.df_match(ref_df, match_df, window=0.5) nptest.assert_allclose( np.array([0.375, 0.375, 0.5, -0.5, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, 2, 2, 3], matched.matched_data) # test asym_window keyword matched = tmatching.df_match( ref_df, match_df, window=0.5, asym_window="<=") nptest.assert_allclose( np.array([0.375, 0.375, 0.5, np.nan, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, 2, np.nan, 3], matched.matched_data) matched = tmatching.df_match( ref_df, match_df, window=0.5, asym_window=">=") nptest.assert_allclose( np.array([0.375, 0.375, np.nan, -0.5, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, np.nan, 2, 3], matched.matched_data)
def match(self, reference, *args): """ takes reference and other dataframe and returnes a joined Dataframe in this case the reference dataset for the grid is also the temporal reference dataset """ matched_datasets = temp_match.df_match(reference, *args, dropna=True, dropduplicates=True, window=self.window) if type(matched_datasets) != tuple: matched_datasets = [matched_datasets] matched_data = pd.DataFrame(reference) for match in matched_datasets: if LooseVersion(pd.__version__) < LooseVersion('0.23'): match = match.drop(('index', ''), axis=1) else: match = match.drop('index', axis=1) match = match.drop('distance', axis=1) matched_data = matched_data.join(match) return matched_data.dropna(how='all')
def test_df_match_borders(): """ Border values can be problematic for temporal matching. See issue #51 """ ref_df = pd.DataFrame( {"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), ) match_df = pd.DataFrame( {"matched_data": np.arange(5)}, index=[ datetime(2007, 1, 1, 9), datetime(2007, 1, 2, 9), datetime(2007, 1, 3, 9), datetime(2007, 1, 4, 9), datetime(2007, 1, 5, 9), ], ) matched = tmatching.df_match(ref_df, match_df) nptest.assert_allclose(np.array([0.375, 0.375, 0.375, 0.375, 0.375]), matched.distance.values) nptest.assert_allclose(np.arange(5), matched.matched_data)
def test_df_match_match_on_window_border(): """ test matching if a value lies exactly on the window border. """ ref_df = pd.DataFrame( {"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"), ) match_df = pd.DataFrame( {"matched_data": np.arange(4)}, index=[ datetime(2007, 1, 1, 9), datetime(2007, 1, 2, 9), datetime(2007, 1, 3, 12), datetime(2007, 1, 5, 9), ], ) with pytest.deprecated_call(): matched = tmatching.df_match(ref_df, match_df, window=0.5) nptest.assert_allclose(np.array([0.375, 0.375, 0.5, -0.5, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, 2, 2, 3], matched.matched_data) # test asym_window keyword with pytest.deprecated_call(): matched = tmatching.df_match(ref_df, match_df, window=0.5, asym_window="<=") nptest.assert_allclose(np.array([0.375, 0.375, 0.5, np.nan, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, 2, np.nan, 3], matched.matched_data) with pytest.deprecated_call(): matched = tmatching.df_match(ref_df, match_df, window=0.5, asym_window=">=") nptest.assert_allclose(np.array([0.375, 0.375, np.nan, -0.5, 0.375]), matched.distance.values) nptest.assert_allclose([0, 1, np.nan, 2, 3], matched.matched_data)
def PCA(Ser1, Ser2, window=1): df1 = pd.DataFrame(Ser1).dropna(); df1.columns = ['ds1'] df2 = pd.DataFrame(Ser2).dropna(); df2.columns = ['ds2'] if (len(df1) < 10) | (len(df2) <= 10): return pd.DataFrame(columns=['PC-1', 'PC-2']) if len(df1) < len(df2): matched = df_match(df1, df2, window=window) df = df1.join(matched['ds2']).dropna() else: matched = df_match(df2, df1, window=window) df = df2.join(matched['ds1']).dropna() if len(df) < 10: return pd.DataFrame(columns=['PC-1', 'PC-2']) X = df.values.copy() X_mean = X.mean(axis=0) X -= X_mean C = (X.T @ X) / (len(X)-1) eigen_vals, eigen_vecs = np.linalg.eig(C) # Rotate Eigenvectors 180 degrees if major PC is pointing in the "wrong" direction. if (np.sign(eigen_vecs[:,np.argmax(eigen_vals)]).sum() == -2) & (np.sign(np.corrcoef(X.T)[0,1]) == 1): eigen_vecs *= -1 X_pca = X @ eigen_vecs if eigen_vals[0] < eigen_vals[1]: X_pca = np.roll(X_pca, 1, axis=1) X_pca[:,0] += X_mean.mean() df_pca = pd.DataFrame(X_pca, columns=['PC-1', 'PC-2'], index=df.index) return pd.concat((df, df_pca), axis='columns'), eigen_vals, eigen_vecs
def match_reverse(self, reference, other): """ takes reference and other dataframe and returnes a joined Dataframe in this case the reference dataset for the grid is also the temporal reference dataset """ # temporal match comparison to reference TimeSeries try: matched_ref = temp_match.df_match(other, reference, window=self.window, dropna=True) except ValueError: return pd.DataFrame() matched_ref = matched_ref.drop(['distance', 'index'], axis=1) return matched_ref.join(other)
def test_df_match_borders_unequal_query_points(): """ Border values can be problematic for temporal matching. See issue #51 """ ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D")) match_df = pd.DataFrame({"matched_data": np.arange(4)}, index=[datetime(2007, 1, 1, 9), datetime(2007, 1, 2, 9), datetime(2007, 1, 4, 9), datetime(2007, 1, 5, 9)]) matched = tmatching.df_match(ref_df, match_df) nptest.assert_allclose( np.array([0.375, 0.375, -0.625, 0.375, 0.375]), matched.distance.values) nptest.assert_allclose(np.array([0, 1, 1, 2, 3]), matched.matched_data)
def collocate(df): dts = np.arange(24) res_df = pd.DataFrame(columns=df.columns.values) for d in dts: ref_df = pd.DataFrame(index=pd.date_range(df.index.min().date(), df.index.max().date()) + pd.Timedelta(d, 'h')) args = [df[col].dropna() for col in df] matched = df_match(ref_df, *args, window=0.5) if len(df.columns) == 1: ref_df[df.columns.values[0]] = matched[df.columns.values[0]] else: for i, col in enumerate(df): ref_df[col] = matched[i][col] ref_df.dropna(inplace=True) if len(ref_df) > len(res_df): res_df = ref_df.copy() return res_df
def collocate(df): """ Collocates the columns of a pd.DataFrame. Data points are resampled to reference time stemps with 24 hr distance, which are optimized to maximize the number of matches. Parameters ---------- df : pd.DataFrame Input Dataframe """ res_df = pd.DataFrame(columns=df.columns.values) # Test each hour of the day as potential reference time step to optimize the number of collocated data points for d in np.arange(24): # Create reference time steps for the respective reference hour of the day of this iteration ref_df = pd.DataFrame(index=pd.date_range(df.index.min().date(), df.index.max().date()) + pd.Timedelta(d, 'h')) # Find the NN to the reference time steps for each data set args = [df[col].dropna() for col in df] matched = df_match(ref_df, *args, window=0.5) if len(df.columns) == 1: ref_df[df.columns.values[0]] = matched[df.columns.values[0]] else: for i, col in enumerate(df): ref_df[col] = matched[i][col] ref_df.dropna(inplace=True) # Check if collocation at this hour gave more temporal matches than collocation at the previous hour if len(ref_df) > len(res_df): res_df = ref_df.copy() return res_df
def TCA_insitu_evaluation(): result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\TCA_evaluation\validation.csv' noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled') DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled') DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile') t_ana = pd.DatetimeIndex( LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time. values).sort_values() ascat = HSAF_io() gpi_list = pd.read_csv( r"D:\data_sets\ASCAT\warp5_grid\pointlist_warp_conus.csv", index_col=0) ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'], row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg']) runs = ['noDA', 'DA_const_err', 'DA_varia_err'] tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries] variables = [ 'sm_surface', ] modes = [ 'absolute', ] for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): logging.info('%i/%i' % (i, len(ismn.list))) try: res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list) ts_asc = ascat.read(gpi, resample_time=False) if ts_asc is None: continue ts_asc.name = 'ascat' ts_asc = pd.DataFrame(ts_asc) for var in variables: for mode in modes: ts_ins = ts_insitu[var].dropna() ts_ins.name = 'insitu' ts_ins = pd.DataFrame(ts_ins) for run, ts_model in zip(runs, tss): ind = (ts_model['snow_mass'][row, col].values == 0) & ( ts_model['soil_temp_layer1'][row, col].values > 277.15) ts_mod = ts_model[var][row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') ts_mod = ts_mod.loc[t_ana].dropna() ts_mod.name = 'model' ts_mod = pd.DataFrame(ts_mod) matched = df_match(ts_mod, ts_asc, ts_ins, window=0.5) data = ts_mod.join(matched[0][[ 'ascat', ]]).join(matched[1][[ 'insitu', ]]).dropna() tc_res = TCA(data['model'].values, data['ascat'].values, data['insitu'].values) res['RMSE_model_' + run + '_' + mode + '_' + var] = tc_res[1][0] res['RMSE_ascat_' + run + '_' + mode + '_' + var] = tc_res[1][1] res['RMSE_insitu_' + run + '_' + mode + '_' + var] = tc_res[1][2] res['beta_ascat_' + run + '_' + mode + '_' + var] = tc_res[2][1] res['beta_insitu_' + run + '_' + mode + '_' + var] = tc_res[2][2] res['len_' + mode + '_' + var] = len(data) if (os.path.isfile(result_file) == False): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False) except: continue
def temp_resam(df, startdate, enddate, freq): ref_dr = pd.date_range(startdate, enddate, freq=freq) ref_df = pd.DataFrame(index=ref_dr) matched_df = tm.df_match(ref_df, df) return matched_df
return matched_df if __name__ == '__main__': cur_path = os.path.abspath(os.path.curdir) input_dataidx_file = os.path.join(cur_path,'data/ascatssf/0673') gpi = 720360 dat_obj = datasets.DatasetTs(input_dataidx_file) print dat_obj.dat_data.dtype print type(dat_obj.dat_data), dat_obj.dat_data.shape gpi_data = dat_obj.read_ts(gpi) print gpi_data.dtype, gpi_data.shape ref_dr = pd.date_range('1970-01-01 12:00:00', '2016-01-01 12:00:00', freq='D') ref_df = pd.DataFrame(index=ref_dr) ascat_dr = julian.julian2datetimeindex(gpi_data['jd']) ascat_df = pd.DataFrame(gpi_data, index=ascat_dr) matched_df = tm.df_match(ref_df, ascat_df) matched_df.plot() plt.show() pass
def plot_suspicious_stations(root): statlist = pd.read_csv('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/station_list_r_diff.csv', index_col=0) rmsd_root = 'US_M36_SMAP_TB_DA_SM_PROXY_' rmsd_exps = list(np.sort([x.name.split(rmsd_root)[1] for x in Path('/Users/u0116961/data_sets/LDASsa_runs').glob('*SM_PROXY*')])) ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_scaled_4K_obserr').timeseries ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries ts_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries['obs_obs'] t_ana = pd.DatetimeIndex(ts_ana.time.values).sort_values() ascat = HSAF_io() gpi_list = pd.read_csv(ascat.root / 'warp5_grid' / 'pointlist_warp_conus.csv', index_col=0) ismn = ISMN_io() variables = ['sm_surface', 'sm_rootzone'] modes = ['absolute', 'longterm', 'shortterm'] ismn.list.index = ismn.list.network + '_' + ismn.list.station ismn.list.reindex(statlist.index) ismn.list = ismn.list.reindex(statlist.index) for i, (meta, ts_insitu) in enumerate(ismn.iter_stations(surface_only=False)): if 'tmp_res' in locals(): if (meta.network in tmp_res) & (meta.station in tmp_res): print(f'Skipping {i}') continue try: res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list) ts_ascat = ascat.read(gpi) / 100 * 0.6 if ts_ascat is None: continue for mode in modes: for var in variables: tmp = statlist[(statlist.network==meta.network)&(statlist.station==meta.station)] dpr = tmp[f'diff_pearsonr2_{mode}_{var}'].values[0] dtr = tmp[f'diff_tcar2_{mode}_{var}'].values[0] if not ((dtr < 0) & (dpr > 0)): continue if mode == 'absolute': ts_asc = ts_ascat.dropna() else: ts_asc = calc_anom(ts_ascat, longterm=(mode == 'longterm')).dropna() ts_asc.name = 'ascat' ts_asc = pd.DataFrame(ts_asc) if mode == 'absolute': ts_ins = ts_insitu[var].dropna() else: ts_ins = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna() ts_ins.name = 'insitu' ts_ins = pd.DataFrame(ts_ins) ind = (ds_ol['snow_mass'].isel(lat=row, lon=col).values == 0) & \ (ds_ol['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15) ts_ol = ds_ol[var].isel(lat=row, lon=col).to_series().loc[ind] ts_ol.index += pd.to_timedelta('2 hours') ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1) if mode == 'absolute': ts_ol = ts_ol.reindex(t_ana[ind_obs]).dropna() else: ts_ol = calc_anom(ts_ol.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna() ts_ol.name = 'open_loop' ts_ol = pd.DataFrame(ts_ol) ind = (ds_da['snow_mass'].isel(lat=row, lon=col).values == 0) & \ (ds_da['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15) ts_da = ds_da[var].isel(lat=row, lon=col).to_series().loc[ind] ts_da.index += pd.to_timedelta('2 hours') ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1) if mode == 'absolute': ts_da = ts_da.reindex(t_ana[ind_obs]).dropna() else: ts_da = calc_anom(ts_da.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna() ts_da.name = 'DA_4K' ts_da = pd.DataFrame(ts_da) matched = df_match(ts_ol, ts_da, ts_asc, ts_ins, window=0.5) data = ts_ol.join(matched[0]['DA_4K']).join(matched[1]['ascat']).join(matched[2]['insitu']).dropna() dpr_triplets = data.corr()['DA_4K']['insitu'] - data.corr()['open_loop']['insitu'] if dpr_triplets < 0: continue f = plt.figure(figsize=(15, 5)) sns.lineplot(data=data[['open_loop', 'DA_4K', 'insitu']], dashes=False, linewidth=1.5, axes=plt.gca()) plt.title(f'{meta.network} / {meta.station} ({var}): d(Pearson R2) = {dpr_triplets:.3f} , d(TCA R2) = {dtr:.3f}') fbase = Path('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/timeseries') fname = fbase / f'{mode}_{var}_{meta.network}_{meta.station}.png' f.savefig(fname, dpi=300, bbox_inches='tight') plt.close() except: continue
def mask_data(): era_matched = temp_match.df_match(ascat_data, era_interim_data, window=1) ascat_masked = ascat_data[(era_matched['snow_depth'] <= mask['snow_depth']) & (era_matched['st_l1'] > mask['st_l1']) & (era_matched['air_temp'] > mask['air_temp'])] if mask['use_ssf'] == True: ascat_masked = ascat_masked[ascat_masked['ssf'] == 1] ascat_masked = ascat_masked[[ascat_label, 'jd']] relevant_depth = None ISMN_station = ISMN.get_station_by_id(station_id) for depth in ISMN_station.sm_depths: if float(depth.depth_from) - 0.05 < 0.001: relevant_depth = depth if relevant_depth == None: return 0, -1 ISMN_data = ISMN_station.get_soil_moisture_for_depth(relevant_depth, start_date=datetime( 2007, 1, 1)) sensor = ISMN_data.keys()[0] ISMN_data = ISMN_data[sensor] ISMN_ts_name = 'insitu sm %.2f - %.2f m sensor: ' % (float( relevant_depth.depth_from), float(relevant_depth.depth_to)) + sensor era_insitu_matched = temp_match.df_match(ISMN_data, era_interim_data, window=1) insitu_masked = ISMN_data[ (era_insitu_matched['snow_depth'] <= mask['snow_depth']) & (era_insitu_matched['st_l1'] > mask['st_l1']) & (era_insitu_matched['air_temp'] > mask['air_temp'])] if mask['use_ssf'] == True: ascat_insitu_matched = temp_match.df_match(insitu_masked, ascat_data, window=1) insitu_masked = insitu_masked[ascat_insitu_matched['ssf'] == 1] ISMN_data = insitu_masked[['insitu', 'jd']] # slice to same period as insitu data era_matched = era_matched[scaled_data.index[0]:scaled_data. index[scaled_data.index.values.size - 1]] era_matched.rename(columns={ 'st_l1': 'soil temperature layer 1', 'air_temp': '2m air temperature' }, inplace=True) era_matched = era_matched[[ 'snow_depth', 'soil temperature layer 1', '2m air temperature' ]] era_labels, era_values = era_matched.to_dygraph_format() masking_data = {'labels': masking_labels, 'data': masking_values}
def run(part): parts = 6 result_file = r'D:\work\ESA_CCI_SM\validation_%i.csv' % part cci = CCISM_io() ismn = ISMN_io() # ismn.list = ismn.list.iloc[100:120] # Split station list in 4 parts for parallelization subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int') subs[-1] = len(ismn.list) start = subs[part - 1] end = subs[part] ismn.list = ismn.list.iloc[start:end, :] periods = { 'p1': ['2007-10-01', '2010-01-14'], 'p2': ['2010-01-15', '2011-10-04'], 'p3': ['2011-10-05', '2012-06-30'], 'p4': ['2012-07-01', '2014-12-31'] } freq = ['abs', 'anom'] corr_tags = [ 'corr_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] p_tags = [ 'p_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] n_tags = [ 'n_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] res = ismn.list.copy() res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True) for col in corr_tags + p_tags: res[col] = np.nan for col in n_tags: res[col] = 0 for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): print('%i/%i (Proc %i)' % (i, len(ismn.list), part)) if ts_insitu is None: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue ts_insitu = ts_insitu[periods['p1'][0]:periods['p4'][1]] if len(ts_insitu) < 10: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue df_insitu = pd.DataFrame(ts_insitu).dropna() df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna() for m in cci.modes: df_cci = cci.read(meta.lon, meta.lat, mode=m).dropna() if len(df_cci) < 10: print('No CCI ' + m + ' data for ' + meta.network + ' / ' + meta.station) continue for f in freq: if f == 'abs': matched = df_match(df_cci, df_insitu, window=0.5) else: for v in cci.versions: df_cci.loc[:, m + '_' + v] = calc_anomaly( df_cci[m + '_' + v]) df_cci.dropna(inplace=True) if (len(df_cci) < 10) | (len(df_insitu_anom) < 10): print('No in situ or CCI ' + m + ' anomaly data for ' + meta.network + ' / ' + meta.station) continue matched = df_match(df_cci, df_insitu_anom, window=0.5) data = df_cci.join(matched['insitu']).dropna() for p in periods.keys(): vals = data[periods[p][0]:periods[p][1]].values n_matches = vals.shape[0] if n_matches < 10: continue for k, v in enumerate(cci.versions): corr, p_value = pearsonr(vals[:, k], vals[:, -1]) res.loc[meta.name, 'corr_' + m + '_' + v + '_' + p + '_' + f] = corr res.loc[meta.name, 'p_' + m + '_' + v + '_' + p + '_' + f] = p_value res.loc[meta.name, 'n_' + m + '_' + v + '_' + p + '_' + f] = n_matches res.to_csv(result_file, float_format='%0.4f')
mask_frozen_prob=5, mask_snow_prob=5) #drop nan values before doing any matching ascat_time_series.data = ascat_time_series.data.dropna() ISMN_time_series.data = ISMN_time_series.data.dropna() #rename the soil moisture column in ISMN_time_series.data to insitu_sm #to clearly differentiate the time series when they are plotted together ISMN_time_series.data.rename(columns={'soil moisture': label_insitu}, inplace=True) #get ISMN data that was observerd within +- 1 hour(1/24. day) of the ASCAT observation #do not include those indexes where no observation was found matched_ISMN_data = temp_match.df_match(ascat_time_series.data, ISMN_time_series.data, window=1 / 24., dropna=True) #matched ISMN data is now a dataframe with the same datetime index #as ascat_time_series.data and the nearest insitu observation #temporal matching also includes distance information #but we are not interested in it right now so let's drop it matched_ISMN_data = matched_ISMN_data.drop(['distance'], axis=1) #this joins the SSM column of the ASCAT data to the matched ISMN data matched_data = matched_ISMN_data.join( ascat_time_series.data[label_ascat]) #the plot shows that ISMN and ASCAT are observed in different units matched_data.plot(secondary_y=[label_ascat]) plt.show()
mask_ssf=True, mask_frozen_prob = 5, mask_snow_prob = 5) #drop nan values before doing any matching ascat_time_series.data = ascat_time_series.data.dropna() ISMN_time_series.data = ISMN_time_series.data.dropna() #rename the soil moisture column in ISMN_time_series.data to insitu_sm #to clearly differentiate the time series when they are plotted together ISMN_time_series.data.rename(columns={'soil moisture':label_insitu},inplace=True) #get ISMN data that was observerd within +- 1 hour(1/24. day) of the ASCAT observation #do not include those indexes where no observation was found matched_ISMN_data = temp_match.df_match(ascat_time_series.data,ISMN_time_series.data, window=1/24.,dropna=True) #matched ISMN data is now a dataframe with the same datetime index #as ascat_time_series.data and the nearest insitu observation #temporal matching also includes distance information #but we are not interested in it right now so let's drop it matched_ISMN_data = matched_ISMN_data.drop(['distance'],axis=1) #this joins the SSM column of the ASCAT data to the matched ISMN data matched_data = matched_ISMN_data.join(ascat_time_series.data[label_ascat]) #the plot shows that ISMN and ASCAT are observed in different units matched_data.plot(secondary_y=[label_ascat]) plt.show() #this takes the matched_data DataFrame and adds a column
def run(part): parts = 6 result_file = r'D:\work\ESA_CCI_SM\ismn_r2\ismn_r2_part%i.csv' % part cci = CCISM_io() ismn = ISMN_io() # ismn.list = ismn.list.iloc[100:120] # Split station list in 4 parts for parallelization subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int') subs[-1] = len(ismn.list) start = subs[part - 1] end = subs[part] ismn.list = ismn.list.iloc[start:end, :] freq = ['abs', 'anom'] res = ismn.list.copy() res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True) res['r_abs'] = np.nan res['r_anom'] = np.nan for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): print('%i/%i (Proc %i)' % (i, len(ismn.list), part)) if ts_insitu is None: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue ts_insitu = ts_insitu['2007-10-01':'2014-12-31'] if len(ts_insitu) < 10: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue df_insitu = pd.DataFrame(ts_insitu).dropna() df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna() df_cci = cci.read(meta.lon, meta.lat, version='v04.4', mode=['ACTIVE', 'PASSIVE']).dropna() if len(df_cci) < 10: print('No CCI data for ' + meta.network + ' / ' + meta.station) continue for f in freq: if f == 'abs': matched = df_match(df_cci, df_insitu, window=0.5) else: df_cci.loc[:, 'ACTIVE_v04.4'] = calc_anomaly( df_cci['ACTIVE_v04.4']) df_cci.loc[:, 'PASSIVE_v04.4'] = calc_anomaly( df_cci['PASSIVE_v04.4']) df_cci.dropna(inplace=True) if (len(df_cci) < 10) | (len(df_insitu_anom) < 10): print('No in situ or CCI anomaly data for ' + meta.network + ' / ' + meta.station) continue matched = df_match(df_cci, df_insitu_anom, window=0.5) data = df_cci.join(matched['insitu']).dropna() if len(data) < 100: continue vals = data[['insitu', 'ACTIVE_v04.4']].values c1, p1 = pearsonr(vals[:, 0], vals[:, 1]) vals = data[['insitu', 'PASSIVE_v04.4']].values c2, p2 = pearsonr(vals[:, 0], vals[:, 1]) vals = data[['ACTIVE_v04.4', 'PASSIVE_v04.4']].values c3, p3 = pearsonr(vals[:, 0], vals[:, 1]) if (c1 < 0) | (c2 < 0) | (c3 < 0) | (p1 > 0.05) | (p2 > 0.05) | ( p3 > 0.05): continue res.loc[meta.name, 'r_' + f] = np.sqrt(tc(data)[1][2]) res.to_csv(result_file, float_format='%0.4f')