def load_site(cls, config, site, data_range=None, meta_src=None, data_src=None): """%%aug%% Expects meta_src, data_src to be pickled pandas DataFrame objects. """ LMRdb_cfg = config.proxies.LMRdb if meta_src is None: meta_src = load_data_frame(LMRdb_cfg.metafile_proxy) if data_src is None: data_src = load_data_frame(LMRdb_cfg.datafile_proxy) data_src = data_src.to_dense() site_meta = meta_src[meta_src['Proxy ID'] == site] pid = site_meta['Proxy ID'].iloc[0] pmeasure = site_meta['Proxy measurement'].iloc[0] LMRdb_type = site_meta['Archive type'].iloc[0] try: proxy_type = LMRdb_cfg.proxy_type_mapping[(LMRdb_type, pmeasure)] except (KeyError, ValueError) as e: print('Proxy type/measurement not found in mapping: {}'.format(e)) raise ValueError(e) start_yr = site_meta['Youngest (C.E.)'].iloc[0] end_yr = site_meta['Oldest (C.E.)'].iloc[0] lat = site_meta['Lat (N)'].iloc[0] lon = site_meta['Lon (E)'].iloc[0] elev = site_meta['Elev'].iloc[0] site_data = data_src[site] seasonality = site_meta['Seasonality'].iloc[0] # make sure a list is returned if type(seasonality) is not list: seasonality = ast.literal_eval(seasonality) if data_range is not None: start, finish = data_range values = site_data[(site_data.index >= start) & (site_data.index <= finish)] else: values = site_data # Might need to remove following line values = values[values.notnull()] times = values.index.values # transform in "anomalies" (time-mean removed) if option activated if config.proxies.LMRdb.proxy_timeseries_kind == 'anom': values = values - values.mean() if len(values) == 0: raise ValueError('No observations in specified time range.') return cls(config, pid, proxy_type, start_yr, end_yr, lat, lon, elev, seasonality, values, times)
def load_all_annual_no_filtering(cls, config, meta_src=None, data_src=None): """ Method created to facilitate the loading of all possible proxy records that can be calibrated with annual resolution. Note: This is still subject to constraints from the PSM calibration ( i.e. if there is an r_crit or not enough calibration data the proxy will not be loaded) Returns ------- proxy_objs: list(BaseProxyObject like) """ # Load source data files if meta_src is None: meta_src = load_data_frame(config.proxies.LMRdb.metafile_proxy) if data_src is None: data_src = load_data_frame(config.proxies.LMRdb.datafile_proxy) data_src = data_src.to_dense() # TODO: For now hard coded to annual resolution - AP useable = meta_src['Resolution (yr)'] == 1.0 proxy_ids = meta_src['Proxy ID'][useable].values proxy_objs = [] for site in proxy_ids: try: pobj = cls.load_site(config, site, meta_src=meta_src, data_src=data_src) proxy_objs.append(pobj) except ValueError as e: print(e) return proxy_objs
def load_all(cls, config, data_range, meta_src=None, data_src=None): """%%aug%% Expects meta_src, data_src to be pickled pandas DataFrame objects. """ # Load source data files if meta_src is None: meta_src = load_data_frame(config.proxies.NCDCdtda.metafile_proxy) if data_src is None: data_src = load_data_frame(config.proxies.NCDCdtda.datafile_proxy) filters = config.proxies.NCDCdtda.simple_filters proxy_order = config.proxies.NCDCdtda.proxy_order ptype_filters = config.proxies.NCDCdtda.proxy_assim2 dbase_filters = config.proxies.NCDCdtda.database_filter proxy_blacklist = config.proxies.NCDCdtda.proxy_blacklist availability_filter = config.proxies.NCDCdtda.proxy_availability_filter availability_fraction = config.proxies.NCDCdtda.proxy_availability_fraction # initial mask all true before filtering useable = meta_src[meta_src.columns[0]] == 0 useable |= True availability_mask = meta_src[meta_src.columns[0]] == 0 availability_mask |= True # Find indices matching simple filter specifications for colname, filt_list in filters.items(): simple_mask = meta_src[colname] == 0 simple_mask &= False for value in filt_list: if colname == 'Resolution (yr)' and type(value) is tuple: for i in range(len(meta_src[colname].index)): simple_mask[i] |= (value[0] <= meta_src[colname][i] <= value[1]) else: simple_mask |= meta_src[colname] == value useable &= simple_mask # Filtering proxy records on conditions of availability during # the reconstruction period (recon_period in configuration, or # data_range here). if availability_filter: # if not None start, finish = data_range # Checking proxy metadata's period of availability against # reconstruction period. availability_mask = ((meta_src['Oldest (C.E.)'] <= start) & (meta_src['Youngest (C.E.)'] >= finish)) # Checking level of completeness of record within the reconstruction # period (ignore record if fraction of available data is below user-defined # threshold (proxy_availability_fraction in config). maxnb = (finish - start) + 1 proxies_to_test = meta_src['Proxy ID'][availability_mask & useable].values for prx in proxies_to_test.tolist(): values = data_src[prx][(data_src[:].index >= start) & (data_src[:].index <= finish)] values = values[values.notnull()] frac_available = float(len(values)) / float(maxnb) if frac_available < availability_fraction: availability_mask[meta_src[meta_src['Proxy ID'] == prx].index] = False # Find indices matching **database filter** specifications database_col = 'Databases' # dbase_filters not "None" or empty list (some selection on db has been activated) if dbase_filters: # define boolean array with right dimension & set all to False dbase_mask = meta_src[database_col] == 0 # set mask to True for proxies matching all databases found in dbase_filters for i in range(len(meta_src[database_col])): if meta_src[database_col][i]: #dbase_mask[i] = set(meta_src[database_col][i]).isdisjoint(dbase_filters) # oldold code #dbase_mask[i] = set(dbase_filters).issubset(meta_src[database_col][i]) # old code dbase_mask[i] = bool( set(meta_src[database_col][i]).intersection( set(dbase_filters))) else: dbase_mask[i] = False else: # selection on db has NOT been activated: # define boolean array with right dimension & set all to True dbase_mask = meta_src[database_col] != 0 # Define mask of proxies listed in a user-defined "blacklist" # (see LMR_config). # boolean array set with right dimension & all set to True blacklist_mask = meta_src['Proxy ID'] != ' ' if proxy_blacklist: # If site listed in blacklist, modify corresponding elements of # boolean array to False for pbl in proxy_blacklist: tmp = meta_src['Proxy ID'].map(lambda x: x.startswith(pbl)) inds = meta_src['Proxy ID'][tmp].index blacklist_mask[inds] = False # Create proxy id lists proxy_id_by_type = {} all_proxy_ids = [] type_col = 'Archive type' measure_col = 'Proxy measurement' for name in proxy_order: type_mask = meta_src[type_col] == 0 type_mask |= True # Filter to proxies of a certain type ptype = name.split('_', 1)[0] type_mask &= meta_src[type_col] == ptype # Reduce to listed measures measure_mask = meta_src[measure_col] == 0 measure_mask &= False for measure in ptype_filters[name]: measure_mask |= meta_src[measure_col] == measure # Extract proxy ids using mask and append to lists proxies = meta_src['Proxy ID'][measure_mask & type_mask & dbase_mask & blacklist_mask & availability_mask & useable].values # If we have ids after filtering add them to the type list if len(proxies) > 0: proxy_id_by_type[name] = proxies.tolist() all_proxy_ids += proxies.tolist() # Create proxy objects list all_proxies = [] for site in all_proxy_ids: try: pobj = cls.load_site(config, site, data_range, meta_src=meta_src, data_src=data_src) all_proxies.append(pobj) except ValueError as e: # Proxy had no obs or didn't meet psm r crit for group in list(proxy_id_by_type.values()): if site in group: group.remove(site) break # Should only be one instance return proxy_id_by_type, all_proxies
def load_all(cls, config, data_range, meta_src=None, data_src=None): """%%aug%% Expects meta_src, data_src to be pickled pandas DataFrame objects. """ # Load source data files if meta_src is None: meta_src = load_data_frame(config.proxies.PAGES2kv1.metafile_proxy) if data_src is None: data_src = load_data_frame(config.proxies.PAGES2kv1.datafile_proxy) data_src = data_src.to_dense() filters = config.proxies.PAGES2kv1.simple_filters proxy_order = config.proxies.PAGES2kv1.proxy_order ptype_filters = config.proxies.PAGES2kv1.proxy_assim2 availability_filter = config.proxies.PAGES2kv1.proxy_availability_filter availability_fraction = config.proxies.PAGES2kv1.proxy_availability_fraction # initial masks all true before filtering useable = meta_src[meta_src.columns[0]] == 0 useable |= True availability_mask = meta_src[meta_src.columns[0]] == 0 availability_mask |= True # Find indices matching filter specifications for colname, filt_list in filters.items(): simple_mask = meta_src[colname] == 0 simple_mask &= False for value in filt_list: simple_mask |= meta_src[colname] == value useable &= simple_mask # Filtering proxy records on conditions of availability during # the reconstruction period (recon_period in configuration, or # data_range here). if availability_filter: # if not None start, finish = data_range # Checking proxy metadata's period of availability against # reconstruction period. availability_mask = ((meta_src['Oldest (C.E.)'] <= start) & (meta_src['Youngest (C.E.)'] >= finish)) # Checking level of completeness of record within the reconstruction # period (ignore record if fraction of available data is below user-defined # threshold (proxy_availability_fraction in config). maxnb = (finish - start) + 1 proxies_to_test = meta_src['Proxy ID'][availability_mask & useable].values for prx in proxies_to_test.tolist(): values = data_src[prx][(data_src[:].index >= start) & (data_src[:].index <= finish)] values = values[values.notnull()] frac_available = float(len(values)) / float(maxnb) if frac_available < availability_fraction: availability_mask[meta_src[meta_src['Proxy ID'] == prx].index] = False # Create proxy id lists proxy_id_by_type = {} all_proxy_ids = [] type_col = 'Archive type' measure_col = 'Proxy measurement' for name in proxy_order: type_mask = meta_src[type_col] == 0 type_mask |= True # Filter to proxies of a certain type ptype = name.split('_', 1)[0] type_mask &= meta_src[type_col] == ptype # Reduce to listed measures measure_mask = meta_src[measure_col] == 0 measure_mask &= False for measure in ptype_filters[name]: measure_mask |= meta_src[measure_col] == measure # Extract proxy ids using mask and append to lists proxies = meta_src['Proxy ID'][measure_mask & type_mask & availability_mask & useable].values # If we have ids after filtering add them to the type list if len(proxies) > 0: proxy_id_by_type[name] = proxies.tolist() all_proxy_ids += proxies.tolist() # Create proxy objects list all_proxies = [] for site in all_proxy_ids: try: pobj = cls.load_site(config, site, data_range, meta_src=meta_src, data_src=data_src) all_proxies.append(pobj) except ValueError as e: # Proxy had no obs or didn't meet psm r crit for group in list(proxy_id_by_type.values()): if site in group: group.remove(site) break # Should only be one instance return proxy_id_by_type, all_proxies