def test_init_shape(): npt.assert_array_equal(UngriddedData().shape, (10000, 12)) d1 = UngriddedData(num_points=2, add_cols=['bla', 'blub']) npt.assert_array_equal(d1.shape, (2, 14)) d1.add_chunk(1112) npt.assert_array_equal(d1.shape, (1114, 14))
def test_coordinate_access(): import string d = UngriddedData() stat_names = list(string.ascii_lowercase) lons = np.arange(len(stat_names)) lats = np.arange(len(stat_names)) - 90 alts = np.arange(len(stat_names)) * 13 for i, n in enumerate(stat_names): d.metadata[i] = dict(data_id='testcase', station_name=n, latitude=lats[i], longitude=lons[i], altitude=alts[i]) import numpy.testing as npt npt.assert_array_equal(d.station_name, stat_names) npt.assert_array_equal(d.latitude, lats) npt.assert_array_equal(d.longitude, lons) npt.assert_array_equal(d.altitude, alts) case_ok = False try: d.to_station_data('a') except DataCoverageError: case_ok = True assert case_ok c = d.station_coordinates npt.assert_array_equal(c['station_name'], stat_names) npt.assert_array_equal(c['latitude'], lats) npt.assert_array_equal(c['longitude'], lons) npt.assert_array_equal(c['altitude'], alts)
def read(self, vars_to_retrieve=None, first_file=None, last_file=None): if isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] files = self.get_file_list() if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] stats = self._read_files(files, vars_to_retrieve) data = UngriddedData.from_station_data(stats) return data
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, read_err=None, remove_outliers=True, file_pattern=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used read_err : bool if True, uncertainty data is also read (where available). If unspecified (None), then the default is used (cf. :attr:`READ_ERR`) file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if read_err is None: read_err = self.READ_ERR if files is None: if len(self.files) == 0: self.get_file_list(vars_to_retrieve, file_pattern=file_pattern) files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() col_idx = data_obj.index meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx #last_station_id = '' num_files = len(files) disp_each = int(num_files*0.1) if disp_each < 1: disp_each = 1 VAR_IDX = -1 for i, _file in enumerate(files): if i%disp_each == 0: print("Reading file {} of {} ({})".format(i+1, num_files, type(self).__name__)) try: stat = self.read_file(_file, vars_to_retrieve=vars_to_retrieve, read_err=read_err, remove_outliers=remove_outliers) if not any([var in stat.contains_vars for var in vars_to_retrieve]): self.logger.info("Station {} contains none of the desired " "variables. Skipping station..." .format(stat.station_name)) continue #if last_station_id != station_id: meta_key += 1 # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(stat.get_meta()) for add_meta in self.KEEP_ADD_META: if add_meta in stat: metadata[meta_key][add_meta] = stat[add_meta] #metadata[meta_key]['station_id'] = station_id #metadata[meta_key]['data_id'] = self.DATA_ID metadata[meta_key]['data_revision'] = self.data_revision metadata[meta_key]['variables'] = [] metadata[meta_key]['var_info'] = od() # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() #last_station_id = station_id # Is floating point single value time = stat.dtime[0] for var in stat.contains_vars: if not var in data_obj.var_idx: VAR_IDX +=1 data_obj.var_idx[var] = VAR_IDX var_idx = data_obj.var_idx[var] val = stat[var] metadata[meta_key]['var_info'][var] = vi = od() if isinstance(val, VerticalProfile): altitude = val.altitude data = val.data add = len(data) err = val.data_err metadata[meta_key]['var_info']['altitude'] = via = od() vi.update(val.var_info[var]) via.update(val.var_info['altitude']) else: add = 1 altitude = np.nan data = val if var in stat.data_err: err = stat.err[var] else: err = np.nan vi.update(stat.var_info[var]) stop = idx + add #check if size of data object needs to be extended if stop >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(add) #write common meta info for this station data_obj._data[idx:stop, col_idx['latitude']] = stat['latitude'] data_obj._data[idx:stop, col_idx['longitude']] = stat['longitude'] data_obj._data[idx:stop, col_idx['altitude']] = stat['altitude'] data_obj._data[idx:stop, col_idx['meta']] = meta_key # write data to data object data_obj._data[idx:stop, col_idx['time']] = time data_obj._data[idx:stop, col_idx['stoptime']] = stat.stopdtime[0] data_obj._data[idx:stop, col_idx['data']] = data data_obj._data[idx:stop, col_idx['dataaltitude']] = altitude data_obj._data[idx:stop, col_idx['varidx']] = var_idx if read_err: data_obj._data[idx:stop, col_idx['dataerr']] = err if not var in meta_idx[meta_key]: meta_idx[meta_key][var] = [] meta_idx[meta_key][var].extend(list(range(idx, stop))) if not var in metadata[meta_key]['variables']: metadata[meta_key]['variables'].append(var) idx += add except Exception as e: self.read_failed.append(_file) self.logger.exception('Failed to read file {} (ERR: {})' .format(os.path.basename(_file), repr(e))) # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.DATA_ID] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list() files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx last_stat_code = '' num_files = len(files) for i, _file in enumerate(files): self.logger.info('File {} ({})'.format(i, num_files)) try: station_data = self.read_file( _file, vars_to_retrieve=vars_to_retrieve) if not any([ var in station_data.contains_vars for var in vars_to_retrieve ]): self.logger.info("Station {} contains none of the desired " "variables. Skipping station...".format( station_data.station_name)) continue stat_code = station_data['stat_code'] if last_stat_code != stat_code: meta_key += 1 # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update( station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['variables'] = [] # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() last_stat_code = stat_code # Is floating point single value time = station_data.dtime for var_idx, var in enumerate(station_data.contains_vars): val = station_data[var] if isinstance(val, VerticalProfile): add = len(val) altitude = val.altitude data = val.data else: add = 1 altitude = np.nan data = val stop = idx + add #check if size of data object needs to be extended if stop >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(add) #write common meta info for this station data_obj._data[ idx:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[ idx:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[ idx:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[idx:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[idx:stop, data_obj._TIMEINDEX] = time data_obj._data[idx:stop, data_obj._DATAINDEX] = data data_obj._data[idx:stop, data_obj._DATAHEIGHTINDEX] = altitude data_obj._data[idx:stop, data_obj._VARINDEX] = var_idx if not var in meta_idx[meta_key]: meta_idx[meta_key][var] = [] meta_idx[meta_key][var].extend(list(range(idx, stop))) if not var in metadata[meta_key]['variables']: metadata[meta_key]['variables'].append(var) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += add except: self.read_failed.append(_file) self.logger.exception('Failed to read file {}'.format( os.path.basename(_file))) # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if len(self.files) == 0: self.get_file_list(vars_to_retrieve) files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] files_contain = self.files_contain[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_files = len(files) disp_each = int(num_files * 0.1) if disp_each < 1: disp_each = 1 vars_to_read, vars_to_compute = self.check_vars_to_retrieve( vars_to_retrieve) self.files_failed = [] for i, _file in enumerate(files): if i % disp_each == 0: print("Reading file {} of {} ({})".format( i, num_files, type(self).__name__)) vars_to_read = files_contain[i] try: station_data = self.read_file(_file, _vars_to_read=vars_to_read, _vars_to_compute=vars_to_compute) except (NotInFileError, EbasFileError) as e: self.files_failed.append(_file) self.logger.warning('Failed to read file {}. ' 'Error: {}'.format(os.path.basename(_file), repr(e))) continue # Fill the metatdata dict # the location in the data set is time step dependent! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update(station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['ts_type'] = station_data['ts_type'] metadata[meta_key]['instrument_name'] = station_data[ 'instrument_name'] metadata[meta_key]['var_info'] = od() # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = {} num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * len(station_data.contains_vars) #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) vars_avail = station_data.contains_vars for var_idx, var in enumerate(vars_avail): if not var in data_obj.unit: data_obj.unit[var] = station_data.unit[var] elif station_data.unit[var] != data_obj.unit[var]: raise DataUnitError("Unit mismatch") values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['stat_lat'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['stat_lat'] data_obj._data[ start:stop, data_obj._ALTITUDEINDEX] = station_data['stat_alt'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) var_info = station_data['var_info'][var] metadata[meta_key]['var_info'][var] = var_info.to_dict() if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx metadata[meta_key]['variables'] = vars_avail idx += totnum meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj = data_obj.merge_common_meta() data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def test_init_add_cols(): d1 = UngriddedData(num_points=2, add_cols=['bla', 'blub']) npt.assert_array_equal(d1.shape, (2, 14))
def ungridded_empty(): return UngriddedData()
def test_from_cache(aeronetsunv3lev2_subset, tempdir): reloaded = UngriddedData.from_cache( data_dir=tempdir, file_name='ungridded_aeronet_subset.pkl') assert reloaded.shape == aeronetsunv3lev2_subset.shape
def _make_ungridded_data(): data = UngriddedData() # Add some random data and metadata blocks return data
def read(self, vars_to_retrieve): # Leeres Datenobjekt anlegen data_obj = UngriddedData() # date index pointer in numpy array index_pointer = 0 # metadata key pointer for each file meta_key = 0.0 # ist in der Basisklasse implementiert, kann aber auch ueberschrieben werden, falls noetig files = self.get_file_list() for f in files: # load data from individual file (returns e.g. dictionary, or StationData) file_data = self.read_file() for var_idx, var in enumerate(vars_to_retrieve): # add station / file metadata e.g. data_obj.metadata[meta_key]['longitude'] = file_data[ 'longitude'] data_obj.metadata[meta_key]['latitude'] = file_data['latitude'] data_obj.metadata[meta_key]['altitude'] = file_data['altitude'] # now copy all data columns # time stamps, assuming array or list of numpy.datetime64 objects time_stamps = file_data['dtime'] # the actual data for this variable var_data = file_data[var] # the number of datapoints added to the Ungridded data object add_num = len(var_data) stop_idx = index_pointer + add_num if stop_idx >= data_obj._ROWNO: # add_chunk actually adds a minimum of 1000 datapoints, it only uses add_num if add_num >= 1000 data_obj.add_chunk(add_num) # now you can add the variable to the data numpy array data_obj._data[index_pointer:stop_idx, data_obj._LATINDEX] = file_data['latitude'] data_obj._data[index_pointer:stop_idx, data_obj._LONINDEX] = file_data['longitude'] data_obj._data[index_pointer:stop_idx, data_obj._ALTITUDEINDEX] = file_data['altitude'] data_obj._data[index_pointer:stop_idx, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[index_pointer:stop_idx, data_obj._TIMEINDEX] = np.float64(time_stamps) data_obj._data[index_pointer:stop_idx, data_obj._DATAINDEX] = var_data data_obj._data[index_pointer:stop_idx, data_obj._DATAHEIGHTINDEX] = file_data[ 'altitude'] #or data data_obj._data[index_pointer:stop_idx, data_obj._VARINDEX] = var_idx index_pointer += add_num meta_key += 1.