def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list() files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx last_stat_code = '' num_files = len(files) for i, _file in enumerate(files): self.logger.info('File {} ({})'.format(i, num_files)) try: station_data = self.read_file( _file, vars_to_retrieve=vars_to_retrieve) if not any([ var in station_data.contains_vars for var in vars_to_retrieve ]): self.logger.info("Station {} contains none of the desired " "variables. Skipping station...".format( station_data.station_name)) continue stat_code = station_data['stat_code'] if last_stat_code != stat_code: meta_key += 1 # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update( station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['variables'] = [] # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() last_stat_code = stat_code # Is floating point single value time = station_data.dtime for var_idx, var in enumerate(station_data.contains_vars): val = station_data[var] if isinstance(val, VerticalProfile): add = len(val) altitude = val.altitude data = val.data else: add = 1 altitude = np.nan data = val stop = idx + add #check if size of data object needs to be extended if stop >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(add) #write common meta info for this station data_obj._data[ idx:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[ idx:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[ idx:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[idx:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[idx:stop, data_obj._TIMEINDEX] = time data_obj._data[idx:stop, data_obj._DATAINDEX] = data data_obj._data[idx:stop, data_obj._DATAHEIGHTINDEX] = altitude data_obj._data[idx:stop, data_obj._VARINDEX] = var_idx if not var in meta_idx[meta_key]: meta_idx[meta_key][var] = [] meta_idx[meta_key][var].extend(list(range(idx, stop))) if not var in metadata[meta_key]['variables']: metadata[meta_key]['variables'].append(var) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += add except: self.read_failed.append(_file) self.logger.exception('Failed to read file {}'.format( os.path.basename(_file))) # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, read_err=None, remove_outliers=True, file_pattern=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used read_err : bool if True, uncertainty data is also read (where available). If unspecified (None), then the default is used (cf. :attr:`READ_ERR`) file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if read_err is None: read_err = self.READ_ERR if files is None: if len(self.files) == 0: self.get_file_list(vars_to_retrieve, file_pattern=file_pattern) files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() col_idx = data_obj.index meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx #last_station_id = '' num_files = len(files) disp_each = int(num_files*0.1) if disp_each < 1: disp_each = 1 VAR_IDX = -1 for i, _file in enumerate(files): if i%disp_each == 0: print("Reading file {} of {} ({})".format(i+1, num_files, type(self).__name__)) try: stat = self.read_file(_file, vars_to_retrieve=vars_to_retrieve, read_err=read_err, remove_outliers=remove_outliers) if not any([var in stat.contains_vars for var in vars_to_retrieve]): self.logger.info("Station {} contains none of the desired " "variables. Skipping station..." .format(stat.station_name)) continue #if last_station_id != station_id: meta_key += 1 # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(stat.get_meta()) for add_meta in self.KEEP_ADD_META: if add_meta in stat: metadata[meta_key][add_meta] = stat[add_meta] #metadata[meta_key]['station_id'] = station_id #metadata[meta_key]['data_id'] = self.DATA_ID metadata[meta_key]['data_revision'] = self.data_revision metadata[meta_key]['variables'] = [] metadata[meta_key]['var_info'] = od() # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() #last_station_id = station_id # Is floating point single value time = stat.dtime[0] for var in stat.contains_vars: if not var in data_obj.var_idx: VAR_IDX +=1 data_obj.var_idx[var] = VAR_IDX var_idx = data_obj.var_idx[var] val = stat[var] metadata[meta_key]['var_info'][var] = vi = od() if isinstance(val, VerticalProfile): altitude = val.altitude data = val.data add = len(data) err = val.data_err metadata[meta_key]['var_info']['altitude'] = via = od() vi.update(val.var_info[var]) via.update(val.var_info['altitude']) else: add = 1 altitude = np.nan data = val if var in stat.data_err: err = stat.err[var] else: err = np.nan vi.update(stat.var_info[var]) stop = idx + add #check if size of data object needs to be extended if stop >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(add) #write common meta info for this station data_obj._data[idx:stop, col_idx['latitude']] = stat['latitude'] data_obj._data[idx:stop, col_idx['longitude']] = stat['longitude'] data_obj._data[idx:stop, col_idx['altitude']] = stat['altitude'] data_obj._data[idx:stop, col_idx['meta']] = meta_key # write data to data object data_obj._data[idx:stop, col_idx['time']] = time data_obj._data[idx:stop, col_idx['stoptime']] = stat.stopdtime[0] data_obj._data[idx:stop, col_idx['data']] = data data_obj._data[idx:stop, col_idx['dataaltitude']] = altitude data_obj._data[idx:stop, col_idx['varidx']] = var_idx if read_err: data_obj._data[idx:stop, col_idx['dataerr']] = err if not var in meta_idx[meta_key]: meta_idx[meta_key][var] = [] meta_idx[meta_key][var].extend(list(range(idx, stop))) if not var in metadata[meta_key]['variables']: metadata[meta_key]['variables'].append(var) idx += add except Exception as e: self.read_failed.append(_file) self.logger.exception('Failed to read file {} (ERR: {})' .format(os.path.basename(_file), repr(e))) # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.DATA_ID] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if len(self.files) == 0: self.get_file_list(vars_to_retrieve) files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] files_contain = self.files_contain[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_files = len(files) disp_each = int(num_files * 0.1) if disp_each < 1: disp_each = 1 vars_to_read, vars_to_compute = self.check_vars_to_retrieve( vars_to_retrieve) self.files_failed = [] for i, _file in enumerate(files): if i % disp_each == 0: print("Reading file {} of {} ({})".format( i, num_files, type(self).__name__)) vars_to_read = files_contain[i] try: station_data = self.read_file(_file, _vars_to_read=vars_to_read, _vars_to_compute=vars_to_compute) except (NotInFileError, EbasFileError) as e: self.files_failed.append(_file) self.logger.warning('Failed to read file {}. ' 'Error: {}'.format(os.path.basename(_file), repr(e))) continue # Fill the metatdata dict # the location in the data set is time step dependent! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update(station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['ts_type'] = station_data['ts_type'] metadata[meta_key]['instrument_name'] = station_data[ 'instrument_name'] metadata[meta_key]['var_info'] = od() # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = {} num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * len(station_data.contains_vars) #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) vars_avail = station_data.contains_vars for var_idx, var in enumerate(vars_avail): if not var in data_obj.unit: data_obj.unit[var] = station_data.unit[var] elif station_data.unit[var] != data_obj.unit[var]: raise DataUnitError("Unit mismatch") values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['stat_lat'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['stat_lat'] data_obj._data[ start:stop, data_obj._ALTITUDEINDEX] = station_data['stat_alt'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) var_info = station_data['var_info'][var] metadata[meta_key]['var_info'][var] = var_info.to_dict() if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx metadata[meta_key]['variables'] = vars_avail idx += totnum meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj = data_obj.merge_common_meta() data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj