def test_bufr_read(self): """Test reading data and data quality on Metop-A MHS BUFR file""" test_file = os.path.join(test_dir, "metop_mhs.bufr") bfr = bufr.BUFRFile(test_file) data = bfr.next() self.assertEqual(data[0].name.strip(' '), "TOVS/ATOVS PRODUCT QUALIFIER") self.assertEqual(data[0].data[0], 3) self.assertEqual(data[0].unit.strip(' '), "CODE TABLE 8070") self.assertEqual(data[0].index, 0)
def test_bufr_interate(self): """Test reading all data, last entry""" test_file = os.path.join(test_dir, "metop_mhs.bufr") bfr = bufr.BUFRFile(test_file) for i, data in enumerate(bfr): pass self.assertEqual(i, 3) self.assertEqual(data[0].name.strip(' '), "TOVS/ATOVS PRODUCT QUALIFIER") self.assertEqual(data[0].data[0], 3) self.assertEqual(data[0].unit.strip(' '), "CODE TABLE 8070") self.assertEqual(data[0].index, 0)
def open( self, view=None, datamodel_geolocation_dims=None, datamodel=None, ): """Open the file (or any other type of storage) Args: view (dict, optional): a dictionary where keys are dimension names and values are slices. A view can be set on a file, meaning that only the subset defined by this view will be accessible. Any later use of slices in :func:`read_values` will be relative to the view defined here. This view is expressed as any subset (see :func:`read_values`). For example: .. code-block: python view = {'time':slice(0,0), 'row':slice(200,300), 'cell':slice(200,300)} """ self._handler = bufr.BUFRFile(self._url) super(BUFRFile, self).open(view=view, datamodel='Swath') if datamodel_geolocation_dims: self.datamodel_geolocation_dims = datamodel_geolocation_dims if datamodel is not None: self._feature_type = datamodel # force data buffering to get full file structure and content fieldnames = [] units = [] self._fields = {} data = [] geolocdata = {} # read file content first = True nbrows = 0 yearidx = None monthidx = None dayidx = None houridx = None minuteidx = None secondidx = None latidx = None lonidx = None indexes = {} duplicated_field_indice = {} for record in self._handler: nbrows += 1 if first: nbcells = len(record[1].data) for entry in record: # create valid name entryname = (entry.name.lower().replace( ' ', '_').strip('_').strip('*')) logging.debug('entry name %s', entryname) # identify geolocation information. We assume the first met # entries with geolocation names are the correct ones. The # next ones with the same names are processed as additional # variables if entryname == 'year' and yearidx is None: yearidx = entry.index logging.debug('entry index year = %s', yearidx) elif entryname == 'month' and monthidx is None: monthidx = entry.index elif entryname == 'day' and dayidx is None: dayidx = entry.index elif entryname == 'hour' and houridx is None: houridx = entry.index elif entryname == 'minute' and minuteidx is None: minuteidx = entry.index elif entryname == 'second' and secondidx is None: secondidx = entry.index elif ('latitude' in entryname and latidx is None): latidx = entry.index geolocdata['lat'] = [entry.data] elif ('longitude' in entryname and lonidx is None): geolocdata['lon'] = [entry.data] lonidx = entry.index # decides if entry should be a field or an attribute elif (entryname not in self.ATTRIBUTE_ENTRIES and entryname not in self.SKIPPED_ENTRIES): if entryname in duplicated_field_indice: # case where different fields with the same name # we had an incremental number and # change the name of the first occurence duplicated_field_indice[entryname] = 0 prev_occurence = fieldnames.index(entryname) fieldnames[prev_occurence] = '_'.join([ entryname, '%d' % duplicated_field_indice[entryname] ]) if entryname in duplicated_field_indice: duplicated_field_indice[entryname] += 1 entryname = '_'.join([ entryname, '%d' % duplicated_field_indice[entryname] ]) fieldnames.append(entryname) indexes[entryname] = entry.index # get unit in standard form unit = entry.unit.strip() if unit in UNITS: units.append(UNITS[unit]) else: units.append(unit) data.append([entry.data]) elif entry.name in self.ATTRIBUTE_ENTRIES: self.attributes[entryname] = entry.data[0] first = False for i in range(nbcells): logging.debug('year %s', int(record[yearidx].data[i])) geolocdata['time'] = [ numpy.array([ (datetime.datetime(int(record[yearidx].data[i]), int(record[monthidx].data[i]), int(record[dayidx].data[i]), int(record[houridx].data[i]), int(record[minuteidx].data[i]), int(record[secondidx].data[i])) - REFERENCE_TIME).total_seconds() for i in range(nbcells) ]) ] else: # read arrays of data current_reccord_range_length = record[secondidx].data.shape[0] if nbcells == current_reccord_range_length: for i, fieldname in enumerate(fieldnames): data[i].append(record[indexes[fieldname]].data) geolocdata['lon'].append(record[lonidx].data) geolocdata['lat'].append(record[latidx].data) geolocdata['time'].append( numpy.array([( datetime.datetime(int(record[yearidx].data[i]), int(record[monthidx].data[i]), int(record[dayidx].data[i]), int(record[houridx].data[i]), int(record[minuteidx].data[i]), int(record[secondidx].data[i])) - REFERENCE_TIME).total_seconds() for i in range(nbcells)])) del self._handler self._handler = self # get dimensions (take into account the view) self._dimensions = collections.OrderedDict([('row', nbrows), ('cell', nbcells)]) newslices = cerbere.mapper.slices.get_absolute_slices( self.view, slices=None, dimnames=self._dimensions.keys(), dimsizes=self._dimensions.values()) # get fields and cache data self._fields = {} for i, fieldname in enumerate(fieldnames): varobj = variable.Variable(fieldname, fieldname.replace('_', ' ')) newfield = field.Field( variable=varobj, dimensions=self._dimensions, datatype=numpy.float32, values=numpy.ma.masked_equal(numpy.vstack(data[i]), self.FILLVALUE)[tuple(newslices)], fillvalue=self.FILLVALUE, units=units[i], ) self._fields[fieldname] = newfield # geolocation self._geofields = {} varobj = variable.Variable(shortname='lat', description='latitude') newfield = field.Field( variable=varobj, dimensions=self._dimensions, datatype=numpy.float32, values=numpy.ma.masked_equal(numpy.vstack(geolocdata['lat']), self.FILLVALUE)[tuple(newslices)], fillvalue=self.FILLVALUE, units='degrees_north', ) self._geofields['lat'] = newfield varobj = variable.Variable(shortname='lon', description='longitude') newfield = field.Field( variable=varobj, dimensions=self._dimensions, datatype=numpy.float32, values=numpy.ma.masked_equal(numpy.vstack(geolocdata['lon']), self.FILLVALUE)[tuple(newslices)], fillvalue=self.FILLVALUE, units='degrees_east', ) self._geofields['lon'] = newfield var = variable.Variable(shortname='time', description='time') newfield = field.Field( variable=var, dimensions=self._dimensions, datatype=numpy.int32, values=numpy.vstack(geolocdata['time'])[tuple(newslices)], units=('seconds since %s' % datetime.datetime.strftime( REFERENCE_TIME, "%Y-%m-%d %H:%M:%S"))) self._geofields['time'] = newfield return self._handler
def bufr2netcdf(instr_name, bufr_fn, nc_fn, dburl=None): """ Does the actual work in transforming the file """ # Create file object and connect to database bfr = bufr.BUFRFile(bufr_fn) try: os.remove(nc_fn) except OSError: pass rootgrp = Dataset(nc_fn, 'w', format='NETCDF4') conn = None if dburl is not None: conn = bufrmetadb.BUFRDescDBConn(dburl) else: conn = bufrmetadb.BUFRDescDBConn() instr = conn.get_instrument(instr_name) #Get bufr record sections from database bstart = instr.bufr_record_start bend = instr.bufr_record_end transpose = instr.transposed logger.debug("start index: %s, end index: %s" % (bstart, bend)) logger.debug("transposed : %s" % transpose) # Read BUFR file keys and get corresponding NetCDF names from # from the database. Fast forward to record start. for i in range(bstart + 1): records = bfr.read() # Set up accounting for each variable , to be used when wriing variables to # netcdf. bfr_count = [] for r in records: bfr_count.append(0) vname_map = conn.get_netcdf_parameters_dict(instr_name) # get replication indicies, the indicies handle multiple records of the # same variable within a bufr subsection. replication_indicies = conn.get_replication_indicies(instr_name) # Create attributes _create_global_attributes(rootgrp, instr) # Create dimensions _create_dimensions(rootgrp, vname_map) # # Get list of variables which should be treated as constants and # global attributes, Notice these values should be constant for # every entry in the scanline so below we just insert the value # from a random scanline # global_var_attrs = conn.get_netcdf_global_attrs(instr_name) for record in records: if record.name in global_var_attrs: setattr(rootgrp, global_var_attrs[record.name], "%s" % \ record.data) # create variables _create_variables(rootgrp, vname_map) # # This section inserts data into the NetCDF variables # logger.debug("Closing netcdf handle after setup") rootgrp.close() logger.debug("Opening netcdf handle after setup") rootgrp = Dataset(nc_fn, 'a', format='NETCDF4') ##bfr.reset() del bfr bfr = bufr.BUFRFile(bufr_fn) scalars_handled = False # Loop though all sections and dump to netcdf # for count, section in enumerate(bfr): # manage record boundaries. In some cases BUFR sections differ within a # file . This enables the user to only convert similar sections if count < bstart: continue if bend is not -1 and count > bend - 1: break mysection = section if transpose: # allocate container for new transposed data transposed_section = [] # F**K ! WTE !!!! (What The EUMETSAT !!!) SSMIS BUFR from SDR # records are sorted by field-of-view and not by scanline hence the # need to collect and transpose data. # Collect the record indicies we need indicies = [] # Structure for keeping trac of base entry and replicated entries index_groups = {} for rec1 in section: try: nc_name = vname_map[rec1.index]\ ['netcdf_name'] nc_dim_length = vname_map[rec1.index]\ ['netcdf_dimension_length'] index_groups[rec1.index] = [] # we need to collect lines matching by bufr replication we # assume that only the original entry has a netcdf_name # assigned to it. for rec2 in section: linked_index = replication_indicies[rec2.index] if linked_index == rec1.index: if rec2.index not in indicies: indicies.append(rec2.index) index_groups[rec1.index].append(rec2.index) else: # Make sure that we don't have replicated # entries that links to different base entries raise BUFR2NetCDFError( "Unable to transpose section, ambiguous variable naming" ) except KeyError: pass # Collect similar variables into a single array consisting of # stacked data rows. If the columns are order by field-of-view we # need to transpose the entire array to get the data ordered by # scanlines. This is the case with SSMIS data from Eumetcast for key, index_group in index_groups.iteritems(): old_entry = section[key] new_data = np.vstack(\ [section[i].data for i in index_group]).transpose() # transposed all linked scanlines and hardlink replication base # key for scanline in new_data: transposed_section.append( bufr.BUFRFileEntry(old_entry.index, old_entry.name, old_entry.unit, scanline[:nc_dim_length])) # reassign whole section to new transposed section mysection = transposed_section if transpose and (len(section) != len(transposed_section)): logger.debug("Different section lengths orig. %s transposed %s" %\ (len(section), len(transposed_section))) for record in mysection: if record is None: # This record is set to none by the transpose functionality, # see above. Just ignore the record and continue continue # linked index handles BUFR replication factors with multiple data # entries in one subsection. try: linked_index = replication_indicies[record.index] except KeyError, e: continue # only try to convert variables that define the netcdf_name # parameter try: nc_var = rootgrp.variables[vname_map[linked_index]\ ['netcdf_name']] except KeyError: continue _insert_record(vname_map, nc_var, record, scalars_handled, bfr_count[linked_index], linked_index) # This variable determines which record number in the netcdf variables the # data should be stored bfr_count[linked_index] += 1 # we have inserted the first bufr section and hence all variables that # can be packed into scalars or per scan vectors should be accounted for scalars_handled = True # We have inserted all data for this subsection. We need to level out # the differences in the unlimited dimension. The differenences stems # from the bufr replication factor. # find the max index of the unlimited dimension max_record = max(bfr_count) for record in mysection: if record is None: # record set to None by transpose functionality above continue # only insert fill data for records with a netcdf_name attribute try: nc_var = rootgrp.variables[vname_map[record.index]\ ['netcdf_name']] except KeyError: continue fill_rows = max_record - bfr_count[record.index] for i in range(fill_rows): _insert_record(vname_map, nc_var, record, scalars_handled, bfr_count[record.index], record.index) # This variable determines which record number in the netcdf variables the # data should be stored bfr_count[record.index] += 1
import bufr f = '/tmp/ascat_20150531_013300_metopa_44686_eps_o_coa_ovw.l2_bufr' hh = bufr.BUFRFile(f) var = [] unit = [] data = [] for record in hh: for entry in record: entryname = (entry.name.lower().replace(' ', '_').strip('_').strip('*')) if entryname not in var: var.append(entryname) unit.append(entry.unit) data.append(entry.data[0]) print entryname, entry.unit, entry.data[0]
def bufr2netcdf(instr_name, bufr_fn, nc_fn, dburl=None): """ Does the actual work in transforming the file """ # Create file object and connect to database ncf = NetCDF.NetCDFFile(nc_fn,'w') bfr = bufr.BUFRFile(bufr_fn) conn = None if dburl is not None: conn = bufr.metadb.BUFRDescDBConn(dburl) else: conn = bufr.metadb.BUFRDescDBConn() instr = conn.get_instrument(instr_name) #Get bufr record sections from database bstart = instr.bufr_record_start bend = instr.bufr_record_end bfr.reset() # Read BUFR file keys and get corresponding NetCDF names from # from the database. Fast forward to record start. for i in range(bstart+1): records = bfr.read() bfr_keys = [r.index for r in records] vname_map = {} for k in bfr_keys: vname_map[k] = conn.get_netcdf_parameters(instr_name, k) # Create attributes _create_global_attributes(ncf, instr) # Create dimensions _create_dimensions(ncf, vname_map) # # Get list of variables which should be treated as constants and # global attributes, Notice these values should be constant for # every entry in the scanline so below we just insert the value # from a random scanline # global_var_attrs = conn.get_netcdf_global_attrs(instr_name) for record in records: if record.name in global_var_attrs: print "attri %s" % record.name setattr(ncf, global_var_attrs[record.name], "%s" % \ record.data) # create variables _create_variables(ncf, vname_map) # close file and reopen in append mode ncf.close() ncf = NetCDF.NetCDFFile(nc_fn,'a') # # Insert data into variables # bfr.reset() bfr.next() count = -1 scalars_handled = False for section in bfr: count = count + 1 ##ncf.sync() # manage record boundaries... if count < bstart: continue if bend is not -1 and count > bend-1: break for record in section: # only try to convert variables that define the netcdf_name # parameter try: nc_var = ncf.variables[vname_map[record.index]['netcdf_name']] except KeyError: continue _insert_record(vname_map, nc_var, record, scalars_handled, count) # we have inserted the first bufr section and hence all variables that # can be packed into scalars or per scan vectors should be accounted for scalars_handled = True ncf.close()