/
caliop_cube.py
424 lines (338 loc) · 17.7 KB
/
caliop_cube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
import logging
from cis.data_io import hdf as hdf
from cis.data_io.hdf_vd import get_data, VDS
from cis.data_io.products import AProduct
import cis.utils as utils
MIXED_RESOLUTION_VARIABLES = ['Atmospheric_Volume_Description', 'CAD_Score',
'Extinction_QC_Flag_1064', 'Extinction_QC_Flag_532']
class Caliop_L2_cube(AProduct):
def get_file_signature(self):
return [r'CAL_LID_L2.*hdf']
def get_variable_names(self, filenames, data_type=None):
try:
from pyhdf.SD import SD
except ImportError:
raise ImportError("HDF support was not installed, please reinstall with pyhdf to read HDF files.")
variables = set([])
# Determine the valid shape for variables
sd = SD(filenames[0])
datasets = sd.datasets()
len_x = datasets['Latitude'][1][0] # Assumes that latitude shape == longitude shape (it should)
alt_data = get_data(VDS(filenames[0], "Lidar_Data_Altitudes"), True)
len_y = alt_data.shape[0]
valid_shape = (len_x, len_y)
for filename in filenames:
sd = SD(filename)
for var_name, var_info in sd.datasets().items():
if var_info[1] == valid_shape:
variables.add(var_name)
return variables
def create_data_object(self, filenames, variable, index_offset=1):
from cis.data_io.hdf_vd import get_data
from cis.data_io.hdf_vd import VDS
from pyhdf.error import HDF4Error
from cis.data_io import hdf_sd
from iris.coords import DimCoord, AuxCoord
from iris.cube import Cube
from cis.data_io.gridded_data import GriddedData
from cis.time_util import cis_standard_time_unit
logging.debug("Creating data object for variable " + variable)
variables = ['Latitude', 'Longitude', "Profile_Time", "Pressure"]
logging.info("Listing coordinates: " + str(variables))
variables.append(variable)
# reading data from files
sdata = {}
for filename in filenames:
try:
sds_dict = hdf_sd.read(filename, variables)
except HDF4Error as e:
raise IOError(str(e))
for var in list(sds_dict.keys()):
utils.add_element_to_list_in_dict(sdata, var, sds_dict[var])
alt_name = "altitude"
logging.info("Additional coordinates: '" + alt_name + "'")
# work out size of data arrays
# the coordinate variables will be reshaped to match that.
# NOTE: This assumes that all Caliop_L1 files have the same altitudes.
# If this is not the case, then the following line will need to be changed
# to concatenate the data from all the files and not just arbitrarily pick
# the altitudes from the first file.
alt_data = get_data(VDS(filenames[0], "Lidar_Data_Altitudes"), True)
alt_coord = DimCoord(alt_data, standard_name='altitude', units='km')
alt_coord.convert_units('m')
lat_data = hdf.read_data(sdata['Latitude'], self._get_calipso_data)[:, index_offset]
lat_coord = AuxCoord(lat_data, standard_name='latitude')
pres_data = hdf.read_data(sdata['Pressure'], self._get_calipso_data)
pres_coord = AuxCoord(pres_data, standard_name='air_pressure', units='hPa')
# longitude
lon = sdata['Longitude']
lon_data = hdf.read_data(lon, self._get_calipso_data)[:, index_offset]
lon_coord = AuxCoord(lon_data, standard_name='longitude')
# profile time, x
time = sdata['Profile_Time']
time_data = hdf.read_data(time, self._get_calipso_data)[:, index_offset]
time_coord = DimCoord(time_data, long_name='Profile_Time', standard_name='time',
units="seconds since 1993-01-01 00:00:00")
time_coord.convert_units(cis_standard_time_unit)
# retrieve data + its metadata
var = sdata[variable]
metadata = hdf.read_metadata(var, "SD")
if variable in MIXED_RESOLUTION_VARIABLES:
logging.warning("Using Level 2 resolution profile for mixed resolution variable {}. See CALIPSO "
"documentation for more details".format(variable))
data = hdf.read_data(var, self._get_mixed_resolution_calipso_data)
else:
data = hdf.read_data(var, self._get_calipso_data)
cube = Cube(data, long_name=metadata.long_name, units=self.clean_units(metadata.units),
dim_coords_and_dims=[(alt_coord, 1), (time_coord, 0)], aux_coords_and_dims=[(lat_coord, (0,)),
(lon_coord, (0,)),
(pres_coord, (0, 1))])
gd = GriddedData.make_from_cube(cube)
return gd
@staticmethod
def clean_units(units):
lookup = {'NoUnits': '', 'sr^-1km^-1': 'sr^-1 km^-1', 'per kilometer': 'km-1',
'per kilometer per steradian': 'km^-1 sr^-1'}
# Get the units from the lookup, if they're not in there use the original
return lookup.get(units, units)
def create_coords(self, filenames, variable=None):
raise NotImplemented()
def _get_mixed_resolution_calipso_data(self, sds):
# The first slice of the last dimension corresponds to the higher (in altitude) element of each
# sub-Level-2-resolution bin.
return self._get_calipso_data(sds)[:, :, 0]
def _get_calipso_data(self, sds):
"""
Reads raw data from an SD instance. Automatically applies the
scaling factors and offsets to the data arrays found in Calipso data.
Returns:
A numpy array containing the raw data with missing data is replaced by NaN.
Arguments:
sds -- The specific sds instance to read
"""
from cis.utils import create_masked_array_for_missing_data
import numpy as np
calipso_fill_values = {'Float_32': -9999.0,
# 'Int_8' : 'See SDS description',
'Int_16': -9999,
'Int_32': -9999,
'UInt_8': -127,
# 'UInt_16' : 'See SDS description',
# 'UInt_32' : 'See SDS description',
'ExtinctionQC Fill Value': 32768,
'FeatureFinderQC No Features Found': 32767,
'FeatureFinderQC Fill Value': 65535}
data = sds.get()
attributes = sds.attributes()
# Missing data. First try 'fillvalue'
missing_val = attributes.get('fillvalue', None)
if missing_val is None:
try:
# Now try and lookup the fill value based on the data type
missing_val = calipso_fill_values[attributes.get('format', None)]
except KeyError:
# Last guess
missing_val = attributes.get('_FillValue', None)
if missing_val is not None:
data = create_masked_array_for_missing_data(data, missing_val)
# Now handle valid range mask
valid_range = attributes.get('valid_range', None)
if valid_range is not None:
# Split the range into two numbers of the right type
v_range = np.asarray(valid_range.split("..."), dtype=data.dtype)
# Some valid_ranges appear to have only one value, so ignore those...
if len(v_range) == 2:
logging.debug("Masking all values {} > v > {}.".format(*v_range))
data = np.ma.masked_outside(data, *v_range)
else:
logging.warning("Invalid valid_range: {}. Not masking values.".format(valid_range))
# Offsets and scaling.
offset = attributes.get('add_offset', 0)
scale_factor = attributes.get('scale_factor', 1)
data = self._apply_scaling_factor_CALIPSO(data, scale_factor, offset)
return data
def _apply_scaling_factor_CALIPSO(self, data, scale_factor, offset):
"""
Apply scaling factor Calipso data.
This isn't explicitly documented, but is referred to in the CALIOP docs here:
http://www-calipso.larc.nasa.gov/resources/calipso_users_guide/data_summaries/profile_data.php#cloud_layer_fraction
And also confirmed by email with jason.l.tackett@nasa.gov
:param data:
:param scale_factor:
:param offset:
:return:
"""
logging.debug("Applying 'science_data = (packed_data / {scale}) + {offset}' "
"transformation to data.".format(scale=scale_factor, offset=offset))
return (data / scale_factor) + offset
def get_file_format(self, filename):
return "HDF4/CaliopL2"
class Caliop_L3_cube(AProduct):
def get_file_signature(self):
return [r'CAL_LID_L3.*hdf']
def get_variable_names(self, filenames, data_type=None):
try:
from pyhdf.SD import SD
except ImportError:
raise ImportError("HDF support was not installed, please reinstall with pyhdf to read HDF files.")
variables = set([])
# Determine the valid shape for variables
sd = SD(filenames[0])
datasets = sd.datasets()
len_x = datasets['Latitude_Midpoint'][1][1]
len_y = datasets['Longitude_Midpoint'][1][1]
len_z = datasets['Altitude_Midpoint'][1][1]
valid_shape = (len_x, len_y, len_z)
for filename in filenames:
sd = SD(filename)
for var_name, var_info in sd.datasets().items():
if var_info[1] == valid_shape:
variables.add(var_name)
return variables
def create_data_object(self, filenames, variable, index_offset=1):
from cis.data_io.hdf_vd import get_data
from cis.data_io.hdf_vd import VDS
from pyhdf.error import HDF4Error
from cis.data_io import hdf_sd
from iris.coords import DimCoord, AuxCoord
from iris.cube import Cube, CubeList
from cis.data_io.gridded_data import GriddedData
from cis.time_util import cis_standard_time_unit
from datetime import datetime
from iris.util import new_axis
import numpy as np
logging.debug("Creating data object for variable " + variable)
variables = ["Pressure_Mean"]
logging.info("Listing coordinates: " + str(variables))
variables.append(variable)
# reading data from files
sdata = {}
for filename in filenames:
try:
sds_dict = hdf_sd.read(filename, variables)
except HDF4Error as e:
raise IOError(str(e))
for var in list(sds_dict.keys()):
utils.add_element_to_list_in_dict(sdata, var, sds_dict[var])
# work out size of data arrays
# the coordinate variables will be reshaped to match that.
# NOTE: This assumes that all Caliop_L1 files have the same altitudes.
# If this is not the case, then the following line will need to be changed
# to concatenate the data from all the files and not just arbitrarily pick
# the altitudes from the first file.
alt_data = self._get_calipso_data(hdf_sd.HDF_SDS(filenames[0], 'Altitude_Midpoint'))[0, :]
alt_coord = DimCoord(alt_data, standard_name='altitude', units='km')
alt_coord.convert_units('m')
lat_data = self._get_calipso_data(hdf_sd.HDF_SDS(filenames[0], 'Latitude_Midpoint'))[0, :]
lat_coord = DimCoord(lat_data, standard_name='latitude', units='degrees_north')
lon_data = self._get_calipso_data(hdf_sd.HDF_SDS(filenames[0], 'Longitude_Midpoint'))[0, :]
lon_coord = DimCoord(lon_data, standard_name='longitude', units='degrees_east')
cubes = CubeList()
for f in filenames:
t = get_data(VDS(f, "Nominal_Year_Month"), True)[0]
time_data = cis_standard_time_unit.date2num(datetime(int(t[0:4]), int(t[4:6]), 15))
time_coord = AuxCoord(time_data, long_name='Profile_Time', standard_name='time',
units=cis_standard_time_unit)
# retrieve data + its metadata
var = sdata[variable]
metadata = hdf.read_metadata(var, "SD")
data = self._get_calipso_data(hdf_sd.HDF_SDS(f, variable))
pres_data = self._get_calipso_data(hdf_sd.HDF_SDS(f, 'Pressure_Mean'))
pres_coord = AuxCoord(pres_data, standard_name='air_pressure', units='hPa')
if data.ndim == 2:
# pres_coord = new_axis()
cube = Cube(data, long_name=metadata.long_name or variable, units=self.clean_units(metadata.units),
dim_coords_and_dims=[(lat_coord, 0), (lon_coord, 1)],
aux_coords_and_dims=[(time_coord, ())])
# Promote the time scalar coord to a length one dimension
new_cube = new_axis(cube, 'time')
cubes.append(new_cube)
elif data.ndim == 3:
# pres_coord = new_axis()
cube = Cube(data, long_name=metadata.long_name or variable, units=self.clean_units(metadata.units),
dim_coords_and_dims=[(lat_coord, 0), (lon_coord, 1), (alt_coord, 2)],
aux_coords_and_dims=[(time_coord, ())])
# Promote the time scalar coord to a length one dimension
new_cube = new_axis(cube, 'time')
# Then add the (extended) pressure coord so that it is explicitly a function of time
new_cube.add_aux_coord(pres_coord[np.newaxis, ...], (0, 1, 2, 3))
cubes.append(new_cube)
else:
raise ValueError("Unexpected number of dimensions for CALIOP data: {}".format(data.ndim))
# Concatenate the cubes from each file into a single GriddedData object
gd = GriddedData.make_from_cube(cubes.concatenate_cube())
return gd
@staticmethod
def clean_units(units):
lookup = {'NoUnits': '', 'sr^-1km^-1': 'sr^-1 km^-1', 'per kilometer': 'km-1',
'per kilometer per steradian': 'km^-1 sr^-1'}
# Get the units from the lookup, if they're not in there use the original
return lookup.get(units, units)
def create_coords(self, filenames, variable=None):
raise NotImplemented()
def _get_calipso_data(self, sds):
"""
Reads raw data from an SD instance. Automatically applies the
scaling factors and offsets to the data arrays found in Calipso data.
Returns:
A numpy array containing the raw data with missing data is replaced by NaN.
Arguments:
sds -- The specific sds instance to read
"""
from cis.utils import create_masked_array_for_missing_data
import numpy as np
calipso_fill_values = {'Float_32': -9999.0,
# 'Int_8' : 'See SDS description',
'Int_16': -9999,
'Int_32': -9999,
'UInt_8': -127,
# 'UInt_16' : 'See SDS description',
# 'UInt_32' : 'See SDS description',
'ExtinctionQC Fill Value': 32768,
'FeatureFinderQC No Features Found': 32767,
'FeatureFinderQC Fill Value': 65535}
data = sds.get()
attributes = sds.attributes()
# Missing data. First try 'fillvalue'
missing_val = attributes.get('fillvalue', None)
if missing_val is None:
try:
# Now try and lookup the fill value based on the data type
missing_val = calipso_fill_values[attributes.get('format', None)]
except KeyError:
# Last guess
missing_val = attributes.get('_FillValue', None)
if missing_val is not None:
data = create_masked_array_for_missing_data(data, missing_val)
# Now handle valid range mask
valid_range = attributes.get('valid_range', None)
if valid_range is not None:
# Split the range into two numbers of the right type
v_range = np.asarray(valid_range.split("..."), dtype=data.dtype)
# Some valid_ranges appear to have only one value, so ignore those...
if len(v_range) == 2:
logging.debug("Masking all values {} > v > {}.".format(*v_range))
data = np.ma.masked_outside(data, *v_range)
else:
logging.warning("Invalid valid_range: {}. Not masking values.".format(valid_range))
# Offsets and scaling.
offset = attributes.get('add_offset', 0)
scale_factor = attributes.get('scale_factor', 1)
data = self._apply_scaling_factor_CALIPSO(data, scale_factor, offset)
return data
def _apply_scaling_factor_CALIPSO(self, data, scale_factor, offset):
"""
Apply scaling factor Calipso data.
This isn't explicitly documented, but is referred to in the CALIOP docs here:
http://www-calipso.larc.nasa.gov/resources/calipso_users_guide/data_summaries/profile_data.php#cloud_layer_fraction
And also confirmed by email with jason.l.tackett@nasa.gov
:param data:
:param scale_factor:
:param offset:
:return:
"""
logging.debug("Applying 'science_data = (packed_data / {scale}) + {offset}' "
"transformation to data.".format(scale=scale_factor, offset=offset))
return (data / scale_factor) + offset
def get_file_format(self, filename):
return "HDF4/CaliopL2"