def test_cgrid_init_roms_depths(self): datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "cgrid" # u grid coords = pd.get_coord_dict("u") assert str(pd._coordcache["u"]) == "[XY][Z][T]" names = pd.get_coord_names("u") assert names["tname"] == "ocean_time" assert names["zname"] == "s_rho" assert names["xname"] == "lon_u" assert names["yname"] == "lat_u" # v grid coords = pd.get_coord_dict("v") assert str(pd._coordcache["v"]) == "[XY][Z][T]" names = pd.get_coord_names("v") assert names["tname"] == "ocean_time" assert names["zname"] == "s_rho" assert names["xname"] == "lon_v" assert names["yname"] == "lat_v" # rho grid coords = pd.get_coord_dict("h") assert str(pd._coordcache["h"]) == "[XY]" names = pd.get_coord_names("h") assert names["tname"] == None assert names["zname"] == None assert names["xname"] == "lon_rho" assert names["yname"] == "lat_rho" pd.closenc()
def test_rgrid_fluid_var_bbox_time(self): datafile = os.path.join(data_path, "marcooshfradar20120331.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' newbbox = np.asarray(pd.getbbox("u")) - 1 test = pd.restrict_vars("u").restrict_bbox(newbbox).nearest_time( datetime(2012, 3, 30, 4, tzinfo=pytz.utc)) assert "v" not in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] assert test.gettimebounds("u")[0] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc) assert test.gettimebounds("u")[1] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc) pd.closenc()
def test_cgrid_init_roms_depths(self): datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'cgrid' # u grid coords = pd.get_coord_dict('u') assert str(pd._coordcache['u']) == "[XY][Z][T]" names = pd.get_coord_names('u') assert names["tname"] == "ocean_time" assert names["zname"] == "s_rho" assert names["xname"] == "lon_u" assert names["yname"] == "lat_u" # v grid coords = pd.get_coord_dict('v') assert str(pd._coordcache['v']) == "[XY][Z][T]" names = pd.get_coord_names('v') assert names["tname"] == "ocean_time" assert names["zname"] == "s_rho" assert names["xname"] == "lon_v" assert names["yname"] == "lat_v" # rho grid coords = pd.get_coord_dict('h') assert str(pd._coordcache['h']) == "[XY]" names = pd.get_coord_names('h') assert names["tname"] == None assert names["zname"] == None assert names["xname"] == "lon_rho" assert names["yname"] == "lat_rho" pd.closenc()
def test_aggregated_dataset(self): datafile = os.path.join(data_path, "pws_das_20140126*.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' values = pd.get_values(var="u", bbox=[-149, 59, -144, 61.5], timeinds=0) assert values.size > 0
def load_dataset(self): self.cd = CommonDataset.open(self.service.get('url')) self.std_variables = None self.non_std_variables = None self.get_standards(self.cd) self.axis_names = DapHarvester.get_axis_variables(self.cd.nc) self.messages = [] return self.cd
def test_cgrid_init(self): url = "http://testbedapps-dev.sura.org/thredds/dodsC/estuarine_hypoxia/chesroms/agg-1991.nc" pd = CommonDataset.open(url) assert pd._datasettype == 'cgrid' coords = pd.get_coord_dict('u') assert str(pd._coordcache['u']) == "[XY][Z][T]" names = pd.get_names('u') assert names["tname"] == "time" assert names["zname"] == "s_rho" assert names["xname"] == "lon_u" assert names["yname"] == "lat_u" pd.closenc()
def load_initial_dataset(self): """ Initialize self.dataset, then close it A cacher will have to wrap this in locks, while a straight runner will not. """ try: self.dataset = CommonDataset.open(self.hydrodataset) if self.timevar is None: self.timevar = self.dataset.gettimevar(self.common_variables.get("u")) except Exception: logger.warn("No source dataset: %s. Particle exiting" % self.hydrodataset) raise
def test_ncell_init(self): url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/usf/fvcom/rita/ultralite/vardrag/nowave/3d" pd = CommonDataset.open(url) assert pd._datasettype == 'ncell' varname = pd.get_varname_from_stdname('sea_surface_height_above_geoid') assert varname == "zeta" names = pd.get_names(varname) assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_rgrid_fluid_var_bbox(self): datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" newbbox = np.asarray(pd.getbbox("u")) - 1 test = pd.restrict_vars("u").restrict_bbox(newbbox) assert "v" not in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] pd.closenc()
def test_rgrid_fluid_var_bbox(self): datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' newbbox = np.asarray(pd.getbbox("u")) - 1 test = pd.restrict_vars("u").restrict_bbox(newbbox) assert "v" not in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] pd.closenc()
def test_fluid_test(self): url = "http://thredds.axiomalaska.com/thredds/dodsC/PWS_DAS.nc" pd = CommonDataset.open(url) assert pd._datasettype == 'rgrid' newbbox = np.asarray(pd.getbbox("u"))-1 test = pd.restrict_vars("u").restrict_bbox(newbbox).restrict_depth((3, 50)).nearest_time(datetime(2011,5,1,0,0, tzinfo=pytz.utc)) assert not "v" in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] assert test.getdepthbounds("u")[0] >= 3 assert test.getdepthbounds("u")[1] <= 50 assert test.gettimebounds("u")[0] == datetime(2011,5,1,0,0, tzinfo=pytz.utc) assert test.gettimebounds("u")[1] == datetime(2011,5,1,0,0, tzinfo=pytz.utc)
def test_rgrid_init_ncom_surface(self): datafile = os.path.join(data_path, "ncom_glb_sfc8_hind_2012033100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' coords = pd.get_coord_dict('water_u') assert str(pd._coordcache['water_u']) == "[XY][T]" names = pd.get_coord_names('water_u') assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_rgrid_init_ncom_surface(self): datafile = os.path.join(data_path, "ncom_glb_sfc8_hind_2012033100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" coords = pd.get_coord_dict("water_u") assert str(pd._coordcache["water_u"]) == "[XY][T]" names = pd.get_coord_names("water_u") assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def load_initial_dataset(self): """ Initialize self.dataset, then close it A cacher will have to wrap this in locks, while a straight runner will not. """ try: self.dataset = CommonDataset.open(self.hydrodataset) if self.timevar is None: self.timevar = self.dataset.gettimevar( self.common_variables.get("u")) except Exception: logger.warn("No source dataset: %s. Particle exiting" % self.hydrodataset) raise
def __init__(self, **kwargs): """ Optional named arguments: * file (local path or dap to bathymetry netcdf file) """ if kwargs.get("file", None) is not None: self._file = os.path.normpath(kwargs.pop('file')) else: raise ValueError("Must provide a path to the Bathymetry file") self._type = kwargs.pop("type", "hover") self._nc = CommonDataset.open(self._file) self._bathy_name = kwargs.pop("bathy", "z")
def test_cgrid_init_pom_depths(self): datafile = os.path.join(data_path, "m201310100.out3.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "cgrid" coords = pd.get_coord_dict("u") assert str(pd._coordcache["u"]) == "[XY][Z][T]" names = pd.get_coord_names("u") assert names["tname"] == "time" assert names["zname"] == "sigma" assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_rgrid_init_hfradar_surface(self): datafile = os.path.join(data_path, "marcooshfradar20120331.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" coords = pd.get_coord_dict("u") assert str(pd._coordcache["u"]) == "[XY][T]" names = pd.get_coord_names("u") assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_bounding_polygon_rgrid(self): datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) bp = pd.getboundingpolygon("u") assert isinstance(bp, Polygon) bbox = pd.getbbox("u") shape = box(bbox[0], bbox[1], bbox[2], bbox[3]) # Shrink some and test if within bbox assert bp.buffer(-0.01).within(shape) # Expand to encompass the bbox assert bp.buffer(1).contains(shape) pd.closenc()
def test_rgrid_init_pws_depths(self): datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' coords = pd.get_coord_dict('u') assert str(pd._coordcache['u']) == "[XY][Z][T]" names = pd.get_coord_names('u') assert names["tname"] == "time" assert names["zname"] == "depth" assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_rgrid_fluid_var_bbox_time(self): datafile = os.path.join(data_path, "marcooshfradar20120331.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" newbbox = np.asarray(pd.getbbox("u")) - 1 test = pd.restrict_vars("u").restrict_bbox(newbbox).nearest_time(datetime(2012, 3, 30, 4, tzinfo=pytz.utc)) assert "v" not in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] assert test.gettimebounds("u")[0] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc) assert test.gettimebounds("u")[1] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc) pd.closenc()
def test_rgrid_regrid_4d(self): from paegan.utils.asainterpolate import create_grid datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' var = "u" lon = [-148.25, -148.24, -148.23, -148.22, -148.21, -148.2, -148.19, -148.18, -148.17, -148.16, -148.15, -148.14, -148.13, -148.12, -148.11, -148.1, -148.09, -148.08, -148.07, -148.06, -148.05, -148.04, -148.03, -148.02, -148.01, -148.0, -147.99, -147.98, -147.97, -147.96, -147.95, -147.94, -147.93, -147.92, -147.91, -147.9, -147.89, -147.88, -147.87, -147.86, -147.85, -147.84, -147.83, -147.82, -147.81, -147.8, -147.79, -147.78, -147.77, -147.76, -147.75, -147.74, -147.73, -147.72, -147.71, -147.7, -147.69, -147.68, -147.67, -147.66, -147.65, -147.64, -147.63, -147.62, -147.61, -147.6, -147.59, -147.58, -147.57, -147.56, -147.55, -147.54, -147.53, -147.52, -147.51, -147.5, -147.49, -147.48, -147.47, -147.46, -147.45, -147.44, -147.43, -147.42, -147.41, -147.4, -147.39, -147.38, -147.37, -147.36, -147.35, -147.34, -147.33, -147.32, -147.31, -147.3, -147.29, -147.28, -147.27, -147.26, -147.25, -147.24, -147.23, -147.22, -147.21, -147.2, -147.19, -147.18, -147.17, -147.16, -147.15, -147.14, -147.13, -147.12, -147.11, -147.1, -147.09, -147.08, -147.07, -147.06, -147.05, -147.04, -147.03, -147.02, -147.01, -147.0, -146.99, -146.98, -146.97, -146.96, -146.95, -146.94, -146.93, -146.92, -146.91, -146.9, -146.89, -146.88, -146.87, -146.86, -146.85, -146.84, -146.83, -146.82, -146.81, -146.8, -146.79, -146.78, -146.77, -146.76, -146.75, -146.74, -146.73, -146.72, -146.71, -146.7, -146.69, -146.68, -146.67, -146.66, -146.65, -146.64, -146.63, -146.62, -146.61, -146.6, -146.59, -146.58, -146.57, -146.56, -146.55, -146.54, -146.53, -146.52, -146.51, -146.5, -146.49, -146.48, -146.47, -146.46, -146.45, -146.44, -146.43, -146.42, -146.41, -146.4, -146.39, -146.38, -146.37, -146.36, -146.35, -146.34, -146.33, -146.32, -146.31, -146.3, -146.29, -146.28, -146.27, -146.26, -146.25, -146.24, -146.23, -146.22, -146.21, -146.2, -146.19, -146.18, -146.17, -146.16, -146.15, -146.14, -146.13, -146.12, -146.11, -146.1, -146.09, -146.08, -146.07, -146.06, -146.05, -146.04, -146.03, -146.02, -146.01, -146.0, -145.99, -145.98, -145.97, -145.96, -145.95, -145.94, -145.93, -145.92, -145.91, -145.9, -145.89, -145.88, -145.87, -145.86, -145.85, -145.84, -145.83, -145.82, -145.81, -145.8, -145.79, -145.78, -145.77, -145.76, -145.75, -145.74, -145.73, -145.72, -145.71, -145.7, -145.69, -145.68, -145.67, -145.66, -145.65, -145.64, -145.63, -145.62, -145.61, -145.6, -145.59, -145.58, -145.57, -145.56, -145.55, -145.54, -145.53, -145.52, -145.51, -145.5, -145.49, -145.48, -145.47, -145.46, -145.45, -145.44, -145.43, -145.42, -145.41, -145.4, -145.39, -145.38, -145.37, -145.36, -145.35, -145.34, -145.33, -145.32, -145.31, -145.3, -145.29, -145.28, -145.27, -145.26, -145.25, -145.24, -145.23, -145.22, -145.21, -145.2, -145.19, -145.18, -145.17, -145.16, -145.15, -145.14, -145.13, -145.12, -145.11, -145.1, -145.09, -145.08, -145.07, -145.06, -145.05, -145.04, -145.03, -145.02, -145.01, -145.0, -144.99, -144.98, -144.97, -144.96, -144.95, -144.94, -144.93, -144.92, -144.91, -144.9, -144.89, -144.88, -144.87, -144.86, -144.85, -144.84, -144.83, -144.82, -144.81, -144.8, -144.79] lat = [59.68, 59.69, 59.7, 59.71, 59.72, 59.73, 59.74, 59.75, 59.760002, 59.77, 59.78, 59.79, 59.8, 59.81, 59.82, 59.83, 59.84, 59.85, 59.86, 59.87, 59.88, 59.89, 59.9, 59.91, 59.920002, 59.93, 59.94, 59.95, 59.96, 59.97, 59.98, 59.99, 60.0, 60.010002, 60.02, 60.03, 60.04, 60.05, 60.06, 60.07, 60.08, 60.09, 60.1, 60.11, 60.12, 60.13, 60.14, 60.15, 60.16, 60.170002, 60.18, 60.19, 60.2, 60.21, 60.22, 60.23, 60.24, 60.25, 60.260002, 60.27, 60.28, 60.29, 60.3, 60.31, 60.32, 60.33, 60.34, 60.35, 60.36, 60.37, 60.38, 60.39, 60.4, 60.41, 60.420002, 60.43, 60.44, 60.45, 60.46, 60.47, 60.48, 60.49, 60.5, 60.510002, 60.52, 60.53, 60.54, 60.55, 60.56, 60.57, 60.58, 60.59, 60.6, 60.61, 60.62, 60.63, 60.64, 60.65, 60.66, 60.670002, 60.68, 60.69, 60.7, 60.71, 60.72, 60.73, 60.74, 60.75, 60.760002, 60.77, 60.78, 60.79, 60.8, 60.81, 60.82, 60.83, 60.84, 60.85, 60.86, 60.87, 60.88, 60.89, 60.9, 60.91, 60.920002, 60.93, 60.94, 60.95, 60.96, 60.97, 60.98, 60.99, 61.0, 61.010002, 61.02, 61.03, 61.04, 61.05, 61.06, 61.07, 61.08, 61.09, 61.1, 61.11, 61.12, 61.13, 61.14, 61.15, 61.16, 61.170002, 61.18, 61.19, 61.2] lon, lat = np.asarray(lon), np.asarray(lat) data1 = pd.get_values(var, bbox = (-149, 59, -144, 61.5)) coords_struct = pd.sub_coords(var, bbox = (-149, 59, -144, 61.5)) data2 = pd.get_values_on_grid(var, coords_struct.x, coords_struct.y, t=coords_struct.time, z=coords_struct.z) pd.closenc() assert np.all(data1 == data2)
def test_rgrid_init_pws_depths(self): datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" coords = pd.get_coord_dict("u") assert str(pd._coordcache["u"]) == "[XY][Z][T]" names = pd.get_coord_names("u") assert names["tname"] == "time" assert names["zname"] == "depth" assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_rgrid_init_hfradar_surface(self): datafile = os.path.join(data_path, "marcooshfradar20120331.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' coords = pd.get_coord_dict('u') assert str(pd._coordcache['u']) == "[XY][T]" names = pd.get_coord_names('u') assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_slosh_test(self): url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/und/slosh/ike/egl3/swi" pd = CommonDataset.open(url) assert pd._datasettype == 'cgrid' grid = pd.getgridobj('eta') box = [i-1 for i in grid.bbox] vals = pd.get_values('eta', bbox = box, zinds = 1, timeinds = 1,) assert vals.shape[0]==133 and vals.shape[1]==72 names = pd.get_names('eta') assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def calc(self): """ Compute bounds for this dataset """ try: nc = CommonDataset.open(self.location) matches = nc.get_varname_from_stdname("eastward_sea_water_velocity") matches = matches + nc.get_varname_from_stdname("eastward_current") query_var = matches[0] # Set BBOX minx, miny, maxx, maxy = nc.getbbox(var=query_var) self.bbox = unicode(box(minx, miny, maxx, maxy).wkt) # Set Bounding Polygon poly = nc.getboundingpolygon(var=query_var) self.geometry = unicode(poly.wkt) # Set Time bounds mintime, maxtime = nc.gettimebounds(var=query_var) self.starting = mintime self.ending = maxtime def clean(value): try: str(type(value)).index("numpy") except: return value else: return value.tolist() cleaned_info = {} variables = nc.getvariableinfo() for k, v in variables.items(): # Strip out numpy arrays into BSON encodable things. cleaned_var = { key : clean(value) for key, value in v.items() } cleaned_info[k] = cleaned_var self.variables = cleaned_info except: app.logger.warning("Could not calculate bounds for this dataset") raise
def load_initial_dataset(self): """ Initialize self.dataset, then close it A cacher will have to wrap this in locks, while a straight runner will not. """ try: with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) self.dataset = CommonDataset.open(self.hydrodataset) self.dataset.closenc() except Exception: logger.warn("No source dataset: %s. Particle exiting" % self.hydrodataset) raise finally: with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid())
def test_slosh_test(self): url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/und/slosh/ike/egl3/swi" pd = CommonDataset.open(url) assert pd._datasettype == 'cgrid' grid = pd.getgridobj('eta') box = [i - 1 for i in grid.bbox] vals = pd.get_values( 'eta', bbox=box, zinds=1, timeinds=1, ) assert vals.shape[0] == 133 and vals.shape[1] == 72 names = pd.get_names('eta') assert names["tname"] == "time" assert names["zname"] == None assert names["xname"] == "lon" assert names["yname"] == "lat" pd.closenc()
def test_bounding_polygon_roms_cgrid(self): datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc") pd = CommonDataset.open(datafile) bp = pd.getboundingpolygon("u") assert isinstance(bp, Polygon) bbox = pd.getbbox("u") shape = box(bbox[0], bbox[1], bbox[2], bbox[3]) # Shrink some and test if within bbox assert bp.buffer(-0.01).within(shape) bp = pd.getboundingpolygon("h") assert isinstance(bp, Polygon) bbox = pd.getbbox("h") shape = box(bbox[0], bbox[1], bbox[2], bbox[3]) # Shrink some and test if within bbox assert bp.buffer(-0.01).within(shape) pd.closenc()
def test_fluid_test(self): url = "http://thredds.axiomalaska.com/thredds/dodsC/PWS_DAS.nc" pd = CommonDataset.open(url) assert pd._datasettype == 'rgrid' newbbox = np.asarray(pd.getbbox("u")) - 1 test = pd.restrict_vars("u").restrict_bbox(newbbox).restrict_depth( (3, 50)).nearest_time(datetime(2011, 5, 1, 0, 0, tzinfo=pytz.utc)) assert not "v" in set(test._current_variables) assert test.getbbox("u")[2] <= newbbox[2] assert test.getbbox("u")[3] <= newbbox[3] assert test.getdepthbounds("u")[0] >= 3 assert test.getdepthbounds("u")[1] <= 50 assert test.gettimebounds("u")[0] == datetime(2011, 5, 1, 0, 0, tzinfo=pytz.utc) assert test.gettimebounds("u")[1] == datetime(2011, 5, 1, 0, 0, tzinfo=pytz.utc)
def setup_run(self, **kwargs): logger.setLevel(logging.PROGRESS) self.redis_url = None self.redis_log_channel = None self.redis_results_channel = None if "redis" in kwargs.get("output_formats", []): from paegan.logger.redis_handler import RedisHandler self.redis_url = kwargs.get("redis_url") self.redis_log_channel = kwargs.get("redis_log_channel") self.redis_results_channel = kwargs.get("redis_results_channel") rhandler = RedisHandler(self.redis_log_channel, self.redis_url) rhandler.setLevel(logging.PROGRESS) logger.addHandler(rhandler) # Relax. time.sleep(0.5) # Add ModelController description to logfile logger.info(unicode(self)) # Add the model descriptions to logfile for m in self._models: logger.info(unicode(m)) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. self.times = range(0, (self._step*self._nstep)+1, self._step) # Calculate a datetime object for each model timestep # This method is duplicated in CachingDataController and CachingForcer # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon): point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)] # Initialize the particles logger.progress((2, "Initializing particles")) for x in xrange(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) if kwargs.get("manager", True): # Get the number of cores (may take some tuning) and create that # many workers then pass particles into the queue for the workers self.mgr = multiprocessing.Manager() # This tracks if the system is 'alive'. Most looping whiles will check this # and break out if it is False. This is True until something goes very wrong. self.active = self.mgr.Value('bool', True) # Each particle is a task, plus the CachingDataController self.number_of_tasks = self.get_number_of_tasks() # Either spin up the number of cores, or the number of tasks self.nproc = min(multiprocessing.cpu_count() - 1, self.number_of_tasks) # Number of tasks that we need to run. This is decremented everytime something exits. self.n_run = self.mgr.Value('int', self.number_of_tasks) # The lock that controls access to the 'n_run' variable self.nproc_lock = self.mgr.Lock() # Create the task queue for all of the particles and the CachingDataController self.tasks = multiprocessing.JoinableQueue(self.number_of_tasks) # Create the result queue for all of the particles and the CachingDataController self.results = self.mgr.Queue(self.number_of_tasks) logger.progress((3, "Initializing and caching hydro model's grid")) try: ds = CommonDataset.open(self.hydrodataset) except Exception: logger.exception("Failed to access dataset %s" % self.hydrodataset) raise BaseDataControllerError("Inaccessible Dataset: %s" % self.hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") self.common_variables = self.get_common_variables_from_dataset(ds) self.timevar = None try: assert self.common_variables.get("u") in ds._current_variables assert self.common_variables.get("v") in ds._current_variables assert self.common_variables.get("x") in ds._current_variables assert self.common_variables.get("y") in ds._current_variables self.timevar = ds.gettimevar(self.common_variables.get("u")) except AssertionError: logger.exception("Could not locate variables needed to run model: %s" % unicode(self.common_variables)) raise BaseDataControllerError("A required data variable was not found in %s" % self.hydrodataset) model_start = self.timevar.get_dates()[0] model_end = self.timevar.get_dates()[-1] try: assert self.start > model_start assert self.start < model_end except AssertionError: raise BaseDataControllerError("Start time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[0], model_start, model_end)) try: assert self.datetimes[-1] > model_start assert self.datetimes[-1] < model_end except AssertionError: raise BaseDataControllerError("End time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[-1], model_start, model_end)) ds.closenc()
# ##### Get bounding polygons from each dataset # <codecell> from paegan.cdm.dataset import CommonDataset lookup_standard_name = "sea_water_temperature" # Filter out DAP servers that are taking FOREVER dap_urls = [url for url in dap_urls if "data1.gfdl.noaa.gov" not in url] dataset_polygons = {} for i, dap in enumerate(dap_urls): print '(%d/%s)' % (i + 1, len(dap_urls)), try: cd = CommonDataset.open(dap) except BaseException: print "Could not access", dap try: var = cd.get_varname_from_stdname( standard_name=lookup_standard_name)[0] dataset_polygons[dap] = cd.getboundingpolygon(var=var) print "Retrieved bounding polygon from %s" % dap except (IndexError, AssertionError): print "No standard_name '%s' in '%s'" % (lookup_standard_name, dap) # <markdowncell> # ##### Overlay dataset polygons on top of Important Bird Area polygons
def setup_run(self, hydrodataset, **kwargs): self.hydrodataset = hydrodataset logger.setLevel(logging.PROGRESS) # Relax. time.sleep(0.5) # Add ModelController description to logfile logger.info(str(self)) # Add the model descriptions to logfile for m in self._models: logger.info(str(m)) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. self.times = list(range(0, (self._step * self._nstep) + 1, self._step)) # Calculate a datetime object for each model timestep # This method is duplicated in CachingDataController and CachingForcer # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps( self.times, start=self.start) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance( self.geometry, MultiPolygon): point_locations = [ Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points( goal=self._npart, polygon=self.geometry) ] # Initialize the particles logger.progress((2, "Initializing particles")) for x in range(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) logger.progress((3, "Initializing and caching hydro model's grid %s" % self.hydrodataset)) try: ds = CommonDataset.open(self.hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") self.common_variables = self.get_common_variables_from_dataset(ds) except Exception: logger.exception("Failed to access dataset %s" % self.hydrodataset) raise BaseDataControllerError("Inaccessible Dataset: %s" % self.hydrodataset) self.timevar = None try: assert self.common_variables.get("u") in ds._current_variables assert self.common_variables.get("v") in ds._current_variables assert self.common_variables.get("x") in ds._current_variables assert self.common_variables.get("y") in ds._current_variables self.timevar = ds.gettimevar(self.common_variables.get("u")) model_start = self.timevar.get_dates()[0] model_end = self.timevar.get_dates()[-1] except AssertionError: logger.exception( "Could not locate variables needed to run model: %s" % str(self.common_variables)) raise BaseDataControllerError( "A required data variable was not found in %s" % self.hydrodataset) finally: ds.closenc() try: assert self.start > model_start assert self.start < model_end except AssertionError: raise BaseDataControllerError( "Start time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[0], model_start, model_end)) try: assert self.datetimes[-1] > model_start assert self.datetimes[-1] < model_end except AssertionError: raise BaseDataControllerError( "End time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[-1], model_start, model_end))
def __call__(self, proc, active): self.active = active if self.usebathy == True: self._bathymetry = Bathymetry(file=self.bathy) self._shoreline = None if self.useshore == True: self._shoreline = Shoreline(file=self.shoreline_path, point=self.release_location_centroid, spatialbuffer=0.25) # Make sure we are not starting on land. Raises exception if we are. self._shoreline.intersect(start_point=self.release_location_centroid, end_point=self.release_location_centroid) self.proc = proc part = self.part if self.active.value == True: while self.get_data.value == True: logger.debug("Waiting for DataController to start...") timer.sleep(10) pass # Initialize commondataset of local cache, then # close the related netcdf file try: with self.read_lock: self.read_count.value += 1 self.has_read_lock.append(os.getpid()) self.dataset = CommonDataset.open(self.localpath) self.dataset.closenc() except StandardError: logger.warn("No cache file: %s. Particle exiting" % self.localpath) raise finally: with self.read_lock: self.read_count.value -= 1 self.has_read_lock.remove(os.getpid()) # Calculate datetime at every timestep modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time) # Load Timevar from pickle serialization f = open(self.timevar_pickle_path,"rb") timevar = pickle.load(f) f.close() if self.time_method == 'interp': time_indexs = timevar.nearest_index(newtimes, select='before') elif self.time_method == 'nearest': time_indexs = timevar.nearest_index(newtimes) else: logger.warn("Method for computing u,v,w,temp,salt not supported!") try: assert len(newtimes) == len(time_indexs) except AssertionError: logger.error("Time indexes are messed up. Need to have equal datetime and time indexes") raise # loop over timesteps # We don't loop over the last time_index because # we need to query in the time_index and set the particle's # location as the 'newtime' object. for loop_i, i in enumerate(time_indexs[0:-1]): if self.active.value == False: raise ValueError("Particle exiting due to Failure.") newloc = None # if need a time that is outside of what we have #if self.active.value == True: # while self.get_data.value == True: # logger.info("Waiting for DataController to get out...") # timer.sleep(4) # pass # Get the variable data required by the models if self.time_method == 'nearest': u, v, w, temp, salt = self.data_nearest(i, newtimes[loop_i]) elif self.time_method == 'interp': u, v, w, temp, salt = self.data_interp(i, timevar, newtimes[loop_i]) else: logger.warn("Method for computing u,v,w,temp,salt not supported!") #logger.info("U: %.4f, V: %.4f, W: %.4f" % (u,v,w)) #logger.info("Temp: %.4f, Salt: %.4f" % (temp,salt)) # Get the bathy value at the particles location if self.usebathy == True: bathymetry_value = self._bathymetry.get_depth(part.location) else: bathymetry_value = -999999999999999 # Age the particle by the modelTimestep (seconds) # 'Age' meaning the amount of time it has been forced. part.age(seconds=modelTimestep[loop_i]) # loop over models - sort these in the order you want them to run for model in self.models: movement = model.move(part, u, v, w, modelTimestep[loop_i], temperature=temp, salinity=salt, bathymetry_value=bathymetry_value) newloc = Location4D(latitude=movement['latitude'], longitude=movement['longitude'], depth=movement['depth'], time=newtimes[loop_i+1]) logger.debug("%s - moved %.3f meters (horizontally) and %.3f meters (vertically) by %s with data from %s" % (part.logstring(), movement['distance'], movement['vertical_distance'], model.__class__.__name__, newtimes[loop_i].isoformat())) if newloc: self.boundary_interaction(particle=part, starting=part.location, ending=newloc, distance=movement['distance'], angle=movement['angle'], azimuth=movement['azimuth'], reverse_azimuth=movement['reverse_azimuth'], vertical_distance=movement['vertical_distance'], vertical_angle=movement['vertical_angle']) logger.debug("%s - was forced by %s and is now at %s" % (part.logstring(), model.__class__.__name__, part.location.logstring())) part.note = part.outputstring() # Each timestep, save the particles status and environmental variables. # This keep fields such as temp, salt, halted, settled, and dead matched up with the number of timesteps part.save() # We won't pull data for the last entry in locations, but we need to populate it with fill data. part.fill_environment_gap() if self.usebathy == True: self._bathymetry.close() if self.useshore == True: self._shoreline.close() return part
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ try: cd = CommonDataset.open(self.service.get('url')) except Exception as e: app.logger.error("Could not open DAP dataset from '%s'\n" "Exception %s: %s" % (self.service.get('url'), type(e).__name__, e)) return 'Not harvested' # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append(u"Could not get dataset name. No global attribute named 'title'.") # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append(u"Could not get dataset description. No global attribute named 'summary'.") # KEYWORDS keywords = [] try: keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append(u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list.") # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables])) axis_names = DapHarvest.get_axis_variables(cd.nc) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX # paegan does not support ugrid, so try to detect this condition and skip is_ugrid = False is_trajectory = False for vname, v in cd.nc.variables.iteritems(): if 'cf_role' in v.ncattrs(): if v.getncattr('cf_role') == 'mesh_topology': is_ugrid = True break elif v.getncattr('cf_role') == 'trajectory_id': is_trajectory = True break gj = None if is_ugrid: messages.append(u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry.") elif is_trajectory: coord_names = {} # try to get info for x, y, z, t axes for v in itertools.chain(std_variables, non_std_variables): try: coord_names = cd.get_coord_names(v, **axis_names) if coord_names['xname'] is not None and \ coord_names['yname'] is not None: break except (AssertionError, AttributeError, ValueError, KeyError): pass else: messages.append(u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library.") if 'xname' in coord_names: try: xvar = cd.nc.variables[coord_names['xname']] yvar = cd.nc.variables[coord_names['yname']] # one less order of magnitude eg 390000 -> 10000 slice_factor = 10 ** (int(math.log10(xvar.size)) - 1) xs = np.concatenate((xvar[::slice_factor], xvar[-1:])) ys = np.concatenate((yvar[::slice_factor], yvar[-1:])) # both coords must be valid to have a valid vertex # get rid of any nans and unreasonable lon/lats valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) & (~np.isnan(ys)) & (np.absolute(ys) <= 90)) xs = xs[valid_idx] ys = ys[valid_idx] # Shapely seems to require float64 values or incorrect # values will propagate for the generated lineString # if the array is not numpy's float64 dtype lineCoords = np.array([xs, ys]).T.astype('float64') gj = mapping(asLineString(lineCoords)) messages.append(u"Variable %s was used to calculate " u"trajectory geometry, and is a " u"naive sampling." % v) except (AssertionError, AttributeError, ValueError, KeyError, IndexError) as e: app.logger.warn("Trajectory error occured: %s", e) messages.append(u"Trajectory discovered but could not create a geometry.") else: for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v, **axis_names ).simplify(0.5)) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments app.logger.exception("Error calculating bounding box") # handles "points" aka single position NCELLs bbox = cd.getbbox(var=v, **axis_names) gj = self.get_bbox_or_point(bbox) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): pass if gj is not None: # We computed something, break out of loop. messages.append(u"Variable %s was used to calculate geometry." % v) break if gj is None: # Try the globals gj = self.global_bounding_box(cd.nc) messages.append(u"Bounding Box calculated using global attributes") if gj is None: messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.") messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.") messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append(u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities.") final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': get_common_name(DapHarvest.get_asset_type(cd)), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() ncdataset = Dataset(self.service.get('url')) scores = self.ccheck_dataset(ncdataset) metamap = self.metamap_dataset(ncdataset) try: metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap) except Exception as e: metadata_rec = None app.logger.error("could not save compliancecheck/metamap information", exc_info=True) return "Harvested"
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ METADATA_VAR_NAMES = [u'crs', u'projection'] # CF standard names for Axis STD_AXIS_NAMES = [u'latitude', u'longitude', u'time', u'forecast_reference_time', u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1', u'ocean_s_coordinate_g2', u'ocean_s_coordinate', u'ocean_double_sigma', u'ocean_sigma_over_z', u'projection_y_coordinate', u'projection_x_coordinate'] # Some datasets don't define standard_names on axis variables. This is used to weed them out based on the # actual variable name COMMON_AXIS_NAMES = [u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time', u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u', u'lat_v', u'lon_v ', u'lat_rho', u'lon_rho', u'lat_psi'] cd = CommonDataset.open(self.service.get('url')) # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append(u"Could not get dataset name. No global attribute named 'title'.") # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append(u"Could not get dataset description. No global attribute named 'summary'.") # KEYWORDS keywords = [] try: keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append(u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list.") # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables])) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX gj = None for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v)) except (AttributeError, AssertionError, ValueError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments gj = mapping(box(*cd.get_bbox(var=v))) except (AttributeError, AssertionError, ValueError): pass if gj is not None: # We computed something, break out of loop. messages.append(u"Variable %s was used to calculate geometry." % v) break if gj is None: messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.") messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.") messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append(u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities.") final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables])) service = { 'name' : name, 'description' : description, 'service_type' : self.service.get('service_type'), 'service_id' : ObjectId(self.service.get('_id')), 'data_provider' : self.service.get('data_provider'), 'metadata_type' : u'ncml', 'metadata_value' : unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages' : map(unicode, messages), 'keywords' : keywords, 'variables' : map(unicode, final_var_names), 'asset_type' : unicode(cd._datasettype).upper(), 'geojson' : gj, 'updated' : datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"
def __call__(self, active): c = 0 self.dataset = CommonDataset.open(self.hydrodataset) self.remote = self.dataset.nc # Calculate the datetimes of the model timesteps like # the particle objects do, so we can figure out unique # time indices modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps( self.times, start=self.start_time) timevar = self.dataset.gettimevar(self.uname) # Don't need to grab the last datetime, as it is not needed for forcing, only # for setting the time of the final particle forcing time_indexs = timevar.nearest_index(newtimes[0:-1], select='before') # Have to make sure that we get the plus 1 for the # linear interpolation of u,v,w,temp,salt self.inds = np.unique(time_indexs) self.inds = np.append(self.inds, self.inds.max() + 1) # While there is at least 1 particle still running, # stay alive, if not break while self.n_run.value > 1: if self.caching is False: logger.debug( "Caching is False, not doing much. Just hanging out until all of the particles finish." ) timer.sleep(10) continue # If particle asks for data, do the following if self.get_data.value is True: logger.debug("Particle asked for data!") # Wait for particles to get out while True: self.read_lock.acquire() logger.debug("Read count: %d" % self.read_count.value) if self.read_count.value > 0: logger.debug( "Waiting for write lock on cache file (particles must stop reading)..." ) self.read_lock.release() timer.sleep(2) else: break # Get write lock on the file. Already have read lock. self.write_lock.acquire() self.has_write_lock.value = os.getpid() if c == 0: logger.debug("Creating cache file") try: # Open local cache for writing, overwrites # existing file with same name self.local = netCDF4.Dataset(self.cache_path, 'w') indices = self.dataset.get_indices( self.uname, timeinds=[np.asarray([0])], point=self.start) self.point_get.value = [ self.inds[0], indices[-2], indices[-1] ] # Create dimensions for u and v variables self.local.createDimension('time', None) self.local.createDimension('level', None) self.local.createDimension('x', None) self.local.createDimension('y', None) # Create 3d or 4d u and v variables if self.remote.variables[self.uname].ndim == 4: self.ndim = 4 dimensions = ('time', 'level', 'y', 'x') coordinates = "time z lon lat" elif self.remote.variables[self.uname].ndim == 3: self.ndim = 3 dimensions = ('time', 'y', 'x') coordinates = "time lon lat" shape = self.remote.variables[self.uname].shape # If there is no FillValue defined in the dataset, use np.nan. # Sometimes it will work out correctly and other times we will # have a huge cache file. try: fill = self.remote.variables[ self.uname].missing_value except Exception: fill = np.nan # Create domain variable that specifies # where there is data geographically/by time # and where there is not data, # Used for testing if particle needs to # ask cache to update domain = self.local.createVariable('domain', 'i', dimensions, zlib=False, fill_value=0) domain.coordinates = coordinates # Create local u and v variables u = self.local.createVariable('u', 'f', dimensions, zlib=False, fill_value=fill) v = self.local.createVariable('v', 'f', dimensions, zlib=False, fill_value=fill) v.coordinates = coordinates u.coordinates = coordinates localvars = [ u, v, ] remotevars = [ self.remote.variables[self.uname], self.remote.variables[self.vname] ] # Create local w variable if self.wname is not None: w = self.local.createVariable('w', 'f', dimensions, zlib=False, fill_value=fill) w.coordinates = coordinates localvars.append(w) remotevars.append( self.remote.variables[self.wname]) if self.temp_name is not None and self.salt_name is not None: # Create local temp and salt vars temp = self.local.createVariable('temp', 'f', dimensions, zlib=False, fill_value=fill) salt = self.local.createVariable('salt', 'f', dimensions, zlib=False, fill_value=fill) temp.coordinates = coordinates salt.coordinates = coordinates localvars.append(temp) localvars.append(salt) remotevars.append( self.remote.variables[self.temp_name]) remotevars.append( self.remote.variables[self.salt_name]) # Create local lat/lon coordinate variables if self.remote.variables[self.xname].ndim == 2: lon = self.local.createVariable('lon', 'f', ("y", "x"), zlib=False) lon[:] = self.remote.variables[self.xname][:, :] lat = self.local.createVariable('lat', 'f', ("y", "x"), zlib=False) lat[:] = self.remote.variables[self.yname][:, :] if self.remote.variables[self.xname].ndim == 1: lon = self.local.createVariable('lon', 'f', ("x"), zlib=False) lon[:] = self.remote.variables[self.xname][:] lat = self.local.createVariable('lat', 'f', ("y"), zlib=False) lat[:] = self.remote.variables[self.yname][:] # Create local z variable if self.zname is not None: if self.remote.variables[self.zname].ndim == 4: z = self.local.createVariable( 'z', 'f', ("time", "level", "y", "x"), zlib=False) remotez = self.remote.variables[self.zname] localvars.append(z) remotevars.append(remotez) elif self.remote.variables[self.zname].ndim == 3: z = self.local.createVariable( 'z', 'f', ("level", "y", "x"), zlib=False) z[:] = self.remote.variables[ self.zname][:, :, :] elif self.remote.variables[self.zname].ndim == 1: z = self.local.createVariable('z', 'f', ("level", ), zlib=False) z[:] = self.remote.variables[self.zname][:] # Create local time variable time = self.local.createVariable('time', 'f8', ("time", ), zlib=False) if self.tname is not None: time[:] = self.remote.variables[self.tname][ self.inds] if self.point_get.value[0] + self.time_size > np.max( self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds) + 1) else: current_inds = np.arange( self.point_get.value[0], self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache. # Try 20 times on the first attempt current_attempt = 1 max_attempts = 20 while True: try: assert current_attempt <= max_attempts self.get_remote_data(localvars, remotevars, current_inds, shape) except AssertionError: raise except: logger.warn( "CachingDataController failed to get remote data. Trying again in 20 seconds. %s attempts left." % str(max_attempts - current_attempt)) logger.exception("Data Access Error") timer.sleep(20) current_attempt += 1 else: break c += 1 except (Exception, AssertionError): logger.error( "CachingDataController failed to get data (first request)" ) raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug( "Done updating cache file, closing file, and releasing locks" ) else: logger.debug("Updating cache file") try: # Open local cache dataset for appending self.local = netCDF4.Dataset(self.cache_path, 'a') # Create local and remote variable objects # for the variables of interest u = self.local.variables['u'] v = self.local.variables['v'] time = self.local.variables['time'] remoteu = self.remote.variables[self.uname] remotev = self.remote.variables[self.vname] # Create lists of variable objects for # the data updater localvars = [ u, v, ] remotevars = [ remoteu, remotev, ] if self.salt_name is not None and self.temp_name is not None: salt = self.local.variables['salt'] temp = self.local.variables['temp'] remotesalt = self.remote.variables[self.salt_name] remotetemp = self.remote.variables[self.temp_name] localvars.append(salt) localvars.append(temp) remotevars.append(remotesalt) remotevars.append(remotetemp) if self.wname is not None: w = self.local.variables['w'] remotew = self.remote.variables[self.wname] localvars.append(w) remotevars.append(remotew) if self.zname is not None: remotez = self.remote.variables[self.zname] if remotez.ndim == 4: z = self.local.variables['z'] localvars.append(z) remotevars.append(remotez) if self.tname is not None: # remotetime = self.remote.variables[self.tname] time[self.inds] = self.remote.variables[self.inds] if self.point_get.value[0] + self.time_size > np.max( self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds) + 1) else: current_inds = np.arange( self.point_get.value[0], self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn( "CachingDataController failed to get remote data. Trying again in 30 seconds" ) timer.sleep(30) else: break c += 1 except Exception: logger.error( "CachingDataController failed to get data (not first request)" ) raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug( "Done updating cache file, closing file, and releasing locks" ) else: logger.debug( "Particles are still running, waiting for them to request data..." ) timer.sleep(2) self.dataset.closenc() return "CachingDataController"
def run(self, hydrodataset, **kwargs): # Add ModelController description to logfile logger.info(self) # Add the model descriptions to logfile for m in self._models: logger.info(m) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. times = range(0,(self._step*self._nstep)+1,self._step) # Calculate a datetime object for each model timestep # This method is duplicated in DataController and ForceParticle # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(times, start=self.start) time_chunk = self._time_chunk horiz_chunk = self._horiz_chunk low_memory = kwargs.get("low_memory", False) # Should we remove the cache file at the end of the run? remove_cache = kwargs.get("remove_cache", True) self.bathy_path = kwargs.get("bathy", None) self.cache_path = kwargs.get("cache", None) if self.cache_path is None: # Generate temp filename for dataset cache default_cache_dir = os.path.join(os.path.dirname(__file__), "_cache") temp_name = AsaRandom.filename(prefix=str(datetime.now().microsecond), suffix=".nc") self.cache_path = os.path.join(default_cache_dir, temp_name) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon): point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)] # Initialize the particles logger.progress((2, "Initializing particles")) for x in xrange(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) # This is where it makes sense to implement the multiprocessing # looping for particles and models. Can handle each particle in # parallel probably. # # Get the number of cores (may take some tuning) and create that # many workers then pass particles into the queue for the workers mgr = multiprocessing.Manager() nproc = multiprocessing.cpu_count() - 1 if nproc <= 0: raise ValueError("Model does not run using less than two CPU cores") # Each particle is a task, plus the DataController number_of_tasks = len(self.particles) + 1 # We need a process for each particle and one for the data controller nproc = min(number_of_tasks, nproc) # When a particle requests data data_request_lock = mgr.Lock() # PID of process with lock has_data_request_lock = mgr.Value('int',-1) nproc_lock = mgr.Lock() # Create the task queue for all of the particles and the DataController tasks = multiprocessing.JoinableQueue(number_of_tasks) # Create the result queue for all of the particles and the DataController results = mgr.Queue(number_of_tasks) # Create the shared state objects get_data = mgr.Value('bool', True) # Number of tasks n_run = mgr.Value('int', number_of_tasks) updating = mgr.Value('bool', False) # When something is reading from cache file read_lock = mgr.Lock() # list of PIDs that are reading has_read_lock = mgr.list() read_count = mgr.Value('int', 0) # When something is writing to the cache file write_lock = mgr.Lock() # PID of process with lock has_write_lock = mgr.Value('int',-1) point_get = mgr.Value('list', [0, 0, 0]) active = mgr.Value('bool', True) logger.progress((3, "Initializing and caching hydro model's grid")) try: ds = CommonDataset.open(hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") common_variables = self.get_common_variables_from_dataset(ds) logger.debug("Pickling time variable to disk for particles") timevar = ds.gettimevar(common_variables.get("u")) f, timevar_pickle_path = tempfile.mkstemp() os.close(f) f = open(timevar_pickle_path, "wb") pickle.dump(timevar, f) f.close() ds.closenc() except: logger.warn("Failed to access remote dataset %s" % hydrodataset) raise DataControllerError("Inaccessible DAP endpoint: %s" % hydrodataset) # Add data controller to the queue first so that it # can get the initial data and is not blocked logger.debug('Starting DataController') logger.progress((4, "Starting processes")) data_controller = parallel.DataController(hydrodataset, common_variables, n_run, get_data, write_lock, has_write_lock, read_lock, read_count, time_chunk, horiz_chunk, times, self.start, point_get, self.reference_location, low_memory=low_memory, cache=self.cache_path) tasks.put(data_controller) # Create DataController worker data_controller_process = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="DataController") data_controller_process.start() logger.debug('Adding %i particles as tasks' % len(self.particles)) for part in self.particles: forcing = parallel.ForceParticle(part, hydrodataset, common_variables, timevar_pickle_path, times, self.start, self._models, self.reference_location.point, self._use_bathymetry, self._use_shoreline, self._use_seasurface, get_data, n_run, read_lock, has_read_lock, read_count, point_get, data_request_lock, has_data_request_lock, reverse_distance=self.reverse_distance, bathy=self.bathy_path, shoreline_path=self.shoreline_path, shoreline_feature=self.shoreline_feature, cache=self.cache_path, time_method=self.time_method) tasks.put(forcing) # Create workers for the particles. procs = [ parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="ForceParticle-%d"%i) for i in xrange(nproc - 1) ] for w in procs: w.start() logger.debug('Started %s' % w.name) # Get results back from queue, test for failed particles return_particles = [] retrieved = 0. error_code = 0 logger.info("Waiting for %i particle results" % len(self.particles)) logger.progress((5, "Running model")) while retrieved < number_of_tasks: try: # Returns a tuple of code, result code, tempres = results.get(timeout=240) except Queue.Empty: # Poll the active processes to make sure they are all alive and then continue with loop if not data_controller_process.is_alive() and data_controller_process.exitcode != 0: # Data controller is zombied, kill off other processes. get_data.value == False results.put((-2, "DataController")) new_procs = [] old_procs = [] for p in procs: if not p.is_alive() and p.exitcode != 0: # Do what the Consumer would do if something finished. # Add something to results queue results.put((-3, "ZombieParticle")) # Decrement nproc (DataController exits when this is 0) with nproc_lock: n_run.value = n_run.value - 1 # Remove task from queue (so they can be joined later on) tasks.task_done() # Start a new Consumer. It will exit if there are no tasks available. np = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name=p.name) new_procs.append(np) old_procs.append(p) # Release any locks the PID had if p.pid in has_read_lock: with read_lock: read_count.value -= 1 has_read_lock.remove(p.pid) if has_data_request_lock.value == p.pid: has_data_request_lock.value = -1 try: data_request_lock.release() except: pass if has_write_lock.value == p.pid: has_write_lock.value = -1 try: write_lock.release() except: pass for p in old_procs: try: procs.remove(p) except ValueError: logger.warn("Did not find %s in the list of processes. Continuing on." % p.name) for p in new_procs: procs.append(p) logger.warn("Started a new consumer (%s) to replace a zombie consumer" % p.name) p.start() else: # We got one. retrieved += 1 if code == None: logger.warn("Got an unrecognized response from a task.") elif code == -1: logger.warn("Particle %s has FAILED!!" % tempres.uid) elif code == -2: error_code = code logger.warn("DataController has FAILED!! Removing cache file so the particles fail.") try: os.remove(self.cache_path) except OSError: logger.debug("Could not remove cache file, it probably never existed") pass elif code == -3: error_code = code logger.info("A zombie process was caught and task was removed from queue") elif isinstance(tempres, Particle): logger.info("Particle %d finished" % tempres.uid) return_particles.append(tempres) # We mulitply by 95 here to save 5% for the exporting logger.progress((round((retrieved / number_of_tasks) * 90.,1), "Particle %d finished" % tempres.uid)) elif tempres == "DataController": logger.info("DataController finished") logger.progress((round((retrieved / number_of_tasks) * 90.,1), "DataController finished")) else: logger.info("Got a strange result on results queue") logger.info(str(tempres)) logger.info("Retrieved %i/%i results" % (int(retrieved),number_of_tasks)) if len(return_particles) != len(self.particles): logger.warn("Some particles failed and are not included in the output") # The results queue should be empty at this point assert results.empty() is True # Should be good to join on the tasks now that the queue is empty logger.info("Joining the task queue") tasks.join() # Join all processes logger.info("Joining the processes") for w in procs + [data_controller_process]: # Wait 10 seconds w.join(10.) if w.is_alive(): # Process is hanging, kill it. logger.info("Terminating %s forcefully. This should have exited itself." % w.name) w.terminate() logger.info('Workers complete') self.particles = return_particles # Remove Manager so it shuts down del mgr # Remove pickled timevar os.remove(timevar_pickle_path) # Remove the cache file if remove_cache is True: try: os.remove(self.cache_path) except OSError: logger.debug("Could not remove cache file, it probably never existed") logger.progress((96, "Exporting results")) if len(self.particles) > 0: # If output_formats and path specified, # output particle run data to disk when completed if "output_formats" in kwargs: # Make sure output_path is also included if kwargs.get("output_path", None) != None: formats = kwargs.get("output_formats") output_path = kwargs.get("output_path") if isinstance(formats, list): for format in formats: logger.info("Exporting to: %s" % format) try: self.export(output_path, format=format) except: logger.error("Failed to export to: %s" % format) else: logger.warn('The output_formats parameter should be a list, not saving any output!') else: logger.warn('No output path defined, not saving any output!') else: logger.warn('No output format defined, not saving any output!') else: logger.warn("Model didn't actually do anything, check the log.") if error_code == -2: raise DataControllerError("Error in the DataController") else: raise ModelError("Error in the model") logger.progress((99, "Model Run Complete")) return
def test_aggregated_dataset(self): datafile = os.path.join(data_path, "pws_das_20140126*.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == "rgrid" values = pd.get_values(var="u", bbox=[-149, 59, -144, 61.5], timeinds=0) assert values.size > 0
def run(self, hydrodataset, **kwargs): # Add ModelController description to logfile logger.info(self) # Add the model descriptions to logfile for m in self._models: logger.info(m) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. times = range(0, (self._step * self._nstep) + 1, self._step) # Calculate a datetime object for each model timestep # This method is duplicated in DataController and ForceParticle # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps( times, start=self.start) time_chunk = self._time_chunk horiz_chunk = self._horiz_chunk low_memory = kwargs.get("low_memory", False) # Should we remove the cache file at the end of the run? remove_cache = kwargs.get("remove_cache", True) self.bathy_path = kwargs.get("bathy", None) self.cache_path = kwargs.get("cache", None) if self.cache_path is None: # Generate temp filename for dataset cache default_cache_dir = os.path.join(os.path.dirname(__file__), "_cache") temp_name = AsaRandom.filename(prefix=str( datetime.now().microsecond), suffix=".nc") self.cache_path = os.path.join(default_cache_dir, temp_name) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance( self.geometry, MultiPolygon): point_locations = [ Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points( goal=self._npart, polygon=self.geometry) ] # Initialize the particles logger.progress((2, "Initializing particles")) for x in xrange(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) # This is where it makes sense to implement the multiprocessing # looping for particles and models. Can handle each particle in # parallel probably. # # Get the number of cores (may take some tuning) and create that # many workers then pass particles into the queue for the workers mgr = multiprocessing.Manager() nproc = multiprocessing.cpu_count() - 1 if nproc <= 0: raise ValueError( "Model does not run using less than two CPU cores") # Each particle is a task, plus the DataController number_of_tasks = len(self.particles) + 1 # We need a process for each particle and one for the data controller nproc = min(number_of_tasks, nproc) # When a particle requests data data_request_lock = mgr.Lock() # PID of process with lock has_data_request_lock = mgr.Value('int', -1) nproc_lock = mgr.Lock() # Create the task queue for all of the particles and the DataController tasks = multiprocessing.JoinableQueue(number_of_tasks) # Create the result queue for all of the particles and the DataController results = mgr.Queue(number_of_tasks) # Create the shared state objects get_data = mgr.Value('bool', True) # Number of tasks n_run = mgr.Value('int', number_of_tasks) updating = mgr.Value('bool', False) # When something is reading from cache file read_lock = mgr.Lock() # list of PIDs that are reading has_read_lock = mgr.list() read_count = mgr.Value('int', 0) # When something is writing to the cache file write_lock = mgr.Lock() # PID of process with lock has_write_lock = mgr.Value('int', -1) point_get = mgr.Value('list', [0, 0, 0]) active = mgr.Value('bool', True) logger.progress((3, "Initializing and caching hydro model's grid")) try: ds = CommonDataset.open(hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") common_variables = self.get_common_variables_from_dataset(ds) logger.debug("Pickling time variable to disk for particles") timevar = ds.gettimevar(common_variables.get("u")) f, timevar_pickle_path = tempfile.mkstemp() os.close(f) f = open(timevar_pickle_path, "wb") pickle.dump(timevar, f) f.close() ds.closenc() except: logger.warn("Failed to access remote dataset %s" % hydrodataset) raise DataControllerError("Inaccessible DAP endpoint: %s" % hydrodataset) # Add data controller to the queue first so that it # can get the initial data and is not blocked logger.debug('Starting DataController') logger.progress((4, "Starting processes")) data_controller = parallel.DataController(hydrodataset, common_variables, n_run, get_data, write_lock, has_write_lock, read_lock, read_count, time_chunk, horiz_chunk, times, self.start, point_get, self.reference_location, low_memory=low_memory, cache=self.cache_path) tasks.put(data_controller) # Create DataController worker data_controller_process = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="DataController") data_controller_process.start() logger.debug('Adding %i particles as tasks' % len(self.particles)) for part in self.particles: forcing = parallel.ForceParticle( part, hydrodataset, common_variables, timevar_pickle_path, times, self.start, self._models, self.reference_location.point, self._use_bathymetry, self._use_shoreline, self._use_seasurface, get_data, n_run, read_lock, has_read_lock, read_count, point_get, data_request_lock, has_data_request_lock, reverse_distance=self.reverse_distance, bathy=self.bathy_path, shoreline_path=self.shoreline_path, cache=self.cache_path, time_method=self.time_method) tasks.put(forcing) # Create workers for the particles. procs = [ parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="ForceParticle-%d" % i) for i in xrange(nproc - 1) ] for w in procs: w.start() logger.debug('Started %s' % w.name) # Get results back from queue, test for failed particles return_particles = [] retrieved = 0. error_code = 0 logger.info("Waiting for %i particle results" % len(self.particles)) logger.progress((5, "Running model")) while retrieved < number_of_tasks: try: # Returns a tuple of code, result code, tempres = results.get(timeout=240) except Queue.Empty: # Poll the active processes to make sure they are all alive and then continue with loop if not data_controller_process.is_alive( ) and data_controller_process.exitcode != 0: # Data controller is zombied, kill off other processes. get_data.value == False results.put((-2, "DataController")) new_procs = [] old_procs = [] for p in procs: if not p.is_alive() and p.exitcode != 0: # Do what the Consumer would do if something finished. # Add something to results queue results.put((-3, "ZombieParticle")) # Decrement nproc (DataController exits when this is 0) with nproc_lock: n_run.value = n_run.value - 1 # Remove task from queue (so they can be joined later on) tasks.task_done() # Start a new Consumer. It will exit if there are no tasks available. np = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name=p.name) new_procs.append(np) old_procs.append(p) # Release any locks the PID had if p.pid in has_read_lock: with read_lock: read_count.value -= 1 has_read_lock.remove(p.pid) if has_data_request_lock.value == p.pid: has_data_request_lock.value = -1 try: data_request_lock.release() except: pass if has_write_lock.value == p.pid: has_write_lock.value = -1 try: write_lock.release() except: pass for p in old_procs: try: procs.remove(p) except ValueError: logger.warn( "Did not find %s in the list of processes. Continuing on." % p.name) for p in new_procs: procs.append(p) logger.warn( "Started a new consumer (%s) to replace a zombie consumer" % p.name) p.start() else: # We got one. retrieved += 1 if code == None: logger.warn("Got an unrecognized response from a task.") elif code == -1: logger.warn("Particle %s has FAILED!!" % tempres.uid) elif code == -2: error_code = code logger.warn( "DataController has FAILED!! Removing cache file so the particles fail." ) try: os.remove(self.cache_path) except OSError: logger.debug( "Could not remove cache file, it probably never existed" ) pass elif code == -3: error_code = code logger.info( "A zombie process was caught and task was removed from queue" ) elif isinstance(tempres, Particle): logger.info("Particle %d finished" % tempres.uid) return_particles.append(tempres) # We mulitply by 95 here to save 5% for the exporting logger.progress( (round((retrieved / number_of_tasks) * 90., 1), "Particle %d finished" % tempres.uid)) elif tempres == "DataController": logger.info("DataController finished") logger.progress((round((retrieved / number_of_tasks) * 90., 1), "DataController finished")) else: logger.info("Got a strange result on results queue") logger.info(str(tempres)) logger.info("Retrieved %i/%i results" % (int(retrieved), number_of_tasks)) if len(return_particles) != len(self.particles): logger.warn( "Some particles failed and are not included in the output") # The results queue should be empty at this point assert results.empty() is True # Should be good to join on the tasks now that the queue is empty logger.info("Joining the task queue") tasks.join() # Join all processes logger.info("Joining the processes") for w in procs + [data_controller_process]: # Wait 10 seconds w.join(10.) if w.is_alive(): # Process is hanging, kill it. logger.info( "Terminating %s forcefully. This should have exited itself." % w.name) w.terminate() logger.info('Workers complete') self.particles = return_particles # Remove Manager so it shuts down del mgr # Remove pickled timevar os.remove(timevar_pickle_path) # Remove the cache file if remove_cache is True: try: os.remove(self.cache_path) except OSError: logger.debug( "Could not remove cache file, it probably never existed") logger.progress((96, "Exporting results")) if len(self.particles) > 0: # If output_formats and path specified, # output particle run data to disk when completed if "output_formats" in kwargs: # Make sure output_path is also included if kwargs.get("output_path", None) != None: formats = kwargs.get("output_formats") output_path = kwargs.get("output_path") if isinstance(formats, list): for format in formats: logger.info("Exporting to: %s" % format) try: self.export(output_path, format=format) except: logger.error("Failed to export to: %s" % format) else: logger.warn( 'The output_formats parameter should be a list, not saving any output!' ) else: logger.warn( 'No output path defined, not saving any output!') else: logger.warn('No output format defined, not saving any output!') else: logger.warn("Model didn't actually do anything, check the log.") if error_code == -2: raise DataControllerError("Error in the DataController") else: raise ModelError("Error in the model") logger.progress((99, "Model Run Complete")) return
def test_rgrid_regrid_4d(self): from paegan.utils.asainterpolate import create_grid datafile = os.path.join(data_path, "pws_L2_2012040100.nc") pd = CommonDataset.open(datafile) assert pd._datasettype == 'rgrid' var = "u" lon = [ -148.25, -148.24, -148.23, -148.22, -148.21, -148.2, -148.19, -148.18, -148.17, -148.16, -148.15, -148.14, -148.13, -148.12, -148.11, -148.1, -148.09, -148.08, -148.07, -148.06, -148.05, -148.04, -148.03, -148.02, -148.01, -148.0, -147.99, -147.98, -147.97, -147.96, -147.95, -147.94, -147.93, -147.92, -147.91, -147.9, -147.89, -147.88, -147.87, -147.86, -147.85, -147.84, -147.83, -147.82, -147.81, -147.8, -147.79, -147.78, -147.77, -147.76, -147.75, -147.74, -147.73, -147.72, -147.71, -147.7, -147.69, -147.68, -147.67, -147.66, -147.65, -147.64, -147.63, -147.62, -147.61, -147.6, -147.59, -147.58, -147.57, -147.56, -147.55, -147.54, -147.53, -147.52, -147.51, -147.5, -147.49, -147.48, -147.47, -147.46, -147.45, -147.44, -147.43, -147.42, -147.41, -147.4, -147.39, -147.38, -147.37, -147.36, -147.35, -147.34, -147.33, -147.32, -147.31, -147.3, -147.29, -147.28, -147.27, -147.26, -147.25, -147.24, -147.23, -147.22, -147.21, -147.2, -147.19, -147.18, -147.17, -147.16, -147.15, -147.14, -147.13, -147.12, -147.11, -147.1, -147.09, -147.08, -147.07, -147.06, -147.05, -147.04, -147.03, -147.02, -147.01, -147.0, -146.99, -146.98, -146.97, -146.96, -146.95, -146.94, -146.93, -146.92, -146.91, -146.9, -146.89, -146.88, -146.87, -146.86, -146.85, -146.84, -146.83, -146.82, -146.81, -146.8, -146.79, -146.78, -146.77, -146.76, -146.75, -146.74, -146.73, -146.72, -146.71, -146.7, -146.69, -146.68, -146.67, -146.66, -146.65, -146.64, -146.63, -146.62, -146.61, -146.6, -146.59, -146.58, -146.57, -146.56, -146.55, -146.54, -146.53, -146.52, -146.51, -146.5, -146.49, -146.48, -146.47, -146.46, -146.45, -146.44, -146.43, -146.42, -146.41, -146.4, -146.39, -146.38, -146.37, -146.36, -146.35, -146.34, -146.33, -146.32, -146.31, -146.3, -146.29, -146.28, -146.27, -146.26, -146.25, -146.24, -146.23, -146.22, -146.21, -146.2, -146.19, -146.18, -146.17, -146.16, -146.15, -146.14, -146.13, -146.12, -146.11, -146.1, -146.09, -146.08, -146.07, -146.06, -146.05, -146.04, -146.03, -146.02, -146.01, -146.0, -145.99, -145.98, -145.97, -145.96, -145.95, -145.94, -145.93, -145.92, -145.91, -145.9, -145.89, -145.88, -145.87, -145.86, -145.85, -145.84, -145.83, -145.82, -145.81, -145.8, -145.79, -145.78, -145.77, -145.76, -145.75, -145.74, -145.73, -145.72, -145.71, -145.7, -145.69, -145.68, -145.67, -145.66, -145.65, -145.64, -145.63, -145.62, -145.61, -145.6, -145.59, -145.58, -145.57, -145.56, -145.55, -145.54, -145.53, -145.52, -145.51, -145.5, -145.49, -145.48, -145.47, -145.46, -145.45, -145.44, -145.43, -145.42, -145.41, -145.4, -145.39, -145.38, -145.37, -145.36, -145.35, -145.34, -145.33, -145.32, -145.31, -145.3, -145.29, -145.28, -145.27, -145.26, -145.25, -145.24, -145.23, -145.22, -145.21, -145.2, -145.19, -145.18, -145.17, -145.16, -145.15, -145.14, -145.13, -145.12, -145.11, -145.1, -145.09, -145.08, -145.07, -145.06, -145.05, -145.04, -145.03, -145.02, -145.01, -145.0, -144.99, -144.98, -144.97, -144.96, -144.95, -144.94, -144.93, -144.92, -144.91, -144.9, -144.89, -144.88, -144.87, -144.86, -144.85, -144.84, -144.83, -144.82, -144.81, -144.8, -144.79 ] lat = [ 59.68, 59.69, 59.7, 59.71, 59.72, 59.73, 59.74, 59.75, 59.760002, 59.77, 59.78, 59.79, 59.8, 59.81, 59.82, 59.83, 59.84, 59.85, 59.86, 59.87, 59.88, 59.89, 59.9, 59.91, 59.920002, 59.93, 59.94, 59.95, 59.96, 59.97, 59.98, 59.99, 60.0, 60.010002, 60.02, 60.03, 60.04, 60.05, 60.06, 60.07, 60.08, 60.09, 60.1, 60.11, 60.12, 60.13, 60.14, 60.15, 60.16, 60.170002, 60.18, 60.19, 60.2, 60.21, 60.22, 60.23, 60.24, 60.25, 60.260002, 60.27, 60.28, 60.29, 60.3, 60.31, 60.32, 60.33, 60.34, 60.35, 60.36, 60.37, 60.38, 60.39, 60.4, 60.41, 60.420002, 60.43, 60.44, 60.45, 60.46, 60.47, 60.48, 60.49, 60.5, 60.510002, 60.52, 60.53, 60.54, 60.55, 60.56, 60.57, 60.58, 60.59, 60.6, 60.61, 60.62, 60.63, 60.64, 60.65, 60.66, 60.670002, 60.68, 60.69, 60.7, 60.71, 60.72, 60.73, 60.74, 60.75, 60.760002, 60.77, 60.78, 60.79, 60.8, 60.81, 60.82, 60.83, 60.84, 60.85, 60.86, 60.87, 60.88, 60.89, 60.9, 60.91, 60.920002, 60.93, 60.94, 60.95, 60.96, 60.97, 60.98, 60.99, 61.0, 61.010002, 61.02, 61.03, 61.04, 61.05, 61.06, 61.07, 61.08, 61.09, 61.1, 61.11, 61.12, 61.13, 61.14, 61.15, 61.16, 61.170002, 61.18, 61.19, 61.2 ] lon, lat = np.asarray(lon), np.asarray(lat) data1 = pd.get_values(var, bbox=(-149, 59, -144, 61.5)) coords_struct = pd.sub_coords(var, bbox=(-149, 59, -144, 61.5)) data2 = pd.get_values_on_grid(var, coords_struct.x, coords_struct.y, t=coords_struct.time, z=coords_struct.z) pd.closenc() assert np.all(data1 == data2)
def __call__(self, proc, active): c = 0 self.dataset = CommonDataset.open(self.url) self.proc = proc self.remote = self.dataset.nc cachepath = self.cache_path # Calculate the datetimes of the model timesteps like # the particle objects do, so we can figure out unique # time indices modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time) timevar = self.dataset.gettimevar(self.uname) # Don't need to grab the last datetime, as it is not needed for forcing, only # for setting the time of the final particle forcing time_indexs = timevar.nearest_index(newtimes[0:-1], select='before') # Have to make sure that we get the plus 1 for the # linear interpolation of u,v,w,temp,salt self.inds = np.unique(time_indexs) self.inds = np.append(self.inds, self.inds.max()+1) # While there is at least 1 particle still running, # stay alive, if not break while self.n_run.value > 1: logger.debug("Particles are still running, waiting for them to request data...") timer.sleep(2) # If particle asks for data, do the following if self.get_data.value == True: logger.debug("Particle asked for data!") # Wait for particles to get out while True: self.read_lock.acquire() logger.debug("Read count: %d" % self.read_count.value) if self.read_count.value > 0: logger.debug("Waiting for write lock on cache file (particles must stop reading)...") self.read_lock.release() timer.sleep(4) else: break # Get write lock on the file. Already have read lock. self.write_lock.acquire() self.has_write_lock.value = os.getpid() if c == 0: logger.debug("Creating cache file") try: # Open local cache for writing, overwrites # existing file with same name self.local = netCDF4.Dataset(cachepath, 'w') indices = self.dataset.get_indices(self.uname, timeinds=[np.asarray([0])], point=self.start) self.point_get.value = [self.inds[0], indices[-2], indices[-1]] # Create dimensions for u and v variables self.local.createDimension('time', None) self.local.createDimension('level', None) self.local.createDimension('x', None) self.local.createDimension('y', None) # Create 3d or 4d u and v variables if self.remote.variables[self.uname].ndim == 4: self.ndim = 4 dimensions = ('time', 'level', 'y', 'x') coordinates = "time z lon lat" elif self.remote.variables[self.uname].ndim == 3: self.ndim = 3 dimensions = ('time', 'y', 'x') coordinates = "time lon lat" shape = self.remote.variables[self.uname].shape # If there is no FillValue defined in the dataset, use np.nan. # Sometimes it will work out correctly and other times we will # have a huge cache file. try: fill = self.remote.variables[self.uname].missing_value except Exception: fill = np.nan # Create domain variable that specifies # where there is data geographically/by time # and where there is not data, # Used for testing if particle needs to # ask cache to update domain = self.local.createVariable('domain', 'i', dimensions, zlib=False, fill_value=0) domain.coordinates = coordinates # Create local u and v variables u = self.local.createVariable('u', 'f', dimensions, zlib=False, fill_value=fill) v = self.local.createVariable('v', 'f', dimensions, zlib=False, fill_value=fill) v.coordinates = coordinates u.coordinates = coordinates localvars = [u, v,] remotevars = [self.remote.variables[self.uname], self.remote.variables[self.vname]] # Create local w variable if self.wname != None: w = self.local.createVariable('w', 'f', dimensions, zlib=False, fill_value=fill) w.coordinates = coordinates localvars.append(w) remotevars.append(self.remote.variables[self.wname]) if self.temp_name != None and self.salt_name != None: # Create local temp and salt vars temp = self.local.createVariable('temp', 'f', dimensions, zlib=False, fill_value=fill) salt = self.local.createVariable('salt', 'f', dimensions, zlib=False, fill_value=fill) temp.coordinates = coordinates salt.coordinates = coordinates localvars.append(temp) localvars.append(salt) remotevars.append(self.remote.variables[self.temp_name]) remotevars.append(self.remote.variables[self.salt_name]) # Create local lat/lon coordinate variables if self.remote.variables[self.xname].ndim == 2: lon = self.local.createVariable('lon', 'f', ("y", "x"), zlib=False) lon[:] = self.remote.variables[self.xname][:, :] lat = self.local.createVariable('lat', 'f', ("y", "x"), zlib=False) lat[:] = self.remote.variables[self.yname][:, :] if self.remote.variables[self.xname].ndim == 1: lon = self.local.createVariable('lon', 'f', ("x"), zlib=False) lon[:] = self.remote.variables[self.xname][:] lat = self.local.createVariable('lat', 'f', ("y"), zlib=False) lat[:] = self.remote.variables[self.yname][:] # Create local z variable if self.zname != None: if self.remote.variables[self.zname].ndim == 4: z = self.local.createVariable('z', 'f', ("time","level","y","x"), zlib=False) remotez = self.remote.variables[self.zname] localvars.append(z) remotevars.append(remotez) elif self.remote.variables[self.zname].ndim == 3: z = self.local.createVariable('z', 'f', ("level","y","x"), zlib=False) z[:] = self.remote.variables[self.zname][:, :, :] elif self.remote.variables[self.zname].ndim ==1: z = self.local.createVariable('z', 'f', ("level",), zlib=False) z[:] = self.remote.variables[self.zname][:] # Create local time variable time = self.local.createVariable('time', 'f8', ("time",), zlib=False) if self.tname != None: time[:] = self.remote.variables[self.tname][self.inds] if self.point_get.value[0]+self.time_size > np.max(self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1) else: current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn("DataController failed to get remote data. Trying again in 30 seconds") timer.sleep(30) else: break c += 1 except StandardError: logger.error("DataController failed to get data (first request)") raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug("Done updating cache file, closing file, and releasing locks") else: logger.debug("Updating cache file") try: # Open local cache dataset for appending self.local = netCDF4.Dataset(cachepath, 'a') # Create local and remote variable objects # for the variables of interest u = self.local.variables['u'] v = self.local.variables['v'] time = self.local.variables['time'] remoteu = self.remote.variables[self.uname] remotev = self.remote.variables[self.vname] # Create lists of variable objects for # the data updater localvars = [u, v, ] remotevars = [remoteu, remotev, ] if self.salt_name != None and self.temp_name != None: salt = self.local.variables['salt'] temp = self.local.variables['temp'] remotesalt = self.remote.variables[self.salt_name] remotetemp = self.remote.variables[self.temp_name] localvars.append(salt) localvars.append(temp) remotevars.append(remotesalt) remotevars.append(remotetemp) if self.wname != None: w = self.local.variables['w'] remotew = self.remote.variables[self.wname] localvars.append(w) remotevars.append(remotew) if self.zname != None: remotez = self.remote.variables[self.zname] if remotez.ndim == 4: z = self.local.variables['z'] localvars.append(z) remotevars.append(remotez) if self.tname != None: remotetime = self.remote.variables[self.tname] time[self.inds] = self.remote.variables[self.inds] if self.point_get.value[0]+self.time_size > np.max(self.inds): current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1) else: current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size) # Get data from remote dataset and add # to local cache while True: try: self.get_remote_data(localvars, remotevars, current_inds, shape) except: logger.warn("DataController failed to get remote data. Trying again in 30 seconds") timer.sleep(30) else: break c += 1 except StandardError: logger.error("DataController failed to get data (not first request)") raise finally: self.local.sync() self.local.close() self.has_write_lock.value = -1 self.write_lock.release() self.get_data.value = False self.read_lock.release() logger.debug("Done updating cache file, closing file, and releasing locks") else: pass self.dataset.closenc() return "DataController"
# ##### Get bounding polygons from each dataset # <codecell> from paegan.cdm.dataset import CommonDataset lookup_standard_name = "sea_water_temperature" # Filter out DAP servers that are taking FOREVER dap_urls = [url for url in dap_urls if "data1.gfdl.noaa.gov" not in url] dataset_polygons = {} for i, dap in enumerate(dap_urls): print '(%d/%s)' % (i+1, len(dap_urls)), try: cd = CommonDataset.open(dap) except BaseException: print "Could not access", dap try: var = cd.get_varname_from_stdname(standard_name=lookup_standard_name)[0] dataset_polygons[dap] = cd.getboundingpolygon(var=var) print "Retrieved bounding polygon from %s" % dap except (IndexError, AssertionError): print "No standard_name '%s' in '%s'" % (lookup_standard_name, dap) # <markdowncell> # ##### Overlay dataset polygons on top of Important Bird Area polygons # <codecell>
def setup_run(self, hydrodataset, **kwargs): self.hydrodataset = hydrodataset logger.setLevel(logging.PROGRESS) # Relax. time.sleep(0.5) # Add ModelController description to logfile logger.info(str(self)) # Add the model descriptions to logfile for m in self._models: logger.info(str(m)) # Calculate the model timesteps # We need times = len(self._nstep) + 1 since data is stored one timestep # after a particle is forced with the final timestep's data. self.times = list(range(0, (self._step*self._nstep)+1, self._step)) # Calculate a datetime object for each model timestep # This method is duplicated in CachingDataController and CachingForcer # using the 'times' variables above. Will be useful in those other # locations for particles released at different times # i.e. released over a few days self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start) logger.progress((1, "Setting up particle start locations")) point_locations = [] if isinstance(self.geometry, Point): point_locations = [self.reference_location] * self._npart elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon): point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)] # Initialize the particles logger.progress((2, "Initializing particles")) for x in range(0, self._npart): p = LarvaParticle(id=x) p.location = point_locations[x] # We don't need to fill the location gaps here for environment variables # because the first data collected actually relates to this original # position. # We do need to fill in fields such as settled, halted, etc. p.fill_status_gap() # Set the inital note p.note = p.outputstring() p.notes.append(p.note) self.particles.append(p) logger.progress((3, "Initializing and caching hydro model's grid %s" % self.hydrodataset)) try: ds = CommonDataset.open(self.hydrodataset) # Query the dataset for common variable names # and the time variable. logger.debug("Retrieving variable information from dataset") self.common_variables = self.get_common_variables_from_dataset(ds) except Exception: logger.exception("Failed to access dataset %s" % self.hydrodataset) raise BaseDataControllerError("Inaccessible Dataset: %s" % self.hydrodataset) self.timevar = None try: assert self.common_variables.get("u") in ds._current_variables assert self.common_variables.get("v") in ds._current_variables assert self.common_variables.get("x") in ds._current_variables assert self.common_variables.get("y") in ds._current_variables self.timevar = ds.gettimevar(self.common_variables.get("u")) model_start = self.timevar.get_dates()[0] model_end = self.timevar.get_dates()[-1] except AssertionError: logger.exception("Could not locate variables needed to run model: %s" % str(self.common_variables)) raise BaseDataControllerError("A required data variable was not found in %s" % self.hydrodataset) finally: ds.closenc() try: assert self.start > model_start assert self.start < model_end except AssertionError: raise BaseDataControllerError("Start time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[0], model_start, model_end)) try: assert self.datetimes[-1] > model_start assert self.datetimes[-1] < model_end except AssertionError: raise BaseDataControllerError("End time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[-1], model_start, model_end))
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ try: cd = CommonDataset.open(self.service.get('url')) except Exception as e: app.logger.error("Could not open DAP dataset from '%s'\n" "Exception %s: %s" % (self.service.get('url'), type(e).__name__, e)) return 'Not harvested' # rely on times in the file first over global atts for calculating # start/end times of dataset. tmin, tmax = self.get_min_max_time(cd) # if nothing was returned, try to get from global atts if (tmin == None and tmax == None and 'time_coverage_start' in cd.metadata and 'time_coverage_end' in cd.metadata): try: tmin, tmax = (parse(cd.metadata[t]) for t in ('time_coverage_start', 'time_coverage_end')) except ValueError: tmin, tmax = None, None # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append( u"Could not get dataset name. No global attribute named 'title'." ) # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append( u"Could not get dataset description. No global attribute named 'summary'." ) # KEYWORDS keywords = [] try: keywords = sorted( map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append( u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list." ) # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode( cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [ re.compile("CF-"), re.compile( 'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html' ) ] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [ cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0 ] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list( set([ x for x in cd.nc.variables if x not in itertools.chain( _possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables ])) axis_names = DapHarvest.get_axis_variables(cd.nc) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX # paegan does not support ugrid, so try to detect this condition and skip is_ugrid = False is_trajectory = False for vname, v in cd.nc.variables.iteritems(): if 'cf_role' in v.ncattrs(): if v.getncattr('cf_role') == 'mesh_topology': is_ugrid = True break elif v.getncattr('cf_role') == 'trajectory_id': is_trajectory = True break gj = None if is_ugrid: messages.append( u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry." ) elif is_trajectory: coord_names = {} # try to get info for x, y, z, t axes for v in itertools.chain(std_variables, non_std_variables): try: coord_names = cd.get_coord_names(v, **axis_names) if coord_names['xname'] is not None and \ coord_names['yname'] is not None: break except (AssertionError, AttributeError, ValueError, KeyError): pass else: messages.append( u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library." ) if 'xname' in coord_names: try: xvar = cd.nc.variables[coord_names['xname']] yvar = cd.nc.variables[coord_names['yname']] # one less order of magnitude eg 390000 -> 10000 slice_factor = 10**(int(math.log10(xvar.size)) - 1) if slice_factor < 1: slice_factor = 1 # TODO: don't split x/y as separate arrays. Refactor to # use single numpy array instead with both lon/lat # tabledap datasets must be treated differently than # standard DAP endpoints. Retrieve geojson instead of # trying to access as a DAP endpoint if 'erddap/tabledap' in unique_id: # take off 's.' from erddap gj = self.erddap_geojson_url(coord_names) # type defaults to MultiPoint, change to LineString coords = np.array(gj['coordinates'][::slice_factor] + gj['coordinates'][-1:]) xs = coords[:, 0] ys = coords[:, 1] else: xs = np.concatenate((xvar[::slice_factor], xvar[-1:])) ys = np.concatenate((yvar[::slice_factor], yvar[-1:])) # both coords must be valid to have a valid vertex # get rid of any nans and unreasonable lon/lats valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) & (~np.isnan(ys)) & (np.absolute(ys) <= 90)) xs = xs[valid_idx] ys = ys[valid_idx] # Shapely seems to require float64 values or incorrect # values will propagate for the generated lineString # if the array is not numpy's float64 dtype lineCoords = np.array([xs, ys]).T.astype('float64') gj = mapping(asLineString(lineCoords)) messages.append(u"Variable %s was used to calculate " u"trajectory geometry, and is a " u"naive sampling." % v) except (AssertionError, AttributeError, ValueError, KeyError, IndexError) as e: app.logger.warn("Trajectory error occured: %s", e) messages.append( u"Trajectory discovered but could not create a geometry." ) else: for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping( cd.getboundingpolygon(var=v, **axis_names).simplify(0.5)) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments app.logger.exception("Error calculating bounding box") # handles "points" aka single position NCELLs bbox = cd.getbbox(var=v, **axis_names) gj = self.get_bbox_or_point(bbox) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): pass if gj is not None: # We computed something, break out of loop. messages.append( u"Variable %s was used to calculate geometry." % v) break if gj is None: # Try the globals gj = self.global_bounding_box(cd.nc) messages.append( u"Bounding Box calculated using global attributes") if gj is None: messages.append( u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset." ) messages.append( u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset." ) messages.append( u"Failed to calculate geometry using all of the following variables: %s" % ", ".join( itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append( u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities." ) final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list( map(unicode, [ "%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables ])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'time_min': tmin, 'time_max': tmax, 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': get_common_name(DapHarvest.get_asset_type(cd)), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() ncdataset = Dataset(self.service.get('url')) scores = self.ccheck_dataset(ncdataset) metamap = self.metamap_dataset(ncdataset) try: metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap) except Exception as e: metadata_rec = None app.logger.error( "could not save compliancecheck/metamap information", exc_info=True) return "Harvested"
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ METADATA_VAR_NAMES = [u'crs', u'projection'] # CF standard names for Axis STD_AXIS_NAMES = [ u'latitude', u'longitude', u'time', u'forecast_reference_time', u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1', u'ocean_s_coordinate_g2', u'ocean_s_coordinate', u'ocean_double_sigma', u'ocean_sigma_over_z', u'projection_y_coordinate', u'projection_x_coordinate' ] # Some datasets don't define standard_names on axis variables. This is used to weed them out based on the # actual variable name COMMON_AXIS_NAMES = [ u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time', u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u', u'lat_v', u'lon_v ', u'lat_rho', u'lon_rho', u'lat_psi' ] cd = CommonDataset.open(self.service.get('url')) # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append( u"Could not get dataset name. No global attribute named 'title'." ) # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append( u"Could not get dataset description. No global attribute named 'summary'." ) # KEYWORDS keywords = [] try: keywords = sorted( map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append( u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list." ) # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode( cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [ re.compile("CF-"), re.compile( 'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html' ) ] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [ cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0 ] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list( set([ x for x in cd.nc.variables if x not in itertools.chain( _possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables ])) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX gj = None for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v)) except (AttributeError, AssertionError, ValueError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments gj = mapping(box(*cd.get_bbox(var=v))) except (AttributeError, AssertionError, ValueError): pass if gj is not None: # We computed something, break out of loop. messages.append( u"Variable %s was used to calculate geometry." % v) break if gj is None: messages.append( u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset." ) messages.append( u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset." ) messages.append( u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append( u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities." ) final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list( map(unicode, [ "%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables ])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': unicode(cd._datasettype).upper(), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"