Example #1
0
    def test_cgrid_init_roms_depths(self):

        datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "cgrid"

        # u grid
        coords = pd.get_coord_dict("u")
        assert str(pd._coordcache["u"]) == "[XY][Z][T]"
        names = pd.get_coord_names("u")
        assert names["tname"] == "ocean_time"
        assert names["zname"] == "s_rho"
        assert names["xname"] == "lon_u"
        assert names["yname"] == "lat_u"

        # v grid
        coords = pd.get_coord_dict("v")
        assert str(pd._coordcache["v"]) == "[XY][Z][T]"
        names = pd.get_coord_names("v")
        assert names["tname"] == "ocean_time"
        assert names["zname"] == "s_rho"
        assert names["xname"] == "lon_v"
        assert names["yname"] == "lat_v"

        # rho grid
        coords = pd.get_coord_dict("h")
        assert str(pd._coordcache["h"]) == "[XY]"
        names = pd.get_coord_names("h")
        assert names["tname"] == None
        assert names["zname"] == None
        assert names["xname"] == "lon_rho"
        assert names["yname"] == "lat_rho"

        pd.closenc()
Example #2
0
    def test_rgrid_fluid_var_bbox_time(self):

        datafile = os.path.join(data_path, "marcooshfradar20120331.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'

        newbbox = np.asarray(pd.getbbox("u")) - 1
        test = pd.restrict_vars("u").restrict_bbox(newbbox).nearest_time(
            datetime(2012, 3, 30, 4, tzinfo=pytz.utc))
        assert "v" not in set(test._current_variables)
        assert test.getbbox("u")[2] <= newbbox[2]
        assert test.getbbox("u")[3] <= newbbox[3]
        assert test.gettimebounds("u")[0] == datetime(2012,
                                                      3,
                                                      30,
                                                      4,
                                                      0,
                                                      tzinfo=pytz.utc)
        assert test.gettimebounds("u")[1] == datetime(2012,
                                                      3,
                                                      30,
                                                      4,
                                                      0,
                                                      tzinfo=pytz.utc)

        pd.closenc()
Example #3
0
    def test_cgrid_init_roms_depths(self):

        datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'cgrid'

        # u grid
        coords = pd.get_coord_dict('u')
        assert str(pd._coordcache['u']) == "[XY][Z][T]"
        names = pd.get_coord_names('u')
        assert names["tname"] == "ocean_time"
        assert names["zname"] == "s_rho"
        assert names["xname"] == "lon_u"
        assert names["yname"] == "lat_u"

        # v grid
        coords = pd.get_coord_dict('v')
        assert str(pd._coordcache['v']) == "[XY][Z][T]"
        names = pd.get_coord_names('v')
        assert names["tname"] == "ocean_time"
        assert names["zname"] == "s_rho"
        assert names["xname"] == "lon_v"
        assert names["yname"] == "lat_v"

        # rho grid
        coords = pd.get_coord_dict('h')
        assert str(pd._coordcache['h']) == "[XY]"
        names = pd.get_coord_names('h')
        assert names["tname"] == None
        assert names["zname"] == None
        assert names["xname"] == "lon_rho"
        assert names["yname"] == "lat_rho"

        pd.closenc()
Example #4
0
 def test_aggregated_dataset(self):
     datafile = os.path.join(data_path, "pws_das_20140126*.nc")
     pd = CommonDataset.open(datafile)
     assert pd._datasettype == 'rgrid'
     values = pd.get_values(var="u",
                            bbox=[-149, 59, -144, 61.5],
                            timeinds=0)
     assert values.size > 0
Example #5
0
 def load_dataset(self):
     self.cd = CommonDataset.open(self.service.get('url'))
     self.std_variables = None
     self.non_std_variables = None
     self.get_standards(self.cd)
     self.axis_names = DapHarvester.get_axis_variables(self.cd.nc)
     self.messages = []
     return self.cd
Example #6
0
 def test_cgrid_init(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/estuarine_hypoxia/chesroms/agg-1991.nc"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'cgrid'
     coords = pd.get_coord_dict('u')
     assert str(pd._coordcache['u']) == "[XY][Z][T]"
     names = pd.get_names('u')
     assert names["tname"] == "time"
     assert names["zname"] == "s_rho"
     assert names["xname"] == "lon_u"
     assert names["yname"] == "lat_u"
     pd.closenc()
 def load_initial_dataset(self):
     """
     Initialize self.dataset, then close it
     A cacher will have to wrap this in locks, while a straight runner will not.
     """
     try:
         self.dataset = CommonDataset.open(self.hydrodataset)
         if self.timevar is None:
             self.timevar = self.dataset.gettimevar(self.common_variables.get("u"))
     except Exception:
         logger.warn("No source dataset: %s.  Particle exiting" % self.hydrodataset)
         raise
Example #8
0
 def test_cgrid_init(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/estuarine_hypoxia/chesroms/agg-1991.nc"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'cgrid'
     coords = pd.get_coord_dict('u')
     assert str(pd._coordcache['u']) == "[XY][Z][T]"
     names = pd.get_names('u')
     assert names["tname"] == "time"
     assert names["zname"] == "s_rho"
     assert names["xname"] == "lon_u"
     assert names["yname"] == "lat_u"
     pd.closenc()
Example #9
0
 def test_ncell_init(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/usf/fvcom/rita/ultralite/vardrag/nowave/3d"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'ncell'
     varname = pd.get_varname_from_stdname('sea_surface_height_above_geoid')
     assert varname == "zeta"
     names = pd.get_names(varname)
     assert names["tname"] == "time"
     assert names["zname"] == None
     assert names["xname"] == "lon"
     assert names["yname"] == "lat"
     pd.closenc()
Example #10
0
 def test_ncell_init(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/usf/fvcom/rita/ultralite/vardrag/nowave/3d"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'ncell'
     varname = pd.get_varname_from_stdname('sea_surface_height_above_geoid')
     assert varname == "zeta"
     names = pd.get_names(varname)
     assert names["tname"] == "time"
     assert names["zname"] == None
     assert names["xname"] == "lon"
     assert names["yname"] == "lat"
     pd.closenc()
Example #11
0
    def test_rgrid_fluid_var_bbox(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "rgrid"

        newbbox = np.asarray(pd.getbbox("u")) - 1
        test = pd.restrict_vars("u").restrict_bbox(newbbox)
        assert "v" not in set(test._current_variables)
        assert test.getbbox("u")[2] <= newbbox[2]
        assert test.getbbox("u")[3] <= newbbox[3]

        pd.closenc()
Example #12
0
    def test_rgrid_fluid_var_bbox(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'

        newbbox = np.asarray(pd.getbbox("u")) - 1
        test = pd.restrict_vars("u").restrict_bbox(newbbox)
        assert "v" not in set(test._current_variables)
        assert test.getbbox("u")[2] <= newbbox[2]
        assert test.getbbox("u")[3] <= newbbox[3]

        pd.closenc()
Example #13
0
 def test_fluid_test(self):
     url = "http://thredds.axiomalaska.com/thredds/dodsC/PWS_DAS.nc"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'rgrid'
     newbbox = np.asarray(pd.getbbox("u"))-1
     test = pd.restrict_vars("u").restrict_bbox(newbbox).restrict_depth((3, 50)).nearest_time(datetime(2011,5,1,0,0, tzinfo=pytz.utc))
     assert not "v" in set(test._current_variables)
     assert test.getbbox("u")[2] <= newbbox[2]
     assert test.getbbox("u")[3] <= newbbox[3]
     assert test.getdepthbounds("u")[0] >= 3
     assert test.getdepthbounds("u")[1] <= 50
     assert test.gettimebounds("u")[0] == datetime(2011,5,1,0,0, tzinfo=pytz.utc)
     assert test.gettimebounds("u")[1] == datetime(2011,5,1,0,0, tzinfo=pytz.utc)
Example #14
0
    def test_rgrid_init_ncom_surface(self):

        datafile = os.path.join(data_path, "ncom_glb_sfc8_hind_2012033100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'
        coords = pd.get_coord_dict('water_u')
        assert str(pd._coordcache['water_u']) == "[XY][T]"
        names = pd.get_coord_names('water_u')
        assert names["tname"] == "time"
        assert names["zname"] == None
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #15
0
    def test_rgrid_init_ncom_surface(self):

        datafile = os.path.join(data_path, "ncom_glb_sfc8_hind_2012033100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "rgrid"
        coords = pd.get_coord_dict("water_u")
        assert str(pd._coordcache["water_u"]) == "[XY][T]"
        names = pd.get_coord_names("water_u")
        assert names["tname"] == "time"
        assert names["zname"] == None
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #16
0
 def load_initial_dataset(self):
     """
     Initialize self.dataset, then close it
     A cacher will have to wrap this in locks, while a straight runner will not.
     """
     try:
         self.dataset = CommonDataset.open(self.hydrodataset)
         if self.timevar is None:
             self.timevar = self.dataset.gettimevar(
                 self.common_variables.get("u"))
     except Exception:
         logger.warn("No source dataset: %s.  Particle exiting" %
                     self.hydrodataset)
         raise
Example #17
0
    def __init__(self, **kwargs):
        """
            Optional named arguments: 
            * file (local path or dap to bathymetry netcdf file)

        """
        
        if kwargs.get("file", None) is not None:
            self._file = os.path.normpath(kwargs.pop('file'))
        else:
            raise ValueError("Must provide a path to the Bathymetry file")
        
        self._type = kwargs.pop("type", "hover")
        self._nc = CommonDataset.open(self._file)
        self._bathy_name = kwargs.pop("bathy", "z")
Example #18
0
    def test_cgrid_init_pom_depths(self):

        datafile = os.path.join(data_path, "m201310100.out3.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "cgrid"

        coords = pd.get_coord_dict("u")
        assert str(pd._coordcache["u"]) == "[XY][Z][T]"
        names = pd.get_coord_names("u")
        assert names["tname"] == "time"
        assert names["zname"] == "sigma"
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #19
0
    def __init__(self, **kwargs):
        """
            Optional named arguments: 
            * file (local path or dap to bathymetry netcdf file)

        """

        if kwargs.get("file", None) is not None:
            self._file = os.path.normpath(kwargs.pop('file'))
        else:
            raise ValueError("Must provide a path to the Bathymetry file")

        self._type = kwargs.pop("type", "hover")
        self._nc = CommonDataset.open(self._file)
        self._bathy_name = kwargs.pop("bathy", "z")
Example #20
0
    def test_rgrid_init_hfradar_surface(self):

        datafile = os.path.join(data_path, "marcooshfradar20120331.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "rgrid"

        coords = pd.get_coord_dict("u")
        assert str(pd._coordcache["u"]) == "[XY][T]"
        names = pd.get_coord_names("u")
        assert names["tname"] == "time"
        assert names["zname"] == None
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #21
0
    def test_bounding_polygon_rgrid(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)

        bp = pd.getboundingpolygon("u")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("u")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)
        # Expand to encompass the bbox
        assert bp.buffer(1).contains(shape)

        pd.closenc()
Example #22
0
    def test_bounding_polygon_rgrid(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)

        bp = pd.getboundingpolygon("u")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("u")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)
        # Expand to encompass the bbox
        assert bp.buffer(1).contains(shape)

        pd.closenc()
Example #23
0
    def test_rgrid_init_pws_depths(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'

        coords = pd.get_coord_dict('u')
        assert str(pd._coordcache['u']) == "[XY][Z][T]"
        names = pd.get_coord_names('u')
        assert names["tname"] == "time"
        assert names["zname"] == "depth"
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #24
0
    def test_rgrid_fluid_var_bbox_time(self):

        datafile = os.path.join(data_path, "marcooshfradar20120331.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "rgrid"

        newbbox = np.asarray(pd.getbbox("u")) - 1
        test = pd.restrict_vars("u").restrict_bbox(newbbox).nearest_time(datetime(2012, 3, 30, 4, tzinfo=pytz.utc))
        assert "v" not in set(test._current_variables)
        assert test.getbbox("u")[2] <= newbbox[2]
        assert test.getbbox("u")[3] <= newbbox[3]
        assert test.gettimebounds("u")[0] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc)
        assert test.gettimebounds("u")[1] == datetime(2012, 3, 30, 4, 0, tzinfo=pytz.utc)

        pd.closenc()
Example #25
0
    def test_rgrid_regrid_4d(self):
        from paegan.utils.asainterpolate import create_grid
        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'
        var = "u"
        lon = [-148.25, -148.24, -148.23, -148.22, -148.21, -148.2, -148.19, -148.18, -148.17, -148.16, -148.15, -148.14, -148.13, -148.12, -148.11, -148.1, -148.09, -148.08, -148.07, -148.06, -148.05, -148.04, -148.03, -148.02, -148.01, -148.0, -147.99, -147.98, -147.97, -147.96, -147.95, -147.94, -147.93, -147.92, -147.91, -147.9, -147.89, -147.88, -147.87, -147.86, -147.85, -147.84, -147.83, -147.82, -147.81, -147.8, -147.79, -147.78, -147.77, -147.76, -147.75, -147.74, -147.73, -147.72, -147.71, -147.7, -147.69, -147.68, -147.67, -147.66, -147.65, -147.64, -147.63, -147.62, -147.61, -147.6, -147.59, -147.58, -147.57, -147.56, -147.55, -147.54, -147.53, -147.52, -147.51, -147.5, -147.49, -147.48, -147.47, -147.46, -147.45, -147.44, -147.43, -147.42, -147.41, -147.4, -147.39, -147.38, -147.37, -147.36, -147.35, -147.34, -147.33, -147.32, -147.31, -147.3, -147.29, -147.28, -147.27, -147.26, -147.25, -147.24, -147.23, -147.22, -147.21, -147.2, -147.19, -147.18, -147.17, -147.16, -147.15, -147.14, -147.13, -147.12, -147.11, -147.1, -147.09, -147.08, -147.07, -147.06, -147.05, -147.04, -147.03, -147.02, -147.01, -147.0, -146.99, -146.98, -146.97, -146.96, -146.95, -146.94, -146.93, -146.92, -146.91, -146.9, -146.89, -146.88, -146.87, -146.86, -146.85, -146.84, -146.83, -146.82, -146.81, -146.8, -146.79, -146.78, -146.77, -146.76, -146.75, -146.74, -146.73, -146.72, -146.71, -146.7, -146.69, -146.68, -146.67, -146.66, -146.65, -146.64, -146.63, -146.62, -146.61, -146.6, -146.59, -146.58, -146.57, -146.56, -146.55, -146.54, -146.53, -146.52, -146.51, -146.5, -146.49, -146.48, -146.47, -146.46, -146.45, -146.44, -146.43, -146.42, -146.41, -146.4, -146.39, -146.38, -146.37, -146.36, -146.35, -146.34, -146.33, -146.32, -146.31, -146.3, -146.29, -146.28, -146.27, -146.26, -146.25, -146.24, -146.23, -146.22, -146.21, -146.2, -146.19, -146.18, -146.17, -146.16, -146.15, -146.14, -146.13, -146.12, -146.11, -146.1, -146.09, -146.08, -146.07, -146.06, -146.05, -146.04, -146.03, -146.02, -146.01, -146.0, -145.99, -145.98, -145.97, -145.96, -145.95, -145.94, -145.93, -145.92, -145.91, -145.9, -145.89, -145.88, -145.87, -145.86, -145.85, -145.84, -145.83, -145.82, -145.81, -145.8, -145.79, -145.78, -145.77, -145.76, -145.75, -145.74, -145.73, -145.72, -145.71, -145.7, -145.69, -145.68, -145.67, -145.66, -145.65, -145.64, -145.63, -145.62, -145.61, -145.6, -145.59, -145.58, -145.57, -145.56, -145.55, -145.54, -145.53, -145.52, -145.51, -145.5, -145.49, -145.48, -145.47, -145.46, -145.45, -145.44, -145.43, -145.42, -145.41, -145.4, -145.39, -145.38, -145.37, -145.36, -145.35, -145.34, -145.33, -145.32, -145.31, -145.3, -145.29, -145.28, -145.27, -145.26, -145.25, -145.24, -145.23, -145.22, -145.21, -145.2, -145.19, -145.18, -145.17, -145.16, -145.15, -145.14, -145.13, -145.12, -145.11, -145.1, -145.09, -145.08, -145.07, -145.06, -145.05, -145.04, -145.03, -145.02, -145.01, -145.0, -144.99, -144.98, -144.97, -144.96, -144.95, -144.94, -144.93, -144.92, -144.91, -144.9, -144.89, -144.88, -144.87, -144.86, -144.85, -144.84, -144.83, -144.82, -144.81, -144.8, -144.79]
        lat = [59.68, 59.69, 59.7, 59.71, 59.72, 59.73, 59.74, 59.75, 59.760002, 59.77, 59.78, 59.79, 59.8, 59.81, 59.82, 59.83, 59.84, 59.85, 59.86, 59.87, 59.88, 59.89, 59.9, 59.91, 59.920002, 59.93, 59.94, 59.95, 59.96, 59.97, 59.98, 59.99, 60.0, 60.010002, 60.02, 60.03, 60.04, 60.05, 60.06, 60.07, 60.08, 60.09, 60.1, 60.11, 60.12, 60.13, 60.14, 60.15, 60.16, 60.170002, 60.18, 60.19, 60.2, 60.21, 60.22, 60.23, 60.24, 60.25, 60.260002, 60.27, 60.28, 60.29, 60.3, 60.31, 60.32, 60.33, 60.34, 60.35, 60.36, 60.37, 60.38, 60.39, 60.4, 60.41, 60.420002, 60.43, 60.44, 60.45, 60.46, 60.47, 60.48, 60.49, 60.5, 60.510002, 60.52, 60.53, 60.54, 60.55, 60.56, 60.57, 60.58, 60.59, 60.6, 60.61, 60.62, 60.63, 60.64, 60.65, 60.66, 60.670002, 60.68, 60.69, 60.7, 60.71, 60.72, 60.73, 60.74, 60.75, 60.760002, 60.77, 60.78, 60.79, 60.8, 60.81, 60.82, 60.83, 60.84, 60.85, 60.86, 60.87, 60.88, 60.89, 60.9, 60.91, 60.920002, 60.93, 60.94, 60.95, 60.96, 60.97, 60.98, 60.99, 61.0, 61.010002, 61.02, 61.03, 61.04, 61.05, 61.06, 61.07, 61.08, 61.09, 61.1, 61.11, 61.12, 61.13, 61.14, 61.15, 61.16, 61.170002, 61.18, 61.19, 61.2]
        lon, lat = np.asarray(lon), np.asarray(lat)
        data1 = pd.get_values(var, bbox = (-149, 59, -144, 61.5))
        coords_struct = pd.sub_coords(var, bbox = (-149, 59, -144, 61.5))
        data2 = pd.get_values_on_grid(var, coords_struct.x, coords_struct.y, t=coords_struct.time, z=coords_struct.z)

        pd.closenc()
        assert np.all(data1 == data2)
Example #26
0
    def test_rgrid_init_pws_depths(self):

        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == "rgrid"

        coords = pd.get_coord_dict("u")
        assert str(pd._coordcache["u"]) == "[XY][Z][T]"
        names = pd.get_coord_names("u")
        assert names["tname"] == "time"
        assert names["zname"] == "depth"
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #27
0
    def test_rgrid_init_hfradar_surface(self):

        datafile = os.path.join(data_path, "marcooshfradar20120331.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'

        coords = pd.get_coord_dict('u')
        assert str(pd._coordcache['u']) == "[XY][T]"
        names = pd.get_coord_names('u')
        assert names["tname"] == "time"
        assert names["zname"] == None
        assert names["xname"] == "lon"
        assert names["yname"] == "lat"

        pd.closenc()
Example #28
0
 def test_slosh_test(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/und/slosh/ike/egl3/swi"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'cgrid'
     grid = pd.getgridobj('eta')
     box = [i-1 for i in grid.bbox]
     vals = pd.get_values('eta',
                          bbox = box, 
                          zinds = 1,
                          timeinds = 1,)
     assert vals.shape[0]==133 and vals.shape[1]==72
     names = pd.get_names('eta')
     assert names["tname"] == "time"
     assert names["zname"] == None
     assert names["xname"] == "lon"
     assert names["yname"] == "lat"
     pd.closenc()
Example #29
0
    def calc(self):
        """
        Compute bounds for this dataset
        """
        try:

            nc = CommonDataset.open(self.location)

            matches = nc.get_varname_from_stdname("eastward_sea_water_velocity")
            matches = matches + nc.get_varname_from_stdname("eastward_current")
            query_var = matches[0]

            # Set BBOX
            minx, miny, maxx, maxy = nc.getbbox(var=query_var)
            self.bbox = unicode(box(minx, miny, maxx, maxy).wkt)

            # Set Bounding Polygon
            poly = nc.getboundingpolygon(var=query_var)
            self.geometry = unicode(poly.wkt)

            # Set Time bounds
            mintime, maxtime = nc.gettimebounds(var=query_var)
            self.starting = mintime
            self.ending = maxtime

            def clean(value):
                try:
                    str(type(value)).index("numpy")
                except:
                    return value
                else:
                    return value.tolist()

            cleaned_info = {}
            variables = nc.getvariableinfo()
            for k, v in variables.items():
                # Strip out numpy arrays into BSON encodable things.
                cleaned_var = { key : clean(value) for key, value in v.items() }
                cleaned_info[k] = cleaned_var

            self.variables = cleaned_info

        except:
            app.logger.warning("Could not calculate bounds for this dataset")
            raise
 def load_initial_dataset(self):
     """
     Initialize self.dataset, then close it
     A cacher will have to wrap this in locks, while a straight runner will not.
     """
     try:
         with self.read_lock:
             self.read_count.value += 1
             self.has_read_lock.append(os.getpid())
         self.dataset = CommonDataset.open(self.hydrodataset)
         self.dataset.closenc()
     except Exception:
         logger.warn("No source dataset: %s.  Particle exiting" % self.hydrodataset)
         raise
     finally:
         with self.read_lock:
             self.read_count.value -= 1
             self.has_read_lock.remove(os.getpid())
Example #31
0
 def load_initial_dataset(self):
     """
     Initialize self.dataset, then close it
     A cacher will have to wrap this in locks, while a straight runner will not.
     """
     try:
         with self.read_lock:
             self.read_count.value += 1
             self.has_read_lock.append(os.getpid())
         self.dataset = CommonDataset.open(self.hydrodataset)
         self.dataset.closenc()
     except Exception:
         logger.warn("No source dataset: %s.  Particle exiting" %
                     self.hydrodataset)
         raise
     finally:
         with self.read_lock:
             self.read_count.value -= 1
             self.has_read_lock.remove(os.getpid())
Example #32
0
 def test_slosh_test(self):
     url = "http://testbedapps-dev.sura.org/thredds/dodsC/in/und/slosh/ike/egl3/swi"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'cgrid'
     grid = pd.getgridobj('eta')
     box = [i - 1 for i in grid.bbox]
     vals = pd.get_values(
         'eta',
         bbox=box,
         zinds=1,
         timeinds=1,
     )
     assert vals.shape[0] == 133 and vals.shape[1] == 72
     names = pd.get_names('eta')
     assert names["tname"] == "time"
     assert names["zname"] == None
     assert names["xname"] == "lon"
     assert names["yname"] == "lat"
     pd.closenc()
Example #33
0
    def test_bounding_polygon_roms_cgrid(self):

        datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc")
        pd = CommonDataset.open(datafile)

        bp = pd.getboundingpolygon("u")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("u")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)

        bp = pd.getboundingpolygon("h")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("h")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)

        pd.closenc()
Example #34
0
    def test_bounding_polygon_roms_cgrid(self):

        datafile = os.path.join(data_path, "ocean_avg_synoptic_seg22.nc")
        pd = CommonDataset.open(datafile)

        bp = pd.getboundingpolygon("u")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("u")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)

        bp = pd.getboundingpolygon("h")
        assert isinstance(bp, Polygon)
        bbox = pd.getbbox("h")
        shape = box(bbox[0], bbox[1], bbox[2], bbox[3])
        # Shrink some and test if within bbox
        assert bp.buffer(-0.01).within(shape)

        pd.closenc()
Example #35
0
 def test_fluid_test(self):
     url = "http://thredds.axiomalaska.com/thredds/dodsC/PWS_DAS.nc"
     pd = CommonDataset.open(url)
     assert pd._datasettype == 'rgrid'
     newbbox = np.asarray(pd.getbbox("u")) - 1
     test = pd.restrict_vars("u").restrict_bbox(newbbox).restrict_depth(
         (3, 50)).nearest_time(datetime(2011, 5, 1, 0, 0, tzinfo=pytz.utc))
     assert not "v" in set(test._current_variables)
     assert test.getbbox("u")[2] <= newbbox[2]
     assert test.getbbox("u")[3] <= newbbox[3]
     assert test.getdepthbounds("u")[0] >= 3
     assert test.getdepthbounds("u")[1] <= 50
     assert test.gettimebounds("u")[0] == datetime(2011,
                                                   5,
                                                   1,
                                                   0,
                                                   0,
                                                   tzinfo=pytz.utc)
     assert test.gettimebounds("u")[1] == datetime(2011,
                                                   5,
                                                   1,
                                                   0,
                                                   0,
                                                   tzinfo=pytz.utc)
    def setup_run(self, **kwargs):

        logger.setLevel(logging.PROGRESS)

        self.redis_url             = None
        self.redis_log_channel     = None
        self.redis_results_channel = None
        if "redis" in kwargs.get("output_formats", []):
            from paegan.logger.redis_handler import RedisHandler
            self.redis_url             = kwargs.get("redis_url")
            self.redis_log_channel     = kwargs.get("redis_log_channel")
            self.redis_results_channel = kwargs.get("redis_results_channel")
            rhandler = RedisHandler(self.redis_log_channel, self.redis_url)
            rhandler.setLevel(logging.PROGRESS)
            logger.addHandler(rhandler)

        # Relax.
        time.sleep(0.5)

        # Add ModelController description to logfile
        logger.info(unicode(self))

        # Add the model descriptions to logfile
        for m in self._models:
            logger.info(unicode(m))

        # Calculate the model timesteps
        # We need times = len(self._nstep) + 1 since data is stored one timestep
        # after a particle is forced with the final timestep's data.
        self.times = range(0, (self._step*self._nstep)+1, self._step)
        # Calculate a datetime object for each model timestep
        # This method is duplicated in CachingDataController and CachingForcer
        # using the 'times' variables above.  Will be useful in those other
        # locations for particles released at different times
        # i.e. released over a few days
        self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start)

        logger.progress((1, "Setting up particle start locations"))
        point_locations = []
        if isinstance(self.geometry, Point):
            point_locations = [self.reference_location] * self._npart
        elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon):
            point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)]

        # Initialize the particles
        logger.progress((2, "Initializing particles"))
        for x in xrange(0, self._npart):
            p = LarvaParticle(id=x)
            p.location = point_locations[x]
            # We don't need to fill the location gaps here for environment variables
            # because the first data collected actually relates to this original
            # position.
            # We do need to fill in fields such as settled, halted, etc.
            p.fill_status_gap()
            # Set the inital note
            p.note = p.outputstring()
            p.notes.append(p.note)
            self.particles.append(p)

        if kwargs.get("manager", True):
            # Get the number of cores (may take some tuning) and create that
            # many workers then pass particles into the queue for the workers
            self.mgr = multiprocessing.Manager()

            # This tracks if the system is 'alive'.  Most looping whiles will check this
            # and break out if it is False.  This is True until something goes very wrong.
            self.active = self.mgr.Value('bool', True)

            # Each particle is a task, plus the CachingDataController
            self.number_of_tasks = self.get_number_of_tasks()

            # Either spin up the number of cores, or the number of tasks
            self.nproc = min(multiprocessing.cpu_count() - 1, self.number_of_tasks)

            # Number of tasks that we need to run.  This is decremented everytime something exits.
            self.n_run = self.mgr.Value('int', self.number_of_tasks)
            # The lock that controls access to the 'n_run' variable
            self.nproc_lock = self.mgr.Lock()

            # Create the task queue for all of the particles and the CachingDataController
            self.tasks = multiprocessing.JoinableQueue(self.number_of_tasks)
            # Create the result queue for all of the particles and the CachingDataController
            self.results = self.mgr.Queue(self.number_of_tasks)

        logger.progress((3, "Initializing and caching hydro model's grid"))
        try:
            ds = CommonDataset.open(self.hydrodataset)
        except Exception:
            logger.exception("Failed to access dataset %s" % self.hydrodataset)
            raise BaseDataControllerError("Inaccessible Dataset: %s" % self.hydrodataset)
        # Query the dataset for common variable names
        # and the time variable.
        logger.debug("Retrieving variable information from dataset")
        self.common_variables = self.get_common_variables_from_dataset(ds)

        self.timevar = None
        try:
            assert self.common_variables.get("u") in ds._current_variables
            assert self.common_variables.get("v") in ds._current_variables
            assert self.common_variables.get("x") in ds._current_variables
            assert self.common_variables.get("y") in ds._current_variables

            self.timevar = ds.gettimevar(self.common_variables.get("u"))
        except AssertionError:
            logger.exception("Could not locate variables needed to run model: %s" % unicode(self.common_variables))
            raise BaseDataControllerError("A required data variable was not found in %s" % self.hydrodataset)

        model_start = self.timevar.get_dates()[0]
        model_end   = self.timevar.get_dates()[-1]

        try:
            assert self.start > model_start
            assert self.start < model_end
        except AssertionError:
            raise BaseDataControllerError("Start time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[0], model_start, model_end))

        try:
            assert self.datetimes[-1] > model_start
            assert self.datetimes[-1] < model_end
        except AssertionError:
            raise BaseDataControllerError("End time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[-1], model_start, model_end))

        ds.closenc()
# ##### Get bounding polygons from each dataset

# <codecell>

from paegan.cdm.dataset import CommonDataset

lookup_standard_name = "sea_water_temperature"

# Filter out DAP servers that are taking FOREVER
dap_urls = [url for url in dap_urls if "data1.gfdl.noaa.gov" not in url]

dataset_polygons = {}
for i, dap in enumerate(dap_urls):
    print '(%d/%s)' % (i + 1, len(dap_urls)),
    try:
        cd = CommonDataset.open(dap)
    except BaseException:
        print "Could not access", dap

    try:
        var = cd.get_varname_from_stdname(
            standard_name=lookup_standard_name)[0]
        dataset_polygons[dap] = cd.getboundingpolygon(var=var)
        print "Retrieved bounding polygon from %s" % dap
    except (IndexError, AssertionError):
        print "No standard_name '%s' in '%s'" % (lookup_standard_name, dap)

# <markdowncell>

# ##### Overlay dataset polygons on top of Important Bird Area polygons
Example #38
0
    def setup_run(self, hydrodataset, **kwargs):

        self.hydrodataset = hydrodataset

        logger.setLevel(logging.PROGRESS)

        # Relax.
        time.sleep(0.5)

        # Add ModelController description to logfile
        logger.info(str(self))

        # Add the model descriptions to logfile
        for m in self._models:
            logger.info(str(m))

        # Calculate the model timesteps
        # We need times = len(self._nstep) + 1 since data is stored one timestep
        # after a particle is forced with the final timestep's data.
        self.times = list(range(0, (self._step * self._nstep) + 1, self._step))
        # Calculate a datetime object for each model timestep
        # This method is duplicated in CachingDataController and CachingForcer
        # using the 'times' variables above.  Will be useful in those other
        # locations for particles released at different times
        # i.e. released over a few days
        self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(
            self.times, start=self.start)

        logger.progress((1, "Setting up particle start locations"))
        point_locations = []
        if isinstance(self.geometry, Point):
            point_locations = [self.reference_location] * self._npart
        elif isinstance(self.geometry, Polygon) or isinstance(
                self.geometry, MultiPolygon):
            point_locations = [
                Location4D(latitude=loc.y,
                           longitude=loc.x,
                           depth=self._depth,
                           time=self.start)
                for loc in AsaTransport.fill_polygon_with_points(
                    goal=self._npart, polygon=self.geometry)
            ]

        # Initialize the particles
        logger.progress((2, "Initializing particles"))
        for x in range(0, self._npart):
            p = LarvaParticle(id=x)
            p.location = point_locations[x]
            # We don't need to fill the location gaps here for environment variables
            # because the first data collected actually relates to this original
            # position.
            # We do need to fill in fields such as settled, halted, etc.
            p.fill_status_gap()
            # Set the inital note
            p.note = p.outputstring()
            p.notes.append(p.note)
            self.particles.append(p)

        logger.progress((3, "Initializing and caching hydro model's grid %s" %
                         self.hydrodataset))
        try:
            ds = CommonDataset.open(self.hydrodataset)
            # Query the dataset for common variable names
            # and the time variable.
            logger.debug("Retrieving variable information from dataset")
            self.common_variables = self.get_common_variables_from_dataset(ds)
        except Exception:
            logger.exception("Failed to access dataset %s" % self.hydrodataset)
            raise BaseDataControllerError("Inaccessible Dataset: %s" %
                                          self.hydrodataset)

        self.timevar = None
        try:
            assert self.common_variables.get("u") in ds._current_variables
            assert self.common_variables.get("v") in ds._current_variables
            assert self.common_variables.get("x") in ds._current_variables
            assert self.common_variables.get("y") in ds._current_variables

            self.timevar = ds.gettimevar(self.common_variables.get("u"))
            model_start = self.timevar.get_dates()[0]
            model_end = self.timevar.get_dates()[-1]
        except AssertionError:
            logger.exception(
                "Could not locate variables needed to run model: %s" %
                str(self.common_variables))
            raise BaseDataControllerError(
                "A required data variable was not found in %s" %
                self.hydrodataset)
        finally:
            ds.closenc()

        try:
            assert self.start > model_start
            assert self.start < model_end
        except AssertionError:
            raise BaseDataControllerError(
                "Start time for model (%s) is not available in source dataset (%s/%s)"
                % (self.datetimes[0], model_start, model_end))

        try:
            assert self.datetimes[-1] > model_start
            assert self.datetimes[-1] < model_end
        except AssertionError:
            raise BaseDataControllerError(
                "End time for model (%s) is not available in source dataset (%s/%s)"
                % (self.datetimes[-1], model_start, model_end))
    def __call__(self, proc, active):

        self.active = active

        if self.usebathy == True:
            self._bathymetry = Bathymetry(file=self.bathy)
        
        self._shoreline = None  
        if self.useshore == True:
            self._shoreline = Shoreline(file=self.shoreline_path, point=self.release_location_centroid, spatialbuffer=0.25)
            # Make sure we are not starting on land.  Raises exception if we are.
            self._shoreline.intersect(start_point=self.release_location_centroid, end_point=self.release_location_centroid)
            
        self.proc = proc
        part = self.part
        
        if self.active.value == True:
            while self.get_data.value == True:
                logger.debug("Waiting for DataController to start...")
                timer.sleep(10)
                pass

        # Initialize commondataset of local cache, then
        # close the related netcdf file
        try:
            with self.read_lock:
                self.read_count.value += 1
                self.has_read_lock.append(os.getpid())
            self.dataset = CommonDataset.open(self.localpath)
            self.dataset.closenc()
        except StandardError:
            logger.warn("No cache file: %s.  Particle exiting" % self.localpath)
            raise
        finally:
            with self.read_lock:
                self.read_count.value -= 1
                self.has_read_lock.remove(os.getpid())

        # Calculate datetime at every timestep
        modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time)

        # Load Timevar from pickle serialization
        f = open(self.timevar_pickle_path,"rb")
        timevar = pickle.load(f)
        f.close()

        if self.time_method == 'interp':
            time_indexs = timevar.nearest_index(newtimes, select='before')
        elif self.time_method == 'nearest':
            time_indexs = timevar.nearest_index(newtimes)
        else:
            logger.warn("Method for computing u,v,w,temp,salt not supported!")
        try:
            assert len(newtimes) == len(time_indexs)
        except AssertionError:
            logger.error("Time indexes are messed up. Need to have equal datetime and time indexes")
            raise

        # loop over timesteps
        # We don't loop over the last time_index because
        # we need to query in the time_index and set the particle's
        # location as the 'newtime' object.
        for loop_i, i in enumerate(time_indexs[0:-1]):

            if self.active.value == False:
                raise ValueError("Particle exiting due to Failure.")

            newloc = None

            # if need a time that is outside of what we have
            #if self.active.value == True:
            #    while self.get_data.value == True:
            #        logger.info("Waiting for DataController to get out...")
            #        timer.sleep(4)
            #        pass
                
            # Get the variable data required by the models
            if self.time_method == 'nearest':
                u, v, w, temp, salt = self.data_nearest(i, newtimes[loop_i])
            elif self.time_method == 'interp': 
                u, v, w, temp, salt = self.data_interp(i, timevar, newtimes[loop_i])
            else:
                logger.warn("Method for computing u,v,w,temp,salt not supported!")

            #logger.info("U: %.4f, V: %.4f, W: %.4f" % (u,v,w))
            #logger.info("Temp: %.4f, Salt: %.4f" % (temp,salt))

            # Get the bathy value at the particles location
            if self.usebathy == True:
                bathymetry_value = self._bathymetry.get_depth(part.location)
            else:
                bathymetry_value = -999999999999999

            # Age the particle by the modelTimestep (seconds)
            # 'Age' meaning the amount of time it has been forced.
            part.age(seconds=modelTimestep[loop_i])

            # loop over models - sort these in the order you want them to run
            for model in self.models:
                movement = model.move(part, u, v, w, modelTimestep[loop_i], temperature=temp, salinity=salt, bathymetry_value=bathymetry_value)
                newloc = Location4D(latitude=movement['latitude'], longitude=movement['longitude'], depth=movement['depth'], time=newtimes[loop_i+1])
                logger.debug("%s - moved %.3f meters (horizontally) and %.3f meters (vertically) by %s with data from %s" % (part.logstring(), movement['distance'], movement['vertical_distance'], model.__class__.__name__, newtimes[loop_i].isoformat()))
                if newloc:
                    self.boundary_interaction(particle=part, starting=part.location, ending=newloc,
                        distance=movement['distance'], angle=movement['angle'], 
                        azimuth=movement['azimuth'], reverse_azimuth=movement['reverse_azimuth'], 
                        vertical_distance=movement['vertical_distance'], vertical_angle=movement['vertical_angle'])
                logger.debug("%s - was forced by %s and is now at %s" % (part.logstring(), model.__class__.__name__, part.location.logstring()))

            part.note = part.outputstring()
            # Each timestep, save the particles status and environmental variables.
            # This keep fields such as temp, salt, halted, settled, and dead matched up with the number of timesteps
            part.save()

        # We won't pull data for the last entry in locations, but we need to populate it with fill data.
        part.fill_environment_gap()

        if self.usebathy == True:
            self._bathymetry.close()

        if self.useshore == True:
            self._shoreline.close()

        return part
Example #40
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        try:
            cd = CommonDataset.open(self.service.get('url'))
        except Exception as e:
            app.logger.error("Could not open DAP dataset from '%s'\n"
                             "Exception %s: %s" % (self.service.get('url'),
                                                   type(e).__name__, e))
            return 'Not harvested'


        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(u"Could not get dataset name.  No global attribute named 'title'.")

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(u"Could not get dataset description.  No global attribute named 'summary'.")

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list.")

        # VARIABLES
        prefix    = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables]))

        axis_names = DapHarvest.get_axis_variables(cd.nc)
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX

        # paegan does not support ugrid, so try to detect this condition and skip
        is_ugrid = False
        is_trajectory = False
        for vname, v in cd.nc.variables.iteritems():
            if 'cf_role' in v.ncattrs():
                if v.getncattr('cf_role') == 'mesh_topology':
                    is_ugrid = True
                    break
                elif v.getncattr('cf_role') == 'trajectory_id':
                    is_trajectory = True
                    break

        gj = None

        if is_ugrid:
            messages.append(u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry.")
        elif is_trajectory:
            coord_names = {}
            # try to get info for x, y, z, t axes
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    coord_names = cd.get_coord_names(v, **axis_names)

                    if coord_names['xname'] is not None and \
                       coord_names['yname'] is not None:
                        break
                except (AssertionError, AttributeError, ValueError, KeyError):
                    pass
            else:
                messages.append(u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library.")

            if 'xname' in coord_names:
                try:
                    xvar = cd.nc.variables[coord_names['xname']]
                    yvar = cd.nc.variables[coord_names['yname']]

                    # one less order of magnitude eg 390000 -> 10000
                    slice_factor = 10 ** (int(math.log10(xvar.size)) - 1)

                    xs = np.concatenate((xvar[::slice_factor], xvar[-1:]))
                    ys = np.concatenate((yvar[::slice_factor], yvar[-1:]))
                    # both coords must be valid to have a valid vertex
                    # get rid of any nans and unreasonable lon/lats
                    valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) &
                                 (~np.isnan(ys)) & (np.absolute(ys) <= 90))

                    xs = xs[valid_idx]
                    ys = ys[valid_idx]
                    # Shapely seems to require float64 values or incorrect
                    # values will propagate for the generated lineString
                    # if the array is not numpy's float64 dtype
                    lineCoords = np.array([xs, ys]).T.astype('float64')

                    gj = mapping(asLineString(lineCoords))

                    messages.append(u"Variable %s was used to calculate "
                                    u"trajectory geometry, and is a "
                                    u"naive sampling." % v)

                except (AssertionError, AttributeError,
                        ValueError, KeyError, IndexError) as e:
                    app.logger.warn("Trajectory error occured: %s", e)
                    messages.append(u"Trajectory discovered but could not create a geometry.")

        else:
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    gj = mapping(cd.getboundingpolygon(var=v, **axis_names
                                                       ).simplify(0.5))
                except (AttributeError, AssertionError, ValueError,
                        KeyError, IndexError):
                    try:
                        # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                        # Asterik magic to expland the tuple into positional arguments
                        app.logger.exception("Error calculating bounding box")

                        # handles "points" aka single position NCELLs
                        bbox = cd.getbbox(var=v, **axis_names)
                        gj = self.get_bbox_or_point(bbox)

                    except (AttributeError, AssertionError, ValueError,
                            KeyError, IndexError):
                        pass

                if gj is not None:
                    # We computed something, break out of loop.
                    messages.append(u"Variable %s was used to calculate geometry." % v)
                    break

            if gj is None: # Try the globals
                gj = self.global_bounding_box(cd.nc)
                messages.append(u"Bounding Box calculated using global attributes")
            if gj is None:
                messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.")
                messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.")
                messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables)))





        # TODO: compute bounding box using global attributes


        final_var_names = []
        if prefix == "":
            messages.append(u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities.")
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables]))

        service = {
            'name':           name,
            'description':    description,
            'service_type':   self.service.get('service_type'),
            'service_id':     ObjectId(self.service.get('_id')),
            'data_provider':  self.service.get('data_provider'),
            'metadata_type':  u'ncml',
            'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages':       map(unicode, messages),
            'keywords':       keywords,
            'variables':      map(unicode, final_var_names),
            'asset_type':     get_common_name(DapHarvest.get_asset_type(cd)),
            'geojson':        gj,
            'updated':        datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        ncdataset = Dataset(self.service.get('url'))
        scores = self.ccheck_dataset(ncdataset)
        metamap = self.metamap_dataset(ncdataset)

        try:
            metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap)
        except Exception as e:
            metadata_rec = None
            app.logger.error("could not save compliancecheck/metamap information", exc_info=True)

        return "Harvested"
Example #41
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        METADATA_VAR_NAMES   = [u'crs',
                                u'projection']

        # CF standard names for Axis
        STD_AXIS_NAMES       = [u'latitude',
                                u'longitude',
                                u'time',
                                u'forecast_reference_time',
                                u'forecast_period',
                                u'ocean_sigma',
                                u'ocean_s_coordinate_g1',
                                u'ocean_s_coordinate_g2',
                                u'ocean_s_coordinate',
                                u'ocean_double_sigma',
                                u'ocean_sigma_over_z',
                                u'projection_y_coordinate',
                                u'projection_x_coordinate']

        # Some datasets don't define standard_names on axis variables.  This is used to weed them out based on the
        # actual variable name
        COMMON_AXIS_NAMES    = [u'x',
                                u'y',
                                u'lat',
                                u'latitude',
                                u'lon',
                                u'longitude',
                                u'time',
                                u'time_run',
                                u'time_offset',
                                u'ntimes',
                                u'lat_u',
                                u'lon_u',
                                u'lat_v',
                                u'lon_v  ',
                                u'lat_rho',
                                u'lon_rho',
                                u'lat_psi']

        cd = CommonDataset.open(self.service.get('url'))

        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(u"Could not get dataset name.  No global attribute named 'title'.")

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(u"Could not get dataset description.  No global attribute named 'summary'.")

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list.")

        # VARIABLES
        prefix    = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables]))

        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX
        gj = None
        for v in itertools.chain(std_variables, non_std_variables):
            try:
                gj = mapping(cd.getboundingpolygon(var=v))
            except (AttributeError, AssertionError, ValueError):
                try:
                    # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                    # Asterik magic to expland the tuple into positional arguments
                    gj = mapping(box(*cd.get_bbox(var=v)))
                except (AttributeError, AssertionError, ValueError):
                    pass

            if gj is not None:
                # We computed something, break out of loop.
                messages.append(u"Variable %s was used to calculate geometry." % v)
                break

        if gj is None:
            messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.")
            messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.")
            messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes


        final_var_names = []
        if prefix == "":
            messages.append(u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities.")
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables]))

        service = {
            'name'              : name,
            'description'       : description,
            'service_type'      : self.service.get('service_type'),
            'service_id'        : ObjectId(self.service.get('_id')),
            'data_provider'     : self.service.get('data_provider'),
            'metadata_type'     : u'ncml',
            'metadata_value'    : unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages'          : map(unicode, messages),
            'keywords'          : keywords,
            'variables'         : map(unicode, final_var_names),
            'asset_type'        : unicode(cd._datasettype).upper(),
            'geojson'           : gj,
            'updated'           : datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        return "Harvested"
Example #42
0
    def __call__(self, active):
        c = 0

        self.dataset = CommonDataset.open(self.hydrodataset)
        self.remote = self.dataset.nc

        # Calculate the datetimes of the model timesteps like
        # the particle objects do, so we can figure out unique
        # time indices
        modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(
            self.times, start=self.start_time)

        timevar = self.dataset.gettimevar(self.uname)

        # Don't need to grab the last datetime, as it is not needed for forcing, only
        # for setting the time of the final particle forcing
        time_indexs = timevar.nearest_index(newtimes[0:-1], select='before')

        # Have to make sure that we get the plus 1 for the
        # linear interpolation of u,v,w,temp,salt
        self.inds = np.unique(time_indexs)
        self.inds = np.append(self.inds, self.inds.max() + 1)

        # While there is at least 1 particle still running,
        # stay alive, if not break
        while self.n_run.value > 1:

            if self.caching is False:
                logger.debug(
                    "Caching is False, not doing much.  Just hanging out until all of the particles finish."
                )
                timer.sleep(10)
                continue

            # If particle asks for data, do the following
            if self.get_data.value is True:
                logger.debug("Particle asked for data!")

                # Wait for particles to get out
                while True:
                    self.read_lock.acquire()

                    logger.debug("Read count: %d" % self.read_count.value)
                    if self.read_count.value > 0:
                        logger.debug(
                            "Waiting for write lock on cache file (particles must stop reading)..."
                        )
                        self.read_lock.release()
                        timer.sleep(2)
                    else:
                        break

                # Get write lock on the file.  Already have read lock.
                self.write_lock.acquire()
                self.has_write_lock.value = os.getpid()

                if c == 0:
                    logger.debug("Creating cache file")
                    try:
                        # Open local cache for writing, overwrites
                        # existing file with same name
                        self.local = netCDF4.Dataset(self.cache_path, 'w')

                        indices = self.dataset.get_indices(
                            self.uname,
                            timeinds=[np.asarray([0])],
                            point=self.start)
                        self.point_get.value = [
                            self.inds[0], indices[-2], indices[-1]
                        ]

                        # Create dimensions for u and v variables
                        self.local.createDimension('time', None)
                        self.local.createDimension('level', None)
                        self.local.createDimension('x', None)
                        self.local.createDimension('y', None)

                        # Create 3d or 4d u and v variables
                        if self.remote.variables[self.uname].ndim == 4:
                            self.ndim = 4
                            dimensions = ('time', 'level', 'y', 'x')
                            coordinates = "time z lon lat"
                        elif self.remote.variables[self.uname].ndim == 3:
                            self.ndim = 3
                            dimensions = ('time', 'y', 'x')
                            coordinates = "time lon lat"
                        shape = self.remote.variables[self.uname].shape

                        # If there is no FillValue defined in the dataset, use np.nan.
                        # Sometimes it will work out correctly and other times we will
                        # have a huge cache file.
                        try:
                            fill = self.remote.variables[
                                self.uname].missing_value
                        except Exception:
                            fill = np.nan

                        # Create domain variable that specifies
                        # where there is data geographically/by time
                        # and where there is not data,
                        #   Used for testing if particle needs to
                        #   ask cache to update
                        domain = self.local.createVariable('domain',
                                                           'i',
                                                           dimensions,
                                                           zlib=False,
                                                           fill_value=0)
                        domain.coordinates = coordinates

                        # Create local u and v variables
                        u = self.local.createVariable('u',
                                                      'f',
                                                      dimensions,
                                                      zlib=False,
                                                      fill_value=fill)
                        v = self.local.createVariable('v',
                                                      'f',
                                                      dimensions,
                                                      zlib=False,
                                                      fill_value=fill)

                        v.coordinates = coordinates
                        u.coordinates = coordinates

                        localvars = [
                            u,
                            v,
                        ]
                        remotevars = [
                            self.remote.variables[self.uname],
                            self.remote.variables[self.vname]
                        ]

                        # Create local w variable
                        if self.wname is not None:
                            w = self.local.createVariable('w',
                                                          'f',
                                                          dimensions,
                                                          zlib=False,
                                                          fill_value=fill)
                            w.coordinates = coordinates
                            localvars.append(w)
                            remotevars.append(
                                self.remote.variables[self.wname])

                        if self.temp_name is not None and self.salt_name is not None:
                            # Create local temp and salt vars
                            temp = self.local.createVariable('temp',
                                                             'f',
                                                             dimensions,
                                                             zlib=False,
                                                             fill_value=fill)
                            salt = self.local.createVariable('salt',
                                                             'f',
                                                             dimensions,
                                                             zlib=False,
                                                             fill_value=fill)
                            temp.coordinates = coordinates
                            salt.coordinates = coordinates
                            localvars.append(temp)
                            localvars.append(salt)
                            remotevars.append(
                                self.remote.variables[self.temp_name])
                            remotevars.append(
                                self.remote.variables[self.salt_name])

                        # Create local lat/lon coordinate variables
                        if self.remote.variables[self.xname].ndim == 2:
                            lon = self.local.createVariable('lon',
                                                            'f', ("y", "x"),
                                                            zlib=False)
                            lon[:] = self.remote.variables[self.xname][:, :]
                            lat = self.local.createVariable('lat',
                                                            'f', ("y", "x"),
                                                            zlib=False)
                            lat[:] = self.remote.variables[self.yname][:, :]
                        if self.remote.variables[self.xname].ndim == 1:
                            lon = self.local.createVariable('lon',
                                                            'f', ("x"),
                                                            zlib=False)
                            lon[:] = self.remote.variables[self.xname][:]
                            lat = self.local.createVariable('lat',
                                                            'f', ("y"),
                                                            zlib=False)
                            lat[:] = self.remote.variables[self.yname][:]

                        # Create local z variable
                        if self.zname is not None:
                            if self.remote.variables[self.zname].ndim == 4:
                                z = self.local.createVariable(
                                    'z',
                                    'f', ("time", "level", "y", "x"),
                                    zlib=False)
                                remotez = self.remote.variables[self.zname]
                                localvars.append(z)
                                remotevars.append(remotez)
                            elif self.remote.variables[self.zname].ndim == 3:
                                z = self.local.createVariable(
                                    'z', 'f', ("level", "y", "x"), zlib=False)
                                z[:] = self.remote.variables[
                                    self.zname][:, :, :]
                            elif self.remote.variables[self.zname].ndim == 1:
                                z = self.local.createVariable('z',
                                                              'f', ("level", ),
                                                              zlib=False)
                                z[:] = self.remote.variables[self.zname][:]

                        # Create local time variable
                        time = self.local.createVariable('time',
                                                         'f8', ("time", ),
                                                         zlib=False)
                        if self.tname is not None:
                            time[:] = self.remote.variables[self.tname][
                                self.inds]

                        if self.point_get.value[0] + self.time_size > np.max(
                                self.inds):
                            current_inds = np.arange(self.point_get.value[0],
                                                     np.max(self.inds) + 1)
                        else:
                            current_inds = np.arange(
                                self.point_get.value[0],
                                self.point_get.value[0] + self.time_size)

                        # Get data from remote dataset and add
                        # to local cache.
                        # Try 20 times on the first attempt
                        current_attempt = 1
                        max_attempts = 20
                        while True:
                            try:
                                assert current_attempt <= max_attempts
                                self.get_remote_data(localvars, remotevars,
                                                     current_inds, shape)
                            except AssertionError:
                                raise
                            except:
                                logger.warn(
                                    "CachingDataController failed to get remote data.  Trying again in 20 seconds. %s attempts left."
                                    % str(max_attempts - current_attempt))
                                logger.exception("Data Access Error")
                                timer.sleep(20)
                                current_attempt += 1
                            else:
                                break

                        c += 1
                    except (Exception, AssertionError):
                        logger.error(
                            "CachingDataController failed to get data (first request)"
                        )
                        raise
                    finally:
                        self.local.sync()
                        self.local.close()
                        self.has_write_lock.value = -1
                        self.write_lock.release()
                        self.get_data.value = False
                        self.read_lock.release()
                        logger.debug(
                            "Done updating cache file, closing file, and releasing locks"
                        )
                else:
                    logger.debug("Updating cache file")
                    try:
                        # Open local cache dataset for appending
                        self.local = netCDF4.Dataset(self.cache_path, 'a')

                        # Create local and remote variable objects
                        # for the variables of interest
                        u = self.local.variables['u']
                        v = self.local.variables['v']
                        time = self.local.variables['time']
                        remoteu = self.remote.variables[self.uname]
                        remotev = self.remote.variables[self.vname]

                        # Create lists of variable objects for
                        # the data updater
                        localvars = [
                            u,
                            v,
                        ]
                        remotevars = [
                            remoteu,
                            remotev,
                        ]
                        if self.salt_name is not None and self.temp_name is not None:
                            salt = self.local.variables['salt']
                            temp = self.local.variables['temp']
                            remotesalt = self.remote.variables[self.salt_name]
                            remotetemp = self.remote.variables[self.temp_name]
                            localvars.append(salt)
                            localvars.append(temp)
                            remotevars.append(remotesalt)
                            remotevars.append(remotetemp)
                        if self.wname is not None:
                            w = self.local.variables['w']
                            remotew = self.remote.variables[self.wname]
                            localvars.append(w)
                            remotevars.append(remotew)
                        if self.zname is not None:
                            remotez = self.remote.variables[self.zname]
                            if remotez.ndim == 4:
                                z = self.local.variables['z']
                                localvars.append(z)
                                remotevars.append(remotez)
                        if self.tname is not None:
                            # remotetime = self.remote.variables[self.tname]
                            time[self.inds] = self.remote.variables[self.inds]

                        if self.point_get.value[0] + self.time_size > np.max(
                                self.inds):
                            current_inds = np.arange(self.point_get.value[0],
                                                     np.max(self.inds) + 1)
                        else:
                            current_inds = np.arange(
                                self.point_get.value[0],
                                self.point_get.value[0] + self.time_size)

                        # Get data from remote dataset and add
                        # to local cache
                        while True:
                            try:
                                self.get_remote_data(localvars, remotevars,
                                                     current_inds, shape)
                            except:
                                logger.warn(
                                    "CachingDataController failed to get remote data.  Trying again in 30 seconds"
                                )
                                timer.sleep(30)
                            else:
                                break

                        c += 1
                    except Exception:
                        logger.error(
                            "CachingDataController failed to get data (not first request)"
                        )
                        raise
                    finally:
                        self.local.sync()
                        self.local.close()
                        self.has_write_lock.value = -1
                        self.write_lock.release()
                        self.get_data.value = False
                        self.read_lock.release()
                        logger.debug(
                            "Done updating cache file, closing file, and releasing locks"
                        )
            else:
                logger.debug(
                    "Particles are still running, waiting for them to request data..."
                )
                timer.sleep(2)

        self.dataset.closenc()

        return "CachingDataController"
    def run(self, hydrodataset, **kwargs):

        # Add ModelController description to logfile
        logger.info(self)

        # Add the model descriptions to logfile
        for m in self._models:
            logger.info(m)

        # Calculate the model timesteps
        # We need times = len(self._nstep) + 1 since data is stored one timestep
        # after a particle is forced with the final timestep's data.
        times = range(0,(self._step*self._nstep)+1,self._step)
        # Calculate a datetime object for each model timestep
        # This method is duplicated in DataController and ForceParticle
        # using the 'times' variables above.  Will be useful in those other
        # locations for particles released at different times
        # i.e. released over a few days
        modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(times, start=self.start)

        time_chunk = self._time_chunk
        horiz_chunk = self._horiz_chunk
        low_memory = kwargs.get("low_memory", False)

        # Should we remove the cache file at the end of the run?
        remove_cache = kwargs.get("remove_cache", True)

        self.bathy_path = kwargs.get("bathy", None)

        self.cache_path = kwargs.get("cache", None)
        if self.cache_path is None:
            # Generate temp filename for dataset cache
            default_cache_dir = os.path.join(os.path.dirname(__file__), "_cache")
            temp_name = AsaRandom.filename(prefix=str(datetime.now().microsecond), suffix=".nc")
            self.cache_path = os.path.join(default_cache_dir, temp_name)
        
        logger.progress((1, "Setting up particle start locations"))
        point_locations = []
        if isinstance(self.geometry, Point):
            point_locations = [self.reference_location] * self._npart
        elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon):
            point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)]

        # Initialize the particles
        logger.progress((2, "Initializing particles"))
        for x in xrange(0, self._npart):
            p = LarvaParticle(id=x)
            p.location = point_locations[x]
            # We don't need to fill the location gaps here for environment variables
            # because the first data collected actually relates to this original
            # position.
            # We do need to fill in fields such as settled, halted, etc.
            p.fill_status_gap()
            # Set the inital note
            p.note = p.outputstring()
            p.notes.append(p.note)
            self.particles.append(p)

        # This is where it makes sense to implement the multiprocessing
        # looping for particles and models. Can handle each particle in 
        # parallel probably.
        #
        # Get the number of cores (may take some tuning) and create that
        # many workers then pass particles into the queue for the workers
        mgr = multiprocessing.Manager()
        nproc = multiprocessing.cpu_count() - 1
        if nproc <= 0:
            raise ValueError("Model does not run using less than two CPU cores")

        # Each particle is a task, plus the DataController
        number_of_tasks = len(self.particles) + 1

        # We need a process for each particle and one for the data controller
        nproc = min(number_of_tasks, nproc)

        # When a particle requests data
        data_request_lock = mgr.Lock()
        # PID of process with lock
        has_data_request_lock = mgr.Value('int',-1)

        nproc_lock = mgr.Lock()
        
        # Create the task queue for all of the particles and the DataController
        tasks = multiprocessing.JoinableQueue(number_of_tasks)
        # Create the result queue for all of the particles and the DataController
        results = mgr.Queue(number_of_tasks)
        
        # Create the shared state objects
        get_data = mgr.Value('bool', True)
        # Number of tasks
        n_run = mgr.Value('int', number_of_tasks)
        updating = mgr.Value('bool', False)

        # When something is reading from cache file
        read_lock = mgr.Lock()
        # list of PIDs that are reading
        has_read_lock = mgr.list()
        read_count = mgr.Value('int', 0)

        # When something is writing to the cache file
        write_lock = mgr.Lock()
        # PID of process with lock
        has_write_lock = mgr.Value('int',-1)

        point_get = mgr.Value('list', [0, 0, 0])
        active = mgr.Value('bool', True)
        
        logger.progress((3, "Initializing and caching hydro model's grid"))
        try:
            ds = CommonDataset.open(hydrodataset)
            # Query the dataset for common variable names
            # and the time variable.
            logger.debug("Retrieving variable information from dataset")
            common_variables = self.get_common_variables_from_dataset(ds)

            logger.debug("Pickling time variable to disk for particles")
            timevar = ds.gettimevar(common_variables.get("u"))
            f, timevar_pickle_path = tempfile.mkstemp()
            os.close(f)
            f = open(timevar_pickle_path, "wb")
            pickle.dump(timevar, f)
            f.close()
            ds.closenc()
        except:
            logger.warn("Failed to access remote dataset %s" % hydrodataset)
            raise DataControllerError("Inaccessible DAP endpoint: %s" % hydrodataset)


        # Add data controller to the queue first so that it 
        # can get the initial data and is not blocked
        
        logger.debug('Starting DataController')
        logger.progress((4, "Starting processes"))
        data_controller = parallel.DataController(hydrodataset, common_variables, n_run, get_data, write_lock, has_write_lock, read_lock, read_count,
                                                  time_chunk, horiz_chunk, times,
                                                  self.start, point_get, self.reference_location,
                                                  low_memory=low_memory,
                                                  cache=self.cache_path)
        tasks.put(data_controller)
        # Create DataController worker
        data_controller_process = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="DataController")
        data_controller_process.start()
        
        logger.debug('Adding %i particles as tasks' % len(self.particles))
        for part in self.particles:
            forcing = parallel.ForceParticle(part,
                                        hydrodataset,
                                        common_variables,
                                        timevar_pickle_path,
                                        times,
                                        self.start,
                                        self._models,
                                        self.reference_location.point,
                                        self._use_bathymetry,
                                        self._use_shoreline,
                                        self._use_seasurface,
                                        get_data,
                                        n_run,
                                        read_lock,
                                        has_read_lock,
                                        read_count,
                                        point_get,
                                        data_request_lock,
                                        has_data_request_lock,
                                        reverse_distance=self.reverse_distance,
                                        bathy=self.bathy_path,
                                        shoreline_path=self.shoreline_path,
                                        shoreline_feature=self.shoreline_feature,
                                        cache=self.cache_path,
                                        time_method=self.time_method)
            tasks.put(forcing)

        # Create workers for the particles.
        procs = [ parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name="ForceParticle-%d"%i)
                  for i in xrange(nproc - 1) ]
        for w in procs:
            w.start()
            logger.debug('Started %s' % w.name)

        # Get results back from queue, test for failed particles
        return_particles = []
        retrieved = 0.
        error_code = 0

        logger.info("Waiting for %i particle results" % len(self.particles))
        logger.progress((5, "Running model"))
        while retrieved < number_of_tasks:
            try:
                # Returns a tuple of code, result
                code, tempres = results.get(timeout=240)
            except Queue.Empty:
                # Poll the active processes to make sure they are all alive and then continue with loop
                if not data_controller_process.is_alive() and data_controller_process.exitcode != 0:
                    # Data controller is zombied, kill off other processes.
                    get_data.value == False
                    results.put((-2, "DataController"))

                new_procs = []
                old_procs = []
                for p in procs:
                    if not p.is_alive() and p.exitcode != 0:
                        # Do what the Consumer would do if something finished.
                        # Add something to results queue
                        results.put((-3, "ZombieParticle"))
                        # Decrement nproc (DataController exits when this is 0)
                        with nproc_lock:
                            n_run.value = n_run.value - 1

                        # Remove task from queue (so they can be joined later on)
                        tasks.task_done()

                        # Start a new Consumer.  It will exit if there are no tasks available.
                        np = parallel.Consumer(tasks, results, n_run, nproc_lock, active, get_data, name=p.name)
                        new_procs.append(np)
                        old_procs.append(p)
                        
                        # Release any locks the PID had
                        if p.pid in has_read_lock:
                            with read_lock:
                                read_count.value -= 1
                                has_read_lock.remove(p.pid)

                        if has_data_request_lock.value == p.pid:
                            has_data_request_lock.value = -1
                            try:
                                data_request_lock.release()
                            except:
                                pass
                            
                        if has_write_lock.value == p.pid:
                            has_write_lock.value = -1
                            try:
                                write_lock.release()
                            except:
                                pass
                            

                for p in old_procs:
                    try:
                        procs.remove(p)
                    except ValueError:
                        logger.warn("Did not find %s in the list of processes.  Continuing on." % p.name)

                for p in new_procs:
                    procs.append(p)
                    logger.warn("Started a new consumer (%s) to replace a zombie consumer" % p.name)
                    p.start()
                
            else:
                # We got one.
                retrieved += 1
                if code == None:
                    logger.warn("Got an unrecognized response from a task.")
                elif code == -1:
                    logger.warn("Particle %s has FAILED!!" % tempres.uid)
                elif code == -2:
                    error_code = code
                    logger.warn("DataController has FAILED!!  Removing cache file so the particles fail.")
                    try:
                        os.remove(self.cache_path)
                    except OSError:
                        logger.debug("Could not remove cache file, it probably never existed")
                        pass
                elif code == -3:
                    error_code = code
                    logger.info("A zombie process was caught and task was removed from queue")
                elif isinstance(tempres, Particle):
                    logger.info("Particle %d finished" % tempres.uid)
                    return_particles.append(tempres)
                    # We mulitply by 95 here to save 5% for the exporting
                    logger.progress((round((retrieved / number_of_tasks) * 90.,1), "Particle %d finished" % tempres.uid))
                elif tempres == "DataController":
                    logger.info("DataController finished")
                    logger.progress((round((retrieved / number_of_tasks) * 90.,1), "DataController finished"))
                else:
                    logger.info("Got a strange result on results queue")
                    logger.info(str(tempres))

                logger.info("Retrieved %i/%i results" % (int(retrieved),number_of_tasks))
        
        if len(return_particles) != len(self.particles):
            logger.warn("Some particles failed and are not included in the output")

        # The results queue should be empty at this point
        assert results.empty() is True

        # Should be good to join on the tasks now that the queue is empty
        logger.info("Joining the task queue")
        tasks.join()

        # Join all processes
        logger.info("Joining the processes")
        for w in procs + [data_controller_process]:
                # Wait 10 seconds
                w.join(10.)
                if w.is_alive():
                    # Process is hanging, kill it.
                    logger.info("Terminating %s forcefully.  This should have exited itself." % w.name)
                    w.terminate()
                    
        logger.info('Workers complete')

        self.particles = return_particles

        # Remove Manager so it shuts down
        del mgr

        # Remove pickled timevar
        os.remove(timevar_pickle_path)

        # Remove the cache file
        if remove_cache is True:
            try:
                os.remove(self.cache_path)
            except OSError:
                logger.debug("Could not remove cache file, it probably never existed")

        logger.progress((96, "Exporting results"))

        if len(self.particles) > 0:
            # If output_formats and path specified,
            # output particle run data to disk when completed
            if "output_formats" in kwargs:
                # Make sure output_path is also included
                if kwargs.get("output_path", None) != None:
                    formats = kwargs.get("output_formats")
                    output_path = kwargs.get("output_path")
                    if isinstance(formats, list):
                        for format in formats:
                            logger.info("Exporting to: %s" % format)
                            try:
                                self.export(output_path, format=format)
                            except:
                                logger.error("Failed to export to: %s" % format)
                    else:
                        logger.warn('The output_formats parameter should be a list, not saving any output!')  
                else:
                    logger.warn('No output path defined, not saving any output!')  
            else:
                logger.warn('No output format defined, not saving any output!')
        else:
            logger.warn("Model didn't actually do anything, check the log.")
            if error_code == -2:
                raise DataControllerError("Error in the DataController")
            else:
                raise ModelError("Error in the model")

        logger.progress((99, "Model Run Complete"))
        return
Example #44
0
 def test_aggregated_dataset(self):
     datafile = os.path.join(data_path, "pws_das_20140126*.nc")
     pd = CommonDataset.open(datafile)
     assert pd._datasettype == "rgrid"
     values = pd.get_values(var="u", bbox=[-149, 59, -144, 61.5], timeinds=0)
     assert values.size > 0
    def run(self, hydrodataset, **kwargs):

        # Add ModelController description to logfile
        logger.info(self)

        # Add the model descriptions to logfile
        for m in self._models:
            logger.info(m)

        # Calculate the model timesteps
        # We need times = len(self._nstep) + 1 since data is stored one timestep
        # after a particle is forced with the final timestep's data.
        times = range(0, (self._step * self._nstep) + 1, self._step)
        # Calculate a datetime object for each model timestep
        # This method is duplicated in DataController and ForceParticle
        # using the 'times' variables above.  Will be useful in those other
        # locations for particles released at different times
        # i.e. released over a few days
        modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(
            times, start=self.start)

        time_chunk = self._time_chunk
        horiz_chunk = self._horiz_chunk
        low_memory = kwargs.get("low_memory", False)

        # Should we remove the cache file at the end of the run?
        remove_cache = kwargs.get("remove_cache", True)

        self.bathy_path = kwargs.get("bathy", None)

        self.cache_path = kwargs.get("cache", None)
        if self.cache_path is None:
            # Generate temp filename for dataset cache
            default_cache_dir = os.path.join(os.path.dirname(__file__),
                                             "_cache")
            temp_name = AsaRandom.filename(prefix=str(
                datetime.now().microsecond),
                                           suffix=".nc")
            self.cache_path = os.path.join(default_cache_dir, temp_name)

        logger.progress((1, "Setting up particle start locations"))
        point_locations = []
        if isinstance(self.geometry, Point):
            point_locations = [self.reference_location] * self._npart
        elif isinstance(self.geometry, Polygon) or isinstance(
                self.geometry, MultiPolygon):
            point_locations = [
                Location4D(latitude=loc.y,
                           longitude=loc.x,
                           depth=self._depth,
                           time=self.start)
                for loc in AsaTransport.fill_polygon_with_points(
                    goal=self._npart, polygon=self.geometry)
            ]

        # Initialize the particles
        logger.progress((2, "Initializing particles"))
        for x in xrange(0, self._npart):
            p = LarvaParticle(id=x)
            p.location = point_locations[x]
            # We don't need to fill the location gaps here for environment variables
            # because the first data collected actually relates to this original
            # position.
            # We do need to fill in fields such as settled, halted, etc.
            p.fill_status_gap()
            # Set the inital note
            p.note = p.outputstring()
            p.notes.append(p.note)
            self.particles.append(p)

        # This is where it makes sense to implement the multiprocessing
        # looping for particles and models. Can handle each particle in
        # parallel probably.
        #
        # Get the number of cores (may take some tuning) and create that
        # many workers then pass particles into the queue for the workers
        mgr = multiprocessing.Manager()
        nproc = multiprocessing.cpu_count() - 1
        if nproc <= 0:
            raise ValueError(
                "Model does not run using less than two CPU cores")

        # Each particle is a task, plus the DataController
        number_of_tasks = len(self.particles) + 1

        # We need a process for each particle and one for the data controller
        nproc = min(number_of_tasks, nproc)

        # When a particle requests data
        data_request_lock = mgr.Lock()
        # PID of process with lock
        has_data_request_lock = mgr.Value('int', -1)

        nproc_lock = mgr.Lock()

        # Create the task queue for all of the particles and the DataController
        tasks = multiprocessing.JoinableQueue(number_of_tasks)
        # Create the result queue for all of the particles and the DataController
        results = mgr.Queue(number_of_tasks)

        # Create the shared state objects
        get_data = mgr.Value('bool', True)
        # Number of tasks
        n_run = mgr.Value('int', number_of_tasks)
        updating = mgr.Value('bool', False)

        # When something is reading from cache file
        read_lock = mgr.Lock()
        # list of PIDs that are reading
        has_read_lock = mgr.list()
        read_count = mgr.Value('int', 0)

        # When something is writing to the cache file
        write_lock = mgr.Lock()
        # PID of process with lock
        has_write_lock = mgr.Value('int', -1)

        point_get = mgr.Value('list', [0, 0, 0])
        active = mgr.Value('bool', True)

        logger.progress((3, "Initializing and caching hydro model's grid"))
        try:
            ds = CommonDataset.open(hydrodataset)
            # Query the dataset for common variable names
            # and the time variable.
            logger.debug("Retrieving variable information from dataset")
            common_variables = self.get_common_variables_from_dataset(ds)

            logger.debug("Pickling time variable to disk for particles")
            timevar = ds.gettimevar(common_variables.get("u"))
            f, timevar_pickle_path = tempfile.mkstemp()
            os.close(f)
            f = open(timevar_pickle_path, "wb")
            pickle.dump(timevar, f)
            f.close()
            ds.closenc()
        except:
            logger.warn("Failed to access remote dataset %s" % hydrodataset)
            raise DataControllerError("Inaccessible DAP endpoint: %s" %
                                      hydrodataset)

        # Add data controller to the queue first so that it
        # can get the initial data and is not blocked

        logger.debug('Starting DataController')
        logger.progress((4, "Starting processes"))
        data_controller = parallel.DataController(hydrodataset,
                                                  common_variables,
                                                  n_run,
                                                  get_data,
                                                  write_lock,
                                                  has_write_lock,
                                                  read_lock,
                                                  read_count,
                                                  time_chunk,
                                                  horiz_chunk,
                                                  times,
                                                  self.start,
                                                  point_get,
                                                  self.reference_location,
                                                  low_memory=low_memory,
                                                  cache=self.cache_path)
        tasks.put(data_controller)
        # Create DataController worker
        data_controller_process = parallel.Consumer(tasks,
                                                    results,
                                                    n_run,
                                                    nproc_lock,
                                                    active,
                                                    get_data,
                                                    name="DataController")
        data_controller_process.start()

        logger.debug('Adding %i particles as tasks' % len(self.particles))
        for part in self.particles:
            forcing = parallel.ForceParticle(
                part,
                hydrodataset,
                common_variables,
                timevar_pickle_path,
                times,
                self.start,
                self._models,
                self.reference_location.point,
                self._use_bathymetry,
                self._use_shoreline,
                self._use_seasurface,
                get_data,
                n_run,
                read_lock,
                has_read_lock,
                read_count,
                point_get,
                data_request_lock,
                has_data_request_lock,
                reverse_distance=self.reverse_distance,
                bathy=self.bathy_path,
                shoreline_path=self.shoreline_path,
                cache=self.cache_path,
                time_method=self.time_method)
            tasks.put(forcing)

        # Create workers for the particles.
        procs = [
            parallel.Consumer(tasks,
                              results,
                              n_run,
                              nproc_lock,
                              active,
                              get_data,
                              name="ForceParticle-%d" % i)
            for i in xrange(nproc - 1)
        ]
        for w in procs:
            w.start()
            logger.debug('Started %s' % w.name)

        # Get results back from queue, test for failed particles
        return_particles = []
        retrieved = 0.
        error_code = 0

        logger.info("Waiting for %i particle results" % len(self.particles))
        logger.progress((5, "Running model"))
        while retrieved < number_of_tasks:
            try:
                # Returns a tuple of code, result
                code, tempres = results.get(timeout=240)
            except Queue.Empty:
                # Poll the active processes to make sure they are all alive and then continue with loop
                if not data_controller_process.is_alive(
                ) and data_controller_process.exitcode != 0:
                    # Data controller is zombied, kill off other processes.
                    get_data.value == False
                    results.put((-2, "DataController"))

                new_procs = []
                old_procs = []
                for p in procs:
                    if not p.is_alive() and p.exitcode != 0:
                        # Do what the Consumer would do if something finished.
                        # Add something to results queue
                        results.put((-3, "ZombieParticle"))
                        # Decrement nproc (DataController exits when this is 0)
                        with nproc_lock:
                            n_run.value = n_run.value - 1

                        # Remove task from queue (so they can be joined later on)
                        tasks.task_done()

                        # Start a new Consumer.  It will exit if there are no tasks available.
                        np = parallel.Consumer(tasks,
                                               results,
                                               n_run,
                                               nproc_lock,
                                               active,
                                               get_data,
                                               name=p.name)
                        new_procs.append(np)
                        old_procs.append(p)

                        # Release any locks the PID had
                        if p.pid in has_read_lock:
                            with read_lock:
                                read_count.value -= 1
                                has_read_lock.remove(p.pid)

                        if has_data_request_lock.value == p.pid:
                            has_data_request_lock.value = -1
                            try:
                                data_request_lock.release()
                            except:
                                pass

                        if has_write_lock.value == p.pid:
                            has_write_lock.value = -1
                            try:
                                write_lock.release()
                            except:
                                pass

                for p in old_procs:
                    try:
                        procs.remove(p)
                    except ValueError:
                        logger.warn(
                            "Did not find %s in the list of processes.  Continuing on."
                            % p.name)

                for p in new_procs:
                    procs.append(p)
                    logger.warn(
                        "Started a new consumer (%s) to replace a zombie consumer"
                        % p.name)
                    p.start()

            else:
                # We got one.
                retrieved += 1
                if code == None:
                    logger.warn("Got an unrecognized response from a task.")
                elif code == -1:
                    logger.warn("Particle %s has FAILED!!" % tempres.uid)
                elif code == -2:
                    error_code = code
                    logger.warn(
                        "DataController has FAILED!!  Removing cache file so the particles fail."
                    )
                    try:
                        os.remove(self.cache_path)
                    except OSError:
                        logger.debug(
                            "Could not remove cache file, it probably never existed"
                        )
                        pass
                elif code == -3:
                    error_code = code
                    logger.info(
                        "A zombie process was caught and task was removed from queue"
                    )
                elif isinstance(tempres, Particle):
                    logger.info("Particle %d finished" % tempres.uid)
                    return_particles.append(tempres)
                    # We mulitply by 95 here to save 5% for the exporting
                    logger.progress(
                        (round((retrieved / number_of_tasks) * 90.,
                               1), "Particle %d finished" % tempres.uid))
                elif tempres == "DataController":
                    logger.info("DataController finished")
                    logger.progress((round((retrieved / number_of_tasks) * 90.,
                                           1), "DataController finished"))
                else:
                    logger.info("Got a strange result on results queue")
                    logger.info(str(tempres))

                logger.info("Retrieved %i/%i results" %
                            (int(retrieved), number_of_tasks))

        if len(return_particles) != len(self.particles):
            logger.warn(
                "Some particles failed and are not included in the output")

        # The results queue should be empty at this point
        assert results.empty() is True

        # Should be good to join on the tasks now that the queue is empty
        logger.info("Joining the task queue")
        tasks.join()

        # Join all processes
        logger.info("Joining the processes")
        for w in procs + [data_controller_process]:
            # Wait 10 seconds
            w.join(10.)
            if w.is_alive():
                # Process is hanging, kill it.
                logger.info(
                    "Terminating %s forcefully.  This should have exited itself."
                    % w.name)
                w.terminate()

        logger.info('Workers complete')

        self.particles = return_particles

        # Remove Manager so it shuts down
        del mgr

        # Remove pickled timevar
        os.remove(timevar_pickle_path)

        # Remove the cache file
        if remove_cache is True:
            try:
                os.remove(self.cache_path)
            except OSError:
                logger.debug(
                    "Could not remove cache file, it probably never existed")

        logger.progress((96, "Exporting results"))

        if len(self.particles) > 0:
            # If output_formats and path specified,
            # output particle run data to disk when completed
            if "output_formats" in kwargs:
                # Make sure output_path is also included
                if kwargs.get("output_path", None) != None:
                    formats = kwargs.get("output_formats")
                    output_path = kwargs.get("output_path")
                    if isinstance(formats, list):
                        for format in formats:
                            logger.info("Exporting to: %s" % format)
                            try:
                                self.export(output_path, format=format)
                            except:
                                logger.error("Failed to export to: %s" %
                                             format)
                    else:
                        logger.warn(
                            'The output_formats parameter should be a list, not saving any output!'
                        )
                else:
                    logger.warn(
                        'No output path defined, not saving any output!')
            else:
                logger.warn('No output format defined, not saving any output!')
        else:
            logger.warn("Model didn't actually do anything, check the log.")
            if error_code == -2:
                raise DataControllerError("Error in the DataController")
            else:
                raise ModelError("Error in the model")

        logger.progress((99, "Model Run Complete"))
        return
Example #46
0
    def test_rgrid_regrid_4d(self):
        from paegan.utils.asainterpolate import create_grid
        datafile = os.path.join(data_path, "pws_L2_2012040100.nc")
        pd = CommonDataset.open(datafile)
        assert pd._datasettype == 'rgrid'
        var = "u"
        lon = [
            -148.25, -148.24, -148.23, -148.22, -148.21, -148.2, -148.19,
            -148.18, -148.17, -148.16, -148.15, -148.14, -148.13, -148.12,
            -148.11, -148.1, -148.09, -148.08, -148.07, -148.06, -148.05,
            -148.04, -148.03, -148.02, -148.01, -148.0, -147.99, -147.98,
            -147.97, -147.96, -147.95, -147.94, -147.93, -147.92, -147.91,
            -147.9, -147.89, -147.88, -147.87, -147.86, -147.85, -147.84,
            -147.83, -147.82, -147.81, -147.8, -147.79, -147.78, -147.77,
            -147.76, -147.75, -147.74, -147.73, -147.72, -147.71, -147.7,
            -147.69, -147.68, -147.67, -147.66, -147.65, -147.64, -147.63,
            -147.62, -147.61, -147.6, -147.59, -147.58, -147.57, -147.56,
            -147.55, -147.54, -147.53, -147.52, -147.51, -147.5, -147.49,
            -147.48, -147.47, -147.46, -147.45, -147.44, -147.43, -147.42,
            -147.41, -147.4, -147.39, -147.38, -147.37, -147.36, -147.35,
            -147.34, -147.33, -147.32, -147.31, -147.3, -147.29, -147.28,
            -147.27, -147.26, -147.25, -147.24, -147.23, -147.22, -147.21,
            -147.2, -147.19, -147.18, -147.17, -147.16, -147.15, -147.14,
            -147.13, -147.12, -147.11, -147.1, -147.09, -147.08, -147.07,
            -147.06, -147.05, -147.04, -147.03, -147.02, -147.01, -147.0,
            -146.99, -146.98, -146.97, -146.96, -146.95, -146.94, -146.93,
            -146.92, -146.91, -146.9, -146.89, -146.88, -146.87, -146.86,
            -146.85, -146.84, -146.83, -146.82, -146.81, -146.8, -146.79,
            -146.78, -146.77, -146.76, -146.75, -146.74, -146.73, -146.72,
            -146.71, -146.7, -146.69, -146.68, -146.67, -146.66, -146.65,
            -146.64, -146.63, -146.62, -146.61, -146.6, -146.59, -146.58,
            -146.57, -146.56, -146.55, -146.54, -146.53, -146.52, -146.51,
            -146.5, -146.49, -146.48, -146.47, -146.46, -146.45, -146.44,
            -146.43, -146.42, -146.41, -146.4, -146.39, -146.38, -146.37,
            -146.36, -146.35, -146.34, -146.33, -146.32, -146.31, -146.3,
            -146.29, -146.28, -146.27, -146.26, -146.25, -146.24, -146.23,
            -146.22, -146.21, -146.2, -146.19, -146.18, -146.17, -146.16,
            -146.15, -146.14, -146.13, -146.12, -146.11, -146.1, -146.09,
            -146.08, -146.07, -146.06, -146.05, -146.04, -146.03, -146.02,
            -146.01, -146.0, -145.99, -145.98, -145.97, -145.96, -145.95,
            -145.94, -145.93, -145.92, -145.91, -145.9, -145.89, -145.88,
            -145.87, -145.86, -145.85, -145.84, -145.83, -145.82, -145.81,
            -145.8, -145.79, -145.78, -145.77, -145.76, -145.75, -145.74,
            -145.73, -145.72, -145.71, -145.7, -145.69, -145.68, -145.67,
            -145.66, -145.65, -145.64, -145.63, -145.62, -145.61, -145.6,
            -145.59, -145.58, -145.57, -145.56, -145.55, -145.54, -145.53,
            -145.52, -145.51, -145.5, -145.49, -145.48, -145.47, -145.46,
            -145.45, -145.44, -145.43, -145.42, -145.41, -145.4, -145.39,
            -145.38, -145.37, -145.36, -145.35, -145.34, -145.33, -145.32,
            -145.31, -145.3, -145.29, -145.28, -145.27, -145.26, -145.25,
            -145.24, -145.23, -145.22, -145.21, -145.2, -145.19, -145.18,
            -145.17, -145.16, -145.15, -145.14, -145.13, -145.12, -145.11,
            -145.1, -145.09, -145.08, -145.07, -145.06, -145.05, -145.04,
            -145.03, -145.02, -145.01, -145.0, -144.99, -144.98, -144.97,
            -144.96, -144.95, -144.94, -144.93, -144.92, -144.91, -144.9,
            -144.89, -144.88, -144.87, -144.86, -144.85, -144.84, -144.83,
            -144.82, -144.81, -144.8, -144.79
        ]
        lat = [
            59.68, 59.69, 59.7, 59.71, 59.72, 59.73, 59.74, 59.75, 59.760002,
            59.77, 59.78, 59.79, 59.8, 59.81, 59.82, 59.83, 59.84, 59.85,
            59.86, 59.87, 59.88, 59.89, 59.9, 59.91, 59.920002, 59.93, 59.94,
            59.95, 59.96, 59.97, 59.98, 59.99, 60.0, 60.010002, 60.02, 60.03,
            60.04, 60.05, 60.06, 60.07, 60.08, 60.09, 60.1, 60.11, 60.12,
            60.13, 60.14, 60.15, 60.16, 60.170002, 60.18, 60.19, 60.2, 60.21,
            60.22, 60.23, 60.24, 60.25, 60.260002, 60.27, 60.28, 60.29, 60.3,
            60.31, 60.32, 60.33, 60.34, 60.35, 60.36, 60.37, 60.38, 60.39,
            60.4, 60.41, 60.420002, 60.43, 60.44, 60.45, 60.46, 60.47, 60.48,
            60.49, 60.5, 60.510002, 60.52, 60.53, 60.54, 60.55, 60.56, 60.57,
            60.58, 60.59, 60.6, 60.61, 60.62, 60.63, 60.64, 60.65, 60.66,
            60.670002, 60.68, 60.69, 60.7, 60.71, 60.72, 60.73, 60.74, 60.75,
            60.760002, 60.77, 60.78, 60.79, 60.8, 60.81, 60.82, 60.83, 60.84,
            60.85, 60.86, 60.87, 60.88, 60.89, 60.9, 60.91, 60.920002, 60.93,
            60.94, 60.95, 60.96, 60.97, 60.98, 60.99, 61.0, 61.010002, 61.02,
            61.03, 61.04, 61.05, 61.06, 61.07, 61.08, 61.09, 61.1, 61.11,
            61.12, 61.13, 61.14, 61.15, 61.16, 61.170002, 61.18, 61.19, 61.2
        ]
        lon, lat = np.asarray(lon), np.asarray(lat)
        data1 = pd.get_values(var, bbox=(-149, 59, -144, 61.5))
        coords_struct = pd.sub_coords(var, bbox=(-149, 59, -144, 61.5))
        data2 = pd.get_values_on_grid(var,
                                      coords_struct.x,
                                      coords_struct.y,
                                      t=coords_struct.time,
                                      z=coords_struct.z)

        pd.closenc()
        assert np.all(data1 == data2)
    def __call__(self, proc, active):
        c = 0
        
        self.dataset = CommonDataset.open(self.url)
        self.proc = proc
        self.remote = self.dataset.nc
        cachepath = self.cache_path
        
        # Calculate the datetimes of the model timesteps like
        # the particle objects do, so we can figure out unique
        # time indices
        modelTimestep, newtimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start_time)

        timevar = self.dataset.gettimevar(self.uname)

        # Don't need to grab the last datetime, as it is not needed for forcing, only
        # for setting the time of the final particle forcing
        time_indexs = timevar.nearest_index(newtimes[0:-1], select='before')
        
        # Have to make sure that we get the plus 1 for the
        # linear interpolation of u,v,w,temp,salt
        self.inds = np.unique(time_indexs)
        self.inds = np.append(self.inds, self.inds.max()+1)
        
        # While there is at least 1 particle still running, 
        # stay alive, if not break
        while self.n_run.value > 1:
            logger.debug("Particles are still running, waiting for them to request data...")
            timer.sleep(2)
            # If particle asks for data, do the following
            if self.get_data.value == True:
                logger.debug("Particle asked for data!")

                # Wait for particles to get out
                while True:
                    self.read_lock.acquire()

                    logger.debug("Read count: %d" % self.read_count.value)
                    if self.read_count.value > 0:
                        logger.debug("Waiting for write lock on cache file (particles must stop reading)...")
                        self.read_lock.release()
                        timer.sleep(4)
                    else:
                        break
                    
                # Get write lock on the file.  Already have read lock.
                self.write_lock.acquire()
                self.has_write_lock.value = os.getpid()

                if c == 0:
                    logger.debug("Creating cache file")
                    try:
                        # Open local cache for writing, overwrites
                        # existing file with same name
                        self.local = netCDF4.Dataset(cachepath, 'w')

                        indices = self.dataset.get_indices(self.uname, timeinds=[np.asarray([0])], point=self.start)
                        self.point_get.value = [self.inds[0], indices[-2], indices[-1]]
                        
                        # Create dimensions for u and v variables
                        self.local.createDimension('time', None)
                        self.local.createDimension('level', None)
                        self.local.createDimension('x', None)
                        self.local.createDimension('y', None)
                        
                        # Create 3d or 4d u and v variables
                        if self.remote.variables[self.uname].ndim == 4:
                            self.ndim = 4
                            dimensions = ('time', 'level', 'y', 'x')
                            coordinates = "time z lon lat"
                        elif self.remote.variables[self.uname].ndim == 3:
                            self.ndim = 3
                            dimensions = ('time', 'y', 'x')
                            coordinates = "time lon lat"
                        shape = self.remote.variables[self.uname].shape

                        # If there is no FillValue defined in the dataset, use np.nan. 
                        # Sometimes it will work out correctly and other times we will
                        # have a huge cache file.
                        try:
                            fill = self.remote.variables[self.uname].missing_value
                        except Exception:
                            fill = np.nan
                        
                        # Create domain variable that specifies
                        # where there is data geographically/by time
                        # and where there is not data,
                        #   Used for testing if particle needs to 
                        #   ask cache to update
                        domain = self.local.createVariable('domain', 'i', dimensions, zlib=False, fill_value=0)
                        domain.coordinates = coordinates
                                
                        # Create local u and v variables
                        u = self.local.createVariable('u', 'f', dimensions, zlib=False, fill_value=fill)
                        v = self.local.createVariable('v', 'f', dimensions, zlib=False, fill_value=fill)
                        
                        v.coordinates = coordinates
                        u.coordinates = coordinates

                        localvars = [u, v,]
                        remotevars = [self.remote.variables[self.uname], self.remote.variables[self.vname]]
                        
                        # Create local w variable
                        if self.wname != None:
                            w = self.local.createVariable('w', 'f', dimensions, zlib=False, fill_value=fill)
                            w.coordinates = coordinates
                            localvars.append(w)
                            remotevars.append(self.remote.variables[self.wname])

                        if self.temp_name != None and self.salt_name != None: 
                            # Create local temp and salt vars       
                            temp = self.local.createVariable('temp', 'f', dimensions, zlib=False, fill_value=fill)
                            salt = self.local.createVariable('salt', 'f', dimensions, zlib=False, fill_value=fill)
                            temp.coordinates = coordinates
                            salt.coordinates = coordinates
                            localvars.append(temp)
                            localvars.append(salt)
                            remotevars.append(self.remote.variables[self.temp_name])
                            remotevars.append(self.remote.variables[self.salt_name])
                        
                        # Create local lat/lon coordinate variables
                        if self.remote.variables[self.xname].ndim == 2:
                            lon = self.local.createVariable('lon', 'f', ("y", "x"), zlib=False)
                            lon[:] = self.remote.variables[self.xname][:, :]
                            lat = self.local.createVariable('lat', 'f', ("y", "x"), zlib=False)
                            lat[:] = self.remote.variables[self.yname][:, :]
                        if self.remote.variables[self.xname].ndim == 1:
                            lon = self.local.createVariable('lon', 'f', ("x"), zlib=False)
                            lon[:] = self.remote.variables[self.xname][:]
                            lat = self.local.createVariable('lat', 'f', ("y"), zlib=False)
                            lat[:] = self.remote.variables[self.yname][:]                           
                            
                        # Create local z variable
                        if self.zname != None:            
                            if self.remote.variables[self.zname].ndim == 4:
                                z = self.local.createVariable('z', 'f', ("time","level","y","x"), zlib=False)  
                                remotez = self.remote.variables[self.zname]
                                localvars.append(z)
                                remotevars.append(remotez)
                            elif self.remote.variables[self.zname].ndim == 3:
                                z = self.local.createVariable('z', 'f', ("level","y","x"), zlib=False)
                                z[:] = self.remote.variables[self.zname][:, :, :]
                            elif self.remote.variables[self.zname].ndim ==1:
                                z = self.local.createVariable('z', 'f', ("level",), zlib=False)
                                z[:] = self.remote.variables[self.zname][:]
                                
                        # Create local time variable
                        time = self.local.createVariable('time', 'f8', ("time",), zlib=False)
                        if self.tname != None:
                            time[:] = self.remote.variables[self.tname][self.inds]
                        
                        if self.point_get.value[0]+self.time_size > np.max(self.inds):
                            current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1)
                        else:
                            current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size)
                        
                        # Get data from remote dataset and add
                        # to local cache  
                        while True:
                            try:
                                self.get_remote_data(localvars, remotevars, current_inds, shape)
                            except:
                                logger.warn("DataController failed to get remote data.  Trying again in 30 seconds")
                                timer.sleep(30)
                            else:
                                break
                        
                        c += 1
                    except StandardError:
                        logger.error("DataController failed to get data (first request)")
                        raise
                    finally:
                        self.local.sync()
                        self.local.close()
                        self.has_write_lock.value = -1
                        self.write_lock.release()
                        self.get_data.value = False
                        self.read_lock.release()
                        logger.debug("Done updating cache file, closing file, and releasing locks")
                else:
                    logger.debug("Updating cache file")
                    try:
                        # Open local cache dataset for appending
                        self.local = netCDF4.Dataset(cachepath, 'a')
                        
                        # Create local and remote variable objects
                        # for the variables of interest  
                        u = self.local.variables['u']
                        v = self.local.variables['v']
                        time = self.local.variables['time']
                        remoteu = self.remote.variables[self.uname]
                        remotev = self.remote.variables[self.vname]
                        
                        # Create lists of variable objects for
                        # the data updater
                        localvars = [u, v, ]
                        remotevars = [remoteu, remotev, ]
                        if self.salt_name != None and self.temp_name != None:
                            salt = self.local.variables['salt']
                            temp = self.local.variables['temp']
                            remotesalt = self.remote.variables[self.salt_name]
                            remotetemp = self.remote.variables[self.temp_name]
                            localvars.append(salt)
                            localvars.append(temp)
                            remotevars.append(remotesalt)
                            remotevars.append(remotetemp)
                        if self.wname != None:
                            w = self.local.variables['w']
                            remotew = self.remote.variables[self.wname]
                            localvars.append(w)
                            remotevars.append(remotew)
                        if self.zname != None:
                            remotez = self.remote.variables[self.zname]
                            if remotez.ndim == 4:
                                z = self.local.variables['z']
                                localvars.append(z)
                                remotevars.append(remotez)
                        if self.tname != None:
                            remotetime = self.remote.variables[self.tname]
                            time[self.inds] = self.remote.variables[self.inds]
                        
                        if self.point_get.value[0]+self.time_size > np.max(self.inds):
                            current_inds = np.arange(self.point_get.value[0], np.max(self.inds)+1)
                        else:
                            current_inds = np.arange(self.point_get.value[0],self.point_get.value[0] + self.time_size)
                        
                        # Get data from remote dataset and add
                        # to local cache
                        while True:
                            try:
                                self.get_remote_data(localvars, remotevars, current_inds, shape)
                            except:
                                logger.warn("DataController failed to get remote data.  Trying again in 30 seconds")
                                timer.sleep(30)
                            else:
                                break
                        
                        c += 1
                    except StandardError:
                        logger.error("DataController failed to get data (not first request)")
                        raise
                    finally:
                        self.local.sync()
                        self.local.close()
                        self.has_write_lock.value = -1
                        self.write_lock.release()
                        self.get_data.value = False
                        self.read_lock.release()
                        logger.debug("Done updating cache file, closing file, and releasing locks")
            else:
                pass        

        self.dataset.closenc()

        return "DataController"
# ##### Get bounding polygons from each dataset 

# <codecell>

from paegan.cdm.dataset import CommonDataset

lookup_standard_name = "sea_water_temperature"

# Filter out DAP servers that are taking FOREVER
dap_urls = [url for url in dap_urls if "data1.gfdl.noaa.gov" not in url]

dataset_polygons = {}
for i, dap in enumerate(dap_urls):
    print '(%d/%s)' % (i+1, len(dap_urls)),
    try:
        cd = CommonDataset.open(dap)
    except BaseException:
        print "Could not access", dap
    
    try:
        var = cd.get_varname_from_stdname(standard_name=lookup_standard_name)[0]
        dataset_polygons[dap] = cd.getboundingpolygon(var=var)
        print "Retrieved bounding polygon from %s" % dap
    except (IndexError, AssertionError):
        print "No standard_name '%s' in '%s'" % (lookup_standard_name, dap)

# <markdowncell>

# ##### Overlay dataset polygons on top of Important Bird Area polygons

# <codecell>
    def setup_run(self, hydrodataset, **kwargs):

        self.hydrodataset = hydrodataset

        logger.setLevel(logging.PROGRESS)

        # Relax.
        time.sleep(0.5)

        # Add ModelController description to logfile
        logger.info(str(self))

        # Add the model descriptions to logfile
        for m in self._models:
            logger.info(str(m))

        # Calculate the model timesteps
        # We need times = len(self._nstep) + 1 since data is stored one timestep
        # after a particle is forced with the final timestep's data.
        self.times = list(range(0, (self._step*self._nstep)+1, self._step))
        # Calculate a datetime object for each model timestep
        # This method is duplicated in CachingDataController and CachingForcer
        # using the 'times' variables above.  Will be useful in those other
        # locations for particles released at different times
        # i.e. released over a few days
        self.modelTimestep, self.datetimes = AsaTransport.get_time_objects_from_model_timesteps(self.times, start=self.start)

        logger.progress((1, "Setting up particle start locations"))
        point_locations = []
        if isinstance(self.geometry, Point):
            point_locations = [self.reference_location] * self._npart
        elif isinstance(self.geometry, Polygon) or isinstance(self.geometry, MultiPolygon):
            point_locations = [Location4D(latitude=loc.y, longitude=loc.x, depth=self._depth, time=self.start) for loc in AsaTransport.fill_polygon_with_points(goal=self._npart, polygon=self.geometry)]

        # Initialize the particles
        logger.progress((2, "Initializing particles"))
        for x in range(0, self._npart):
            p = LarvaParticle(id=x)
            p.location = point_locations[x]
            # We don't need to fill the location gaps here for environment variables
            # because the first data collected actually relates to this original
            # position.
            # We do need to fill in fields such as settled, halted, etc.
            p.fill_status_gap()
            # Set the inital note
            p.note = p.outputstring()
            p.notes.append(p.note)
            self.particles.append(p)

        logger.progress((3, "Initializing and caching hydro model's grid %s" % self.hydrodataset))
        try:
            ds = CommonDataset.open(self.hydrodataset)
            # Query the dataset for common variable names
            # and the time variable.
            logger.debug("Retrieving variable information from dataset")
            self.common_variables = self.get_common_variables_from_dataset(ds)
        except Exception:
            logger.exception("Failed to access dataset %s" % self.hydrodataset)
            raise BaseDataControllerError("Inaccessible Dataset: %s" % self.hydrodataset)

        self.timevar = None
        try:
            assert self.common_variables.get("u") in ds._current_variables
            assert self.common_variables.get("v") in ds._current_variables
            assert self.common_variables.get("x") in ds._current_variables
            assert self.common_variables.get("y") in ds._current_variables

            self.timevar = ds.gettimevar(self.common_variables.get("u"))
            model_start = self.timevar.get_dates()[0]
            model_end = self.timevar.get_dates()[-1]
        except AssertionError:
            logger.exception("Could not locate variables needed to run model: %s" % str(self.common_variables))
            raise BaseDataControllerError("A required data variable was not found in %s" % self.hydrodataset)
        finally:
            ds.closenc()

        try:
            assert self.start > model_start
            assert self.start < model_end
        except AssertionError:
            raise BaseDataControllerError("Start time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[0], model_start, model_end))

        try:
            assert self.datetimes[-1] > model_start
            assert self.datetimes[-1] < model_end
        except AssertionError:
            raise BaseDataControllerError("End time for model (%s) is not available in source dataset (%s/%s)" % (self.datetimes[-1], model_start, model_end))
Example #50
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        try:
            cd = CommonDataset.open(self.service.get('url'))
        except Exception as e:
            app.logger.error("Could not open DAP dataset from '%s'\n"
                             "Exception %s: %s" %
                             (self.service.get('url'), type(e).__name__, e))
            return 'Not harvested'

        # rely on times in the file first over global atts for calculating
        # start/end times of dataset.
        tmin, tmax = self.get_min_max_time(cd)
        # if nothing was returned, try to get from global atts
        if (tmin == None and tmax == None
                and 'time_coverage_start' in cd.metadata
                and 'time_coverage_end' in cd.metadata):
            try:
                tmin, tmax = (parse(cd.metadata[t])
                              for t in ('time_coverage_start',
                                        'time_coverage_end'))
            except ValueError:
                tmin, tmax = None, None
        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(
                u"Could not get dataset name.  No global attribute named 'title'."
            )

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(
                u"Could not get dataset description.  No global attribute named 'summary'."
            )

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(
                map(lambda x: unicode(x.strip()),
                    cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(
                u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list."
            )

        # VARIABLES
        prefix = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(
                cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [
                re.compile("CF-"),
                re.compile(
                    'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html'
                )
            ]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [
            cd.get_varname_from_stdname(x)[0]
            for x in self.get_standard_variables(cd.nc)
            if x not in self.STD_AXIS_NAMES and
            len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0
        ]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(
            set([
                x for x in cd.nc.variables if x not in itertools.chain(
                    _possibley, _possiblex, _possiblez, _possiblet,
                    self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and
                len(cd.nc.variables[x].shape) > 0 and x not in std_variables
            ]))

        axis_names = DapHarvest.get_axis_variables(cd.nc)
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX

        # paegan does not support ugrid, so try to detect this condition and skip
        is_ugrid = False
        is_trajectory = False
        for vname, v in cd.nc.variables.iteritems():
            if 'cf_role' in v.ncattrs():
                if v.getncattr('cf_role') == 'mesh_topology':
                    is_ugrid = True
                    break
                elif v.getncattr('cf_role') == 'trajectory_id':
                    is_trajectory = True
                    break

        gj = None

        if is_ugrid:
            messages.append(
                u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry."
            )
        elif is_trajectory:
            coord_names = {}
            # try to get info for x, y, z, t axes
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    coord_names = cd.get_coord_names(v, **axis_names)

                    if coord_names['xname'] is not None and \
                       coord_names['yname'] is not None:
                        break
                except (AssertionError, AttributeError, ValueError, KeyError):
                    pass
            else:
                messages.append(
                    u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library."
                )

            if 'xname' in coord_names:
                try:
                    xvar = cd.nc.variables[coord_names['xname']]
                    yvar = cd.nc.variables[coord_names['yname']]

                    # one less order of magnitude eg 390000 -> 10000
                    slice_factor = 10**(int(math.log10(xvar.size)) - 1)
                    if slice_factor < 1:
                        slice_factor = 1

                    # TODO: don't split x/y as separate arrays.  Refactor to
                    # use single numpy array instead with both lon/lat

                    # tabledap datasets must be treated differently than
                    # standard DAP endpoints.  Retrieve geojson instead of
                    # trying to access as a DAP endpoint
                    if 'erddap/tabledap' in unique_id:
                        # take off 's.' from erddap
                        gj = self.erddap_geojson_url(coord_names)
                        # type defaults to MultiPoint, change to LineString
                        coords = np.array(gj['coordinates'][::slice_factor] +
                                          gj['coordinates'][-1:])
                        xs = coords[:, 0]
                        ys = coords[:, 1]
                    else:
                        xs = np.concatenate((xvar[::slice_factor], xvar[-1:]))
                        ys = np.concatenate((yvar[::slice_factor], yvar[-1:]))
                    # both coords must be valid to have a valid vertex
                    # get rid of any nans and unreasonable lon/lats
                    valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) &
                                 (~np.isnan(ys)) & (np.absolute(ys) <= 90))

                    xs = xs[valid_idx]
                    ys = ys[valid_idx]
                    # Shapely seems to require float64 values or incorrect
                    # values will propagate for the generated lineString
                    # if the array is not numpy's float64 dtype
                    lineCoords = np.array([xs, ys]).T.astype('float64')

                    gj = mapping(asLineString(lineCoords))

                    messages.append(u"Variable %s was used to calculate "
                                    u"trajectory geometry, and is a "
                                    u"naive sampling." % v)

                except (AssertionError, AttributeError, ValueError, KeyError,
                        IndexError) as e:
                    app.logger.warn("Trajectory error occured: %s", e)
                    messages.append(
                        u"Trajectory discovered but could not create a geometry."
                    )

        else:
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    gj = mapping(
                        cd.getboundingpolygon(var=v,
                                              **axis_names).simplify(0.5))
                except (AttributeError, AssertionError, ValueError, KeyError,
                        IndexError):
                    try:
                        # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                        # Asterik magic to expland the tuple into positional arguments
                        app.logger.exception("Error calculating bounding box")

                        # handles "points" aka single position NCELLs
                        bbox = cd.getbbox(var=v, **axis_names)
                        gj = self.get_bbox_or_point(bbox)

                    except (AttributeError, AssertionError, ValueError,
                            KeyError, IndexError):
                        pass

                if gj is not None:
                    # We computed something, break out of loop.
                    messages.append(
                        u"Variable %s was used to calculate geometry." % v)
                    break

            if gj is None:  # Try the globals
                gj = self.global_bounding_box(cd.nc)
                messages.append(
                    u"Bounding Box calculated using global attributes")
            if gj is None:
                messages.append(
                    u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset."
                )
                messages.append(
                    u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset."
                )
                messages.append(
                    u"Failed to calculate geometry using all of the following variables: %s"
                    % ", ".join(
                        itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes

        final_var_names = []
        if prefix == "":
            messages.append(
                u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities."
            )
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(
                map(unicode, [
                    "%s%s" %
                    (prefix, cd.nc.variables[x].getncattr("standard_name"))
                    for x in std_variables
                ]))

        service = {
            'name':
            name,
            'description':
            description,
            'service_type':
            self.service.get('service_type'),
            'service_id':
            ObjectId(self.service.get('_id')),
            'data_provider':
            self.service.get('data_provider'),
            'metadata_type':
            u'ncml',
            'metadata_value':
            unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'time_min':
            tmin,
            'time_max':
            tmax,
            'messages':
            map(unicode, messages),
            'keywords':
            keywords,
            'variables':
            map(unicode, final_var_names),
            'asset_type':
            get_common_name(DapHarvest.get_asset_type(cd)),
            'geojson':
            gj,
            'updated':
            datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        ncdataset = Dataset(self.service.get('url'))
        scores = self.ccheck_dataset(ncdataset)
        metamap = self.metamap_dataset(ncdataset)

        try:
            metadata_rec = self.save_ccheck_dataset('ioos', dataset._id,
                                                    scores, metamap)
        except Exception as e:
            metadata_rec = None
            app.logger.error(
                "could not save compliancecheck/metamap information",
                exc_info=True)

        return "Harvested"
Example #51
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        METADATA_VAR_NAMES = [u'crs', u'projection']

        # CF standard names for Axis
        STD_AXIS_NAMES = [
            u'latitude', u'longitude', u'time', u'forecast_reference_time',
            u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1',
            u'ocean_s_coordinate_g2', u'ocean_s_coordinate',
            u'ocean_double_sigma', u'ocean_sigma_over_z',
            u'projection_y_coordinate', u'projection_x_coordinate'
        ]

        # Some datasets don't define standard_names on axis variables.  This is used to weed them out based on the
        # actual variable name
        COMMON_AXIS_NAMES = [
            u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time',
            u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u',
            u'lat_v', u'lon_v  ', u'lat_rho', u'lon_rho', u'lat_psi'
        ]

        cd = CommonDataset.open(self.service.get('url'))

        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(
                u"Could not get dataset name.  No global attribute named 'title'."
            )

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(
                u"Could not get dataset description.  No global attribute named 'summary'."
            )

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(
                map(lambda x: unicode(x.strip()),
                    cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(
                u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list."
            )

        # VARIABLES
        prefix = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(
                cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [
                re.compile("CF-"),
                re.compile(
                    'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html'
                )
            ]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [
            cd.get_varname_from_stdname(x)[0]
            for x in self.get_standard_variables(cd.nc)
            if x not in STD_AXIS_NAMES and
            len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0
        ]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(
            set([
                x for x in cd.nc.variables if x not in itertools.chain(
                    _possibley, _possiblex, _possiblez, _possiblet,
                    METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and
                len(cd.nc.variables[x].shape) > 0 and x not in std_variables
            ]))
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX
        gj = None
        for v in itertools.chain(std_variables, non_std_variables):
            try:
                gj = mapping(cd.getboundingpolygon(var=v))
            except (AttributeError, AssertionError, ValueError):
                try:
                    # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                    # Asterik magic to expland the tuple into positional arguments
                    gj = mapping(box(*cd.get_bbox(var=v)))
                except (AttributeError, AssertionError, ValueError):
                    pass

            if gj is not None:
                # We computed something, break out of loop.
                messages.append(
                    u"Variable %s was used to calculate geometry." % v)
                break

        if gj is None:
            messages.append(
                u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset."
            )
            messages.append(
                u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset."
            )
            messages.append(
                u"Failed to calculate geometry using all of the following variables: %s"
                % ", ".join(itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes

        final_var_names = []
        if prefix == "":
            messages.append(
                u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities."
            )
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(
                map(unicode, [
                    "%s%s" %
                    (prefix, cd.nc.variables[x].getncattr("standard_name"))
                    for x in std_variables
                ]))

        service = {
            'name':
            name,
            'description':
            description,
            'service_type':
            self.service.get('service_type'),
            'service_id':
            ObjectId(self.service.get('_id')),
            'data_provider':
            self.service.get('data_provider'),
            'metadata_type':
            u'ncml',
            'metadata_value':
            unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages':
            map(unicode, messages),
            'keywords':
            keywords,
            'variables':
            map(unicode, final_var_names),
            'asset_type':
            unicode(cd._datasettype).upper(),
            'geojson':
            gj,
            'updated':
            datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        return "Harvested"