def test_validate_uri_opendap_does_not_exist(self): uri = 'http://www.ifremer.fr/opendap/cerdap1/globcurrent/' \ 'v2.0/global_012_deg/geostrophic/2014/001/' \ '20140101000000-GLOBCURRENT-L4-CURgeo_0m-ALT_OI-v02.0-fv01.0.nc.tull' with self.assertRaises(ConnectionError) as cm: utils.validate_uri(uri) self.assertEqual(uri, cm.exception.args[0])
def get_or_create(self, uri, force): # Validate uri - this should raise an exception if the uri doesn't # point to a valid file or stream validate_uri(uri) # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we # need to pass uri_filter_args uris = DatasetURI.objects.filter(uri=uri) # If the ingested uri is already in the database and not <force> ingestion then stop if uris.exists() and not force: return uris[0].dataset, False elif uris.exists() and force: uris[0].dataset.delete() # Open file with Nansat n = Nansat(nansat_filename(uri)) # get metadata from Nansat and get objects from vocabularies n_metadata = n.get_metadata() # set compulsory metadata (source) platform, _ = Platform.objects.get_or_create( json.loads(n_metadata['platform'])) instrument, _ = Instrument.objects.get_or_create( json.loads(n_metadata['instrument'])) specs = n_metadata.get('specs', '') source, _ = Source.objects.get_or_create(platform=platform, instrument=instrument, specs=specs) footprint = Polygon(list(zip(*n.get_border()))) geolocation = GeographicLocation.objects.get_or_create( geometry=footprint)[0] data_center = DataCenter.objects.get_or_create( json.loads(n_metadata['Data Center']))[0] iso_category = ISOTopicCategory.objects.get_or_create( pti.get_iso19115_topic_category('Oceans'))[0] location = Location.objects.get_or_create( json.loads(n_metadata['gcmd_location']))[0] # create dataset ds, created = Dataset.objects.get_or_create( time_coverage_start=make_aware(n.time_coverage_start), time_coverage_end=make_aware( n.time_coverage_start + timedelta(hours=23, minutes=59, seconds=59)), source=source, geographic_location=geolocation, ISO_topic_category=iso_category, data_center=data_center, summary='', gcmd_location=location, access_constraints='', entry_id=lambda: 'NERSC_' + str(uuid.uuid4())) ds_uri, _ = DatasetURI.objects.get_or_create( name=FILE_SERVICE_NAME, service=LOCAL_FILE_SERVICE, uri=uri, dataset=ds) return ds, created
def crawl(url, **options): validate_uri(url) skips = Crawl.SKIPS + ['.*ncml'] c = Crawl(url, skip=skips, debug=True) added = 0 for ds in c.datasets: url = [s.get('url') for s in ds.services if s.get('service').lower()=='opendap'][0] metno_obs_stat, cr = MetObsStation.objects.get_or_create(url) if cr: added += 1 print('Added %s, no. %d/%d'%(url, added, len(c.datasets))) return added
def crawl(url, **options): validate_uri(url) date = options.get('date', None) filename = options.get('filename', None) if date: select = ['(.*%s.*\.nc)' %date] elif filename: select = ['(.*%s)' %filename] else: select = None skips = Crawl.SKIPS + ['.*ncml'] c = Crawl(url, select=select, skip=skips, debug=True) added = 0 for ds in c.datasets: url = [s.get('url') for s in ds.services if s.get('service').lower()=='opendap'][0] try: gds, cr = NansatDataset.objects.get_or_create(url) except (IOError, AttributeError) as e: #warnings.warn(e.message) continue else: if cr: added += 1 print('Added %s, no. %d/%d'%(url, added, len(c.datasets))) # Connect all service uris to the dataset for s in ds.services: try: ds_uri, _ = DatasetURI.objects.get_or_create(name=s.get('name'), service=s.get('service'), uri=s.get('url'), dataset=gds) except IntegrityError: # There is no standard for the name (and possibly the service). This means that the # naming defined by geospaas.catalog.managers.DAP_SERVICE_NAME (and assigned to the # DatasetURI in geospaas.nansat_ingestor.managers.DatasetManager.get_or_create) may # be different from s.get('name'). # Solution: ignore the error and continue the loop continue return added
def crawl_and_ingest(url, **options): validate_uri(url) date = options.get('date', None) filename = options.get('filename', None) if date: select = ['(.*%s.*\.nc)' % date] elif filename: select = ['(.*%s)' % filename] else: select = None skips = Crawl.SKIPS + ['.*ncml'] c = Crawl(url, select=select, skip=skips, debug=True) added = 0 for ds in c.datasets: for s in ds.services: if s.get('service').lower() == 'opendap': url = s.get('url') name = s.get('name') service = s.get('service') try: # Create Dataset from OPeNDAP url - this is necessary to get all metadata gds, cr = NansatDataset.objects.get_or_create( url, uri_service_name=name, uri_service_type=service) except (IOError, AttributeError, ValueError) as e: # warnings.warn(e.message) continue if cr: added += 1 print('Added %s, no. %d/%d' % (url, added, len(c.datasets))) # Connect all service uris to the dataset for s in ds.services: ds_uri, _ = DatasetURI.objects.get_or_create( name=s.get('name'), service=s.get('service'), uri=s.get('url'), dataset=gds) print('Added %s, no. %d/%d' % (url, added, len(c.datasets))) return added
def get_or_create(self, uri, n_points=10, uri_filter_args=None, *args, **kwargs): ''' Create dataset and corresponding metadata Parameters: ---------- uri : str URI to file or stream openable by Nansat n_points : int Number of border points (default is 10) uri_filter_args : dict Extra DatasetURI filter arguments if several datasets can refer to the same URI Returns: ------- dataset and flag ''' if not uri_filter_args: uri_filter_args = {} # Validate uri - this should raise an exception if the uri doesn't point to a valid # file or stream validate_uri(uri) # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we # need to pass uri_filter_args uris = DatasetURI.objects.filter(uri=uri, **uri_filter_args) if len(uris) > 0: return uris[0].dataset, False # Open file with Nansat n = Nansat(nansat_filename(uri), **kwargs) # get metadata from Nansat and get objects from vocabularies n_metadata = n.get_metadata() # set compulsory metadata (source) platform, _ = Platform.objects.get_or_create( json.loads(n_metadata['platform'])) instrument, _ = Instrument.objects.get_or_create( json.loads(n_metadata['instrument'])) specs = n_metadata.get('specs', '') source, _ = Source.objects.get_or_create(platform=platform, instrument=instrument, specs=specs) default_char_fields = { 'entry_id': lambda: 'NERSC_' + str(uuid.uuid4()), 'entry_title': lambda: 'NONE', 'summary': lambda: 'NONE', } # set optional CharField metadata from Nansat or from default_char_fields options = {} for name in default_char_fields: if name not in n_metadata: warnings.warn('%s is not provided in Nansat metadata!' % name) options[name] = default_char_fields[name]() else: options[name] = n_metadata[name] default_foreign_keys = { 'gcmd_location': { 'model': Location, 'value': pti.get_gcmd_location('SEA SURFACE') }, 'data_center': { 'model': DataCenter, 'value': pti.get_gcmd_provider('NERSC') }, 'ISO_topic_category': { 'model': ISOTopicCategory, 'value': pti.get_iso19115_topic_category('Oceans') }, } # set optional ForeignKey metadata from Nansat or from default_foreign_keys for name in default_foreign_keys: value = default_foreign_keys[name]['value'] model = default_foreign_keys[name]['model'] if name not in n_metadata: warnings.warn('%s is not provided in Nansat metadata!' % name) else: try: value = json.loads(n_metadata[name]) except: warnings.warn( '%s value of %s metadata provided in Nansat is wrong!' % (n_metadata[name], name)) options[name], _ = model.objects.get_or_create(value) # Find coverage to set number of points in the geolocation if len(n.vrt.dataset.GetGCPs()) > 0: n.reproject_gcps() geolocation = GeographicLocation.objects.get_or_create( geometry=WKTReader().read(n.get_border_wkt(nPoints=n_points)))[0] # create dataset ds, created = Dataset.objects.get_or_create( time_coverage_start=n.get_metadata('time_coverage_start'), time_coverage_end=n.get_metadata('time_coverage_end'), source=source, geographic_location=geolocation, **options) # create dataset URI ds_uri, _ = DatasetURI.objects.get_or_create(uri=uri, dataset=ds) return ds, created
def get_or_create(self, uri, n_points=10, uri_filter_args=None, uri_service_name=FILE_SERVICE_NAME, uri_service_type=LOCAL_FILE_SERVICE, *args, **kwargs): """ Create dataset and corresponding metadata Parameters: ---------- uri : str URI to file or stream openable by Nansat n_points : int Number of border points (default is 10) uri_filter_args : dict Extra DatasetURI filter arguments if several datasets can refer to the same URI uri_service_name : str name of the service which is used ('dapService', 'fileService', 'http' or 'wms') uri_service_type : str type of the service which is used ('OPENDAP', 'local', 'HTTPServer' or 'WMS') Returns: ------- dataset and flag """ if not uri_filter_args: uri_filter_args = {} # Validate uri - this should raise an exception if the uri doesn't point to a valid # file or stream validate_uri(uri) # Several datasets can refer to the same uri (e.g., scatterometers and svp drifters), so we # need to pass uri_filter_args uris = DatasetURI.objects.filter(uri=uri, **uri_filter_args) if len(uris) > 0: return uris[0].dataset, False # Open file with Nansat n = Nansat(nansat_filename(uri), **kwargs) # get metadata from Nansat and get objects from vocabularies n_metadata = n.get_metadata() entry_id = n_metadata.get('entry_id', None) # set compulsory metadata (source) platform, _ = Platform.objects.get_or_create( json.loads(n_metadata['platform'])) instrument, _ = Instrument.objects.get_or_create( json.loads(n_metadata['instrument'])) specs = n_metadata.get('specs', '') source, _ = Source.objects.get_or_create(platform=platform, instrument=instrument, specs=specs) default_char_fields = { # Adding NERSC_ in front of the id violates the string representation of the uuid #'entry_id': lambda: 'NERSC_' + str(uuid.uuid4()), 'entry_id': lambda: str(uuid.uuid4()), 'entry_title': lambda: 'NONE', 'summary': lambda: 'NONE', } # set optional CharField metadata from Nansat or from default_char_fields options = {} try: existing_ds = Dataset.objects.get(entry_id=entry_id) except Dataset.DoesNotExist: existing_ds = None for name in default_char_fields: if name not in n_metadata: warnings.warn('%s is not provided in Nansat metadata!' % name) # prevent overwriting of existing values by defaults if existing_ds: options[name] = existing_ds.__getattribute__(name) else: options[name] = default_char_fields[name]() else: options[name] = n_metadata[name] default_foreign_keys = { 'gcmd_location': { 'model': Location, 'value': pti.get_gcmd_location('SEA SURFACE') }, 'data_center': { 'model': DataCenter, 'value': pti.get_gcmd_provider('NERSC') }, 'ISO_topic_category': { 'model': ISOTopicCategory, 'value': pti.get_iso19115_topic_category('Oceans') }, } # set optional ForeignKey metadata from Nansat or from default_foreign_keys for name in default_foreign_keys: value = default_foreign_keys[name]['value'] model = default_foreign_keys[name]['model'] if name not in n_metadata: warnings.warn('%s is not provided in Nansat metadata!' % name) else: try: value = json.loads(n_metadata[name]) except: warnings.warn( '%s value of %s metadata provided in Nansat is wrong!' % (n_metadata[name], name)) if existing_ds: options[name] = existing_ds.__getattribute__(name) else: options[name], _ = model.objects.get_or_create(value) # Find coverage to set number of points in the geolocation if len(n.vrt.dataset.GetGCPs()) > 0: n.reproject_gcps() geolocation = GeographicLocation.objects.get_or_create( geometry=WKTReader().read(n.get_border_wkt(nPoints=n_points)))[0] # create dataset # - the get_or_create method should use get_or_create here as well, # or its name should be changed - see issue #127 ds, created = Dataset.objects.update_or_create( entry_id=options['entry_id'], defaults={ 'time_coverage_start': n.get_metadata('time_coverage_start'), 'time_coverage_end': n.get_metadata('time_coverage_end'), 'source': source, 'geographic_location': geolocation, 'gcmd_location': options["gcmd_location"], 'ISO_topic_category': options["ISO_topic_category"], "data_center": options["data_center"], 'entry_title': options["entry_title"], 'summary': options["summary"] }) # create parameter all_band_meta = n.bands() for band_id in range(1, len(all_band_meta) + 1): band_meta = all_band_meta[band_id] standard_name = band_meta.get('standard_name', None) short_name = band_meta.get('short_name', None) units = band_meta.get('units', None) if standard_name in ['latitude', 'longitude', None]: continue params = Parameter.objects.filter(standard_name=standard_name) if params.count() > 1 and short_name is not None: params = params.filter(short_name=short_name) if params.count() > 1 and units is not None: params = params.filter(units=units) if params.count() >= 1: ds.parameters.add(params[0]) # create dataset URI DatasetURI.objects.get_or_create(name=uri_service_name, service=uri_service_type, uri=uri, dataset=ds) return ds, created
def test_validate_uri_opendap_does_not_exist(self): uri = 'http://www.ifremer.fr/opendap/cerdap1/cersat/' \ '20140101000000-GLOBCURRENT-L4-CURgeo_0m-ALT_OI-v02.0-fv01.0.nc.tull' with self.assertRaises(OSError) as cm: utils.validate_uri(uri) self.assertEqual('NetCDF: file not found', cm.exception.args[1])
def test_fail_invalid_uri(self): uri = '/this/is/some/file/but/not/an/uri' with self.assertRaises(ValueError): utils.validate_uri(uri)
def test__validate_uri__opendap_exists(self, mock_PoolManager): # Mock request.status so it returns 200, meaning successful connection.. mock_PoolManager.return_value.request.return_value = PropertyMock( status=200) uri = 'http://nbstds.met.no/thredds/catalog/NBS/S2A/test_catalog.html' self.assertEqual(utils.validate_uri(uri), None)
def test_validate_uri_local_does_not_exist(self): uri = 'file://localhost/some/folder/filename.ext' with self.assertRaises(FileNotFoundError) as cm: utils.validate_uri(uri) the_exception = '/some/folder/filename.ext' self.assertEqual(the_exception, cm.exception.args[0])
def test_validate_uri_local(self, mock_isfile): mock_isfile.return_value = True uri = 'file://localhost/some/folder/filename.ext' self.assertEqual(utils.validate_uri(uri), None)
def save(self, *args, **kwargs): validate_uri(self.uri) # Validation is not usually done in the models but rather via form # validation. We should discuss if we want it here or not. super(DatasetURI, self).save(*args, **kwargs)