def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() # TODO: Destroy the metadata files # TODO: RE-analyze coverage # TODO: Should be corrupt, take action to repair if so # Repair the metadata files dr.repair_metadata() # TODO: Re-analyze fixed coverage fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product() self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # TODO: Turn these into meaningful Asserts self.assertEqual(len(dr_result.get_brick_corruptions()), 0) self.assertEqual(len(dr_result.get_brick_size_ratios()), 8) self.assertEqual(len(dr_result.get_corruptions()), 0) self.assertEqual(len(dr_result.get_master_corruption()), 0) self.assertEqual(len(dr_result.get_param_corruptions()), 0) self.assertEqual(len(dr_result.get_param_size_ratios()), 64) self.assertEqual(len(dr_result.get_master_size_ratio()), 1) self.assertEqual(len(dr_result.get_size_ratios()), 73) self.assertEqual(dr_result.master_status[1], 'NORMAL') self.assertFalse(dr_result.is_corrupt) self.assertEqual(dr_result.param_file_count, 64) self.assertEqual(dr_result.brick_file_count, 8) self.assertEqual(dr_result.total_file_count, 73) # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() orig_cov.close() # Corrupt the Master File fo = open(cov._persistence_layer.master_manager.file_path, "wb") fo.write('Junk') fo.close() # Corrupt the lon Parameter file fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb") fo.write('Junk') fo.close() corrupt_res = dr.analyze(reanalyze=True) self.assertTrue(corrupt_res.is_corrupt) # Repair the metadata files dr.repair(reanalyze=True) fixed_res = dr.analyze(reanalyze=True) self.assertFalse(fixed_res.is_corrupt) fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() fixed_cov.close() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def _splice_coverage(cls, dataset_id, scov): file_root = FileSystem.get_url(FS.CACHE,'datasets') vcov = cls._get_coverage(dataset_id,mode='a') scov_pth = scov.persistence_dir if isinstance(vcov.reference_coverage, SimplexCoverage): ccov = ComplexCoverage(file_root, uuid4().hex, 'Complex coverage for %s' % dataset_id, reference_coverage_locs=[vcov.head_coverage_path,], parameter_dictionary=ParameterDictionary(), complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) log.info('Creating Complex Coverage: %s', ccov.persistence_dir) ccov.append_reference_coverage(scov_pth) ccov_pth = ccov.persistence_dir ccov.close() vcov.replace_reference_coverage(ccov_pth) elif isinstance(vcov.reference_coverage, ComplexCoverage): log.info('Appending simplex coverage to complex coverage') #vcov.reference_coverage.append_reference_coverage(scov_pth) dir_path = vcov.reference_coverage.persistence_dir vcov.close() ccov = AbstractCoverage.load(dir_path, mode='a') ccov.append_reference_coverage(scov_pth) ccov.refresh() ccov.close() vcov.refresh() vcov.close()
def _splice_coverage(cls, dataset_id, scov): file_root = FileSystem.get_url(FS.CACHE, 'datasets') vcov = cls._get_coverage(dataset_id, mode='a') scov_pth = scov.persistence_dir if isinstance(vcov.reference_coverage, SimplexCoverage): ccov = ComplexCoverage( file_root, uuid4().hex, 'Complex coverage for %s' % dataset_id, reference_coverage_locs=[ vcov.head_coverage_path, ], parameter_dictionary=ParameterDictionary(), complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION) log.info('Creating Complex Coverage: %s', ccov.persistence_dir) ccov.append_reference_coverage(scov_pth) ccov_pth = ccov.persistence_dir ccov.close() vcov.replace_reference_coverage(ccov_pth) elif isinstance(vcov.reference_coverage, ComplexCoverage): log.info('Appending simplex coverage to complex coverage') #vcov.reference_coverage.append_reference_coverage(scov_pth) dir_path = vcov.reference_coverage.persistence_dir vcov.close() ccov = AbstractCoverage.load(dir_path, mode='a') ccov.append_reference_coverage(scov_pth) ccov.refresh() ccov.close() vcov.refresh() vcov.close()
def _get_nonview_coverage(cls, dataset_id, mode='r'): cov = cls._get_coverage(dataset_id, mode) if isinstance(cov, ViewCoverage): rcov = cov.reference_coverage pdir = rcov.persistence_dir rcov = None cov.close() cov = AbstractCoverage.load(pdir, mode=mode) return cov
def fill_temporal_gap(self, dataset_id, gap_coverage_path=None, gap_coverage_id=None): if gap_coverage_path is None and gap_coverage_id is None: raise ValueError('Must specify either \'gap_coverage_path\' or \'gap_coverage_id\'') if gap_coverage_path is None: gap_coverage_path = self.get_coverage_path(gap_coverage_id) from coverage_model import AbstractCoverage gap_cov = AbstractCoverage.load(gap_coverage_path) self.pause_ingestion(self.get_stream_id(dataset_id)) DatasetManagementService._splice_coverage(dataset_id, gap_cov)
def _get_coverage(cls, dataset_id, mode='w'): file_root = FileSystem.get_url(FS.CACHE, 'datasets') coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode) return coverage
def get_dataset_xml(self, coverage_path, product_id, product_name='', available_fields=None): #http://coastwatch.pfeg.noaa.gov/erddap/download/setupDatasetsXml.html result = '' paths = os.path.split(coverage_path) cov = AbstractCoverage.load(coverage_path) doc = xml.dom.minidom.Document() #erd_type_map = {'d':'double', 'f':"float", 'h':'short', 'i':'int', 'l':'int', 'q':'int', 'b':'byte', 'b':'char', 'S':'String'} #Get lists of variables with unique sets of dimensions. #Datasets can only have variables with the same sets of dimensions if not cov.list_parameters(): raise BadRequest( 'Attempting to register an empty dataset. The coverage (%s) has no definition.\n%s' % (coverage_path, cov)) datasets = {} for key in cov.list_parameters(): pc = cov.get_parameter_context(key) #if getattr(pc, 'visible', None): # continue if available_fields and pc.name not in available_fields: continue #if not isinstance(pc.param_type, QuantityType): # continue param = cov.get_parameter(key) dims = (cov.temporal_parameter_name, ) if len(param.shape) == 2: dims = (cov.temporal_parameter_name, cov.spatial_domain.shape.name) if not dims in datasets.keys(): datasets[dims] = [] datasets[dims].append(key) index = 0 if not datasets: raise BadRequest( 'Attempting to register a dimensionless dataset. The coverage (%s) has no dimension(s).\n%s' % (coverage_path, cov)) for dims, vars in datasets.iteritems(): erd_name_map = self.get_errdap_name_map(vars) if len(vars) == 1: raise BadRequest( 'A dataset needs a proper range, not just the temporal dimension. %s\n%s' % (coverage_path, cov)) dataset_element = doc.createElement('dataset') #dataset_element.setAttribute('type', 'EDDGridFromDap') dataset_element.setAttribute('type', 'EDDTableFromDapSequence') dataset_element.setAttribute('datasetID', 'data' + product_id) dataset_element.setAttribute('active', 'True') source_element = doc.createElement('sourceUrl') text_node = doc.createTextNode(self.pydap_url + paths[1]) source_element.appendChild(text_node) dataset_element.appendChild(source_element) reload_element = doc.createElement('reloadEveryNMinutes') if self.CFG.get_safe('server.erddap.dataset_caching', True): text_node = doc.createTextNode('1440') else: text_node = doc.createTextNode('5') reload_element.appendChild(text_node) dataset_element.appendChild(reload_element) outer_element = doc.createElement('outerSequenceName') text_node = doc.createTextNode('data') outer_element.appendChild(text_node) dataset_element.appendChild(outer_element) default_element = doc.createElement('defaultDataQuery') text_node = doc.createTextNode('&time>=1970-01-01') default_element.appendChild(text_node) dataset_element.appendChild(default_element) # No longer applicable #if self.CFG.get_safe('server.erddap.dataset_caching',True): #refresh_interval = self.CFG.get_safe('server.erddap.refresh_interval', 30000) #update_element = doc.createElement('updateEveryNMillis') #text_node = doc.createTextNode(str(refresh_interval)) #update_element.appendChild(text_node) #dataset_element.appendChild(update_element) add_attributes_element = doc.createElement('addAttributes') atts = {} atts['title'] = product_name or urllib.unquote(cov.name) atts['infoUrl'] = self.ux_url + 'DataProduct/face/' + product_id atts['institution'] = 'OOI' atts[ 'Conventions'] = "COARDS, CF-1.6, Unidata Dataset Discovery v1.0" atts[ 'license'] = '''These data were collected by the Ocean Observatory Initiative (OOI) project purely for internal system development purposes during the construction phase of the project and are offered for release to the public with no assurance of data quality, consistency, temporal continuity or additional support. The OOI Program assumes no liability resulting from the use of these data for other than the intended purpose. No data quality assurance steps have been implemented on this data to date.''' atts['summary'] = cov.name atts['cdm_data_type'] = 'Other' atts['standard_name_vocabulary'] = 'CF-12' for key, val in atts.iteritems(): self.xml_attr(doc, add_attributes_element, key, val) if len(add_attributes_element.childNodes) > 0: dataset_element.appendChild(add_attributes_element) for var_name in vars: var = cov.get_parameter_context(var_name) if re.match(r'.*_[a-z0-9]{32}', var.name): continue # Let's not do this #if len(param.shape) >=1 and not param.is_coordinate: #dataVariable data_element = doc.createElement('dataVariable') source_name_element = doc.createElement('sourceName') text_node = doc.createTextNode(var.name) source_name_element.appendChild(text_node) data_element.appendChild(source_name_element) destination_name_element = doc.createElement('destinationName') text_node = doc.createTextNode(erd_name_map[var.name]) destination_name_element.appendChild(text_node) data_element.appendChild(destination_name_element) add_attributes_element = doc.createElement('addAttributes') units = "unknown" if hasattr(var, 'uom') and var.uom: units = var.uom self.xml_attr(doc, add_attributes_element, 'units', units) #if var.ATTRS is not None: #for key in var.ATTRS: #if not hasattr(var,key): #continue #val = getattr(var,key) #if not val: #val = '' #att_element = doc.createElement('att') #att_element.setAttribute('name', key) #text_node = doc.createTextNode(val) #att_element.appendChild(text_node) #add_attributes_element.appendChild(att_element) ioos_cat = self.get_ioos_category(var.name, units) self.xml_attr(doc, add_attributes_element, 'ioos_category', ioos_cat) if hasattr(var, 'display_name') and var.display_name is not None: self.xml_attr(doc, add_attributes_element, 'long_name', var.display_name) if hasattr(var, 'standard_name') and var.standard_name is not None: self.xml_attr(doc, add_attributes_element, 'standard_name', var.standard_name) if 'seconds' in units and 'since' in units: self.xml_attr(doc, add_attributes_element, 'time_precision', '1970-01-01T00:00:00.000Z') if hasattr(var, 'ooi_short_name') and var.ooi_short_name: sname = var.ooi_short_name sname = re.sub('[\t\n ]+', ' ', sname) self.xml_attr(doc, add_attributes_element, 'ooi_short_name', sname) m = re.match(r'[A-Z0-9]{7}', sname) if m: reference_url = 'https://confluence.oceanobservatories.org/display/instruments/' + m.group( ) self.xml_attr(doc, add_attributes_element, 'references', reference_url) if 'L2' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L2') self.xml_attr(doc, add_attributes_element, 'source', 'level 2 calibrated sensor observation') elif 'L1' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L1') self.xml_attr(doc, add_attributes_element, 'source', 'level 1 calibrated sensor observation') elif 'L0' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L0') self.xml_attr(doc, add_attributes_element, 'source', 'sensor observation') elif 'QC' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'QC') elif not isinstance(var.param_type, ParameterFunctionType): self.xml_attr(doc, add_attributes_element, 'ooi_short_name', var.name) if units == 'counts': self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L0') self.xml_attr(doc, add_attributes_element, 'source', 'sensor observation') elif 'seconds' in units and 'since' in units: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'axis') else: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'unknown') if hasattr(var, 'reference_urls') and var.reference_urls: if isinstance(var.reference_urls, list): references = ','.join(var.reference_urls) else: references = var.reference_urls self.xml_attr(doc, add_attributes_element, 'instrument_type', references) if isinstance(var.param_type, ParameterFunctionType): if isinstance(var.function, PythonFunction): self.xml_attr(doc, add_attributes_element, 'function_module', var.function.owner or '') self.xml_attr(doc, add_attributes_element, 'function_name', var.function.func_name or '') if var.function.owner.startswith('ion_functions'): s = var.function.owner url = s.replace('.', '/') + '.py' url = 'https://github.com/ooici/ion-functions/blob/master/' + url self.xml_attr(doc, add_attributes_element, 'function_url', url) elif var.function.egg_uri: self.xml_attr(doc, add_attributes_element, 'function_url', var.function.egg_uri or '') elif isinstance(var.function, NumexprFunction): self.xml_attr(doc, add_attributes_element, 'function_name', var.function.name or '') data_element.appendChild(add_attributes_element) dataset_element.appendChild(data_element) index += 1 #bug with prettyxml #http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/ result += dataset_element.toprettyxml() + '\n' #result += dataset_element.toxml() + '\n' cov.close() if not result: log.error( "Attempted to register empty dataset\nDims: %s\nDatasets: %s", dims, datasets) return result
def _get_coverage(cls,dataset_id,mode='r'): file_root = FileSystem.get_url(FS.CACHE,'datasets') coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode) return coverage
def get_dataset_xml(self, coverage_path, product_id, product_name='', available_fields=None): #http://coastwatch.pfeg.noaa.gov/erddap/download/setupDatasetsXml.html result = '' paths = os.path.split(coverage_path) cov = AbstractCoverage.load(coverage_path) doc = xml.dom.minidom.Document() #erd_type_map = {'d':'double', 'f':"float", 'h':'short', 'i':'int', 'l':'int', 'q':'int', 'b':'byte', 'b':'char', 'S':'String'} #Get lists of variables with unique sets of dimensions. #Datasets can only have variables with the same sets of dimensions if not cov.list_parameters(): raise BadRequest('Attempting to register an empty dataset. The coverage (%s) has no definition.\n%s' %(coverage_path, cov)) datasets = {} for key in cov.list_parameters(): pc = cov.get_parameter_context(key) #if getattr(pc, 'visible', None): # continue if available_fields and pc.name not in available_fields: continue #if not isinstance(pc.param_type, QuantityType): # continue param = cov.get_parameter(key) dims = (cov.temporal_parameter_name,) if len(param.shape) == 2: dims = (cov.temporal_parameter_name, cov.spatial_domain.shape.name) if not dims in datasets.keys(): datasets[dims] = [] datasets[dims].append(key) index = 0 if not datasets: raise BadRequest('Attempting to register a dimensionless dataset. The coverage (%s) has no dimension(s).\n%s' %( coverage_path, cov)) for dims, vars in datasets.iteritems(): erd_name_map = self.get_errdap_name_map(vars) if len(vars)==1: raise BadRequest('A dataset needs a proper range, not just the temporal dimension. %s\n%s' %( coverage_path, cov)) dataset_element = doc.createElement('dataset') #dataset_element.setAttribute('type', 'EDDGridFromDap') dataset_element.setAttribute('type', 'EDDTableFromDapSequence') dataset_element.setAttribute('datasetID', 'data' + product_id) dataset_element.setAttribute('active', 'True') source_element = doc.createElement('sourceUrl') text_node = doc.createTextNode(self.pydap_url + paths[1]) source_element.appendChild(text_node) dataset_element.appendChild(source_element) reload_element = doc.createElement('reloadEveryNMinutes') if self.CFG.get_safe('server.erddap.dataset_caching',True): text_node = doc.createTextNode('1440') else: text_node = doc.createTextNode('5') reload_element.appendChild(text_node) dataset_element.appendChild(reload_element) outer_element = doc.createElement('outerSequenceName') text_node = doc.createTextNode('data') outer_element.appendChild(text_node) dataset_element.appendChild(outer_element) default_element = doc.createElement('defaultDataQuery') text_node = doc.createTextNode('&time>=1970-01-01') default_element.appendChild(text_node) dataset_element.appendChild(default_element) # No longer applicable #if self.CFG.get_safe('server.erddap.dataset_caching',True): #refresh_interval = self.CFG.get_safe('server.erddap.refresh_interval', 30000) #update_element = doc.createElement('updateEveryNMillis') #text_node = doc.createTextNode(str(refresh_interval)) #update_element.appendChild(text_node) #dataset_element.appendChild(update_element) add_attributes_element = doc.createElement('addAttributes') atts = {} atts['title'] = product_name or urllib.unquote(cov.name) atts['infoUrl'] = self.ux_url + 'DataProduct/face/' + product_id atts['institution'] = 'OOI' atts['Conventions'] = "COARDS, CF-1.6, Unidata Dataset Discovery v1.0" atts['license'] = '''These data were collected by the Ocean Observatory Initiative (OOI) project purely for internal system development purposes during the construction phase of the project and are offered for release to the public with no assurance of data quality, consistency, temporal continuity or additional support. The OOI Program assumes no liability resulting from the use of these data for other than the intended purpose. No data quality assurance steps have been implemented on this data to date.''' atts['summary'] = cov.name atts['cdm_data_type'] = 'Other' atts['standard_name_vocabulary'] = 'CF-12' for key, val in atts.iteritems(): self.xml_attr(doc, add_attributes_element, key, val) if len(add_attributes_element.childNodes) > 0: dataset_element.appendChild(add_attributes_element) for var_name in vars: var = cov.get_parameter_context(var_name) if re.match(r'.*_[a-z0-9]{32}', var.name): continue # Let's not do this #if len(param.shape) >=1 and not param.is_coordinate: #dataVariable data_element = doc.createElement('dataVariable') source_name_element = doc.createElement('sourceName') text_node = doc.createTextNode(var.name) source_name_element.appendChild(text_node) data_element.appendChild(source_name_element) destination_name_element = doc.createElement('destinationName') text_node = doc.createTextNode(erd_name_map[var.name]) destination_name_element.appendChild(text_node) data_element.appendChild(destination_name_element) add_attributes_element = doc.createElement('addAttributes') units = "unknown" if hasattr(var,'uom') and var.uom: units = var.uom self.xml_attr(doc, add_attributes_element, 'units', units) #if var.ATTRS is not None: #for key in var.ATTRS: #if not hasattr(var,key): #continue #val = getattr(var,key) #if not val: #val = '' #att_element = doc.createElement('att') #att_element.setAttribute('name', key) #text_node = doc.createTextNode(val) #att_element.appendChild(text_node) #add_attributes_element.appendChild(att_element) ioos_cat = self.get_ioos_category(var.name, units) self.xml_attr(doc, add_attributes_element, 'ioos_category', ioos_cat) if hasattr(var,'display_name') and var.display_name is not None: self.xml_attr(doc, add_attributes_element, 'long_name', var.display_name) if hasattr(var,'standard_name') and var.standard_name is not None: self.xml_attr(doc, add_attributes_element, 'standard_name', var.standard_name) if 'seconds' in units and 'since' in units: self.xml_attr(doc, add_attributes_element, 'time_precision', '1970-01-01T00:00:00.000Z') if hasattr(var, 'ooi_short_name') and var.ooi_short_name: sname = var.ooi_short_name sname = re.sub('[\t\n ]+', ' ', sname) self.xml_attr(doc, add_attributes_element, 'ooi_short_name', sname) m = re.match(r'[A-Z0-9]{7}', sname) if m: reference_url = 'https://confluence.oceanobservatories.org/display/instruments/' + m.group() self.xml_attr(doc, add_attributes_element, 'references', reference_url) if 'L2' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L2') self.xml_attr(doc, add_attributes_element, 'source', 'level 2 calibrated sensor observation') elif 'L1' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L1') self.xml_attr(doc, add_attributes_element, 'source', 'level 1 calibrated sensor observation') elif 'L0' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L0') self.xml_attr(doc, add_attributes_element, 'source', 'sensor observation') elif 'QC' in var.ooi_short_name: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'QC') elif not isinstance(var.param_type, ParameterFunctionType): self.xml_attr(doc, add_attributes_element, 'ooi_short_name', var.name) if units == 'counts': self.xml_attr(doc, add_attributes_element, 'data_product_level', 'L0') self.xml_attr(doc, add_attributes_element, 'source', 'sensor observation') elif 'seconds' in units and 'since' in units: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'axis') else: self.xml_attr(doc, add_attributes_element, 'data_product_level', 'unknown') if hasattr(var, 'reference_urls') and var.reference_urls: if isinstance(var.reference_urls, list): references = ','.join(var.reference_urls) else: references = var.reference_urls self.xml_attr(doc, add_attributes_element, 'instrument_type', references) if isinstance(var.param_type, ParameterFunctionType): if isinstance(var.function, PythonFunction): self.xml_attr(doc, add_attributes_element, 'function_module', var.function.owner or '') self.xml_attr(doc, add_attributes_element, 'function_name', var.function.func_name or '') if var.function.owner.startswith('ion_functions'): s = var.function.owner url = s.replace('.','/') + '.py' url = 'https://github.com/ooici/ion-functions/blob/master/' + url self.xml_attr(doc, add_attributes_element, 'function_url', url) elif var.function.egg_uri: self.xml_attr(doc, add_attributes_element, 'function_url', var.function.egg_uri or '') elif isinstance(var.function, NumexprFunction): self.xml_attr(doc, add_attributes_element, 'function_name', var.function.name or '') data_element.appendChild(add_attributes_element) dataset_element.appendChild(data_element) index += 1 #bug with prettyxml #http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/ result += dataset_element.toprettyxml() + '\n' #result += dataset_element.toxml() + '\n' cov.close() if not result: log.error("Attempted to register empty dataset\nDims: %s\nDatasets: %s", dims, datasets) return result
def repair( self, backup=True, copy_over=True, keep_temp=False, reanalyze=False, analyze_bricks=False, detailed_analysis=False, ): """ Heavy repair tool that recreates a blank persisted Coverage from the broken coverage's original construction parameters, then reconstructs the Master and Parameter metadata files by inspection of the ION objects and "valid" brick files. @return: """ if self._ar is None or reanalyze: self._ar = self._do_analysis(analyze_bricks=analyze_bricks, detailed_analysis=detailed_analysis) if self._ar.is_corrupt: if len(self._ar.get_brick_corruptions()) > 0: raise NotImplementedError("Brick corruption. Cannot repair at this time!!!") else: # Repair the Master and Parameter metadata files # Need the ParameterDictionary, TemporalDomain and SpatialDomain pdict = ParameterDictionary.load(self._dso.parameter_dictionary) tdom = GridDomain.load(self._dso.temporal_domain) sdom = GridDomain.load(self._dso.spatial_domain) # Set up the working directory for the recovered coverage tempcov_dir = tempfile.mkdtemp("covs") # Create the temporary Coverage tempcov = SimplexCoverage( root_dir=tempcov_dir, persistence_guid=self._guid, name=self._guid, parameter_dictionary=pdict, spatial_domain=sdom, temporal_domain=tdom, ) # Handle to persistence layer for tempcov pl = tempcov._persistence_layer # Set up the original and temporary coverage path strings orig_dir = os.path.join(self.cov_pth, self._guid) temp_dir = os.path.join(tempcov.persistence_dir, tempcov.persistence_guid) # Insert same number of timesteps into temporary coverage as in broken coverage brick_domains_new, new_brick_list, brick_list_spans, tD, bD, min_data_bound, max_data_bound = self.inspect_bricks( self.cov_pth, self._guid, "time" ) empty_cov = ( brick_list_spans is None ) # If None, there are no brick files --> no timesteps, empty coverage! if not empty_cov: # If None, there are no brick files --> no timesteps, empty coverage! bls = [s.value for s in brick_list_spans] maxes = [sum(b[3]) for b in new_brick_list.values()] # Replace metadata is the Master file pl.master_manager.brick_domains = brick_domains_new pl.master_manager.brick_list = new_brick_list # Repair ExternalLinks to brick files with HDFLockingFile(pl.master_manager.file_path, "r+") as f: for param_name in pdict.keys(): del f[param_name] f.create_group(param_name) for param_name in pdict.keys(): for brick in bls: link_path = "/{0}/{1}".format(param_name, brick[0]) brick_file_name = "{0}.hdf5".format(brick[0]) brick_rel_path = os.path.join( pl.parameter_metadata[param_name].root_dir.replace(tempcov.persistence_dir, "."), brick_file_name, ) log.debug("link_path: %s", link_path) log.debug("brick_rel_path: %s", brick_rel_path) pl.master_manager.add_external_link(link_path, brick_rel_path, brick[0]) pl.flush_values() pl.flush() tempcov.close() # Remove 'rtree' dataset from Master file if it already exists (post domain expansion) # to make way for reconstruction with HDFLockingFile(pl.master_manager.file_path, "r+") as f: if "rtree" in f.keys(): del f["rtree"] # Reconstruct 'rtree' dataset # Open temporary Coverage and PersistenceLayer objects fixed_cov = AbstractCoverage.load(tempcov.persistence_dir, mode="r+") pl_fixed = fixed_cov._persistence_layer # Call update_rtree for each brick using PersistenceLayer builtin brick_count = 0 if not empty_cov: for brick in bls: rtree_extents, brick_extents, brick_active_size = pl_fixed.calculate_extents( brick[1][1], bD, tD ) pl_fixed.master_manager.update_rtree(brick_count, rtree_extents, obj=brick[0]) brick_count += 1 # Update parameter_bounds property based on each parameter's brick data using deep inspection valid_bounds_types = ["BooleanType", "ConstantType", "QuantityType", "ConstantRangeType"] if not empty_cov: for param in pdict.keys(): if pdict.get_context(param).param_type.__class__.__name__ in valid_bounds_types: brick_domains_new, new_brick_list, brick_list_spans, tD, bD, min_data_bound, max_data_bound = self.inspect_bricks( self.cov_pth, self._guid, param ) # Update the metadata pl_fixed.update_parameter_bounds(param, [min_data_bound, max_data_bound]) pl_fixed.flush() fixed_cov.close() # Create backup copy of original Master and Parameter files if backup: import datetime orig_master_file = os.path.join(self.cov_pth, "{0}_master.hdf5".format(self._guid)) # Generate the timestamp tstamp_format = "%Y%m%d%H%M%S" tstamp = datetime.datetime.now().strftime(tstamp_format) backup_master_file = os.path.join(self.cov_pth, "{0}_master.{1}.hdf5".format(self._guid, tstamp)) shutil.copy2(orig_master_file, backup_master_file) for param in pdict.keys(): param_orig = os.path.join(orig_dir, param, "{0}.hdf5".format(param)) param_backup = os.path.join(orig_dir, param, "{0}.{1}.hdf5".format(param, tstamp)) shutil.copy2(param_orig, param_backup) # Copy Master and Parameter metadata files back to original/broken coverage (cov_pth) location if copy_over == True: shutil.copy2( os.path.join(tempcov.persistence_dir, "{0}_master.hdf5".format(self._guid)), os.path.join(self.cov_pth, "{0}_master.hdf5".format(self._guid)), ) for param in pdict.keys(): shutil.copy2( os.path.join(temp_dir, param, "{0}.hdf5".format(param)), os.path.join(orig_dir, param, "{0}.hdf5".format(param)), ) # Reanalyze the repaired coverage self._ar = self._do_analysis(analyze_bricks=True) # Verify repair worked, clean up if not if self._ar.is_corrupt: # If the files were backed up then revert if backup: # Remove backed up files and clean up the repair attempt log.info("Repair attempt failed. Reverting to pre-repair state.") # Use backup copy to replace post-repair file. shutil.copy2(backup_master_file, orig_master_file) # Delete the backup os.remove(backup_master_file) # Iterate over parameters and revert to pre-repair state for param in pdict.keys(): param_orig = os.path.join(orig_dir, param, "{0}.hdf5".format(param)) param_backup = os.path.join(orig_dir, param, "{0}.{1}.hdf5".format(param, tstamp)) # Use backup copy to replace post-repair file. shutil.copy2(param_backup, param_orig) # Delete the backup os.remove(param_backup) raise ValueError("Coverage repair failed! Revert to stored backup version, if possible.") # Remove temporary coverage if keep_temp == False: shutil.rmtree(tempcov_dir) else: return tempcov_dir else: log.info("Coverage is not corrupt, nothing to repair!")
def repair(self, backup=True, copy_over=True, keep_temp=False, reanalyze=False, analyze_bricks=False, detailed_analysis=False): """ Heavy repair tool that recreates a blank persisted Coverage from the broken coverage's original construction parameters, then reconstructs the Master and Parameter metadata files by inspection of the ION objects and "valid" brick files. @return: """ if self._ar is None or reanalyze: self._ar = self._do_analysis(analyze_bricks=analyze_bricks, detailed_analysis=detailed_analysis) if self._ar.is_corrupt: if len(self._ar.get_brick_corruptions()) > 0: raise NotImplementedError( 'Brick corruption. Cannot repair at this time!!!') else: # Repair the Master and Parameter metadata files # Need the ParameterDictionary, TemporalDomain and SpatialDomain pdict = ParameterDictionary.load( self._dso.parameter_dictionary) tdom = GridDomain.load(self._dso.temporal_domain) sdom = GridDomain.load(self._dso.spatial_domain) # Set up the working directory for the recovered coverage tempcov_dir = tempfile.mkdtemp('covs') # Create the temporary Coverage tempcov = SimplexCoverage(root_dir=tempcov_dir, persistence_guid=self._guid, name=self._guid, parameter_dictionary=pdict, spatial_domain=sdom, temporal_domain=tdom) # Handle to persistence layer for tempcov pl = tempcov._persistence_layer # Set up the original and temporary coverage path strings orig_dir = os.path.join(self.cov_pth, self._guid) temp_dir = os.path.join(tempcov.persistence_dir, tempcov.persistence_guid) # Insert same number of timesteps into temporary coverage as in broken coverage brick_domains_new, new_brick_list, brick_list_spans, tD, bD, min_data_bound, max_data_bound = self.inspect_bricks( self.cov_pth, self._guid, 'time') empty_cov = brick_list_spans is None # If None, there are no brick files --> no timesteps, empty coverage! if not empty_cov: # If None, there are no brick files --> no timesteps, empty coverage! bls = [s.value for s in brick_list_spans] maxes = [sum(b[3]) for b in new_brick_list.values()] tempcov.insert_timesteps(sum(maxes)) # Replace metadata is the Master file pl.master_manager.brick_domains = brick_domains_new pl.master_manager.brick_list = new_brick_list # Repair ExternalLinks to brick files f = h5py.File(pl.master_manager.file_path, 'a') for param_name in pdict.keys(): del f[param_name] f.create_group(param_name) for brick in bls: link_path = '/{0}/{1}'.format(param_name, brick[0]) brick_file_name = '{0}.hdf5'.format(brick[0]) brick_rel_path = os.path.join( pl.parameter_metadata[param_name].root_dir. replace(tempcov.persistence_dir, '.'), brick_file_name) log.debug('link_path: %s', link_path) log.debug('brick_rel_path: %s', brick_rel_path) pl.master_manager.add_external_link( link_path, brick_rel_path, brick[0]) pl.flush_values() pl.flush() tempcov.close() # Remove 'rtree' dataset from Master file if it already exists (post domain expansion) # to make way for reconstruction f = h5py.File(pl.master_manager.file_path, 'a') if 'rtree' in f.keys(): del f['rtree'] f.close() # Reconstruct 'rtree' dataset # Open temporary Coverage and PersistenceLayer objects fixed_cov = AbstractCoverage.load(tempcov.persistence_dir, mode='a') pl_fixed = fixed_cov._persistence_layer # Call update_rtree for each brick using PersistenceLayer builtin brick_count = 0 if not empty_cov: for brick in bls: rtree_extents, brick_extents, brick_active_size = pl_fixed.calculate_extents( brick[1][1], bD, tD) pl_fixed.master_manager.update_rtree(brick_count, rtree_extents, obj=brick[0]) brick_count += 1 # Update parameter_bounds property based on each parameter's brick data using deep inspection valid_bounds_types = [ 'BooleanType', 'ConstantType', 'QuantityType', 'ConstantRangeType' ] if not empty_cov: for param in pdict.keys(): if pdict.get_context( param ).param_type.__class__.__name__ in valid_bounds_types: brick_domains_new, new_brick_list, brick_list_spans, tD, bD, min_data_bound, max_data_bound = self.inspect_bricks( self.cov_pth, self._guid, param) # Update the metadata pl_fixed.update_parameter_bounds( param, [min_data_bound, max_data_bound]) pl_fixed.flush() fixed_cov.close() # Create backup copy of original Master and Parameter files if backup: import datetime orig_master_file = os.path.join( self.cov_pth, '{0}_master.hdf5'.format(self._guid)) # Generate the timestamp tstamp_format = '%Y%m%d%H%M%S' tstamp = datetime.datetime.now().strftime(tstamp_format) backup_master_file = os.path.join( self.cov_pth, '{0}_master.{1}.hdf5'.format(self._guid, tstamp)) shutil.copy2(orig_master_file, backup_master_file) for param in pdict.keys(): param_orig = os.path.join(orig_dir, param, '{0}.hdf5'.format(param)) param_backup = os.path.join( orig_dir, param, '{0}.{1}.hdf5'.format(param, tstamp)) shutil.copy2(param_orig, param_backup) # Copy Master and Parameter metadata files back to original/broken coverage (cov_pth) location if copy_over == True: shutil.copy2( os.path.join(tempcov.persistence_dir, '{0}_master.hdf5'.format(self._guid)), os.path.join(self.cov_pth, '{0}_master.hdf5'.format(self._guid))) for param in pdict.keys(): shutil.copy2( os.path.join(temp_dir, param, '{0}.hdf5'.format(param)), os.path.join(orig_dir, param, '{0}.hdf5'.format(param))) # Reanalyze the repaired coverage self._ar = self._do_analysis(analyze_bricks=True) # Verify repair worked, clean up if not if self._ar.is_corrupt: # If the files were backed up then revert if backup: # Remove backed up files and clean up the repair attempt log.info( 'Repair attempt failed. Reverting to pre-repair state.' ) # Use backup copy to replace post-repair file. shutil.copy2(backup_master_file, orig_master_file) # Delete the backup os.remove(backup_master_file) # Iterate over parameters and revert to pre-repair state for param in pdict.keys(): param_orig = os.path.join(orig_dir, param, '{0}.hdf5'.format(param)) param_backup = os.path.join( orig_dir, param, '{0}.{1}.hdf5'.format(param, tstamp)) # Use backup copy to replace post-repair file. shutil.copy2(param_backup, param_orig) # Delete the backup os.remove(param_backup) raise ValueError( 'Coverage repair failed! Revert to stored backup version, if possible.' ) # Remove temporary coverage if keep_temp == False: shutil.rmtree(tempcov_dir) else: return tempcov_dir else: log.info('Coverage is not corrupt, nothing to repair!')
def test_coverage_recovery(self): # Create the coverage dp_id, stream_id, route, stream_def_id, dataset_id = self.load_data_product( ) self.populate_dataset(dataset_id, 36) dset = self.dataset_management.read_dataset(dataset_id) dprod = self.dpsc_cli.read_data_product(dp_id) cov = DatasetManagementService._get_simplex_coverage(dataset_id) cov_pth = cov.persistence_dir cov.close() num_params = len(cov.list_parameters()) num_bricks = 8 total = num_params + num_bricks + 1 # Analyze the valid coverage dr = CoverageDoctor(cov_pth, dprod, dset) dr_result = dr.analyze() # TODO: Turn these into meaningful Asserts self.assertEqual(len(dr_result.get_brick_corruptions()), 0) self.assertEqual(len(dr_result.get_brick_size_ratios()), num_bricks) self.assertEqual(len(dr_result.get_corruptions()), 0) self.assertEqual(len(dr_result.get_master_corruption()), 0) self.assertEqual(len(dr_result.get_param_corruptions()), 0) self.assertEqual(len(dr_result.get_param_size_ratios()), num_params) self.assertEqual(len(dr_result.get_master_size_ratio()), 1) self.assertEqual(len(dr_result.get_size_ratios()), total) self.assertEqual(dr_result.master_status[1], 'NORMAL') self.assertFalse(dr_result.is_corrupt) self.assertEqual(dr_result.param_file_count, num_params) self.assertEqual(dr_result.brick_file_count, num_bricks) self.assertEqual(dr_result.total_file_count, total) # Get original values (mock) orig_cov = AbstractCoverage.load(cov_pth) time_vals_orig = orig_cov.get_time_values() orig_cov.close() # Corrupt the Master File fo = open(cov._persistence_layer.master_manager.file_path, "wb") fo.write('Junk') fo.close() # Corrupt the lon Parameter file fo = open(cov._persistence_layer.parameter_metadata['lon'].file_path, "wb") fo.write('Junk') fo.close() corrupt_res = dr.analyze(reanalyze=True) self.assertTrue(corrupt_res.is_corrupt) # Repair the metadata files dr.repair(reanalyze=True) fixed_res = dr.analyze(reanalyze=True) self.assertFalse(fixed_res.is_corrupt) fixed_cov = AbstractCoverage.load(cov_pth) self.assertIsInstance(fixed_cov, AbstractCoverage) time_vals_fixed = fixed_cov.get_time_values() fixed_cov.close() self.assertTrue(np.array_equiv(time_vals_orig, time_vals_fixed))
def get_dataset_xml(self, coverage_path, product_id, product_name='', available_fields=None): #http://coastwatch.pfeg.noaa.gov/erddap/download/setupDatasetsXml.html result = '' paths = os.path.split(coverage_path) cov = AbstractCoverage.load(coverage_path) doc = xml.dom.minidom.Document() #erd_type_map = {'d':'double', 'f':"float", 'h':'short', 'i':'int', 'l':'int', 'q':'int', 'b':'byte', 'b':'char', 'S':'String'} #Get lists of variables with unique sets of dimensions. #Datasets can only have variables with the same sets of dimensions if not cov.list_parameters(): raise BadRequest('Attempting to register an empty dataset. The coverage (%s) has no definition.\n%s' %(coverage_path, cov)) datasets = {} for key in cov.list_parameters(): pc = cov.get_parameter_context(key) #if getattr(pc, 'visible', None): # continue if available_fields and pc.name not in available_fields: continue #if not isinstance(pc.param_type, QuantityType): # continue param = cov.get_parameter(key) dims = (cov.temporal_parameter_name,) if len(param.shape) == 2: dims = (cov.temporal_parameter_name, cov.spatial_domain.shape.name) if not dims in datasets.keys(): datasets[dims] = [] datasets[dims].append(key) index = 0 if not datasets: raise BadRequest('Attempting to register a dimensionless dataset. The coverage (%s) has no dimension(s).\n%s' %( coverage_path, cov)) for dims, vars in datasets.iteritems(): erd_name_map = self.get_errdap_name_map(vars) if len(vars)==1: raise BadRequest('A dataset needs a proper range, not just the temporal dimension. %s\n%s' %( coverage_path, cov)) if not (len(dims) == 1 and dims[0] == vars[0]): dataset_element = doc.createElement('dataset') dataset_element.setAttribute('type', 'EDDGridFromDap') dataset_element.setAttribute('datasetID', product_id) dataset_element.setAttribute('active', 'True') source_element = doc.createElement('sourceUrl') text_node = doc.createTextNode(self.pydap_url + paths[1]) source_element.appendChild(text_node) dataset_element.appendChild(source_element) reload_element = doc.createElement('reloadEveryNMinutes') if self.CFG.get_safe('server.erddap.dataset_caching',True): text_node = doc.createTextNode('1440') else: text_node = doc.createTextNode('5') reload_element.appendChild(text_node) dataset_element.appendChild(reload_element) if self.CFG.get_safe('server.erddap.dataset_caching',True): refresh_interval = self.CFG.get_safe('server.erddap.refresh_interval', 30000) update_element = doc.createElement('updateEveryNMillis') text_node = doc.createTextNode(str(refresh_interval)) update_element.appendChild(text_node) dataset_element.appendChild(update_element) add_attributes_element = doc.createElement('addAttributes') atts = {} atts['title'] = product_name or urllib.unquote(cov.name) atts['infoUrl'] = self.pydap_url + paths[1] atts['institution'] = 'OOI' atts['Conventions'] = "COARDS, CF-1.6, Unidata Dataset Discovery v1.0" atts['license'] = '[standard]' atts['summary'] = cov.name atts['cdm_data_type'] = 'Grid' atts['standard_name_vocabulary'] = 'CF-12' for key, val in atts.iteritems(): att_element = doc.createElement('att') att_element.setAttribute('name', key) text_node = doc.createTextNode(val) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) if len(add_attributes_element.childNodes) > 0: dataset_element.appendChild(add_attributes_element) for var_name in vars: var = cov.get_parameter_context(var_name) units = "unknown" if hasattr(var,'uom') and var.uom: units = var.uom #if len(param.shape) >=1 and not param.is_coordinate: #dataVariable data_element = doc.createElement('dataVariable') source_name_element = doc.createElement('sourceName') text_node = doc.createTextNode(var.name) source_name_element.appendChild(text_node) data_element.appendChild(source_name_element) destination_name_element = doc.createElement('destinationName') text_node = doc.createTextNode(erd_name_map[var.name]) destination_name_element.appendChild(text_node) data_element.appendChild(destination_name_element) add_attributes_element = doc.createElement('addAttributes') if var.ATTRS is not None: for key in var.ATTRS: if not hasattr(var,key): continue val = getattr(var,key) if not val: val = '' att_element = doc.createElement('att') att_element.setAttribute('name', key) text_node = doc.createTextNode(val) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) att_element = doc.createElement('att') att_element.setAttribute('name', 'ioos_category') text_node = doc.createTextNode(self.get_ioos_category(var.name, units)) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) att_element = doc.createElement('att') att_element.setAttribute('name', 'long_name') long_name = "" if hasattr(var,'display_name') and var.display_name is not None: long_name = var.display_name text_node = doc.createTextNode(long_name) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) att_element = doc.createElement('att') standard_name = "" if hasattr(var,'standard_name') and var.standard_name is not None: standard_name = var.standard_name att_element.setAttribute('name', 'standard_name') text_node = doc.createTextNode(standard_name) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) att_element = doc.createElement('att') att_element.setAttribute('name', 'units') text_node = doc.createTextNode(units) att_element.appendChild(text_node) add_attributes_element.appendChild(att_element) data_element.appendChild(add_attributes_element) dataset_element.appendChild(data_element) index += 1 #bug with prettyxml #http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/ #result += dataset_element.toprettyxml() + '\n' result += dataset_element.toxml() + '\n' cov.close() return result