def dest_path_anmn_nrs_realtime(filepath): """Returns the relative path a given netCDF file should be published to, based on the name and content of the file. Only works for ANMN NRS real-time files. :param filepath: full path of the file :return: relative destination path including file name """ filename = os.path.basename(filepath) # Start with base path for this sub-facility path_list = ['IMOS', 'ANMN', 'NRS', 'REAL_TIME'] # add site code with Dataset(filepath, mode='r') as f: site_code = getattr(f, 'site_code', '') if not site_code: raise InvalidFileContentError("File '{name}' has no site_code attribute!".format(name=filename)) path_list.append(site_code) # add product sub-directory if re.match('IMOS_ANMN-NRS_MT_.*-Surface-.*-MET', filename): path_list.append('Meteorology') elif re.match('IMOS_ANMN-NRS_W_.*-Surface-.*-wave', filename): path_list.append('Wave') elif re.match('IMOS_ANMN-NRS_TPSOBUE_.*-SubSurface-.*-WQM', filename): path_list.append('Biogeochem_timeseries') else: raise InvalidInputFileError( "File name '{name}' doesn't match pattern for any known NRS real-time product".format(name=filename) ) path_list.append(filename) return os.path.join(*path_list)
def get_product_type(netcdf_path): with Dataset(netcdf_path, mode='r') as nc_obj: try: return nc_obj.product_type except AttributeError: raise InvalidInputFileError( "Expecting 'product_type' attribute in netCDF'{gzip}'".format( gzip=os.path.basename(netcdf_path)))
def preprocess(self): """ Preprocessing for NRT and DM files - NRT: generate a NetCDF files based on input text file. Set the input file publish_type property to 'archive' - DM file collection: update the check_type and publish_type properties for non-NetCDF files. These files are not checked or harvested, but uploaded to S3 """ if self.custom_params is not None and self.custom_params.get( 'ship_callsign_ls'): self.ship_callsign_ls = self.custom_params['ship_callsign_ls'] else: self.ship_callsign_ls = ship_callsign_list() # Delayed mode file submitted as a zip archive if self.file_extension == '.zip': nc_file = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) if len(nc_file) != 1: raise InvalidInputFileError( "Expecting one netCDF file in ZIP archive '{zip}'".format( zip=os.path.basename(self.input_file))) # first process the NetCDF file to set the destination path for the file collection nc = nc_file[0] nc.dest_path = self.dest_path(nc.src_path) nc_dir_path = os.path.dirname(nc.dest_path) # SOOP-CO2 DM and FRMAP .txt,.pdf or/and .xml files. # Set check type to NONEMPTY and publish type to UPLOAD_ONLY non_nc_files = PipelineFileCollection( f for f in self.file_collection if f.file_type is not FileType.NETCDF) for non_nc in non_nc_files: non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY non_nc.dest_path = os.path.join(nc_dir_path, non_nc.name) elif self.input_file.endswith('dat.txt'): # Single text file Realtime files (*dat.txt) rt_file = self.file_collection[0] rt_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY nrt_nc_file_path = soop_co2_nrt_nc_generator.process_co2_rt( rt_file, self.products_dir, self.ship_callsign_ls) nrt_nc_file = PipelineFile(nrt_nc_file_path) self.file_collection.add(nrt_nc_file) nrt_nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
def preprocess(self): """ Preprocessing of Zip archive and NetCDF files Preprocessing consist in setting the destination path AND deleting previous version files - Zip contains netcdf , images ,text, doc, or xml file and raw file to archive dest_path is generated based on info stored in FV01 NetCDF file. update check_type and publish_type according to destination : raw files : move to archive =>publish_type property to 'archive' - text, doc, xml, images: basic checks uploaded to S3 => set check_type and publish_type attributesge accordingly """ netcdf = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) if len(netcdf) != 1: raise InvalidInputFileError( "Expecting one netCDF file from input file '{infile}'".format( infile=os.path.basename(self.input_file))) nc = netcdf[0] destination = dest_path_soop_ba(nc) nc.dest_path = os.path.join(destination, nc.name) results = self.state_query.query_storage(destination).keys() files_to_delete = self.get_previous_version(results, destination, nc.name) if files_to_delete: self.file_collection.update(files_to_delete) if self.file_type is FileType.ZIP: non_nc_files = PipelineFileCollection( f for f in self.file_collection if f.file_type is not FileType.NETCDF) for non_nc in non_nc_files: non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK if non_nc.extension in ['.ek5', '.out', '.raw']: non_nc.publish_type = PipelineFilePublishType.ARCHIVE_ONLY dest_archive = archive_path_soop_ba(nc) non_nc.archive_path = os.path.join(dest_archive, non_nc.name) else: non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY non_nc.dest_path = os.path.join(destination, non_nc.name) files_to_delete = self.get_previous_version( results, destination, non_nc.name) if files_to_delete: self.file_collection.update(files_to_delete)
def process_zip_common(self, mode): if mode == 'RT': regex = AnfogFileClassifier.ANFOG_RT_REGEX file_type = 'FV00' elif mode == 'DM': regex = AnfogFileClassifier.DM_REGEX file_type = 'FV01' else: raise ValueError("invalid mode '{mode}'".format(mode=mode)) netcdf_collection = self.file_collection.filter_by_attribute_regex( 'name', regex) if len(netcdf_collection) != 1: raise InvalidInputFileError( "Expecting one '{file_type}' NetCDF file in ZIP archive '{zip}'" .format(file_type=file_type, zip=os.path.basename(self.input_file))) nc = netcdf_collection[0] # use the FV00/01 NetCDF file to set the destination path for the file collection self.primary_nc = nc self.upload_destination = AnfogFileClassifier.get_destination( nc.src_path)
def get_data_mode(self): """ 1) Set data mode based on NetCDF product type If FV01 => DM If FV00 is ANFOG_RT => RT, then also check zip contain ancillary files, (.png or position_summary.txt file) If not present => missing RT material 2) Set format_check type specific to product type(FV00/01) and origin(ANFOG, DSTG or NRL) """ fv01 = self.file_collection.filter_by_attribute_regex( 'name', AnfogFileClassifier.DM_REGEX) adapter_dstg = '%s|%s' % (AnfogFileClassifier.ADAPTER_REGEX, AnfogFileClassifier.DSTG_REGEX) anfog_rt = self.file_collection.filter_by_attribute_regex( 'name', AnfogFileClassifier.ANFOG_RT_REGEX) if fv01: if re.match(adapter_dstg, fv01[0].name): # Adapter and DSTG file not cf and imos compliant fv01[0].check_type = PipelineFileCheckType.FORMAT_CHECK return 'DM' elif anfog_rt: # RT file not compliant anfog_rt[0].check_type = PipelineFileCheckType.FORMAT_CHECK png = self.file_collection.filter_by_attribute_regex( 'name', AnfogFileClassifier.RT_PNG_REGEX) if png: return "RT" else: raise InvalidFileContentError( "Missing ancillary files(PNGs or summary position file) in ZIP archive {name}" .format(name=os.path.basename(self.input_file))) else: raise InvalidInputFileError( "Expecting one NetCDF file in ZIP archive '{zip}'".format( zip=os.path.basename(self.input_file)))
def preprocess(self): # if input file is a NetCDF, create a .nc.gz and harvest upload it. # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc. # to be consistent, we transform this .nc into a .nz.gz if self.file_type is FileType.NETCDF: self.file_collection.set_publish_types( PipelineFilePublishType.NO_ACTION) gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz') with open(self.input_file, 'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out: gz_out.writelines(f_in) # publish self.add_to_collection( gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD) if self.file_type is FileType.GZIP: # add nc_gz file to collection (not by default) self.file_collection.add(self.input_file_object) netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id( 'file_type', FileType.GZIP) netcdf_file_gz = netcdf_file_gz_collection[0] netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD # default # some GSLA files are gzipped, so gunzip them before checking them # if uploaded file is GZIP check that GZIP contains a NetCDF netcdf_collection = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) if len(netcdf_collection) != 1: raise InvalidInputFileError( "Expecting one netCDF file in GZIP archive '{gzip}'". format(gzip=os.path.basename(self.input_file))) netcdf_file_gz = self.file_collection.filter_by_attribute_id( 'file_type', FileType.GZIP)[0] netcdf_file = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF)[0] # setting the path of the gz file with the gunzipped file netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path) # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION # we don't know the product type (DM00 or DM01) of the file already # on s3 in order to deduce its path. We need to get the product # type from the file in incoming result_previous_version_creation_date = self.get_previous_version_creation_date( netcdf_file.src_path) """ default values by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """ push_new_file = True remove_previous_version = False # compare creation dates with file already on storage if result_previous_version_creation_date: new_file_creation_date = get_creation_date(netcdf_file.name) if result_previous_version_creation_date > new_file_creation_date: push_new_file = False elif result_previous_version_creation_date == new_file_creation_date: push_new_file = True else: remove_previous_version = True previous_file_path = self.get_previous_version_object( netcdf_file.src_path) if push_new_file: if GSLA_REGEX_YEARLY.match(netcdf_file.name): # yearly file should never be harvested netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY else: raise InvalidFileNameError( "file name: \"{filename}\" creation date is older than file already on " "storage".format(filename=netcdf_file_gz.name)) # deletion of the previous file if remove_previous_version: previous_file_name = os.path.basename(previous_file_path) file_to_delete = PipelineFile( previous_file_name, is_deletion=True, dest_path=previous_file_path, file_update_callback=self._file_update_callback) if GSLA_REGEX_YEARLY.match(netcdf_file.name): file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY else: file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST self.file_collection.add(file_to_delete)
def preprocess(self): """ Preprocessing for RT and DM files. Processes ZIP, single NetCDF and single TXT files Set destination path based on info in NetCDF files Update ANFOG deployment status record table stored in anforg_rt schema Status are set by processing a status text file except for status 'in-progress' set by the pipeline. These status text files are either pushed to incoming by POs manually(delayed-mode, renamed), or by facility. Difference in letter case reflect the origin of the status file: => lower case status files manually pushed in incoming by POs, or set by pipeline => uppercase status files uploded by facility Status are converted to lowercase and written to the HarvestLising file. File requirements: 1- File name like : PL-Mission_status.txt (PL platform: SG seaglider or SL slocum_glider) For ex: SL-Portland20190218_renamed.txt SL-Portland20190218_RECOVERED.txt Note that the message cannot contain undercores otherwise the process fails (see function get_destination) 2- File must be size > 0 but its content is not relevant. Valid status are : 'in-progress' : set by pipeline upon reception of first NRT file of a new NRT deployment. No further action 'delayed-mode' : set py pipeline upon reception of new DM dataset. Triggers deletion of relevant NRT files from S3 'renamed' : uploaded by PO when error in deployment name (either error in date or deployement name). This status triggers clearing of relevant NRT files from S3. 'RECOVERED' : uploaded by facility within 12-24h of glider recovery. No further action. No further action 'ABORTED' : uploaded by facility after aborting mission and within 12-24h of glider recovery. No further action 'POTENTIALLY-LOST' : uploaded by facility when glider becomes irresponsive for extended period. No further action 'LOST' : uploaded by facility when glider is definitely lost at sea. No further action. Note however that NRT file of lost glider should ultimately be deleted by PO within a couple of week after reception of the lost status message using the 'cleanup' status message 'cleanup-files' : uploaded by PO. Triggers deletion of relevant NRT files from S3. Used for cleaning S3 REATLIME folder from deployments that will not be processed in delayed-mode, for example: mission aborted with no valid data, or lost glider """ input_file_basename = os.path.basename(self.input_file) if self.input_file.endswith('.txt'): txt = self.file_collection.filter_by_attribute_regex( 'extension', '.txt') txt[0].check_type = PipelineFileCheckType.NO_ACTION txt[0].publish_type = PipelineFilePublishType.NO_ACTION message = input_file_basename.split('_')[1].strip('.txt') if message not in AnfogHandler.VALID_STATUS: raise InvalidInputFileError( "Invalid status message {m}." "Message can be either 'delayed-mode', 'renamed', 'RECOVERED'" "'POTENTIALLY_LOST', 'LOST', 'ABORTED' or 'clear-files'". format(m=message)) self.upload_destination = AnfogFileClassifier.get_destination( self.input_file) if message in ['renamed', 'clear-files']: self.delete_previous_version('RT', message) if message != 'clear-files': # the "clear-file" message is not harvested as it is not relevant to deployment status self.set_deployment_status(self.input_file, message) elif (self.file_type is FileType.ZIP) or re.match( AnfogFileClassifier.DM_REGEX, input_file_basename): mode = self.get_data_mode() if self.file_type is FileType.ZIP and mode == 'DM': self.process_zip_dm() elif self.file_type is FileType.ZIP and mode == 'RT': self.process_zip_rt() elif re.match(AnfogFileClassifier.DM_REGEX, input_file_basename): # In Delayed mode, single NetCDF file upload only valid for updates # => Check that deployment exists on S3 self.primary_nc = self.file_collection[0] self.upload_destination = AnfogFileClassifier.get_destination( self.primary_nc.src_path) results = self.state_query.query_storage( self.upload_destination) if results: self.delete_previous_version('DM', 'update') else: raise MissingFileError( "New delayed mode deployment. NetCDF file '{file}' " "should have been submitted with ancillary material". format( file=os.path.basename(self.primary_nc.src_path))) else: raise InvalidInputFileError( "Cannot process the uploaded file {name}.".format( name=input_file_basename))