コード例 #1
0
ファイル: classifiers.py プロジェクト: aodn/python-aodndata
def dest_path_anmn_nrs_realtime(filepath):
    """Returns the relative path a given netCDF file should be published to, based on the name and content of the file.
    Only works for ANMN NRS real-time files.

    :param filepath: full path of the file
    :return: relative destination path including file name
    """

    filename = os.path.basename(filepath)

    # Start with base path for this sub-facility
    path_list = ['IMOS', 'ANMN', 'NRS', 'REAL_TIME']

    # add site code
    with Dataset(filepath, mode='r') as f:
        site_code = getattr(f, 'site_code', '')
    if not site_code:
        raise InvalidFileContentError("File '{name}' has no site_code attribute!".format(name=filename))
    path_list.append(site_code)

    # add product sub-directory
    if re.match('IMOS_ANMN-NRS_MT_.*-Surface-.*-MET', filename):
        path_list.append('Meteorology')
    elif re.match('IMOS_ANMN-NRS_W_.*-Surface-.*-wave', filename):
        path_list.append('Wave')
    elif re.match('IMOS_ANMN-NRS_TPSOBUE_.*-SubSurface-.*-WQM', filename):
        path_list.append('Biogeochem_timeseries')
    else:
        raise InvalidInputFileError(
            "File name '{name}' doesn't match pattern for any known NRS real-time product".format(name=filename)
        )

    path_list.append(filename)

    return os.path.join(*path_list)
コード例 #2
0
def get_product_type(netcdf_path):
    with Dataset(netcdf_path, mode='r') as nc_obj:
        try:
            return nc_obj.product_type
        except AttributeError:
            raise InvalidInputFileError(
                "Expecting 'product_type' attribute in netCDF'{gzip}'".format(
                    gzip=os.path.basename(netcdf_path)))
コード例 #3
0
    def preprocess(self):
        """ Preprocessing for NRT and DM files
           - NRT: generate a NetCDF files based on input text file.
             Set the input file publish_type property to 'archive'
           - DM file collection: update the check_type and publish_type properties for non-NetCDF files.
             These files are not checked or harvested, but uploaded to S3

        """
        if self.custom_params is not None and self.custom_params.get(
                'ship_callsign_ls'):
            self.ship_callsign_ls = self.custom_params['ship_callsign_ls']
        else:
            self.ship_callsign_ls = ship_callsign_list()

        # Delayed mode file submitted as a zip archive
        if self.file_extension == '.zip':
            nc_file = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(nc_file) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in ZIP archive '{zip}'".format(
                        zip=os.path.basename(self.input_file)))

            # first process the NetCDF file to set the destination path for the file collection
            nc = nc_file[0]
            nc.dest_path = self.dest_path(nc.src_path)
            nc_dir_path = os.path.dirname(nc.dest_path)

            # SOOP-CO2 DM and FRMAP .txt,.pdf or/and .xml files.
            # Set check type to NONEMPTY and publish type to UPLOAD_ONLY
            non_nc_files = PipelineFileCollection(
                f for f in self.file_collection
                if f.file_type is not FileType.NETCDF)
            for non_nc in non_nc_files:
                non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK
                non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                non_nc.dest_path = os.path.join(nc_dir_path, non_nc.name)

        elif self.input_file.endswith('dat.txt'):
            # Single text file Realtime files (*dat.txt)
            rt_file = self.file_collection[0]
            rt_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY

            nrt_nc_file_path = soop_co2_nrt_nc_generator.process_co2_rt(
                rt_file, self.products_dir, self.ship_callsign_ls)
            nrt_nc_file = PipelineFile(nrt_nc_file_path)
            self.file_collection.add(nrt_nc_file)
            nrt_nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
コード例 #4
0
    def preprocess(self):
        """ Preprocessing of Zip archive and NetCDF files
            Preprocessing consist in setting the destination path AND deleting previous version files
            - Zip contains netcdf , images ,text, doc, or xml file and raw file to archive
             dest_path is generated based on info stored in FV01 NetCDF file.
             update check_type and publish_type according to destination :
             raw files :  move to archive =>publish_type property to 'archive'
            - text, doc, xml, images: basic checks
              uploaded to S3 => set check_type and publish_type attributesge accordingly
        """

        netcdf = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        if len(netcdf) != 1:
            raise InvalidInputFileError(
                "Expecting one netCDF file from input file '{infile}'".format(
                    infile=os.path.basename(self.input_file)))

        nc = netcdf[0]
        destination = dest_path_soop_ba(nc)
        nc.dest_path = os.path.join(destination, nc.name)

        results = self.state_query.query_storage(destination).keys()
        files_to_delete = self.get_previous_version(results, destination,
                                                    nc.name)
        if files_to_delete:
            self.file_collection.update(files_to_delete)

        if self.file_type is FileType.ZIP:
            non_nc_files = PipelineFileCollection(
                f for f in self.file_collection
                if f.file_type is not FileType.NETCDF)
            for non_nc in non_nc_files:
                non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK
                if non_nc.extension in ['.ek5', '.out', '.raw']:
                    non_nc.publish_type = PipelineFilePublishType.ARCHIVE_ONLY
                    dest_archive = archive_path_soop_ba(nc)
                    non_nc.archive_path = os.path.join(dest_archive,
                                                       non_nc.name)
                else:
                    non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                    non_nc.dest_path = os.path.join(destination, non_nc.name)

                    files_to_delete = self.get_previous_version(
                        results, destination, non_nc.name)
                    if files_to_delete:
                        self.file_collection.update(files_to_delete)
コード例 #5
0
ファイル: handlers.py プロジェクト: aodn/python-aodndata
    def process_zip_common(self, mode):
        if mode == 'RT':
            regex = AnfogFileClassifier.ANFOG_RT_REGEX
            file_type = 'FV00'
        elif mode == 'DM':
            regex = AnfogFileClassifier.DM_REGEX
            file_type = 'FV01'
        else:
            raise ValueError("invalid mode '{mode}'".format(mode=mode))

        netcdf_collection = self.file_collection.filter_by_attribute_regex(
            'name', regex)
        if len(netcdf_collection) != 1:
            raise InvalidInputFileError(
                "Expecting one '{file_type}' NetCDF file in ZIP archive '{zip}'"
                .format(file_type=file_type,
                        zip=os.path.basename(self.input_file)))

        nc = netcdf_collection[0]
        # use the FV00/01 NetCDF file to set the destination path for the file collection
        self.primary_nc = nc
        self.upload_destination = AnfogFileClassifier.get_destination(
            nc.src_path)
コード例 #6
0
ファイル: handlers.py プロジェクト: aodn/python-aodndata
    def get_data_mode(self):
        """
            1) Set data mode based on NetCDF product type
            If FV01 => DM
            If FV00 is ANFOG_RT =>  RT, then also check zip contain ancillary files,
            (.png or position_summary.txt file)
            If not present => missing RT material
            2) Set format_check type specific to product type(FV00/01) and origin(ANFOG, DSTG or NRL)
        """
        fv01 = self.file_collection.filter_by_attribute_regex(
            'name', AnfogFileClassifier.DM_REGEX)
        adapter_dstg = '%s|%s' % (AnfogFileClassifier.ADAPTER_REGEX,
                                  AnfogFileClassifier.DSTG_REGEX)
        anfog_rt = self.file_collection.filter_by_attribute_regex(
            'name', AnfogFileClassifier.ANFOG_RT_REGEX)

        if fv01:
            if re.match(adapter_dstg, fv01[0].name):
                # Adapter and DSTG file not cf and imos compliant
                fv01[0].check_type = PipelineFileCheckType.FORMAT_CHECK

            return 'DM'
        elif anfog_rt:
            # RT file not compliant
            anfog_rt[0].check_type = PipelineFileCheckType.FORMAT_CHECK
            png = self.file_collection.filter_by_attribute_regex(
                'name', AnfogFileClassifier.RT_PNG_REGEX)
            if png:
                return "RT"
            else:
                raise InvalidFileContentError(
                    "Missing ancillary files(PNGs or summary position file) in ZIP archive {name}"
                    .format(name=os.path.basename(self.input_file)))
        else:
            raise InvalidInputFileError(
                "Expecting one NetCDF file in ZIP archive '{zip}'".format(
                    zip=os.path.basename(self.input_file)))
コード例 #7
0
    def preprocess(self):

        # if input file is a NetCDF, create a .nc.gz and harvest upload it.
        # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc.
        # to be consistent, we transform this .nc into a .nz.gz
        if self.file_type is FileType.NETCDF:
            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)

            gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz')
            with open(self.input_file,
                      'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out:
                gz_out.writelines(f_in)

            # publish
            self.add_to_collection(
                gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD)

        if self.file_type is FileType.GZIP:
            # add nc_gz file to collection (not by default)
            self.file_collection.add(self.input_file_object)
            netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.GZIP)
            netcdf_file_gz = netcdf_file_gz_collection[0]
            netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD  # default

            # some GSLA files are gzipped, so gunzip them before checking them
            # if uploaded file is GZIP check that GZIP contains a NetCDF
            netcdf_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(netcdf_collection) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in GZIP archive '{gzip}'".
                    format(gzip=os.path.basename(self.input_file)))

        netcdf_file_gz = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        netcdf_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        # setting the path of the gz file with the gunzipped file
        netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path)
        # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION
        netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION

        # we don't know the product type (DM00 or DM01) of the file already
        # on s3 in order to deduce its path. We need to get the product
        # type from the file in incoming
        result_previous_version_creation_date = self.get_previous_version_creation_date(
            netcdf_file.src_path)
        """ default values
        by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """
        push_new_file = True
        remove_previous_version = False

        # compare creation dates with file already on storage
        if result_previous_version_creation_date:
            new_file_creation_date = get_creation_date(netcdf_file.name)
            if result_previous_version_creation_date > new_file_creation_date:
                push_new_file = False
            elif result_previous_version_creation_date == new_file_creation_date:
                push_new_file = True
            else:
                remove_previous_version = True
                previous_file_path = self.get_previous_version_object(
                    netcdf_file.src_path)

        if push_new_file:
            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                # yearly file should never be harvested
                netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY
        else:
            raise InvalidFileNameError(
                "file name: \"{filename}\"  creation date is older than file already on "
                "storage".format(filename=netcdf_file_gz.name))

        # deletion of the previous file
        if remove_previous_version:
            previous_file_name = os.path.basename(previous_file_path)
            file_to_delete = PipelineFile(
                previous_file_name,
                is_deletion=True,
                dest_path=previous_file_path,
                file_update_callback=self._file_update_callback)

            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST

            self.file_collection.add(file_to_delete)
コード例 #8
0
ファイル: handlers.py プロジェクト: aodn/python-aodndata
    def preprocess(self):
        """ Preprocessing for RT and DM files.
        Processes ZIP, single NetCDF and single TXT files
        Set destination path based on info in NetCDF files
        Update ANFOG deployment status record table stored in anforg_rt schema

        Status are set by processing a status text file except for status 'in-progress' set by the pipeline.

        These status text files are either pushed to incoming by POs manually(delayed-mode, renamed), or by facility.
        Difference in letter case reflect the origin of the status file:
        => lower case status files manually pushed in incoming by POs, or set by pipeline
        => uppercase status files uploded by facility
        Status are converted to lowercase and written to the HarvestLising file.

        File requirements:
        1- File name like : PL-Mission_status.txt (PL platform: SG seaglider or SL slocum_glider)
                           For ex:
                           SL-Portland20190218_renamed.txt
                           SL-Portland20190218_RECOVERED.txt

        Note that the message cannot contain undercores otherwise the process fails (see function get_destination)

        2- File must be size > 0 but its content is not relevant.

        Valid status are : 'in-progress' : set by pipeline upon reception of first NRT file of a new NRT deployment.
                                           No further action
                           'delayed-mode' : set py pipeline upon reception of new DM dataset. Triggers
                                           deletion of relevant NRT files from S3
                           'renamed' : uploaded by PO when error in deployment name
                                       (either error in date or deployement name). This status triggers clearing of
                                       relevant NRT files from S3.
                           'RECOVERED' : uploaded by facility within 12-24h of glider recovery. No further action.
                                         No further action
                           'ABORTED' : uploaded by facility after aborting mission and within 12-24h of glider recovery.
                                       No further action
                           'POTENTIALLY-LOST' : uploaded by facility when glider becomes irresponsive for extended
                                                period. No further action
                           'LOST' : uploaded by facility when glider is definitely lost at sea. No further action.
                                   Note however that NRT file of lost glider should ultimately be deleted by PO
                                   within a couple of week after reception of the lost status message using the
                                   'cleanup' status message
                           'cleanup-files' : uploaded by PO. Triggers deletion of relevant NRT files from S3. Used for
                                           cleaning S3 REATLIME folder from deployments that will not be processed in
                                           delayed-mode, for example: mission aborted with no valid data, or lost glider

        """
        input_file_basename = os.path.basename(self.input_file)
        if self.input_file.endswith('.txt'):
            txt = self.file_collection.filter_by_attribute_regex(
                'extension', '.txt')
            txt[0].check_type = PipelineFileCheckType.NO_ACTION
            txt[0].publish_type = PipelineFilePublishType.NO_ACTION
            message = input_file_basename.split('_')[1].strip('.txt')

            if message not in AnfogHandler.VALID_STATUS:
                raise InvalidInputFileError(
                    "Invalid status message {m}."
                    "Message can be either 'delayed-mode', 'renamed', 'RECOVERED'"
                    "'POTENTIALLY_LOST', 'LOST', 'ABORTED' or 'clear-files'".
                    format(m=message))

            self.upload_destination = AnfogFileClassifier.get_destination(
                self.input_file)

            if message in ['renamed', 'clear-files']:
                self.delete_previous_version('RT', message)

            if message != 'clear-files':
                # the "clear-file"  message is not harvested as it is not relevant to deployment status
                self.set_deployment_status(self.input_file, message)

        elif (self.file_type is FileType.ZIP) or re.match(
                AnfogFileClassifier.DM_REGEX, input_file_basename):
            mode = self.get_data_mode()

            if self.file_type is FileType.ZIP and mode == 'DM':
                self.process_zip_dm()
            elif self.file_type is FileType.ZIP and mode == 'RT':
                self.process_zip_rt()
            elif re.match(AnfogFileClassifier.DM_REGEX, input_file_basename):
                # In Delayed mode, single NetCDF file upload only valid for updates
                # => Check that deployment exists on S3
                self.primary_nc = self.file_collection[0]
                self.upload_destination = AnfogFileClassifier.get_destination(
                    self.primary_nc.src_path)
                results = self.state_query.query_storage(
                    self.upload_destination)
                if results:
                    self.delete_previous_version('DM', 'update')
                else:
                    raise MissingFileError(
                        "New delayed mode deployment. NetCDF file '{file}' "
                        "should have been submitted with ancillary material".
                        format(
                            file=os.path.basename(self.primary_nc.src_path)))
        else:
            raise InvalidInputFileError(
                "Cannot process the uploaded file {name}.".format(
                    name=input_file_basename))