Example #1
0
    def _get_product_level(cls, input_file):
        """Determine the product level of the file, i.e. either 'non-QC' (FV00), 'burst-averaged'
        or 'gridded' (FV02 products), or empty for FV01 files.

        """
        name_field = cls._get_file_name_fields(input_file)

        if cls._get_data_category(input_file) == 'CO2':
            if 'realtime' in name_field[6]:
                return 'real-time'
            elif 'delayed' in name_field[6]:
                return 'delayed'
            else:
                raise InvalidFileNameError("Unknown CO2 file type '{input_file}'".format(input_file=input_file))

        if name_field[5] == 'FV00':
            return 'non-QC'

        if name_field[5] == 'FV02':
            if len(name_field) < 7:
                raise InvalidFileNameError(
                    "Can't determine product type from file name '{name}'".format(name=input_file)
                )
            if 'burst-averaged' in name_field[6]:
                return 'burst-averaged'
            if 'gridded' in name_field[6]:
                return 'gridded'

        return ''
Example #2
0
    def dest_path(self, src_file):
        dir_list = []
        project = def_project(src_file)

        if project not in VALID_PROJECT:
            raise InvalidFileNameError(
                "Invalid project name '{project}'. "
                "Project should be IMOS, SOOP-CO2_RT or Future_Reef_MAP".
                format(project=project))

        if project in ['IMOS', 'SOOP-CO2_RT']:
            fields = FileClassifier._get_file_name_fields(src_file)
            ship_code = fields[4]
            if ship_code not in self.ship_callsign_ls:
                raise InvalidFileNameError(
                    "Missing vessel callsign in file name '{name}'.".format(
                        name=src_file))

            project_base = 'IMOS'
            facility = fields[1][:4]
            sub_facility = fields[1]
            platform = "{ship_code}_{ship_name}".format(
                ship_code=ship_code,
                ship_name=self.ship_callsign_ls[ship_code])
            dir_list.extend([project_base, facility, sub_facility, platform])

        if project == 'FutureReefMap':
            fields = FileClassifier._get_file_name_fields(src_file,
                                                          min_fields=5)
            ship_code = fields[3]
            if ship_code not in self.ship_callsign_ls:
                raise InvalidFileNameError(
                    "Missing vessel callsign in file name '{name}'.".format(
                        name=src_file))

            dir_list.append('Future_Reef_MAP')
            data_type = 'underway'
            dir_list.extend([data_type, self.ship_callsign_ls[ship_code]])

        if project in ['IMOS', 'FutureReefMap']:
            att_list = FileClassifier._get_nc_att(
                src_file, ['cruise_id', 'time_coverage_start'])
            year = att_list[1][:4]
            cruise_id = att_list[0]
            dir_list.extend([year, cruise_id])

        if project == 'SOOP-CO2_RT':
            data_type = 'REALTIME'
            time_start = FileClassifier._get_nc_att(src_file,
                                                    'time_coverage_start')
            year = time_start[:4]
            month = time_start[5:7]
            month = month.lstrip('0')
            dir_list.extend([data_type, year, month])

        dir_path = FileClassifier._make_path(dir_list)
        return os.path.join(dir_path, os.path.basename(src_file))
Example #3
0
    def process(self):
        """Handle a zip file containing images and no NetCDF files. In this case we just want to publish the zip file
        itself, not the individual images. If we encounter a "mixed" zip file with images and netCDF files,
        we're just going to give up, for now.
        """
        images = PipelineFileCollection(f for f in self.file_collection
                                        if f.file_type.is_image_type)
        netcdfs = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        is_zip = self.file_type is FileType.ZIP
        have_images = len(images) > 0
        have_netcdfs = len(netcdfs) > 0
        if is_zip and have_images:
            if have_netcdfs:
                raise InvalidFileContentError(
                    "Zip file contains both images and netCDFs. Don't know what to do!"
                    " They are handled differently, so please upload only one at a time."
                )
            if not DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.match(
                    self.file_basename):
                raise InvalidFileNameError(
                    "Zip file contains images, but its name does not match pattern for images zip file "
                    "(regular expression '{p}')".format(
                        p=DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.pattern))

            self.logger.info(
                "Zip file contains images and no netCDF files. "
                "Publishing original zip file instead of its contents.")

            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)
            self.input_file_object.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
            self.file_collection.add(self.input_file_object)
Example #4
0
    def get_deployment_code(cls, src_path):
        """Depending on data mode :
           DM :  get deployment code from netcdf global attributes directly
               DSTG : no attribute deployment_code, extract deployment code from title instead
           RT :exctract deployment code from title
        """

        name = os.path.basename(src_path)
        if re.match(cls.DSTG_REGEX, name) or re.match(cls.ANFOG_RT_REGEX,
                                                      name):
            title = cls._get_nc_att(src_path, 'title')
            deployment_code = title.split()[-1]
            if deployment_code == 'mission':
                raise InvalidFileContentError(
                    "Missing deployment code in {file} ".format(file=name))

        elif re.match(cls.ANFOG_NC_REGEX, name) or re.match(
                cls.ADAPTER_REGEX, name):
            deployment_code = cls._get_nc_att(src_path, 'deployment_code')
        elif name.endswith('.txt'):
            # extract deployment code from filename like SL-Yamba20180609_completed.txt
            field = name.split('_')
            deployment_code = field[0].split('-')[1]
        else:
            raise InvalidFileNameError(
                "Invalidfile name {file} ".format(file=name))

        return deployment_code
def dest_path_aatams_sattag_qc_ctd(filepath):

    with Dataset(filepath, mode='r') as nc_obj:
        try:
            deployment_code = nc_obj.deployment_code
        except AttributeError:
            raise InvalidFileNameError(
                'deployment_code attribute not found in NetCDF file to deduce path'
            )

    # deployment code should be equivalent to the start of the NetCDF
    netcdf_filename = os.path.basename(filepath)
    if deployment_code != netcdf_filename[0:len(deployment_code)]:
        raise InvalidFileNameError(
            'Empty deployment_code attribute in NetCDF file to deduce path')

    return os.path.join(AATAMS_MEOP_DIR, deployment_code, netcdf_filename)
Example #6
0
    def dest_path(cls, input_file):
        """
        Destination object path for an DWM file. Of the form:

          'IMOS/DWM/DA/<platform_code>/<data_category>/<product_level>'
          or
          'IMOS/DWM/SOTS/<year_of_deployment>/<product_type>'
          or
          'IMOS/DWM/SOTS/images'

        where
        <platform_code> is the value of the platform_code global attribute
        <data_category> is a broad category like 'Temperature', 'CTD_profiles', etc...
        <product_level> is
         - 'non-QC' for FV00 files
         - empty for FV01 files
        <year_of_deployment> is the year in which the deployment started
        <product_type> is
         - 'real-time';
         - empty (for delayed mode data)

        The basename of the input file is appended.

        """
        dir_list = [cls.PROJECT, cls.FACILITY]
        input_file_basename = os.path.basename(input_file)

        # deal with image zip files first, as they're simpler
        if cls.SOTS_IMAGES_ZIP_PATTERN.match(input_file_basename):
            dir_list.extend(['SOTS', 'images', input_file_basename])
            return cls._make_path(dir_list)

        fac, subfac = cls._get_facility(input_file)
        is_asfs_and_rt = subfac == 'ASFS' and cls._is_realtime(input_file)
        if subfac == 'DA':
            dir_list.append(subfac)
            dir_list.append(cls._get_nc_att(input_file, 'platform_code'))
            dir_list.append(cls._get_data_category(input_file))
            dir_list.append(cls._get_product_level(input_file))
        elif is_asfs_and_rt: # rt files with old names not migrated yet
            cat = cls._get_old_data_category(input_file)
            start_time = cls._get_nc_att(input_file,'time_coverage_start',time_format=True)
            rt_folder_name = '{}_daily'.format(start_time.year)
            dir_list += ['ASFS', 'SOFS', cat, 'Real-time', rt_folder_name]
        elif subfac in ('SOTS', 'ASFS'):
            dir_list.append('SOTS')
            dir_list.append(cls._get_deployment_year(input_file))
            if cls._is_realtime(input_file):
                dir_list.append('real-time')
        else:
            raise InvalidFileNameError(
                "Unknown DWM sub-facility '{subfac}' for file '{input_file}'".format(subfac=subfac,
                                                                                      input_file=input_file)
            )

        dir_list.append(input_file_basename)

        return cls._make_path(dir_list)
Example #7
0
    def dest_path(filepath):
        filepath = re.sub(
            '_C-.*$', '.nc',
            filepath)  # strip creation date from filepath if exists
        netcdf_filename = os.path.basename(filepath)
        m = re.search(
            r'^IMOS_SRS-OC_F_([0-9]{8}T[0-9]{6}Z)_(.*)_FV0([0-2]{1})_DALEC_.*\.nc$',
            netcdf_filename)

        if m is None:
            raise InvalidFileNameError(
                "file name not matching regex to deduce dest_path")

        platform_code = m.group(2)
        file_version_code = 'FV0%s' % m.group(3)

        ships_dic = ship_callsign_list()

        if platform_code in ships_dic:
            vessel_name = ships_dic[platform_code]
        else:
            raise InvalidFileNameError(
                "Vessel name not known '{name}'".format(name=platform_code))

        if not (file_version_code != "FV00" or file_version_code != "FV01"
                or file_version_code != "FV02"):
            raise InvalidFileNameError(
                "File_version code is unknown for '{name}'".format(
                    name=filepath))

        year = datetime.strptime(m.group(1), '%Y%m%dT%H%M%SZ').strftime("%Y")
        relative_netcdf_path = os.path.join(
            'IMOS', 'SRS', 'OC', 'radiometer',
            '%s_%s' % (platform_code, vessel_name), year)

        if file_version_code == "FV02":
            relative_netcdf_path = os.path.join(relative_netcdf_path,
                                                'fv02-products',
                                                netcdf_filename)
        else:
            relative_netcdf_path = os.path.join(relative_netcdf_path,
                                                netcdf_filename)

        return relative_netcdf_path
Example #8
0
def dest_path_srs_oc_ljco_aeronet(filepath):
    file_basename = os.path.basename(filepath)

    if file_basename == VALID_FILENAME:
        return os.path.join(PREFIX_PATH, file_basename)

    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not equal to {valid} in order to deduce dest_path"
            .format(filename=os.path.basename(filepath), valid=VALID_FILENAME))
Example #9
0
 def _get_data_category(cls, input_file):
     if 'aggregated-timeseries' in input_file:
         return 'aggregated_timeseries'
     elif 'hourly-timeseries' in input_file:
         return 'hourly_timeseries'
     elif 'gridded-timeseries' in input_file:
         return 'gridded_timeseries'
     else:
         raise InvalidFileNameError(
             "Could not determine data category from {name}".format(
                 name=input_file))
Example #10
0
def dest_path_aodn_wave_dm(filepath):
    file_basename = os.path.basename(filepath)
    with Dataset(filepath, mode='r') as nc_obj:
        site_name = nc_obj.site_name

    if BOM_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(BOM_DIR, WAVERIDER_DIR, DELAYED_DIR)
        product_dir = site_name.replace(' ', '_')

    elif DES_QLD_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DES_QLD_DIR, WAVERIDER_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   DES_QLD_WAVERIDER)
        product_dir = fields['site_code']

    elif DOT_WA_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DOT_WA_DIR, WAVERIDER_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   DOT_WA_WAVERIDER)
        product_dir = os.path.join(site_name.replace(' ', '_'),
                                   fields['site_code'])

    elif MHL_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(MHL_DIR_BASE, MHL_DIR, MHL_WAVERIDER_DIR)
        product_dir = site_name.replace(' ', '_')

    elif DOT_WA_AWAC.match(file_basename):
        data_base_dir = os.path.join(DOT_WA_DIR, AWAC_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename, DOT_WA_AWAC)
        product_dir = fields['site_code']

    elif DTA_NZ_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DTA_NZ_DIR, WAVERIDER_DIR, DELAYED_DIR)
        if 'Wave Rider Buoy' not in site_name:
            raise InvalidFileContentError(
                "file name: \"{filename}\"; global attribute site_code does not contain 'Wave Rider Buoy' string to " \
                "deduce path".format(filename=file_basename))
        product_dir = site_name.replace('Wave Rider Buoy',
                                        '').strip().replace(' ', '_')

    elif NTP_WAVE.match(file_basename):
        data_base_dir = os.path.join(NTP_WAVE_DIR, WAVERIDER_DIR, DELAYED_DIR)
        if len(site_name) == 0:
            raise InvalidFileContentError(
                "file name: \"{filename}\"; global attribute site_name is empty"
                .format(filename=file_basename))
        product_dir = site_name

    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not matching regex to deduce path".
            format(filename=file_basename))

    return os.path.join(data_base_dir, product_dir, os.path.basename(filepath))
Example #11
0
def dest_path_cars(filepath):
    pattern = r'CARS(\d+)_.*\.nc'
    try:
        year = re.search(pattern, filepath).group(1)
    except AttributeError:
        raise InvalidFileNameError(
            "invalid file name {filepath}. Not matching '{pattern}'".format(
                filepath=filepath, pattern=pattern))
    return os.path.join(
        "CSIRO/Climatology/CARS/{year}/AODN-product/{basename}".format(
            year=year, basename=os.path.basename(filepath)))
Example #12
0
    def archive_path(self, src_file):
        """
        Generate archive path for RT file based on vessel_code
            eg:IN_2017-165-0000dat.txt
              <Vessel_code>_yyyy-ddd-hhmmdat.txt
        :return: relative archive path- full path, including file name
        eg: 'IMOS/SOOP/SOOP-CO2/VLMJ_Investigator/REALTIME/2018/1/IN_2018-022-0000dat.txt'
        """
        dir_list = []
        project = 'IMOS'
        facility = 'SOOP'
        sub_facility = 'SOOP-CO2'
        data_type = 'REALTIME'
        dir_list.extend([project, facility, sub_facility])
        fields = FileClassifier._get_file_name_fields(
            os.path.basename(src_file), min_fields=2)
        if fields[0] in VESSEL_CODE:
            ship_code = VESSEL_CODE[fields[0]]
        else:
            raise InvalidFileNameError(
                "File {file} has an invalid vessel code or is not a valid SOOP-CO2 realtime file"
                .format(file=os.path.basename(src_file)))
        platform = "{ship_code}_{ship_name}".format(
            ship_code=ship_code, ship_name=self.ship_callsign_ls[ship_code])
        dir_list.extend([platform, data_type])
        year = int(fields[1][:4])
        dir_list.append(year)
        jday = int(fields[1][5:8])
        if not (jday in range(0, 367)) or year < 2017:
            raise InvalidFileNameError(
                "Failed extracting valid [year, day] from file {file}".format(
                    file=os.path.basename(src_file)))

        # Determine month from julian day (1-365). Leap year taken into account
        year_to_ordinal = datetime.date(year, 1, 1).toordinal() + jday - 1
        month = datetime.date.fromordinal(year_to_ordinal).month
        dir_list.append(month)
        dir_list.append(os.path.basename(src_file))
        archive_file_path = FileClassifier._make_path(dir_list)

        return archive_file_path
Example #13
0
def get_type(filepath):
    """return acorn_file_type, the file type of an ACORN file based on its filename"""
    file_basename = os.path.basename(filepath)
    unknown_product = False
    if ACORN_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   ACORN_FILE_PATTERN)
        product_type = fields['product_type']
        file_version = fields['file_version']
        platform_code = fields['platform_code']

        if product_type == 'radial' and file_version == 'FV00':
            acorn_file_type = "radial"

        elif product_type == 'radial' and file_version == 'FV01':
            acorn_file_type = "radial_quality_controlled"

        elif product_type == 'sea-state' and file_version == 'FV00':
            acorn_file_type = "vector"

        elif product_type == 'wavespec' and file_version == 'FV01':
            acorn_file_type = "gridded_1h-avg-wave-spectra_QC"

        elif product_type == 'windp' and file_version == 'FV01':
            acorn_file_type = "gridded_1h-avg-wind-map_QC"

        elif product_type == 'wavep' and file_version == 'FV01':
            site_map_station = ['CBG', 'SAG', 'ROT', 'COF']

            if any(s == platform_code for s in site_map_station):
                acorn_file_type = "gridded_1h-avg-wave-site-map_QC"
            else:
                acorn_file_type = "gridded_1h-avg-wave-station-map_QC"

        elif product_type == '1-hour-avg' and file_version == 'FV00':
            acorn_file_type = "gridded_1h-avg-current-map_non-QC"

        elif product_type == '1-hour-avg' and file_version == 'FV01':
            acorn_file_type = "gridded_1h-avg-current-map_QC"

        else:
            unknown_product = True
    else:
        unknown_product = True

    if unknown_product:
        raise InvalidFileNameError(
            "file name: \"{filename}\" Unknown product type from filename".
            format(filename=file_basename))

    return acorn_file_type
Example #14
0
    def preprocess(self):
        """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching
        file will be left with publish_type UNSET after the _resolve step.

        :return: None
        """
        self.logger.info("Checking for invalid files.")

        invalid_files = self.file_collection.filter_by_attribute_id(
            'publish_type', PipelineFilePublishType.UNSET)
        if invalid_files:
            raise InvalidFileNameError(
                "File name(s) don't match the pattern expected for this upload location: {names}"
                .format(names=invalid_files.get_attribute_list('name')))
Example #15
0
def get_gsla_type(filepath):
    """ :return:  gsla file type """
    file_basename = os.path.basename(filepath)
    if GSLA_REGEX.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename, GSLA_REGEX)
        return fields['product_type']

    elif GSLA_REGEX_YEARLY.match(file_basename):
        return os.path.join(get_product_type(filepath), 'yearfiles')

    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not matching regex to deduce dest_path".
            format(filename=file_basename))
Example #16
0
def get_product_type(file_path):
    """Return a product type label for the given file (extracted from the file name).
    For example "PSAL-aggregated-timeseries", or "hourly-timeseries".

    :param file_path: str path or name of file
    :returns: str product type label
    """
    file_name = os.path.basename(file_path)
    name_match = PRODUCT_TYPE_PATTERN.search(file_name)
    if not name_match:
        raise InvalidFileNameError(
            "Could not extract produt type from '{file_name}'".format(
                file_name=file_name))
    return name_match.group(1)
Example #17
0
def get_creation_date(filepath):
    """ :return: creation date    """
    file_basename = os.path.basename(filepath)
    if GSLA_REGEX.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename, GSLA_REGEX)

    elif GSLA_REGEX_YEARLY.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   GSLA_REGEX_YEARLY)

    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not matching regex to deduce creation_date"
            .format(filename=file_basename))

    return datetime.strptime(fields['creation_date'], '%Y%m%dT%H%M%SZ')
Example #18
0
    def dest_path(filepath):
        sstaars_alt_dir = os.path.join('CSIRO', 'Climatology', 'SSTAARS',
                                       '2017')
        sstaars_aodn_dir = os.path.join(sstaars_alt_dir, 'AODN-product')
        netcdf_file_name = os.path.basename(filepath)

        regex_daily_files = re.compile(r'SSTAARS_daily_fit_[0-9]{3}\.nc')

        if netcdf_file_name == 'SSTAARS.nc':
            return os.path.join(sstaars_alt_dir, netcdf_file_name)
        elif (netcdf_file_name == 'SSTAARS_daily_fit.nc') or re.match(
                regex_daily_files, netcdf_file_name):
            return os.path.join(sstaars_aodn_dir, netcdf_file_name)
        else:
            raise InvalidFileNameError(
                r"invalid file name {filepath}. Not matching 'STAARS.*\.nc'".
                format(filepath=filepath))
Example #19
0
    def dest_path(filepath):
        file_basename = os.path.basename(filepath)

        # NON CONTRIBUTED DATA SET
        if IMOS_OC_FILE_PATTERN.match(file_basename):
            fields = get_pattern_subgroups_from_string(file_basename, IMOS_OC_FILE_PATTERN)
            nc_time_cov_start = datetime.strptime(fields['nc_time_cov_start'], '%Y%m%dT%H%M%SZ')
            data_parameter_code = fields['data_parameter_code']

            if data_parameter_code == 'A':
                product_name = 'aqua'
            elif data_parameter_code == 'S':
                product_name = 'seawifs'
            elif data_parameter_code == 'V':
                product_name = 'viirs'

            path = os.path.join(OC_GRIDDED_PREFIX_PATH, product_name, fields['time_coverage_resolution'],
                                '%d' % nc_time_cov_start.year, '%02d' % nc_time_cov_start.month,
                                file_basename)
            return path

        # CONTRIBUTED DATA SET
        elif RJOHNSON_FILE_PATTERN.match(file_basename):
            fields = get_pattern_subgroups_from_string(file_basename, RJOHNSON_FILE_PATTERN)
            data_parameter_code = fields['data_parameter_code']
            time_coverage_resolution =  fields['time_coverage_resolution']

            if data_parameter_code == 'A':
                product_name = 'aqua'
            elif data_parameter_code == 'S':
                product_name = 'seawifs'

            if time_coverage_resolution == '8D':
                time_cov = '8d'
            elif time_coverage_resolution == 'MO':
                time_cov = '1m'

            return os.path.join(OC_GRIDDED_PREFIX_PATH, 'contributed', 'SO-Johnson',
                                'chl', time_cov, product_name, file_basename)

        else:
            raise InvalidFileNameError("file name: \"{filename}\" not matching regex to deduce dest_path".
                                       format(filename=file_basename))
Example #20
0
    def preprocess(self):
        """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching
        file will be left with publish_type UNSET after the _resolve step.

        If there are any netCDF files from burst-sampling instruments in the collection, create the burst-averaged
        version of each and add them to the collection.

        :return: None
        """
        self.logger.info(
            "Checking for invalid files and adjusting check/publish properties."
        )

        invalid_files = self.file_collection.filter_by_attribute_id(
            'publish_type', PipelineFilePublishType.UNSET)
        if invalid_files:
            raise InvalidFileNameError(
                "File name(s) don't match the pattern expected for this upload location: {names}"
                .format(names=invalid_files.get_attribute_list('name')))

        # Burst-processing for FV01 files with burst-sampling global attributes
        burst_files = (self.file_collection.filter_by_attribute_id(
            'file_type',
            FileType.NETCDF).filter_by_attribute_regex('name', r'.*_FV01_'))
        for f in burst_files:
            with Dataset(f.src_path, mode='r') as D:
                has_interval = hasattr(D, 'instrument_burst_interval')
                has_duration = hasattr(D, 'instrument_burst_duration')
                is_adcp = ('DIST_ALONG_BEAMS' in D.dimensions
                           or 'HEIGHT_ABOVE_SENSOR' in D.dimensions)
            if not (has_interval and has_duration) or is_adcp:
                continue

            self.logger.info("Burst-processing {f.name}".format(f=f))
            product_path = create_burst_average_netcdf(f.src_path,
                                                       self.products_dir)
            product_file = PipelineFile(
                product_path, file_update_callback=self._file_update_callback)
            product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
            self.file_collection.add(product_file)
Example #21
0
def dest_path_soop_ba(src_file):
    dir_list = []
    fields = FileClassifier._get_file_name_fields(src_file.name)
    ship_code = fields[4]
    ship_callsign_ls = ship_callsign_list()

    if ship_code not in ship_callsign_ls:
        raise InvalidFileNameError(
            "Missing vessel callsign in file name '{name}'.".format(
                name=src_file.name))

    project = fields[0]
    facility = fields[1][:4]
    sub_facility = fields[1]
    platform = "{ship_code}_{ship_name}".format(
        ship_code=ship_code, ship_name=ship_callsign_ls[ship_code])
    dir_list.extend([project, facility, sub_facility, platform])

    deployment_id = get_deployment_id(src_file, ship_code)

    dir_list.append(deployment_id)
    return FileClassifier._make_path(dir_list)
Example #22
0
def archive_path_soop_ba(src_file):
    """Define the archive path based on info from NetCDF"""
    dir_list = []
    fields = FileClassifier._get_file_name_fields(src_file.name)
    ship_code = fields[4]
    ship_callsign_ls = ship_callsign_list()

    if ship_code not in ship_callsign_ls:
        raise InvalidFileNameError(
            "Missing vessel callsign in file name '{name}'.".format(
                name=src_file.name))

    project = fields[0]
    facility = fields[1][:4]
    sub_facility = fields[1]
    raw_folder = 'raw'
    platform = "{ship_code}_{ship_name}".format(
        ship_code=ship_code, ship_name=ship_callsign_ls[ship_code])
    dir_list.extend([project, facility, sub_facility, raw_folder, platform])

    deployment_id = get_deployment_id(src_file, ship_code)
    dir_list.append(deployment_id)
    return FileClassifier._make_path(dir_list)
Example #23
0
def get_info_nc(filepath):
    file_basename = os.path.basename(filepath)

    if L3S_L3C_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   L3S_L3C_FILE_PATTERN)
        day_time = fields['day_time']
        temporal_extent = fields['temporal_extent']
    elif L3U_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   L3U_FILE_PATTERN)
        day_time = None
        temporal_extent = None
    elif L3S_MULTISENSOR_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(
            file_basename, L3S_MULTISENSOR_FILE_PATTERN)
        day_time = fields['day_time']
        temporal_extent = fields['temporal_extent']
        fields['product_type'] = '%sM' % fields['product_type']
    elif L3U_VIIRS_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   L3U_VIIRS_FILE_PATTERN)
        day_time = ''
        temporal_extent = None
        fields['sat_value'] = 'snpp'
    elif L3C_VIIRS_FILE_PATTERN.match(file_basename):
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   L3C_VIIRS_FILE_PATTERN)
        day_time = fields['day_time']
        temporal_extent = fields['temporal_extent']
        fields['sat_value'] = 'snpp'
    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not matching regex to deduce dest_path".
            format(filename=os.path.basename(filepath)))

    prod_lev = fields['product_type']

    if day_time == 'night':
        day_time = 'ngt'

    date_nc = datetime.strptime(fields['nc_time_cov_start'], '%Y%m%d%H%M%S')

    sat_value = fields.get('sat_value', '')
    if sat_value.isdigit():
        sat_value = 'n%s' % sat_value

    if prod_lev != 'L3U':
        product_path = '%s-%s' % (prod_lev, temporal_extent)
    else:
        product_path = prod_lev

    if 'Southern' in filepath:
        if '-' in product_path:
            product_path = '%sS' % product_path
        else:
            product_path = '%s-%s' % (product_path, 'S')

    file_info = {
        'prod_level': prod_lev,
        'temporal_extent': temporal_extent,
        'day_time': day_time,
        'date_data': date_nc,
        'sat_value': sat_value,
        'product_path': product_path
    }

    return file_info
Example #24
0
    def dest_path(filepath):
        ljco_s3_base_dir = os.path.join('IMOS', 'SRS', 'OC', 'LJCO')

        netcdf_filename = os.path.basename(filepath)
        netcdf_filename = re.sub('_C-.*.nc$', '.nc',
                                 netcdf_filename)  # remove creation date

        # looking for product_name
        m = re.search(
            r'^IMOS_SRS-OC-LJCO_.*_([0-9]{8}T[0-9]{6}Z)_(SRC|LJCO)_FV0([0-2]{1}).*\.nc$',
            netcdf_filename)

        if m is None:
            raise InvalidFileNameError(
                "file name not matching regex to deduce dest_path")

        # list of allowed products keywords
        products_type_ls = [
            'ACS', 'EcoTriplet', 'BB9', 'HyperOCR', 'WQM', 'DALEC'
        ]
        products_type = re.compile('|'.join(products_type_ls))
        nc_product_type = products_type.findall(netcdf_filename)

        # list of allowed time coverage keywords
        products_time_cov_ls = ['hourly', 'daily', 'monthly']
        products_time_cov = re.compile('|'.join(products_time_cov_ls))
        nc_product_time_cov = products_time_cov.findall(netcdf_filename)

        # netcdf qc value
        nc_product_qc = 'FV0%s' % m.group(3)

        nc_time_cov_start = datetime.strptime(m.group(1), '%Y%m%dT%H%M%SZ')
        nc_year = nc_time_cov_start.year
        nc_month = nc_time_cov_start.month
        nc_day = nc_time_cov_start.day

        if not nc_product_type:
            raise InvalidFileNameError(
                "can not find matching product type from allowed list: {product_type_ls}"
                .format(product_type_ls=products_type_ls))

        if nc_product_type[0] == 'DALEC':
            product_dir = nc_product_type[0]
        else:
            # products other than DALEC need to have product type AND time coverage info
            if len(nc_product_time_cov) == 0:
                raise InvalidFileNameError(
                    "can not find matching time coverage from allowed list: {products_time_cov_ls}"
                    .format(products_time_cov_ls=products_time_cov_ls))
            else:
                product_dir = '%s-%s' % (nc_product_type[0],
                                         nc_product_time_cov[0])

        nc_common_dir_structure_prefix = os.path.join(ljco_s3_base_dir,
                                                      product_dir,
                                                      '%d' % nc_year)

        # DALEC doesn't have nc_product_time_cov keywords, so we run this section first
        if nc_product_type[0] == 'DALEC':
            if nc_product_qc == 'FV02':
                return os.path.join(nc_common_dir_structure_prefix,
                                    '%02d' % nc_month, 'fv02-products',
                                    netcdf_filename)
            else:
                return os.path.join(nc_common_dir_structure_prefix,
                                    '%02d' % nc_month, netcdf_filename)

        if nc_product_time_cov[0] == 'hourly':
            return os.path.join(nc_common_dir_structure_prefix,
                                '%02d' % nc_month, '%02d' % nc_day,
                                netcdf_filename)

        if nc_product_time_cov[0] == 'daily':
            return os.path.join(nc_common_dir_structure_prefix,
                                netcdf_filename)
Example #25
0
def netcdf_writer(log_path, output_dir, ship_name, meta_path=[]):
    if meta_path != []:
        with open(meta_path, 'r') as f:
            meta_data = json.loads('\n'.join([
                row for row in f.readlines() if len(row.split('#')) == 1
            ]))  # remove comments
            for ii in range(len(meta_data['calibration'])):
                if meta_data['calibration'][ii]['item'] == 'EFLO':
                    calibration_flo_a0 = float(
                        meta_data['calibration'][ii]['a0'])
                    calibration_flo_a1 = float(
                        meta_data['calibration'][ii]['a1'])
                if meta_data['calibration'][ii]['item'] == 'ESAL':
                    calibration_sal_a0 = float(
                        meta_data['calibration'][ii]['a0'])
                    calibration_sal_a1 = float(
                        meta_data['calibration'][ii]['a1'])
                if meta_data['calibration'][ii]['item'] == 'ETMP':
                    calibration_tmp_a0 = float(
                        meta_data['calibration'][ii]['a0'])
                    calibration_tmp_a1 = float(
                        meta_data['calibration'][ii]['a1'])
                if meta_data['calibration'][ii]['item'] == 'ETURB':
                    calibration_turb_a0 = float(
                        meta_data['calibration'][ii]['a0'])
                    calibration_turb_a1 = float(
                        meta_data['calibration'][ii]['a1'])

    df = parse_log_file(log_path)
    df = transform_count_to_real_val(df)
    log_filename = os.path.basename(log_path)

    fields = get_pattern_subgroups_from_string(log_filename,
                                               SOOP_NRT_LOG_PATTERN)
    product_code = fields['product_code']

    if product_code in ['D2M', 'M2D', 'S2M', 'M2S']:
        product_type = "transect"
        feature_type = "trajectory"
        template = DatasetTemplate.from_json(NC_JSON_TEMPLATE_TRAJECTORY)
    elif product_code in ['DEV', 'MEL', 'SYD']:
        product_type = "mooring"
        feature_type = "timeSeries"
        template = DatasetTemplate.from_json(NC_JSON_TEMPLATE_MOORING)
    else:
        raise InvalidFileNameError(
            "SOOP NRT input logfile has incorrect product_code '{product_code}'. Not belonging to any of "
            "('D2M', 'M2D', 'S2M', 'M2S','DEV', 'MEL', 'SYD').".format(
                product_code=product_code))

    template.global_attributes.update({'product_type': product_type})

    time_val_dateobj = date2num(df.index.to_pydatetime(),
                                template.variables['TIME']['units'],
                                template.variables['TIME']['calendar'])

    # replace all nan with FillValue from template value
    df.replace(np.nan,
               template.variables['LATITUDE']['_FillValue'],
               inplace=True)

    template.variables['TIME']['_data'] = time_val_dateobj
    template.variables['LATITUDE']['_data'] = df.LATITUDE.values
    template.variables['LONGITUDE']['_data'] = df.LONGITUDE.values

    template.variables['TEMP']['_data'] = df.TEMP.values
    template.variables['PSAL']['_data'] = df.PSAL.values
    template.variables['TURB']['_data'] = df.TURB.values
    template.variables['CPHL']['_data'] = df.CPHL.values

    calibration_comment = 'Value=a0 + a1 x raw_value'
    if 'calibration_tmp_a0' in locals() and 'calibration_tmp_a1' in locals():
        template.variables['TEMP']['a0'] = calibration_tmp_a0
        template.variables['TEMP']['a1'] = calibration_tmp_a1
        template.variables['TEMP']['calibration_comment'] = calibration_comment

    if 'calibration_sal_a0' in locals() and 'calibration_sal_a1' in locals():
        template.variables['PSAL']['a0'] = calibration_sal_a0
        template.variables['PSAL']['a1'] = calibration_sal_a1
        template.variables['PSAL']['calibration_comment'] = calibration_comment

    if 'calibration_turb_a0' in locals() and 'calibration_turb_a1' in locals():
        template.variables['TURB']['a0'] = calibration_turb_a0
        template.variables['TURB']['a1'] = calibration_turb_a1
        template.variables['TURB']['calibration_comment'] = calibration_comment

    if 'calibration_flo_a0' in locals() and 'calibration_flo_a1' in locals():
        template.variables['CPHL']['a0'] = calibration_flo_a0
        template.variables['CPHL']['a1'] = calibration_flo_a1
        template.variables['CPHL']['calibration_comment'] = calibration_comment

    measurement_frequency = get_measurement_frequency(df)
    if measurement_frequency == 1:
        measurement_frequency_str = '1sec'
    elif measurement_frequency == 10:
        measurement_frequency_str = '10secs'

    template.global_attributes.update({
        'time_coverage_start':
        df.index.strftime('%Y-%m-%dT%H:%M:%SZ')[0],
        'time_coverage_end':
        df.index.strftime('%Y-%m-%dT%H:%M:%SZ')[-1],
        'featureType':
        feature_type,
        'date_created':
        datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        'platform_code':
        SHIP_CODE,
        'vessel_name':
        ship_name,
        'geospatial_lat_min':
        df.LATITUDE.dropna().min(),
        'geospatial_lat_max':
        df.LATITUDE.dropna().max(),
        'geospatial_lon_min':
        df.LONGITUDE.dropna().min(),
        'geospatial_lon_max':
        df.LONGITUDE.dropna().max(),
        'measurement_frequency':
        measurement_frequency_str,
        'history':
        "File created {date_created}".format(
            date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
    })

    if measurement_frequency == 1:
        template.variables['CPHL'].update({
            'calibration_blank':
            CHLU_PARAMS['blank'],
            'calibration_scale':
            CHLU_PARAMS['scale']
        })

        template.variables['TURB'].update({
            'calibration_blank':
            TURB_PARAMS['blank'],
            'calibration_scale':
            TURB_PARAMS['scale']
        })

    nc_filename = 'IMOS_SOOP-TMV_TSUB_{time_start}_{vessel_code}_FV0{product_number}_{product_type}-{product_code}_END-{time_end}.nc'.format(
        time_start=df.index.strftime('%Y%m%dT%H%M%SZ')[0],
        time_end=df.index.strftime('%Y%m%dT%H%M%SZ')[-1],
        vessel_code=SHIP_CODE,
        product_number=0,
        product_type=product_type,
        product_code=product_code)

    netcdf_path = os.path.join(output_dir, nc_filename)
    template.to_netcdf(netcdf_path)
    return netcdf_path
Example #26
0
    def preprocess(self):
        if self.custom_params is not None and self.custom_params.get(
                'ship_callsign_ls'):
            self.ship_callsign_ls = self.custom_params['ship_callsign_ls']
        else:
            self.ship_callsign_ls = ship_callsign_list()

        if SHIP_CODE not in self.ship_callsign_ls:
            raise RuntimeError(
                "Missing vessel callsign {callsign} from vocabulary.".format(
                    callsign=SHIP_CODE))

        self.soop_tmv_dir = os.path.join(
            'IMOS', 'SOOP', 'SOOP-TMV', '{ship_code}_{ship_name}'.format(
                ship_code=SHIP_CODE,
                ship_name=self.ship_callsign_ls[SHIP_CODE]), 'realtime')

        txt_files = self.file_collection.filter_by_attribute_value(
            'extension', '.txt')
        log_files = self.file_collection.filter_by_attribute_value(
            'extension', '.log')
        nc_files = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        """
        * 10secs zip files (*.log + *.txt [calibration]) -> *.zip is pushed to ARCHIVE_DIR
                                                            (netcdf still needs to be generated to deduce path).
                                                            *.log, *.txt and *.nc NOT added to the collection
        * 1sec zip files (*.log only) -> *.log & *.nc pushed to S3. *.zip not added to the collection
        """

        if len(nc_files):
            # case where we re-push an existing NetCDF file
            f_nc = nc_files[0]
            f_nc.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

        elif len(log_files):
            f_log = log_files[0]
            log_filename = os.path.basename(f_log.src_path)

            if SOOP_NRT_LOG_PATTERN.match(log_filename) is None:
                raise InvalidFileNameError(
                    "SOOP TMV NRT input logfile has incorrect naming '{name}'."
                    .format(name=log_filename))

            # case to create NetCDF file from log file
            f_txt = None
            if len(txt_files):
                f_txt = txt_files[0]
                netcdf_filepath = netcdf_writer(
                    f_log.src_path,
                    self.temp_dir,
                    self.ship_callsign_ls[SHIP_CODE],
                    meta_path=f_txt.src_path)
            else:
                netcdf_filepath = netcdf_writer(
                    f_log.src_path, self.temp_dir,
                    self.ship_callsign_ls[SHIP_CODE])

            # the path of logs and zips has to deduced within the pre-process as it needs the creation of a NetCDF to
            # get the correct info
            with Dataset(netcdf_filepath) as nc_open:
                measurement_frequency = nc_open.measurement_frequency
                product_type = nc_open.product_type
                year = datetime.strptime(nc_open.time_coverage_start,
                                         '%Y-%m-%dT%H:%M:%SZ').strftime("%Y")

            pre_path = os.path.join(self.soop_tmv_dir, product_type,
                                    measurement_frequency, year)

            if measurement_frequency == "1sec":
                f_log.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                f_log.dest_path = os.path.join(pre_path, 'logs', f_log.name)
                nc_file = PipelineFile(netcdf_filepath)
                nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
                self.file_collection.add(nc_file)

            elif measurement_frequency == "10secs":
                if self.input_file.endswith('zip'):
                    self.input_file_object.publish_type = PipelineFilePublishType.ARCHIVE_ONLY
                    self.input_file_object.archive_path = os.path.join(
                        pre_path, 'logs', self.input_file_object.name)
                    self.file_collection.add(self.input_file_object)
                    f_log.publish_type = PipelineFilePublishType.NO_ACTION
                    if f_txt:
                        f_txt.publish_type = PipelineFilePublishType.NO_ACTION
                else:
                    # case when a 10secs log file (and not a zip) is pushed to incoming
                    f_log.publish_type = PipelineFilePublishType.ARCHIVE_ONLY
                    f_log.archive_path = os.path.join(pre_path, 'logs',
                                                      f_log.name)
Example #27
0
    def preprocess(self):

        # if input file is a NetCDF, create a .nc.gz and harvest upload it.
        # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc.
        # to be consistent, we transform this .nc into a .nz.gz
        if self.file_type is FileType.NETCDF:
            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)

            gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz')
            with open(self.input_file,
                      'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out:
                gz_out.writelines(f_in)

            # publish
            self.add_to_collection(
                gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD)

        if self.file_type is FileType.GZIP:
            # add nc_gz file to collection (not by default)
            self.file_collection.add(self.input_file_object)
            netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.GZIP)
            netcdf_file_gz = netcdf_file_gz_collection[0]
            netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD  # default

            # some GSLA files are gzipped, so gunzip them before checking them
            # if uploaded file is GZIP check that GZIP contains a NetCDF
            netcdf_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(netcdf_collection) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in GZIP archive '{gzip}'".
                    format(gzip=os.path.basename(self.input_file)))

        netcdf_file_gz = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        netcdf_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        # setting the path of the gz file with the gunzipped file
        netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path)
        # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION
        netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION

        # we don't know the product type (DM00 or DM01) of the file already
        # on s3 in order to deduce its path. We need to get the product
        # type from the file in incoming
        result_previous_version_creation_date = self.get_previous_version_creation_date(
            netcdf_file.src_path)
        """ default values
        by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """
        push_new_file = True
        remove_previous_version = False

        # compare creation dates with file already on storage
        if result_previous_version_creation_date:
            new_file_creation_date = get_creation_date(netcdf_file.name)
            if result_previous_version_creation_date > new_file_creation_date:
                push_new_file = False
            elif result_previous_version_creation_date == new_file_creation_date:
                push_new_file = True
            else:
                remove_previous_version = True
                previous_file_path = self.get_previous_version_object(
                    netcdf_file.src_path)

        if push_new_file:
            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                # yearly file should never be harvested
                netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY
        else:
            raise InvalidFileNameError(
                "file name: \"{filename}\"  creation date is older than file already on "
                "storage".format(filename=netcdf_file_gz.name))

        # deletion of the previous file
        if remove_previous_version:
            previous_file_name = os.path.basename(previous_file_path)
            file_to_delete = PipelineFile(
                previous_file_name,
                is_deletion=True,
                dest_path=previous_file_path,
                file_update_callback=self._file_update_callback)

            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST

            self.file_collection.add(file_to_delete)