def check_parameters(dataf, vessel_code, input_param, src_file):
    """
    Checks parameters list contains all required parameter(vessel specific)
    Cast selected parameter data to correct type
    Checks that Lat/Lon are not all missing.
    Returns updated dataframe
    """
    rt_input_parameters = set.union(
        INPUT_RT_PARAMETERS, eval(vessel_code + '_SPECIFIC_INPUT_PARAMS'))
    if not all(param in input_param for param in rt_input_parameters):
        missing_param = []
        for required_param in rt_input_parameters:
            if required_param not in input_param:
                missing_param = missing_param.append(required_param)
                raise InvalidFileContentError(
                    "Missing parameter(s) '{missing_param}' in file '{src_file}'.Aborting"
                    .format(missing_param=missing_param, src_file=src_file))
    else:  # required_param all present . Change dtype to numeric where relevant
        # var TYPE conversion to string outside this function
        for param in rt_input_parameters:
            if param not in set(['Type', 'PcDate', 'PcTime']):
                dataf[param] = dataf[param].apply(
                    pd.to_numeric,
                    errors='coerce')  # convert bad non numeric to NaN

    if all(np.isnan(dataf['GpsShipLatitude'])) or all(
            np.isnan(dataf['GpsShipLongitude'])):
        raise InvalidFileContentError(
            "Latitude and/or Longitude values all missing in file '{src_file}'.Aborting"
            .format(src_file=src_file))
    return dataf
Exemple #2
0
    def _read_manifest(self):
        """Read the manifest file and extract key parameters for product"""
        with open(self.input_file) as f:
            manifest = json.load(f)

        try:
            self.product_site_code = manifest['site_code']
            self.product_variables = manifest['variables']
        except KeyError:
            raise InvalidFileContentError(
                "manifest file '{self.input_file}' missing information (site_code, variables)"
                .format(self=self))
        if 'products' in manifest:
            invalid_products = set(manifest['products']) - self.VALID_PRODUCTS
            if invalid_products:
                raise InvalidFileContentError(
                    "invalid product(s) {invalid_products} requested "
                    "in manifest file '{self.input_file}'".format(
                        invalid_products=invalid_products, self=self))
            self.products_to_create = set(manifest['products'])

        # Even if only the gridded product is explicitly requested, we need to re-generate and publish the hourly too,
        # as it is the input file for the gridded.
        if 'gridded' in self.products_to_create:
            self.products_to_create.add('hourly')
Exemple #3
0
def dest_path_aodn_wave_dm(filepath):
    file_basename = os.path.basename(filepath)
    with Dataset(filepath, mode='r') as nc_obj:
        site_name = nc_obj.site_name

    if BOM_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(BOM_DIR, WAVERIDER_DIR, DELAYED_DIR)
        product_dir = site_name.replace(' ', '_')

    elif DES_QLD_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DES_QLD_DIR, WAVERIDER_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   DES_QLD_WAVERIDER)
        product_dir = fields['site_code']

    elif DOT_WA_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DOT_WA_DIR, WAVERIDER_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename,
                                                   DOT_WA_WAVERIDER)
        product_dir = os.path.join(site_name.replace(' ', '_'),
                                   fields['site_code'])

    elif MHL_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(MHL_DIR_BASE, MHL_DIR, MHL_WAVERIDER_DIR)
        product_dir = site_name.replace(' ', '_')

    elif DOT_WA_AWAC.match(file_basename):
        data_base_dir = os.path.join(DOT_WA_DIR, AWAC_DIR, DELAYED_DIR)
        fields = get_pattern_subgroups_from_string(file_basename, DOT_WA_AWAC)
        product_dir = fields['site_code']

    elif DTA_NZ_WAVERIDER.match(file_basename):
        data_base_dir = os.path.join(DTA_NZ_DIR, WAVERIDER_DIR, DELAYED_DIR)
        if 'Wave Rider Buoy' not in site_name:
            raise InvalidFileContentError(
                "file name: \"{filename}\"; global attribute site_code does not contain 'Wave Rider Buoy' string to " \
                "deduce path".format(filename=file_basename))
        product_dir = site_name.replace('Wave Rider Buoy',
                                        '').strip().replace(' ', '_')

    elif NTP_WAVE.match(file_basename):
        data_base_dir = os.path.join(NTP_WAVE_DIR, WAVERIDER_DIR, DELAYED_DIR)
        if len(site_name) == 0:
            raise InvalidFileContentError(
                "file name: \"{filename}\"; global attribute site_name is empty"
                .format(filename=file_basename))
        product_dir = site_name

    else:
        raise InvalidFileNameError(
            "file name: \"{filename}\" not matching regex to deduce path".
            format(filename=file_basename))

    return os.path.join(data_base_dir, product_dir, os.path.basename(filepath))
Exemple #4
0
    def preprocess(self):
        nc_file = self.file_collection[0]

        # if file contains any of FILE_TYPE_NEED_INDEX, we index, otherwise publish only
        if any(s in get_type(nc_file.name) for s in FILE_TYPE_NEED_INDEX):
            nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
        else:
            nc_file.publish_type = PipelineFilePublishType.UPLOAD_ONLY

        # check if file with same dest_path already on s3. If yes, check its date_created nc attribute to know
        # if we need to overwrite this object or not
        destination_s3 = self.dest_path(nc_file.name)

        storage_query_res = self.state_query.query_storage(
            destination_s3).keys()

        # creation date of the new file in the pipeline
        if destination_s3 in storage_query_res:
            creation_date_nc_pipeline = get_creation_date(nc_file.src_path)

            # creation date of the file already published
            creation_date_nc_s3 = get_creation_date(
                os.path.join(self.opendap_root, destination_s3))

            if creation_date_nc_pipeline < creation_date_nc_s3:
                raise InvalidFileContentError(
                    "file name: \"{filename}\"  creation date is older than file already on storage"
                    .format(filename=nc_file.name))
Exemple #5
0
    def process(self):
        """Handle a zip file containing images and no NetCDF files. In this case we just want to publish the zip file
        itself, not the individual images. If we encounter a "mixed" zip file with images and netCDF files,
        we're just going to give up, for now.
        """
        images = PipelineFileCollection(f for f in self.file_collection
                                        if f.file_type.is_image_type)
        netcdfs = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        is_zip = self.file_type is FileType.ZIP
        have_images = len(images) > 0
        have_netcdfs = len(netcdfs) > 0
        if is_zip and have_images:
            if have_netcdfs:
                raise InvalidFileContentError(
                    "Zip file contains both images and netCDFs. Don't know what to do!"
                    " They are handled differently, so please upload only one at a time."
                )
            if not DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.match(
                    self.file_basename):
                raise InvalidFileNameError(
                    "Zip file contains images, but its name does not match pattern for images zip file "
                    "(regular expression '{p}')".format(
                        p=DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.pattern))

            self.logger.info(
                "Zip file contains images and no netCDF files. "
                "Publishing original zip file instead of its contents.")

            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)
            self.input_file_object.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
            self.file_collection.add(self.input_file_object)
Exemple #6
0
    def get_deployment_code(cls, src_path):
        """Depending on data mode :
           DM :  get deployment code from netcdf global attributes directly
               DSTG : no attribute deployment_code, extract deployment code from title instead
           RT :exctract deployment code from title
        """

        name = os.path.basename(src_path)
        if re.match(cls.DSTG_REGEX, name) or re.match(cls.ANFOG_RT_REGEX,
                                                      name):
            title = cls._get_nc_att(src_path, 'title')
            deployment_code = title.split()[-1]
            if deployment_code == 'mission':
                raise InvalidFileContentError(
                    "Missing deployment code in {file} ".format(file=name))

        elif re.match(cls.ANFOG_NC_REGEX, name) or re.match(
                cls.ADAPTER_REGEX, name):
            deployment_code = cls._get_nc_att(src_path, 'deployment_code')
        elif name.endswith('.txt'):
            # extract deployment code from filename like SL-Yamba20180609_completed.txt
            field = name.split('_')
            deployment_code = field[0].split('-')[1]
        else:
            raise InvalidFileNameError(
                "Invalidfile name {file} ".format(file=name))

        return deployment_code
Exemple #7
0
    def preprocess(self):
        pz = NSWOEHSurveyProcesor(self.input_file)
        report = pz.check_all()

        if len(report) > 0:
            raise_string = ''
            for heading, messages in report.items():
                raise_string += "\n{heading}:\n{messages}\n".format(
                    heading=heading,
                    messages="\n".join([
                        "{0}. {1}".format(i + 1, messages)
                        for i, messages in enumerate(messages)
                    ]))

            raise InvalidFileContentError(
                "Zip file '{name}' failed content checks:\n{raise_string}".
                format(name=os.path.basename(self.input_file),
                       raise_string=raise_string))

        # STAX & LIDAR surveys - publish zip file and shapefile only
        if pz.survey_methods in BASIC_PACKAGED_METHODS:
            for f in self.file_collection:
                if not SHAPEFILE_PATTERN.match(f.name):
                    f.publish_type = PipelineFilePublishType.NO_ACTION

            self.file_collection.add(self.input_file_object)
            self.input_file_object.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

        # For multi-beam (MB) surveys the default (publish all zipfile contents) is correct

        self.survey_path = pz.get_dest_path()
Exemple #8
0
def dest_path_anmn_nrs_realtime(filepath):
    """Returns the relative path a given netCDF file should be published to, based on the name and content of the file.
    Only works for ANMN NRS real-time files.

    :param filepath: full path of the file
    :return: relative destination path including file name
    """

    filename = os.path.basename(filepath)

    # Start with base path for this sub-facility
    path_list = ['IMOS', 'ANMN', 'NRS', 'REAL_TIME']

    # add site code
    with Dataset(filepath, mode='r') as f:
        site_code = getattr(f, 'site_code', '')
    if not site_code:
        raise InvalidFileContentError("File '{name}' has no site_code attribute!".format(name=filename))
    path_list.append(site_code)

    # add product sub-directory
    if re.match('IMOS_ANMN-NRS_MT_.*-Surface-.*-MET', filename):
        path_list.append('Meteorology')
    elif re.match('IMOS_ANMN-NRS_W_.*-Surface-.*-wave', filename):
        path_list.append('Wave')
    elif re.match('IMOS_ANMN-NRS_TPSOBUE_.*-SubSurface-.*-WQM', filename):
        path_list.append('Biogeochem_timeseries')
    else:
        raise InvalidInputFileError(
            "File name '{name}' doesn't match pattern for any known NRS real-time product".format(name=filename)
        )

    path_list.append(filename)

    return os.path.join(*path_list)
Exemple #9
0
def fzf_vessel_get_info(profile):
    """
    fuzzy search finder for vessel callsign.
    The vessel name found in the BUFR file format is slightly different from the platform code AODN vocabulary.
    This function finds the closest match and modifies the name of the vessel accordingly with a confidence of 0.9
    :param profile:
    :return: profile
    """
    callsign_list = ship_callsign_list()
    ship_name = profile['profile_metadata']['ship_name']
    ship_name_fuzzy_search = fwprocess.extractOne(ship_name,
                                                  list(callsign_list.values()),
                                                  score_cutoff=90)
    try:
        ship_name = ship_name_fuzzy_search[0]
    except:
        raise InvalidFileContentError(
            '{ship_name} is not a valid enough value to fuzzy match it with an existing AODN '
            'vessel name'.format(ship_name=ship_name))

    callsign = next(k for k, v in callsign_list.items()
                    if v == ship_name)  # find callsign from key value
    # special case for the Astrolabe vessel. All NRT XBT data (post 2020) is collected by the new Astrolabe vessel,
    # which has a new callsign.
    # 2 callsigns are available in the ANDS vocabulary for this vessel; We simply force it to use the new vessel only to
    # avoid writing complicated and unnecessary logic
    if callsign == 'FHZI':
        callsign = 'FASB'

    profile['profile_metadata']['ship_name'] = ship_name
    profile['profile_metadata']['Callsign'] = callsign

    return profile
Exemple #10
0
def transform_count_to_real_val(df):
    """ 1sec files measure FLU2 and TURB in counts. Transforming to CPHL and TURB
    10secs are already in CPHL and TURB
    Ref: https://github.com/aodn/imos-toolbox/blob/spirit/Preprocessing/spiritCountToEngPP.txt
    """

    measurement_frequency = get_measurement_frequency(df)
    if measurement_frequency == 1:
        # transform FLU count data to CPHL
        df['CPHL'] = (df['CPHL'].values -
                      CHLU_PARAMS['blank']) * CHLU_PARAMS['scale']

        # transform TURB count data to TURB
        df['TURB'] = (df['TURB'].values -
                      TURB_PARAMS['blank']) * TURB_PARAMS['scale']
    elif measurement_frequency == 10:
        # Nothing to transform
        pass
    else:
        raise InvalidFileContentError(
            "SOOP NRT input logfile has incorrect delta time. '{measurement_frequency}'. Not belonging to any of "
            "('10 secs', '1 sec').".format(
                measurement_frequency=measurement_frequency))

    return df
Exemple #11
0
    def _get_data_category(cls, input_file):
        """Determine the category a file belongs to (Temperature,
        CTD_timeseires, Biogeochem_profile, etc..)

        """

        var_names = set(cls._get_variable_names(input_file))

        if var_names.intersection(cls.VELOCITY_VAR):
            return 'Velocity'

        if var_names.intersection(cls.WAVE_VAR):
            return 'Wave'

        if var_names.intersection(cls.CO2_VAR):
            return 'CO2'

        feature_type = cls._get_nc_att(input_file, 'featureType').lower()
        if feature_type == 'profile':
            if var_names.intersection(cls.BGC_VAR) or var_names.intersection(cls.SALINITY_VAR):
                return 'Biogeochem_profiles'
            else:
                raise InvalidFileContentError(
                    "Could not determine data category for '{name}'".format(name=input_file)
                )

        if feature_type == 'timeseries':
            if var_names.intersection(cls.BGC_VAR):
                return 'Biogeochem_timeseries'

            if var_names.intersection(cls.SALINITY_VAR):
                return 'CTD_timeseries'

        if feature_type == 'timeseriesprofile' and 'long-timeseries' in input_file:
            return 'aggregated_products'

        if var_names.intersection(cls.TEMP_VAR):
            return 'Temperature'

        raise InvalidFileContentError("Could not determine data category for '{name}'".format(name=input_file))
Exemple #12
0
    def _get_old_data_category(cls, input_file):
        """Determine the category a file belongs to."""

        var_names = set(cls._get_variable_names(input_file))
        if var_names.intersection(cls.WAVE_VAR):
            return 'Surface_waves'

        if var_names.intersection(cls.FLUX_VAR):
            return 'Surface_fluxes'

        if var_names.intersection(cls.MET_VAR):
            return 'Surface_properties'

        raise InvalidFileContentError("Could not determine data category for {input_file}".format(input_file=input_file))
Exemple #13
0
    def dest_path(self, filepath):
        with Dataset(filepath, mode='r') as nc_obj:
            measurement_frequency = nc_obj.measurement_frequency
            product_type = nc_obj.product_type
            year = datetime.strptime(nc_obj.time_coverage_start,
                                     '%Y-%m-%dT%H:%M:%SZ').strftime("%Y")

        if measurement_frequency != "1sec":
            raise InvalidFileContentError(
                "SOOP TMV NRT: NetCDF with a measurement frequency of "
                "{measurement_frequency} aren't allowed to be harvested".
                format(measurement_frequency=measurement_frequency))

        return os.path.join(self.soop_tmv_dir, product_type,
                            measurement_frequency, year,
                            os.path.basename(filepath))
Exemple #14
0
def xbt_line_get_info(profile, url):
    """
    retrieve xbt line information from ANDS vocabulary and store new values in existing profile dictionary
    :param profile:
    :param url: url of the ANDS xbt line vocabulary
    :return:
    """

    helper = XbtLineVocabHelper(url)
    xbt_lines_info = helper.xbt_line_info()
    xbt_line = profile['profile_metadata']['XBT_line']

    # another weird case with missing line info
    if not xbt_line.strip():
        xbt_line = 'NOLINE'

        profile['profile_metadata']['XBT_line'] = xbt_line
        profile['profile_metadata']['XBT_line_description'] = xbt_line
        return profile

    # In some cases, the CSV file has the "pref_lab" value of the XBT_line. This complicated discovery as in our dict,
    # the pref_value is a value of our dict and not the key. In the next few lines, the search logic is reversed, looking
    # for the key according to the value
    for key, value in xbt_lines_info.items():
        if xbt_line == value['xbt_pref_label']:
            xbt_line = key
            break

    # look for xbt line value from BUFR file available in ANDS XBT line vocabulary
    if xbt_line not in xbt_lines_info:
        raise InvalidFileContentError(
            '{xbt_code} is not a known/correct XBT line value found in the ANDS XBT line vocabulary'
            .format(xbt_code=xbt_line))

    xbt_line_description = xbt_lines_info[xbt_line]['xbt_line_description']
    xbt_line = xbt_lines_info[xbt_line]['xbt_pref_label']

    # dealing with vocabulary inconsistencies between vocab/XBT_DM/XBT_NRT, forcing consistency ... somewhat
    if xbt_line == 'PX30':
        xbt_line = 'PX30-31'

    profile['profile_metadata']['XBT_line'] = xbt_line
    profile['profile_metadata']['XBT_line_description'] = xbt_line_description

    return profile
Exemple #15
0
    def _make_aggregated_timeseries(self):
        """For each variable, generate aggregated timeseries product and add to file_collection."""

        for var in self.product_variables:
            # Filter input_list to the files relevant for this var
            input_list = self._input_list_for_variables(var)
            if not input_list:
                raise InvalidFileContentError(
                    "No files to aggregate for {var}".format(var=var))
            self.logger.info("Aggregating {var} ({n} files)".format(
                var=var, n=len(input_list)))

            product_url, errors = main_aggregator(input_list, var,
                                                  self.product_site_code,
                                                  **self.product_common_kwargs)
            self._log_excluded_files(errors)
            self._add_to_collection(product_url)
            self._cleanup_previous_version(os.path.basename(product_url))
Exemple #16
0
    def _make_velocity_hourly_timeseries(self):
        """Generate velocity hourly product for the site and add to file_collection."""

        # Filter input list to just the velocity files, i.e. files with the variables
        # UCUR ("eastward_sea_water_velocity") or VCUR ("northward_sea_water_velocity")
        input_list = self._input_list_for_variables('UCUR', 'VCUR')
        if not input_list:
            raise InvalidFileContentError("No velocity files to aggregate")
        self.logger.info(
            "Creating velocity hourly products from {n} input files".format(
                n=len(input_list)))

        product_url, errors = velocity_hourly_aggregated(
            input_list, self.product_site_code, **self.product_common_kwargs)

        self._log_excluded_files(errors)
        self._add_to_collection(product_url)
        self._cleanup_previous_version(os.path.basename(product_url))
Exemple #17
0
    def _get_data_category(cls, input_file):
        """Determine the category a file belongs to (Temperature,
        CTD_timeseires, Velocity, etc..)

        """

        var_names = set(cls._get_variable_names(input_file))

        if var_names.intersection(cls.VELOCITY_VAR):
            return 'Velocity'

        if var_names.intersection(cls.BGC_VAR):
            return 'Biogeochem_timeseries'

        if var_names.intersection(cls.SALINITY_VAR):
            return 'CTD_timeseries'

        if var_names.intersection(cls.TEMP_VAR):
            return 'Temperature'

        raise InvalidFileContentError(
            "Could not determine data category for '{name}'".format(name=input_file)
        )
Exemple #18
0
    def get_data_mode(self):
        """
            1) Set data mode based on NetCDF product type
            If FV01 => DM
            If FV00 is ANFOG_RT =>  RT, then also check zip contain ancillary files,
            (.png or position_summary.txt file)
            If not present => missing RT material
            2) Set format_check type specific to product type(FV00/01) and origin(ANFOG, DSTG or NRL)
        """
        fv01 = self.file_collection.filter_by_attribute_regex(
            'name', AnfogFileClassifier.DM_REGEX)
        adapter_dstg = '%s|%s' % (AnfogFileClassifier.ADAPTER_REGEX,
                                  AnfogFileClassifier.DSTG_REGEX)
        anfog_rt = self.file_collection.filter_by_attribute_regex(
            'name', AnfogFileClassifier.ANFOG_RT_REGEX)

        if fv01:
            if re.match(adapter_dstg, fv01[0].name):
                # Adapter and DSTG file not cf and imos compliant
                fv01[0].check_type = PipelineFileCheckType.FORMAT_CHECK

            return 'DM'
        elif anfog_rt:
            # RT file not compliant
            anfog_rt[0].check_type = PipelineFileCheckType.FORMAT_CHECK
            png = self.file_collection.filter_by_attribute_regex(
                'name', AnfogFileClassifier.RT_PNG_REGEX)
            if png:
                return "RT"
            else:
                raise InvalidFileContentError(
                    "Missing ancillary files(PNGs or summary position file) in ZIP archive {name}"
                    .format(name=os.path.basename(self.input_file)))
        else:
            raise InvalidInputFileError(
                "Expecting one NetCDF file in ZIP archive '{zip}'".format(
                    zip=os.path.basename(self.input_file)))
Exemple #19
0
    def process_nrt(self):
        """Process NRT files, only allowing updates to occur if the new files for
        certain campaigns are more recent."""

        previous_files = self.state_query.query_storage(
            self.dest_path_function(""))
        if not previous_files:
            return

        try:
            old_zip_file = [
                x for x in previous_files if self.current_campaign in x.name
                and ".zip" in PurePath(x.name).suffix
            ][0]
        except IndexError:
            return

        old_metadata_file = self.get_remote_metadata_from_zip(old_zip_file)
        if not old_metadata_file:
            return

        new_metadata_file = self.get_file(self.file_collection,
                                          'metadata',
                                          campaign=self.current_campaign)

        logger.info(
            NRT_TIMESTAMP_COMPARISON_MSG.format(old_zip_file.dest_path,
                                                new_metadata_file.src_path))

        if not self.is_nrt_update_required(old_metadata_file.local_path):
            raise InvalidFileContentError(
                NRT_TIMESTAMP_DIFFERS_MSG.format(
                    incoming_file=self.input_file_object.src_path,
                    within_file=new_metadata_file.src_path,
                    archival_file=old_zip_file.dest_path,
                ))