Ejemplo n.º 1
0
    def __init__(self,
                 suppress_file_output=False,
                 json_files=None,
                 facet_json=None,
                 **kwargs):
        """
        Initialise the ProcessDatasets class.

        @param suppress_file_output (boolean): Whether or not to write out moles tags
        @param json_files (iterable): collection of JSON files to load
        @param facet_json (string): filepath to JSON file which contains a dump of the facet object
                to save time when loading the tagger

        """
        self.logger = logging.getLogger(__name__)
        self.__suppress_fo = suppress_file_output

        if facet_json:
            with open(facet_json, 'r') as reader:
                self.__facets = Facets.from_json(json.load(reader))
                print(self.__facets)
        else:
            self.__facets = Facets()

        self.__file_drs = None
        self.__file_csv = None
        self._open_files()
        self.__not_found_messages = set()
        self.__error_messages = set()
        self.__dataset_json_values = DatasetJSONMappings(json_files)
Ejemplo n.º 2
0
    def _load_facets():
        """
        Load the facets from the server

        :return: vocab mapping
        :rtype: dict
        """

        data = {}
        try:
            data = Facets().to_json()
        except Exception as e:
            logger.error(f'Failed to get vocabs from vocab server: {e}')

        # If vocabs successfully retrieved, save to disk
        if data:
            try:
                with open(settings.VOCAB_CACHE_FILE, 'w') as writer:
                    json.dump(data, writer)
            except Exception as e:
                logger.warning(f'Failed to save vocab mapping: {e}')

        # If vocabs not successfully retrieved from live server. Try
        # to retrieve the disk cache from the last successful attempt
        if not data:
            try:
                with open(settings.VOCAB_CACHE_FILE) as reader:
                    data = json.load(reader)

            except Exception as e:
                logger.critical('Unable to retrieve vocab cache from live server or disk: {e}')

        return data
Ejemplo n.º 3
0
    def __init__(self,
                 checksum=True,
                 use_mapping=True,
                 verbose=0,
                 update_moles=False,
                 default_terms_file=None,
                 json_data=None):
        """
        Initialise the ProcessDatasets class.

        @param checksum (boolean): if True produce a checksum for each file
        @param use_mapping (boolean): if True use the local mapping to correct
                use values to match those in the vocab server
        @param verbose (int): increase output verbosity

        """
        self.__checksum = checksum
        self.__use_mapping = use_mapping
        self.__verbose = verbose
        self.__update_moles = update_moles
        if self.__update_moles:
            try:
                from tools.vocab_tools.tag_obs_with_vocab_terms import \
                    tag_observation
                self._tag_observation = tag_observation
            except ImportError:
                print('Oops. Looks like you have selected to write to MOLES '
                      'but we cannot find the MOLES library')
                exit(1)
        if self.__facets is None:
            self.__facets = Facets()
        self.__file_drs = None
        self.__file_csv = None
        self._open_files()
        self.__not_found_messages = set()
        self.__error_messages = set()
        self.__ds_drs_mapping = set()
        self.__drs_version = 'v{}'.format(strftime("%Y%m%d"))
        self.__user_assigned_defaults, self.__user_assigned_defaults_uris = (
            self._init_user_assigned_defaults(default_terms_file, json_data))
        if json_data and json_data.get("mappings"):
            self.__user_mappings = UserVocabMappings(json_data.get("mappings"))
        else:
            self.__user_mappings = None
Ejemplo n.º 4
0
def main():
    args = get_args()
    facets = Facets()

    with open(args.output, 'w') as writer:
        json.dump(facets.to_json(), writer)
    def handle(self, *args, **options):

        facets = Facets()
        with open(settings.VOCAB_CACHE_FILE, 'w') as writer:
            json.dump(facets.to_json(), writer)
Ejemplo n.º 6
0
class ProcessDatasets(object):
    """
    This class provides the process_datasets method to process datasets,
    extract data from file names and from within net cdf files. It then
    produces files for input into MOLES and ESGF.

    Some data are extracted from the file name.

    The file name comes in two different formats. The values are '-'
    delimited.
    Format 1
        <Indicative Date>[<Indicative Time>]-ESACCI
        -<Processing Level>_<CCI Project>-<Data Type>-<Product String>
        [-<Additional Segregator>][-v<GDS version>]-fv<File version>.nc
    Format 2
        ESACCI-<CCI Project>-<Processing Level>-<Data Type>-
        <Product String>[-<Additional Segregator>]-
        <IndicativeDate>[<Indicative Time>]-fv<File version>.nc

    Values extracted from the file name:
        Processing Level
        CCI Project (ecv)
        Data Type
        Product String

    Other data are extracted from the net cdf file attributes.
        time frequency
        sensor id
        platform id
        product version
        institute

    The DRS is made up of:
        project (hard coded "esacci")
        cci_project
        time frequency
        processing level
        data type
        sensor id
        platform id
        product string
        product version
        realization
        version (current date)

    Realization is used to distinguish between DRS that would otherwise be
    identical. When determining the realisation a file of mappings of dataset
    names to DRS is consulted. If the data set already exists in the list then
    the existing realisation value is reused.

    """
    ESACCI = 'ESACCI'
    DRS_ESACCI = 'esacci'

    # an instance of the facets class
    __facets = None

    __moles_facets = SINGLE_VALUE_FACETS + ALLOWED_GLOBAL_ATTRS

    def __init__(self,
                 suppress_file_output=False,
                 json_files=None,
                 facet_json=None,
                 **kwargs):
        """
        Initialise the ProcessDatasets class.

        @param suppress_file_output (boolean): Whether or not to write out moles tags
        @param json_files (iterable): collection of JSON files to load
        @param facet_json (string): filepath to JSON file which contains a dump of the facet object
                to save time when loading the tagger

        """
        self.logger = logging.getLogger(__name__)
        self.__suppress_fo = suppress_file_output

        if facet_json:
            with open(facet_json, 'r') as reader:
                self.__facets = Facets.from_json(json.load(reader))
                print(self.__facets)
        else:
            self.__facets = Facets()

        self.__file_drs = None
        self.__file_csv = None
        self._open_files()
        self.__not_found_messages = set()
        self.__error_messages = set()
        self.__dataset_json_values = DatasetJSONMappings(json_files)

    def _check_property_value(self, value, labels, facet, defaults_source):
        if value not in labels:
            print('ERROR "{value}" in {file} is not a valid value for '
                  '{facet}. Should be one of {labels}.'.format(
                      value=value,
                      file=defaults_source,
                      facet=facet,
                      labels=', '.join(sorted(labels))))
            exit(1)
        return True

    def get_dataset(self, dspath):
        """
        Return a dataset object for the requested path
        :param dspath: Path to the dataset
        :return: Dataset
        """

        dataset_id = self.__dataset_json_values.get_dataset(dspath)
        return Dataset(dataset_id, self.__dataset_json_values, self.__facets)

    def process_datasets(self, datasets, max_file_count=0):
        """
        Loop through the datasets pulling out data from file names and from
        within net cdf files.

        @param datasets (List(str)): a list of dataset names, these are the
        full paths to the datasets
        @param max_file_count (int): how many .nc files to look at per dataset.
                If the value is less than 1 then all datasets will be
                processed.

        """

        ds_len = len(datasets)
        self.logger.info(
            f'Processing a maximum of {max_file_count if max_file_count > 0 else "unlimited"} files for each of {ds_len} datasets'
        )

        # A sanity check to let you see what files are being included in each dataset
        dataset_file_mapping = {}
        terms_not_found = set()

        for dspath in sorted(datasets):

            dataset = self.get_dataset(dspath)

            dataset_uris, ds_file_map = dataset.process_dataset(max_file_count)

            self._write_moles_tags(dataset.id, dataset_uris)

            dataset_file_mapping.update(ds_file_map)

            terms_not_found.update(dataset.not_found_messages)

        self._write_json(dataset_file_mapping)

        if len(terms_not_found) > 0:
            print("\nSUMMARY OF TERMS NOT IN THE VOCAB:\n")
            for message in sorted(terms_not_found):
                print(message)

        self._close_files()

    def get_file_tags(self, fpath):
        """
        Extracts the facet labels from the tags
        USED BY THE FACET SCANNER FOR THE CCI PROJECT
        :param fpath: Path the file to scan
        :return: drs identifier (string), facet labels (dict)
        """

        # Get the dataset
        dataset = self.get_dataset(fpath)

        # Get the URIs for the datset
        uris = dataset.get_file_tags(filepath=fpath)

        # Turn uris into human readable tags
        tags = self.__facets.process_bag(uris)

        # Get DRS labels
        drs_facets = dataset.get_drs_labels(tags)

        # Generate DRS id
        drs = dataset.generate_ds_id(drs_facets, fpath)

        return TaggedDataset(drs, tags, uris)

    def _write_moles_tags(self, ds, uris):
        """

        :param ds: Dataset (will be a file path)
        :param uris: Dictionary of extracted tags as URIS to the vocab service
        """

        for facet in self.__moles_facets:
            tags = uris.get(facet)
            if tags:
                self._write_moles_tags_out(ds, tags)

    def _write_moles_tags_out(self, ds, uris):

        if self.__suppress_fo:
            return
        else:
            for uri in uris:
                self.__file_csv.write(f'{ds},{uri}\n')

    def _write_json(self, drs):
        if self.__suppress_fo:
            return

        self.__file_drs.write(
            json.dumps(drs, sort_keys=True, indent=4, separators=(',', ': ')))

    def _open_files(self, ):
        # Do not open files if suppress output is true
        if self.__suppress_fo:
            return

        self.__file_csv = open(MOLES_TAGS_FILE, 'w')

        self.__file_drs = open(ESGF_DRS_FILE, 'w')

    def _close_files(self, ):
        if self.__suppress_fo:
            return

        self.__file_csv.close()

        self.__file_drs.close()
Ejemplo n.º 7
0
__contact__ = '*****@*****.**'

from json_tagger import DatasetJSONMappings
from cci_tagger.dataset.dataset import Dataset
from cci_tagger.facets import Facets

PATH = '/Users/vdn73631/Documents/dev/CCI_KE_PROJECT/cci-tagger/cci_tagger/tests/ocean_colour/ESACCI-OC-L3S-CHLOR_A-MERGED-5D_DAILY_4km_GEO_PML_OCx-20000101-fv4.0.nc'
PATH = '/Users/vdn73631/Documents/dev/CCI_KE_PROJECT/cci-tagger/cci_tagger/tests/cloud/200707-ESACCI-L3C_CLOUD-CLD_PRODUCTS-AVHRR_METOPA-fv2.0.nc'
# PATH = '/Users/vdn73631/Documents/dev/CCI_KE_PROJECT/cci-tagger/cci_tagger/tests/sst/19960202120000-ESACCI-L4_GHRSST-SSTdepth-OSTIA-GLOB_CDR2.1-v02.0-fv01.0.nc'
PATH = '/Users/vdn73631/Documents/dev/CCI_KE_PROJECT/cci-tagger/cci_tagger/tests/biomass/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2017-fv1.0.nc'

# GET JSON FILES
mappings = DatasetJSONMappings([
    '/Users/vdn73631/Documents/dev/CCI_KE_PROJECT/cci-tagger/cci_tagger/tests/test_json_files/biomass.json'
])
facets = Facets()

dataset_id = mappings.get_dataset(PATH)

dataset = Dataset(dataset_id, mappings, facets)

uris = dataset.get_file_tags(filepath=PATH)

tags = facets.process_bag(uris)

drs_facets = dataset.get_drs_labels(tags)

drs = dataset.generate_ds_id(drs_facets, PATH)

print(uris)
Ejemplo n.º 8
0
class ProcessDatasets(object):
    """
    This class provides the process_datasets method to process datasets,
    extract data from file names and from within net cdf files. It then
    produces files for input into MOLES and ESGF.

    Some data are extracted from the file name.

    The file name comes in two different formats. The values are '-'
    delimited.
    Format 1
        <Indicative Date>[<Indicative Time>]-ESACCI
        -<Processing Level>_<CCI Project>-<Data Type>-<Product String>
        [-<Additional Segregator>][-v<GDS version>]-fv<File version>.nc
    Format 2
        ESACCI-<CCI Project>-<Processing Level>-<Data Type>-
        <Product String>[-<Additional Segregator>]-
        <IndicativeDate>[<Indicative Time>]-fv<File version>.nc

    Values extracted from the file name:
        Processing Level
        CCI Project (ecv)
        Data Type
        Product String

    Other data are extracted from the net cdf file attributes.
        time frequency
        sensor id
        platform id
        product version
        institute

    The DRS is made up of:
        project (hard coded "esacci")
        cci_project
        time frequency
        processing level
        data type
        sensor id
        platform id
        product string
        product version
        realization
        version (current date)

    Realization is used to distinguish between DRS that would otherwise be
    identical. When determining the realisation a file of mappings of dataset
    names to DRS is consulted. If the data set already exists in the list then
    the existing realisation value is reused.

    """
    ESACCI = 'ESACCI'
    DRS_ESACCI = 'esacci'

    # an instance of the facets class
    __facets = None

    __allowed_net_cdf_attribs = [FREQUENCY, INSTITUTION, PLATFORM, SENSOR]
    __single_valued_facets = [
        BROADER_PROCESSING_LEVEL, DATA_TYPE, ECV, PROCESSING_LEVEL,
        PRODUCT_STRING
    ]
    __multi_valued_facet_labels = {
        FREQUENCY: 'multi-frequency',
        INSTITUTION: 'multi-institution',
        PLATFORM: 'multi-platform',
        SENSOR: 'multi-sensor'
    }

    def __init__(self,
                 checksum=True,
                 use_mapping=True,
                 verbose=0,
                 update_moles=False,
                 default_terms_file=None,
                 json_data=None):
        """
        Initialise the ProcessDatasets class.

        @param checksum (boolean): if True produce a checksum for each file
        @param use_mapping (boolean): if True use the local mapping to correct
                use values to match those in the vocab server
        @param verbose (int): increase output verbosity

        """
        self.__checksum = checksum
        self.__use_mapping = use_mapping
        self.__verbose = verbose
        self.__update_moles = update_moles
        if self.__update_moles:
            try:
                from tools.vocab_tools.tag_obs_with_vocab_terms import \
                    tag_observation
                self._tag_observation = tag_observation
            except ImportError:
                print('Oops. Looks like you have selected to write to MOLES '
                      'but we cannot find the MOLES library')
                exit(1)
        if self.__facets is None:
            self.__facets = Facets()
        self.__file_drs = None
        self.__file_csv = None
        self._open_files()
        self.__not_found_messages = set()
        self.__error_messages = set()
        self.__ds_drs_mapping = set()
        self.__drs_version = 'v{}'.format(strftime("%Y%m%d"))
        self.__user_assigned_defaults, self.__user_assigned_defaults_uris = (
            self._init_user_assigned_defaults(default_terms_file, json_data))
        if json_data and json_data.get("mappings"):
            self.__user_mappings = UserVocabMappings(json_data.get("mappings"))
        else:
            self.__user_mappings = None

    def _init_user_assigned_defaults(self, default_terms_file, json_data):
        if default_terms_file is None and (json_data is None or
                                           json_data.get("defaults") is None):
            return {}, {}

        tags = {}
        tag_uris = {}
        if json_data.get("defaults"):
            properties = json_data.get("defaults")
            dafaults_source = "json file"
        else:
            properties = Properties(default_terms_file).properties()
            dafaults_source = default_terms_file

        if self.__verbose >= 1:
            print("Using defaults from %s" % dafaults_source)
            if self.__verbose >= 2:
                for key, value in properties.items():
                    print("\t{key}: {value}".format(key=key, value=value))

        # validate the user values against data from the triple store
        for key in properties.keys():
            # get the values for a facet (property key)
            try:
                labels = self.__facets.get_labels((key.lower()))
            except KeyError:
                print('ERROR "{key}" in {file} is not a valid facet value. '
                      'Should be one of {facets}.'.format(
                          key=key,
                          file=dafaults_source,
                          facets=', '.join(
                              sorted(self.__facets.get_facet_names()))))
                exit(1)

            if key in self.__multi_valued_facet_labels.keys():
                tag_uris[key] = set()
                values = properties[key].lower().split(',')
                for value in values:
                    value = value.strip().lower()
                    self._check_property_value(value, labels.keys(), key,
                                               dafaults_source)
                    tag_uris[key].add(labels[value])
                if len(values) > 1:
                    tags[key] = self.__multi_valued_facet_labels[key]
                else:
                    tags[key] = values[0].strip().lower()
            else:
                value = properties[key].strip().lower()
                self._check_property_value(value, labels.keys(), key,
                                           dafaults_source)
                tag_uris[key] = labels[properties[key].lower()]
                tags[key] = value
        return tags, tag_uris

    def _check_property_value(self, value, labels, facet, dafaults_source):
        if value not in labels:
            print('ERROR "{value}" in {file} is not a valid value for '
                  '{facet}. Should be one of {labels}.'.format(
                      value=value,
                      file=dafaults_source,
                      facet=facet,
                      labels=', '.join(sorted(labels))))
            exit(1)
        return True

    def process_datasets(self, datasets, max_file_count):
        """
        Loop through the datasets pulling out data from file names and from
        within net cdf files.

        @param datasets (List(str)): a list of dataset names, these are the
        full paths to the datasets
        @param max_file_count (int): how many .nt files to look at per dataset.
                If the value is less than 1 then all datasets will be
                processed.

        """
        ds_len = len(datasets)
        if self.__verbose >= 1:
            if max_file_count > 0:
                print("Processing a maximum of %s files for each of %s "
                      "datasets" % (max_file_count, ds_len))
            else:
                print("Processing %s datasets" % ds_len)
        drs = {}
        count = 0

        for ds in sorted(datasets):
            count = count + 1
            self._process_dataset(ds, count, drs, max_file_count)

        self._write_json(drs)

        if len(self.__not_found_messages) > 0:
            print("\nSUMMARY OF TERMS NOT IN THE VOCAB:\n")
            for message in sorted(self.__not_found_messages):
                print(message)

        with open(ERROR_FILE, 'w') as f:
            for message in sorted(self.__error_messages):
                f.write('%s\n' % message)

        self._write_moles_drs_mapping()

        self._close_files()

    def _process_dataset(self, ds, count, drs, max_file_count):
        """
        Pull out data from file names and from within net cdf files.

        @param ds (str): the full path to the dataset
        @param count (int): the sequence number for this dataset
        @param drs {dict}: key (str) = DRS label
                           value (dict):
                               key = 'file', value = the file path
                               key = 'sha256', value = the sha256 of the file
                               key = 'size', value = the siz of the file
                               key = 'mtime', value = the mtime of the file
        @param max_file_count (int): how many .nt files to look at per dataset
                If the value is less than 1 then all datasets will be
                processed.

        """
        tags_ds = dict(self.__user_assigned_defaults_uris)
        drs_count = 0

        # key drs id, value realization
        current_drs_ids = {}

        # get a list of files
        nc_files = self._get_nc_files(ds, max_file_count)

        if self.__verbose >= 1:
            print("\nDataset %s Processing %s files from %s" %
                  (count, len(nc_files), ds))

        if len(nc_files) == 0:
            self.__error_messages.add('WARNING %s, no .nc files found' % (ds))
            return

        for fpath in nc_files:
            # the terms to be used to generate the DRS
            drs_facets = dict(self.__user_assigned_defaults)

            net_cdf_drs, net_cdf_tags = self._parse_file_name(ds, fpath)
            drs_facets.update(net_cdf_drs)
            tags_ds.update(net_cdf_tags)
            net_cdf_drs, net_cdf_tags = self._scan_net_cdf_file(
                fpath, ds, net_cdf_tags.get(PROCESSING_LEVEL))
            drs_facets.update(net_cdf_drs)
            tags_ds.update(net_cdf_tags)

            dataset_id = self._generate_ds_id(ds, drs_facets)
            # only add files with all of the drs data
            if dataset_id is None or drs_facets.get('error'):
                continue

            if dataset_id not in current_drs_ids.keys():
                current_drs_ids[dataset_id] = self._get_next_realization(
                    ds, dataset_id, drs)
                dataset_id = '%s.%s' % (dataset_id,
                                        current_drs_ids[dataset_id])
                self.__ds_drs_mapping.add((ds, dataset_id, self.__drs_version))
                dataset_id = '%s.%s' % (dataset_id, self.__drs_version)
            else:
                dataset_id = '%s.%s.%s' % (dataset_id,
                                           current_drs_ids[dataset_id],
                                           self.__drs_version)

            if self.__checksum:
                sha256 = self._sha256(fpath)
                mtime = os.path.getmtime(fpath)
                size = os.path.getsize(fpath)
                if dataset_id in drs.keys():
                    drs[dataset_id].append({
                        'file': fpath,
                        'sha256': sha256,
                        'mtime': mtime,
                        'size': size
                    })
                else:
                    drs_count = drs_count + 1
                    drs[dataset_id] = [{
                        'file': fpath,
                        'sha256': sha256,
                        'mtime': mtime,
                        'size': size
                    }]
            else:
                if dataset_id in drs.keys():
                    drs[dataset_id].append({'file': fpath})
                else:
                    drs_count = drs_count + 1
                    drs[dataset_id] = [{'file': fpath}]
                    if self.__verbose >= 1:
                        print('DRS = %s' % dataset_id)

        if drs_count == 0:
            self.__error_messages.add('ERROR in %s, no DRS entries created' %
                                      (ds))

        if self.__verbose >= 1:
            print("Created {count} DRS {entry}".format(
                count=drs_count,
                entry='entry' if drs_count == 1 else 'entries'))

        self._write_moles_tags(ds, tags_ds)

    def _sha256(self, fpath):
        """
        Generate the sha256 for the given file.

        @param (str): the path to the file

        @return the sha256 of the file

        """
        if self.__verbose >= 2:
            print('Generating sha256')
        h = hashlib.sha256()
        f = open(fpath)
        while True:
            data = f.read(10240)
            if not data:
                break
            h.update(data)
        f.close()
        return h.hexdigest()

    def _get_nc_files(self, dir_, max_file_count):
        """
        Get the list of net cdf files in the given directory.

        @param dir_ (str): the name of the directory to scan
        @param max_file_count (int): how many .nt files to look at per dataset
                If the value is less than 1 then all datasets will be
                processed.

        @return a list of file names complete with paths

        """
        file_list = []
        count = 1
        for root, _, files in os.walk(dir_):
            for name in files:
                if name.endswith('.nc'):
                    file_list.append(os.path.join(root, name))
                    count = count + 1
                    if max_file_count > 0 and count > max_file_count:
                        return file_list
        return file_list

    def _parse_file_name(self, ds, fpath):
        """
        Extract data from the file name.

        The file name comes in two different formats. The values are '-'
        delimited.
        Form 1
            <Indicative Date>[<Indicative Time>]-ESACCI
            -<Processing Level>_<CCI Project>-<Data Type>-<Product String>
            [-<Additional Segregator>][-v<GDS version>]-fv<File version>.nc
        Form 2
            ESACCI-<CCI Project>-<Processing Level>-<Data Type>-
            <Product String>[-<Additional Segregator>]-
            <IndicativeDate>[<Indicative Time>]-fv<File version>.nc

        Values extracted from the file name:
            Processing Level
            CCI Project (ecv)
            Data Type
            Product String

        @param ds (str): the full path to the dataset
        @param fpath (str): the path to the file

        @return drs and csv representations of the data

        """

        path_facet_bits = fpath.split('/')
        last_bit = len(path_facet_bits) - 1
        file_segments = path_facet_bits[last_bit].split('-')
        if len(file_segments) < 5:
            message_found = False
            # Do not add another message if we have already reported an invalid
            # file name for this dataset
            for message in self.__error_messages:
                if (message.startswith(
                        'ERROR in %s, invalid file name format' % (ds))):
                    message_found = True
            if not message_found:
                self.__error_messages.add(
                    'ERROR in %s, invalid file name format "%s"' %
                    (ds, path_facet_bits[last_bit]))
            return {}, {}

        if file_segments[1] == self.ESACCI:
            return self._process_form(
                ds, self._get_data_from_file_name_1(file_segments))
        elif file_segments[0] == self.ESACCI:
            return self._process_form(
                ds, self._get_data_from_file_name_2(file_segments))
        else:
            message_found = False
            # Do not add another message if we have already reported an invalid
            # file name for this dataset
            for message in self.__error_messages:
                if (message.startswith(
                        'ERROR in %s, invalid file name format' % (ds))):
                    message_found = True
            if not message_found:
                self.__error_messages.add(
                    'ERROR in %s, invalid file name format "%s"' %
                    (ds, path_facet_bits[last_bit]))
            return {}, {}

    def _get_data_from_file_name_1(self, file_segments):
        """
        Extract data from the file name of form 1.

        @param file_segments (List(str)): file segments

        @return a dict where:
                key = facet name
                value = file segment

        """
        form = {}
        form[PROCESSING_LEVEL] = file_segments[2].split('_')[0]
        form[ECV] = file_segments[2].split('_')[1]
        form[DATA_TYPE] = file_segments[3]
        form[PRODUCT_STRING] = file_segments[4]
        return form

    def _get_data_from_file_name_2(self, file_segments):
        """
        Extract data from the file name of form 2.

        @param file_segments (List(str)): file segments

        @return a dict where:
                key = facet name
                value = file segment

        """
        form = {}
        form[PROCESSING_LEVEL] = file_segments[2]
        form[ECV] = file_segments[1]
        form[DATA_TYPE] = file_segments[3]
        form[PRODUCT_STRING] = file_segments[4]
        return form

    def _process_form(self, ds, form):
        """
        Process form to generate drs and csv representations.

        @param ds (str): the full path to the dataset
        @param form (dict): data extracted from the file name

        @return drs and csv representations of the data

        """
        csv_rec = {}
        term = self._get_term_uri(PROCESSING_LEVEL, form[PROCESSING_LEVEL], ds)
        if term is not None:
            csv_rec[PROCESSING_LEVEL] = term
            # add broader terms for the processing level
            broader_proc_level = self.__facets.get_broader_proc_level(term)
            if broader_proc_level is not None:
                csv_rec[BROADER_PROCESSING_LEVEL] = broader_proc_level
        else:
            self.__not_found_messages.add(
                "%s: %s" % (PROCESSING_LEVEL, form[PROCESSING_LEVEL]))
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, PROCESSING_LEVEL, form[PROCESSING_LEVEL]))

        term = self._get_term_uri(ECV, form[ECV], ds)
        if term is not None:
            csv_rec[ECV] = term
        else:
            self.__not_found_messages.add("%s: %s" % (ECV, form[ECV]))
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, ECV, form[ECV]))

        term = self._get_term_uri(DATA_TYPE, form[DATA_TYPE], ds)
        if term is not None:
            csv_rec[DATA_TYPE] = term
        else:
            self.__not_found_messages.add("%s: %s" %
                                          (DATA_TYPE, form[DATA_TYPE]))
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, DATA_TYPE, form[DATA_TYPE]))

        term = self._get_term_uri(PRODUCT_STRING, form[PRODUCT_STRING], ds)
        if term is not None:
            csv_rec[PRODUCT_STRING] = term
        else:
            self.__not_found_messages.add(
                "%s: %s" % (PRODUCT_STRING, form[PRODUCT_STRING]))
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, PRODUCT_STRING, form[PRODUCT_STRING]))

        return self._create_drs_record(csv_rec), csv_rec

    def _create_drs_record(self, csv_rec):
        proc_lev_label = TripleStore.get_alt_label(
            csv_rec.get(PROCESSING_LEVEL))
        project_label = TripleStore.get_alt_label(csv_rec.get(ECV))
        data_type_label = TripleStore.get_alt_label(csv_rec.get(DATA_TYPE))
        print(csv_rec)
        print(PRODUCT_STRING)
        print(csv_rec.get(PRODUCT_STRING))
        if csv_rec.get(PRODUCT_STRING) is not None:
            product_label = TripleStore.get_pref_label(
                csv_rec.get(PRODUCT_STRING))
        else:
            product_label = ''
        drs = {}
        if project_label != '':
            drs[ECV] = project_label
        if proc_lev_label != '':
            drs[PROCESSING_LEVEL] = proc_lev_label
        if data_type_label != '':
            drs[DATA_TYPE] = data_type_label
        if product_label != '':
            drs[PRODUCT_STRING] = product_label
        return drs

    def _scan_net_cdf_file(self, fpath, ds, processing_level):
        """
        Extract data from the net cdf file.

        The values to extract are take from the know_attr list which are the
        keys of the attr_mapping dictionary.

        """
        drs = {}
        tags = {}
        try:
            nc = netCDF4.Dataset(fpath)
        except:
            self.__error_messages.add(
                'ERROR in %s, extracting attributes from "%s"' % (ds, fpath))
            return drs, tags

        if self.__verbose >= 2:
            print("GLOBAL ATTRS for %s: " % fpath)
        for global_attr in nc.ncattrs():
            if self.__verbose >= 2:
                print(global_attr, "=", nc.getncattr(global_attr))

            if (global_attr.lower() == FREQUENCY
                    and processing_level is not None
                    and '2' in processing_level):
                # do something special for level 2 data
                drs[FREQUENCY] = (
                    TripleStore.get_pref_label(LEVEL_2_FREQUENCY))
                tags[FREQUENCY] = [LEVEL_2_FREQUENCY]
            elif global_attr.lower() in self.__allowed_net_cdf_attribs:
                attr = nc.getncattr(global_attr)
                a_drs, a_tags = self._process_file_atrib(
                    global_attr.lower(), attr, ds)
                drs.update(a_drs)
                tags.update(a_tags)
            # we don't have a vocab for product_version
            elif global_attr.lower() == PRODUCT_VERSION:
                attr = self._convert_term(PRODUCT_VERSION,
                                          nc.getncattr(global_attr))
                drs[PRODUCT_VERSION] = attr
                tags[PRODUCT_VERSION] = attr

        if self.__verbose >= 3:
            print("VARIABLES...")
        for (var_id, var) in nc.variables.items():
            if self.__verbose >= 3:
                print("\tVARIABLE ATTRIBUTES (%s)" % var_id)
            if var_id == 'time':
                if var.dimensions == ():
                    self.__error_messages.add(
                        'ERROR in %s, time has no dimensions' % ds)
                    drs['error'] = True
            for attr in var.ncattrs():
                if self.__verbose >= 3:
                    print("\t\t%s=%s" % (attr, var.getncattr(attr)))
                if (attr.lower() == 'long_name'
                        and len(var.getncattr(attr)) == 0):
                    self.__error_messages.add(
                        'WARNING in %s, long_name value has zero length' % ds)

        return drs, tags

    def _process_file_atrib(self, global_attr, attr, ds):
        drs = {}
        tags = {}
        if self.__user_mappings:
            attr = self.__user_mappings.split_attrib(attr)

        if self.__use_mapping:
            attr = LocalVocabMappings.split_attrib(attr)

        if global_attr == PLATFORM:
            if '<' in attr:
                bits = attr.split(', ')
            else:
                bits = attr.split(',')
        elif (global_attr == INSTITUTION or global_attr == SENSOR
              or global_attr == FREQUENCY):
            bits = attr.split(',')
        else:
            bits = [attr]

        # Hack to deal with multi platforms
        # TODO do in generic way
        if global_attr == PLATFORM:
            if 'NOAA-<12,14,15,16,17,18>' in bits:
                bits.remove('NOAA-<12,14,15,16,17,18>')
                bits.extend([
                    'NOAA-12', 'NOAA-14', 'NOAA-15', 'NOAA-16', 'NOAA-17',
                    'NOAA-18'
                ])
            if 'ERS-<1,2>' in bits:
                bits.remove('ERS-<1,2>')
                bits.extend(['ERS-1', 'ERS-2'])

        term_count = 0
        for bit in bits:
            term_uri = self._get_term_uri(global_attr, bit.strip())
            if term_uri is not None:
                # A term found in the vocab
                drs[global_attr] = (TripleStore.get_pref_label(term_uri))
                if term_count == 0:
                    tags[global_attr] = set()
                tags[global_attr].add(term_uri)
                term_count = term_count + 1

                if global_attr == PLATFORM and bit.strip() != "N/A":
                    # add the broader terms
                    for tag in self._get_programme_group(term_uri):
                        tags[global_attr].add(tag)

            elif global_attr == PLATFORM:
                # This is an unknown platform
                if bit.strip() == "N/A":
                    continue
                p_tags = self._get_paltform_as_programme(bit.strip())
                if len(p_tags) > 0 and term_count == 0:
                    tags[PLATFORM] = set()
                    # we are adding a programme or group to the list of
                    # platforms, hence adding more than one platform to the
                    # count to ensure encoded as multi platform
                    term_count = term_count + 2
                if len(p_tags) == 0:
                    self._attrib_not_found_message(ds, global_attr, attr,
                                                   bit.strip())

                for tag in p_tags:
                    tags[PLATFORM].add(tag)

            elif global_attr == SENSOR and bit.strip() == "N/A":
                pass

            else:
                self._attrib_not_found_message(ds, global_attr, attr,
                                               bit.strip())

        if term_count > 1 and (global_attr == SENSOR or global_attr == PLATFORM
                               or global_attr == FREQUENCY):
            #             drs[global_attr] = self.MULTI_SENSOR
            drs[global_attr] = self.__multi_valued_facet_labels[global_attr]

        if (drs == {}
                and not ((global_attr == PLATFORM or global_attr == SENSOR) and
                         (attr == "N/A"))):
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, global_attr, attr))

        return drs, tags

    def _attrib_not_found_message(self, ds, global_attr, attr, value):
        self.__not_found_messages.add("%s: %s" % (global_attr, value))
        if value == attr:
            self.__error_messages.add(
                'ERROR in %s for %s, invalid value "%s"' %
                (ds, global_attr, value))
        else:
            self.__error_messages.add('ERROR in %s for %s, invalid value "%s" '
                                      'in "%s"' %
                                      (ds, global_attr, value, attr))

    def _get_programme_group(self, term_uri):
        # now add the platform programme and group
        tags = []
        programme = self.__facets.get_platforms_programme(term_uri)
        programme_uri = self._get_term_uri(PLATFORM_PROGRAMME, programme)
        tags.append(programme_uri)
        try:
            group = self.__facets.get_programmes_group(programme_uri)
            group_uri = self._get_term_uri(PLATFORM_GROUP, group)
            tags.append(group_uri)
        except KeyError:
            # not all programmes have groups
            pass
        return tags

    def _get_paltform_as_programme(self, platform):
        tags = []
        # check if the platform is really a platform programme
        if (platform in self.__facets.get_programme_labels()):
            programme_uri = self._get_term_uri(PLATFORM_PROGRAMME, platform)
            tags.append(programme_uri)
            try:
                group = self.__facets.get_programmes_group(programme_uri)
                group_uri = self._get_term_uri(PLATFORM_GROUP, group)
                tags.append(group_uri)
            except KeyError:
                # not all programmes have groups
                pass

        # check if the platform is really a platform group
        elif (platform in self.__facets.get_group_labels()):
            group_uri = self._get_term_uri(PLATFORM_GROUP, platform)
            tags.append(group_uri)

        return tags

    def _generate_ds_id(self, ds, drs_facets):
        error = False
        facets = [
            ECV, FREQUENCY, PROCESSING_LEVEL, DATA_TYPE, SENSOR, PLATFORM,
            PRODUCT_STRING, PRODUCT_VERSION
        ]
        ds_id = self.DRS_ESACCI
        for facet in facets:
            try:
                if drs_facets[facet] == '':
                    error = True
                    message_found = False
                    # Do not add another message if we have already reported an
                    # invalid value
                    for message in self.__error_messages:
                        if (message.startswith(
                                'ERROR in %s for %s, invalid value' %
                            (ds, facet))):
                            message_found = True
                    if not message_found:
                        self.__error_messages.add(
                            'ERROR in %s for %s, value not found' %
                            (ds, facet))

                else:
                    facet_value = str(drs_facets[facet]).replace('.',
                                                                 '-').replace(
                                                                     ' ', '-')
                    if facet == FREQUENCY:
                        facet_value = facet_value.replace('month',
                                                          'mon').replace(
                                                              'year', 'yr')
                    ds_id = '%s.%s' % (ds_id, facet_value)
            except (KeyError):
                error = True
                message_found = False
                # Do not add another message if we have already reported an
                # invalid value
                for message in self.__error_messages:
                    if (message.startswith(
                            'ERROR in %s for %s, invalid value' %
                        (ds, facet))):
                        message_found = True
                if not message_found:
                    self.__error_messages.add(
                        'ERROR in %s for %s, value not found' % (ds, facet))
        if error:
            return None

        return ds_id

    def _get_next_realization(self, ds, drs_id, drs):
        realization_no = 1
        while True:
            ds_id_r = '%s.r%s.%s' % (drs_id, realization_no,
                                     self.__drs_version)
            if ds_id_r not in drs.keys():
                return 'r%s' % (realization_no)
            realization_no = realization_no + 1

    def _write_moles_tags(self, ds, drs):
        if self.__update_moles:
            if self.__verbose >= 2:
                print('Updating MOLES tags')
        for value in self.__single_valued_facets:
            try:
                self._write_moles_tags_out(ds, drs[value])
            except KeyError:
                pass

        for value in self.__multi_valued_facet_labels.keys():
            try:
                for uri in drs[value]:
                    self._write_moles_tags_out(ds, uri)
            except KeyError:
                pass

    def _write_moles_tags_out(self, ds, uri):
        if self.__update_moles:
            self._tag_observation(ds, uri, 'clipc_skos_vocab')
        else:
            self.__file_csv.write('{ds},{uri}\n'.format(ds=ds, uri=uri))

    def _write_moles_drs_mapping(self):
        if self.__update_moles:
            self._write_moles_drs_mapping_to_moles()
        else:
            self._write_moles_drs_mapping_to_file()

    def _write_moles_drs_mapping_to_moles(self):
        if self.__verbose >= 2:
            print('Updating MOLES ESGF mapping')
        from tools.esgf_tools.add_drs_datasets import add_mapping
        for directory, drs_id, version in self.__ds_drs_mapping:
            add_mapping(directory, drs_id, version)

    def _write_moles_drs_mapping_to_file(self):
        with open(MOLES_ESGF_MAPPING_FILE, 'w') as f:
            for directory, drs_id, version in sorted(self.__ds_drs_mapping):
                f.write('{directory},{drs_id}.{version}\n'.format(
                    directory=directory, drs_id=drs_id, version=version))

    def _write_json(self, drs):
        self.__file_drs.write(
            json.dumps(drs, sort_keys=True, indent=4, separators=(',', ': ')))

    def _get_term_uri(self, facet, term, ds=None):
        facet = facet.lower()
        term_l = self._convert_term(facet, term)
        if term_l in self.__facets.get_labels(facet).keys():
            return self.__facets.get_labels(facet)[term_l]
        elif term_l in self.__facets.get_alt_labels(facet).keys():
            return self.__facets.get_alt_labels(facet)[term_l]
        return None

    def _convert_term(self, facet, term):
        term = term.lower()
        if self.__user_mappings:
            term = self.__user_mappings.get_mapping(facet, term)
        if self.__use_mapping:
            term = LocalVocabMappings.get_mapping(facet, term)
        return term

    def _open_files(self, ):
        if not self.__update_moles:
            self.__file_csv = open(MOLES_TAGS_FILE, 'w')
        self.__file_drs = open(ESGF_DRS_FILE, 'w')

    def _close_files(self, ):
        if not self.__update_moles:
            self.__file_csv.close()
        self.__file_drs.close()
Ejemplo n.º 9
0
    def __init__(self):

        # Retrieve cached values, cache lasts for 24 hours as vocab server doesn't change much
        self.facets = Facets.from_json(cache.get_or_set('cci_vocabs', self._load_facets, timeout=86400))