Example #1
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            cdms_file = cdms_open(self.path)
            for key in splitLine(config.get(projectSection, config_key), ','):

                # check for mapped keys
                if ':' in key:
                    parts = key.split(':')
                    value = cdms_file.__getattribute__(parts[0])
                    result[parts[1]] = value

                else:
                    result[key] = cdms_file.__getattribute__(key)

        return result
Example #2
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            cdms_file = cdms_open(self.path)
            for key in splitLine(config.get(projectSection, config_key), ','):
                
                # check for mapped keys
                if ':' in key:
                    parts = key.split(':')
                    value = cdms_file.__getattribute__(parts[0])
                    result[parts[1]] = value

                else:
                    result[key] = cdms_file.__getattribute__(key)

        return result
Example #3
0
 def getDatasetIdFields(self):
     """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``.
     """
     config = getConfig()
     section = 'project:'+self.name
     dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True))
     idfields = [re.findall(_patpat, format) for format in dataset_id_formats]
     return idfields, dataset_id_formats
Example #4
0
    def initializeFields(self, Session):
        """Initialize field names and options based on the configuration file."""
        from esgcet.model import Model, Experiment

        config = getConfig()
        projectSection = "project:" + self.name
        categoryOption = config.get(projectSection, "categories")
        categorySpecs = splitRecord(categoryOption)
        for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs:
            categoryType = getCategoryType(categoryTypeS)
            isMandatory = getBoolean(isMandatoryS)
            isThreddsProperty = getBoolean(isThreddsPropertyS)
            displayOrder = string.atoi(displayOrderS)
            self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder)

        categoryDefaultsOption = config.get(projectSection, "category_defaults", default=None, raw=True)
        if categoryDefaultsOption is not None:
            categoryDefaultsSpecs = splitRecord(categoryDefaultsOption)
            for category, categoryDefault in categoryDefaultsSpecs:
                self.categoryDefaults[category] = categoryDefault

        session = Session()

        # Find any new experiments. This allows experiments to be added to the config file without
        # running esginitialize.
        if self.fieldNames.has_key("experiment") and self.fieldNames["experiment"][WIDGET_TYPE] == ENUM:
            initializeExperiments(config, self.name, session)

        for category in self.getFieldNames():
            # At the moment some fields are predefined
            if category == "project":
                projects = splitRecord(config.get(projectSection, "project_options", default=""))
                self.validValues["project"] = [x[0] for x in projects]
            elif category == "model":
                models = session.query(Model).filter_by(project=self.name).all()
                self.validValues["model"] = [x.name for x in models]
            elif category == "experiment":
                experiments = session.query(Experiment).filter_by(project=self.name).all()
                self.validValues["experiment"] = [x.name for x in experiments]
            elif category == "creator":
                creators = splitRecord(config.get(projectSection, "creator_options", default=""))
                self.validValues["creator"] = [x[0] for x in creators]
                self.validMaps["creator"] = genMap(creators)
            elif category == "publisher":
                publishers = splitRecord(config.get(projectSection, "publisher_options", default=""))
                self.validValues["publisher"] = [x[0] for x in publishers]
                self.validMaps["publisher"] = genMap(publishers)
            else:
                categoryType = self.getFieldType(category)
                if categoryType == ENUM:
                    option = category + "_options"
                    self.validValues[category] = splitLine(config.get(projectSection, option), ",")

            self.context[category] = ""

        session.close()
Example #5
0
 def getDatasetIdFields(self):
     """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``.
     """
     config = getConfig()
     section = 'project:' + self.name
     dataset_id_formats = splitLine(
         config.get(section, 'dataset_id', raw=True))
     idfields = [
         re.findall(_patpat, format) for format in dataset_id_formats
     ]
     return idfields, dataset_id_formats
Example #6
0
 def getDirectoryFormatFilters(self):
     """Return a list of regular expression filters associated with the ``directory_format`` option
     in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = "project:" + self.name
     directory_format = config.get(section, "directory_format", raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace("\.", "__ESCAPE_DOT__")
         pat3 = pat2.replace(".", r"\.")
         pat4 = pat3.replace("__ESCAPE_DOT__", r"\.")
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r"(?P<\1>[^/]*)", pat4)
         filter = "^" + pattern + "$"
         filters.append(filter)
     return filters
Example #7
0
 def getFilters(self, option='directory_format'):
     """Return a list of regular expression filters associated with the option in the configuration file.
      This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = 'project:'+self.name
     directory_format = config.get(section, option, raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace('\.','__ESCAPE_DOT__')
         pat3 = pat2.replace('.', r'\.')
         pat4 = pat3.replace('__ESCAPE_DOT__', r'\.')
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4)
         filter = '^'+pattern+'$'
         filters.append(filter)
     return filters
Example #8
0
 def getFilters(self, option='directory_format'):
     """Return a list of regular expression filters associated with the option in the configuration file.
      This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``.
     """
     config = getConfig()
     section = 'project:' + self.name
     directory_format = config.get(section, option, raw=True)
     formats = splitLine(directory_format)
     filters = []
     for format in formats:
         pat = format.strip()
         pat2 = pat.replace('\.', '__ESCAPE_DOT__')
         pat3 = pat2.replace('.', r'\.')
         pat4 = pat3.replace('__ESCAPE_DOT__', r'\.')
         # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4)
         pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4)
         filter = '^' + pattern + '$'
         filters.append(filter)
     return filters
Example #9
0
 def getMaps(self):
     """Get a dictionary of maps from the project section.
     """
     config = getConfig()
     section = 'project:'+self.name
     if self.mapdict is None:
         mapdict = {}
         projectMaps = splitLine(config.get(section, 'maps', default=""), ',')
         for option in projectMaps:
             if option=="":
                 continue
             fromcat, tocat, projectMap = splitMap(config.get(section, option))
             for to_index, field in enumerate(tocat):
                 value = (fromcat, projectMap, to_index)
                 if mapdict.has_key(field):
                     mapdict[field].append(value)
                 else:
                     mapdict[field] = [value]
         self.mapdict = mapdict
     return self.mapdict
Example #10
0
 def getMaps(self):
     """Get a dictionary of maps from the project section.
     """
     config = getConfig()
     section = 'project:' + self.name
     if self.mapdict is None:
         mapdict = {}
         projectMaps = splitLine(config.get(section, 'maps', default=""),
                                 ',')
         for option in projectMaps:
             if option == "":
                 continue
             fromcat, tocat, projectMap = splitMap(
                 config.get(section, option))
             for to_index, field in enumerate(tocat):
                 value = (fromcat, projectMap, to_index)
                 if mapdict.has_key(field):
                     mapdict[field].append(value)
                 else:
                     mapdict[field] = [value]
         self.mapdict = mapdict
     return self.mapdict
Example #11
0
    def readContext(self, cdfile):
        "Get a dictionary of key/value pairs from an open file."
        f = cdfile.file

        result = {}
        if hasattr(f, 'title'):
            result['title'] = f.title
        if hasattr(f, 'Conventions'):
            result['Conventions'] = f.Conventions
        if hasattr(f, 'source'):
            result['source'] = f.source
        if hasattr(f, 'history'):
            result['history'] = f.history

        config = getConfig()
        projectSection = 'project:' + self.name

        config_key = "extract_global_attrs"

        if config.has_option(projectSection, config_key):
            for key in splitLine(config.get(projectSection, config_key), ','):
                result[key] = cdfile.getAttribute(key, None)

        return result
Example #12
0
    def validateContext(self, context):
        """
        Validate context values:

        - Mandatory values must be non-blank, and if enumerated have a valid value
        - If enumerated, non-mandatory values must be blank or have a valid value
        otherwise if enumerated the field must be either be blank or one of the valid values

        Raises ESGPublishError if a validation error occurs

        If the validate configuration option is set to False in the project section,
        validation always succeeds.
        """
        if not self.validate:
            return
        
        for key in context.keys():
            fieldType = self.getFieldType(key)

            # Ignore non-configured fields
            if fieldType is None:
                continue
            
            isenum = (fieldType==ENUM)
            if isenum:
                options = self.getFieldOptions(key)
            value = context[key]

            config = getConfig()

            project_section = 'project:%s' % self.name
            delimiter = config.get(project_section, key + "_delimiter", default="")

            if value in ['', None]:
                # if value not in default context, try to get it from key_pattern or *_map
                option = '%s_pattern' % key
                if config.has_option(project_section, option):
                    value = config.get(project_section, option, False, context)
                    context[key] = value
                elif config.has_option(project_section, 'maps'):
                    for map_option in splitLine(config.get(project_section, 'maps', default=''), ','):
                        from_keys, to_keys, value_dict = splitMap(config.get(project_section, map_option))
                        if key in to_keys:
                            from_values = tuple(context[k] for k in from_keys)
                            to_values = value_dict[from_values]
                            value = to_values[to_keys.index(key)]
                            context[key] = value

            if self.isMandatory(key):
                if value in ['', None]:
                    if isenum:
                        raise ESGInvalidMandatoryField("Mandatory field '%s' not set, must be one of %s"%(key, `options`))
                    else:
                        raise ESGInvalidMandatoryField("Mandatory field '%s' not set"%key)
                elif isenum and not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGInvalidMandatoryField("Invalid value of mandatory field '%s': %s, must be one of %s"%(key, value, `validOptions`))
            elif isenum:     # non-mandatory field
                options += ['', None]
                if not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGPublishError("Invalid value of '%s': %s, must be one of %s"%(key, value, `validOptions`))
Example #13
0
def readDatasetMap(mappath, parse_extra_fields=False):
    """Read a dataset map.

    A dataset map is a text file, each line having the form:

    dataset_id | absolute_file_path | size [ | ``from_file`` =<path> [ | extra_field=extra_value ...]]

    where dataset_id has the form dataset_name[#version]

    Returns (if parse_extra_fields=False) a dataset map - a dictionary: dataset_id => [(path, size), (path, size), ...]
    If parse_extra_fields=True, returns a tuple (dataset_map, extra_dictionary). See parse_extra_fields.
      

    mappath
      Name of the dataset map.

    parse_extra_fields
      Boolean; if True then parse any extra fields of the form *extra_field=extra_value*, and return
      a dictionary with items of the form:

      extrafields[(dataset_name, version_number, absolute_file_path, *field_name*)] => field_value

      where *field_name* is one of:

      - ``from_file``
      - ``mod_time``

    """
    datasetMap = {}
    extraFieldMap = {}
    mapfile = open(mappath)
    for line in mapfile.readlines():
        if line[0] == '#' or line.strip() == '':
            continue

        if parse_extra_fields:
            fields = splitLine(line)
            versionName, path, size = fields[0:3]
            datasetName, versionno = parseDatasetVersionId(versionName)
            if len(fields) > 3:
                for field in fields[3:]:
                    efield, evalue = field.split('=')
                    extraFieldMap[(datasetName, versionno, path,
                                   efield.strip())] = evalue.strip()
            if datasetMap.has_key((datasetName, versionno)):
                datasetMap[(datasetName, versionno)].append((path, size))
            else:
                datasetMap[(datasetName, versionno)] = [(path, size)]
        else:
            datasetId, path, size = splitLine(line)[0:3]
            versionId = parseDatasetVersionId(datasetId)
            if datasetMap.has_key(versionId):
                datasetMap[versionId].append((path, size))
            else:
                datasetMap[versionId] = [(path, size)]

    mapfile.close()

    for value in datasetMap.values():
        value.sort()

    if parse_extra_fields:
        return (datasetMap, extraFieldMap)
    else:
        return datasetMap
Example #14
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP,
                       progressCallback=None, stopEvent=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None,
                       comment=None, useVersion=-1, forceRescan=False, nodbwrite=False, pid_connector=None, test_publication=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    pid_connector
        ESGF_PID_connector object to register PIDs

    test_publication
        Flag whether publication is for production or test

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)

        if not offline:
            if perVariable is None:
                perVariable = config.getboolean(section, 'variable_per_file', False)
            else:
                perVariable = False
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False

    exclude_variables = splitLine(config.get(section, 'thredds_exclude_variables', default=''), sep=',')

    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType
    configOptions['exclude_variables'] = exclude_variables
    configOptions['perVariable'] = perVariable

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if (nodbwrite): 
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        info("dataset scan complete, not writing to database")
        return dset
       
    elif operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, useVersion=useVersion, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        if operation==REPLACE_OP:
            versionObj = dset.getVersionObj(-1)
        else:
            versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, useVersion=useVersion, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if useVersion == -1:
        if keepVersion:
            if existingVersion<=0:
                newVersion = getInitialDatasetVersion(versionByDate)
            else:
                newVersion = existingVersion
        elif newVersion is None:
            newVersion = getNextDatasetVersion(existingVersion, versionByDate)
    else:
        newVersion = useVersion

    dset.reaggregate = False

    if newVersion<existingVersion:
        versionList = dset.getVersionList()
        if newVersion in versionList:
            addNewVersion = False

    # Add a new version
    if addNewVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title

        # if project uses PIDs, generate PID for dataset
        dataset_pid = None
        if pid_connector:
            dataset_pid = pid_connector.make_handle_from_drsid_and_versionnumber(drs_id=datasetName, version_number=newVersion)
            info("Assigned PID to dataset %s.v%s: %s " % (datasetName, newVersion, dataset_pid))

        # if project uses citation, build citation url
        project_config_section = 'config:%s' %context.get('project')
        citation_url = handler.get_citation_url(project_config_section, config, datasetName, newVersion, test_publication)

        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes,
                                                  tech_notes_title=datasetTechNotesTitle, pid=dataset_pid, citation_url=citation_url)

        info("New dataset version = %d"%newDsetVersionObj.version)
        
        try:
            for var in dset.variables:
                session.delete(var)
        except IntegrityError as ie:
            debug("sqlalchemy IntegrityError: " + str(ie))
            raise ESGPublishError("Error in creating dataset version, did you already publish this version to the database?")
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset
Example #15
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError('No directory specified')

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, 'a')
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", '.*\.nc$')
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, 'w')
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList,
                                               filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList,
                                                      filefilt,
                                                      datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(
                directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath,
                                            filefilt=filefilt,
                                            followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (
                            csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (appendMap is
                            None) or (not appendMap.has_key(datasetId)) or (
                                (filepath, "%d" % size)
                                not in appendMap[datasetId]):
                        print >> output, "%s | %s | %d | %s" % (
                            datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
        else:
            raise ESGPublishError(
                "Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName,
                                         service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister,
                commandArgs,
                handler,
                filefilt=filefilt,
                datasetName=datasetName,
                offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or (
                (filepath, "%d" % size) not in appendMap[dsetName]):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size,
                                                      extrastuff)

    if output is not sys.stdout:
        output.close()
Example #16
0
    def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters
        
        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:'+self.name
            dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True))
            idfields = [re.findall(_patpat, format) for format in dataset_id_formats]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1]=='/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s'%drsversion
                    except:
                        allfields = reduce(lambda x,y: set(x)+set(y), idfields)
                        missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section)))
                        raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath))
                else:
                    warning("Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini")
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0 ):
            warning("Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini")
        return datasetMap
Example #17
0
    def generateDirectoryMap(self,
                             directoryList,
                             filefilt,
                             initContext=None,
                             datasetName=None,
                             use_version=False):
        """Generate a directory map. Recursively scan each directory in *directoryList*,
        locating each directory with at least one file matching filefilt.

        Returns a directory map (dictionary) mapping
        dataset_id => [(directory_path, filepath), (directory_path, filepath), ...]
        where the dataset_id is generated by matching the 'directory_format' configuration option to
        each directory path. The map has one entry per directory, where it is assumed that
        all files in the directory belong to the same dataset.

        directoryList
          List of directories to scan. The scan searches for directories matching the 'directory_format'
          configuration file option for this project, and having at least one file matching *filefilt*.

        filefilt
          Regular expression as defined by the Python **re** module. Matched against the file basename.

        initContext
          Dictionary of field => value items. Entries override values found from matching the directory paths.

        datasetName
          Name of the dataset. If not specified, generate with ``generateDatasetId()``.
        """
        from esgcet.publish import nodeIterator

        # If the dataset name is specified, no need to get directory format filters

        if datasetName is None:
            # Get the dataset_id and filters
            filters = self.getFilters()
            config = getConfig()
            section = 'project:' + self.name
            dataset_id_formats = splitLine(
                config.get(section, 'dataset_id', raw=True))
            idfields = [
                re.findall(_patpat, format) for format in dataset_id_formats
            ]
        else:
            filters = [r'.*$']

        # Iterate over nodes
        mapdict = self.getMaps()
        datasetMap = {}
        for direc in directoryList:
            if direc[-1] == '/':
                direc = direc[:-1]
            nodeiter = nodeIterator(direc, filters, filefilt)
            for nodepath, filepath, groupdict in nodeiter:
                if initContext is not None:
                    groupdict.update(initContext)
                if not groupdict.has_key('project'):
                    groupdict['project'] = self.name
                if datasetName is None:
                    try:
                        datasetId = self.generateDatasetId(
                            'dataset_id',
                            idfields,
                            groupdict,
                            multiformat=dataset_id_formats)
                        if use_version and 'version' in groupdict:
                            drsversion = groupdict['version']
                            if not re.match('^[0-9]+$',
                                            drsversion[0]):  # e.g. vYYYYMMDD
                                drsversion = drsversion[1:]
                            datasetId += '#%s' % drsversion
                    except:
                        allfields = reduce(lambda x, y: set(x) + set(y),
                                           idfields)
                        missingFields = list((set(allfields) -
                                              set(groupdict.keys())) -
                                             set(config.options(section)))
                        raise ESGPublishError(
                            "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"
                            % ( ` missingFields `, nodepath))
                else:
                    warning(
                        "Empty dataset name.  Check that directory hierarchy format matches the configured format string in esg.ini"
                    )
                    datasetId = datasetName
                if datasetMap.has_key(datasetId):
                    datasetMap[datasetId].append((nodepath, filepath))
                else:
                    datasetMap[datasetId] = [(nodepath, filepath)]

        if (len(datasetMap) == 0):
            warning(
                "Empty datasetMap.  Check that directory hierarchy format matches the configured format string in esg.ini"
            )
        return datasetMap
Example #18
0
    def validateContext(self, context):
        """
        Validate context values:

        - Mandatory values must be non-blank, and if enumerated have a valid value
        - If enumerated, non-mandatory values must be blank or have a valid value
        otherwise if enumerated the field must be either be blank or one of the valid values

        Raises ESGPublishError if a validation error occurs

        If the validate configuration option is set to False in the project section,
        validation always succeeds.
        """
        if not self.validate:
            return

        for key in context.keys():
            fieldType = self.getFieldType(key)

            # Ignore non-configured fields
            if fieldType is None:
                continue

            isenum = (fieldType == ENUM)
            if isenum:
                options = self.getFieldOptions(key)
            value = context[key]

            config = getConfig()

            project_section = 'project:%s' % self.name
            delimiter = config.get(project_section,
                                   key + "_delimiter",
                                   default="")

            if value in ['', None]:
                # if value not in default context, try to get it from key_pattern or *_map
                option = '%s_pattern' % key
                if config.has_option(project_section, option):
                    value = config.get(project_section, option, False, context)
                    context[key] = value
                elif config.has_option(project_section, 'maps'):
                    for map_option in splitLine(
                            config.get(project_section, 'maps', default=''),
                            ','):
                        from_keys, to_keys, value_dict = splitMap(
                            config.get(project_section, map_option))
                        if key in to_keys:
                            from_values = tuple(context[k] for k in from_keys)
                            to_values = value_dict[from_values]
                            value = to_values[to_keys.index(key)]
                            context[key] = value

            if self.isMandatory(key):
                if value in ['', None]:
                    if isenum:
                        raise ESGInvalidMandatoryField(
                            "Mandatory field '%s' not set, must be one of %s" %
                            (key, ` options `))
                    else:
                        raise ESGInvalidMandatoryField(
                            "Mandatory field '%s' not set" % key)
                elif isenum and not self.compareEnumeratedValue(
                        value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGInvalidMandatoryField(
                        "Invalid value of mandatory field '%s': %s, must be one of %s"
                        % (key, value, ` validOptions `))
            elif isenum:  # non-mandatory field
                options += ['', None]
                if not self.compareEnumeratedValue(value, options, delimiter):
                    validOptions = self.mapValidFieldOptions(key, options)
                    raise ESGPublishError(
                        "Invalid value of '%s': %s, must be one of %s" %
                        (key, value, ` validOptions `))
Example #19
0
    def initializeFields(self, Session):
        """Initialize field names and options based on the configuration file."""
        from esgcet.model import Model, Experiment
        config = getConfig()
        projectSection = 'project:' + self.name
        categoryOption = config.get(projectSection, 'categories')
        categorySpecs = splitRecord(categoryOption)
        for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs:
            categoryType = getCategoryType(categoryTypeS)
            isMandatory = getBoolean(isMandatoryS)
            isThreddsProperty = getBoolean(isThreddsPropertyS)
            displayOrder = string.atoi(displayOrderS)
            self.fieldNames[category] = (categoryType, isMandatory,
                                         isThreddsProperty, displayOrder)

        categoryDefaultsOption = config.get(projectSection,
                                            'category_defaults',
                                            default=None,
                                            raw=True)
        if categoryDefaultsOption is not None:
            categoryDefaultsSpecs = splitRecord(categoryDefaultsOption)
            for category, categoryDefault in categoryDefaultsSpecs:
                self.categoryDefaults[category] = categoryDefault

        session = Session()

        # Find any new experiments. This allows experiments to be added to the config file without
        # running esginitialize.
        if self.fieldNames.has_key('experiment') and self.fieldNames[
                'experiment'][WIDGET_TYPE] == ENUM:
            initializeExperiments(config, self.name, session)

        for category in self.getFieldNames():
            # At the moment some fields are predefined
            if category == "project":
                projects = splitRecord(
                    config.get(projectSection, 'project_options', default=''))
                self.validValues['project'] = [x[0] for x in projects]
            elif category == "model":
                models = session.query(Model).filter_by(
                    project=self.name).all()
                self.validValues['model'] = [x.name for x in models]
            elif category == "experiment":
                experiments = session.query(Experiment).filter_by(
                    project=self.name).all()
                self.validValues['experiment'] = [x.name for x in experiments]
            elif category == "creator":
                creators = splitRecord(
                    config.get(projectSection, 'creator_options', default=''))
                self.validValues['creator'] = [x[0] for x in creators]
                self.validMaps['creator'] = genMap(creators)
            elif category == "publisher":
                publishers = splitRecord(
                    config.get(projectSection, 'publisher_options',
                               default=''))
                self.validValues['publisher'] = [x[0] for x in publishers]
                self.validMaps['publisher'] = genMap(publishers)
            else:
                categoryType = self.getFieldType(category)
                if categoryType == ENUM:
                    option = category + "_options"
                    self.validValues[category] = splitLine(
                        config.get(projectSection, option), ',')

            self.context[category] = ''

        session.close()
Example #20
0
def readDatasetMap(mappath, parse_extra_fields=False):
    """Read a dataset map.

    A dataset map is a text file, each line having the form:

    dataset_id | absolute_file_path | size [ | ``from_file`` =<path> [ | extra_field=extra_value ...]]

    where dataset_id has the form dataset_name[#version]

    Returns (if parse_extra_fields=False) a dataset map - a dictionary: dataset_id => [(path, size), (path, size), ...]
    If parse_extra_fields=True, returns a tuple (dataset_map, extra_dictionary). See parse_extra_fields.
      

    mappath
      Name of the dataset map.

    parse_extra_fields
      Boolean; if True then parse any extra fields of the form *extra_field=extra_value*, and return
      a dictionary with items of the form:

      extrafields[(dataset_name, version_number, absolute_file_path, *field_name*)] => field_value

      where *field_name* is one of:

      - ``from_file``
      - ``mod_time``

    """
    datasetMap = {}
    extraFieldMap = {}
    mapfile = open(mappath)
    for line in mapfile.readlines():
        if line[0]=='#' or line.strip()=='':
            continue

        if parse_extra_fields:
            fields = splitLine(line)
            versionName, path, size = fields[0:3]
            datasetName,versionno = parseDatasetVersionId(versionName)
            if len(fields)>3:
                for field in fields[3:]:
                    efield, evalue = field.split('=')
                    extraFieldMap[(datasetName, versionno, path, efield.strip())] = evalue.strip()
            if datasetMap.has_key((datasetName, versionno)):
                datasetMap[(datasetName, versionno)].append((path, size))
            else:
                datasetMap[(datasetName, versionno)] = [(path, size)]
        else:
            datasetId, path, size = splitLine(line)[0:3]
            versionId = parseDatasetVersionId(datasetId)
            if datasetMap.has_key(versionId):
                datasetMap[versionId].append((path, size))
            else:
                datasetMap[versionId] = [(path, size)]

    mapfile.close()

    for value in datasetMap.values():
        value.sort()
        
    if parse_extra_fields:
        return (datasetMap, extraFieldMap)
    else:
        return datasetMap
Example #21
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:' + self.name
        datasetIdFormatList = config.get(section,
                                         'dataset_id',
                                         raw=True,
                                         default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())

            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions(
                'experiment', experimentOptions)
            if idFormat.find(
                    '%(experiment)s') != -1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x, y: x + '|' + y,
                                      experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)' % optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)',
                                              experimentPattern)

            if newinit[-1] != '$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value != context[key]:
                    warning("Dataset ID=%s, but %s=%s" %
                            (datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning(
                "Dataset ID: %s does not match the dataset_id format(s): %s" %
                (datasetName, ` datasetIdFormats `))

        return context
Example #22
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError("No directory specified")

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, "a")
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", ".*\.nc$")
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, "w")
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get("DEFAULT", "checksum", default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (
                        (appendMap is None)
                        or (not appendMap.has_key(datasetId))
                        or ((filepath, "%d" % size) not in appendMap[datasetId])
                    ):
                        print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName, service)
        offlineLister = config.get(listerSection, "offline_lister_executable")
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
            offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
        ):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (
                (appendMap is None)
                or (not appendMap.has_key(dsetName))
                or ((filepath, "%d" % size) not in appendMap[dsetName])
            ):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
Example #24
0
    def parseDatasetName(self, datasetName, context):
        """Parse a dataset name.

        Returns a dictionary, mapping field => value. The config file option 'dataset_id'
        is used to parse the name into fields.

        datasetName
          String dataset identifier.

        context
          Initial context dictionary. This argument is altered on output.

        """
        config = getConfig()
        section = 'project:'+self.name
        datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None)
        if datasetIdFormatList is None:
            # warning("No dataset_id option found for project %s"%self.name)
            return context
        datasetIdFormats = splitLine(datasetIdFormatList)

        formatMatched = False
        for idFormat in datasetIdFormats:

            # '.' => '\.'
            newinit = re.sub(r'\.', r'\.', idFormat.strip())
            
            # %(name)s => (?P<name>[^.]*)
            newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit)

            # If experiment is enumerated, match on the experiment options. This allows
            # experiment ids to contain periods (.) .
            experimentOptions = self.getFieldOptions('experiment')

            # Map to case-sensitive options
            experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions)
            if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None:
                if len(experimentOptions) > 0:
                    optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions)
                    experimentPattern = r'(?P<experiment>%s)'%optionOr
                    newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern)
            
            if newinit[-1]!='$':
                newinit += '$'

            match = re.match(newinit, datasetName)

            if match is None:
                continue
            else:
                result = match.groupdict()
                formatMatched = True
            for key, value in result.items():
                if context.has_key(key) and value!=context[key]:
                    warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key]))
                else:
                    context[str(key)] = value
            break

        if not formatMatched:
            warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`))

        return context
Example #25
0
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, **context):
    """
    Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables:

    - dataset
    - dataset_version
    - file
    - file_version
    - dataset_file_version
    - file_variable (partially)
    - associated attribute tables

    Returns a Dataset object.

    datasetName
      String dataset identifier.

    fileIterator
      An iterator that returns an iteration of (file_path, file_size), where file_size is an integer.

    dbSession
      A database Session.

    handler
      Project handler

    cfHandler  
      A CF handler instance

    aggregateDimensionName
      The name of the dimension across which the dataset is aggregated, if any.

    offline
      Boolean, True if the files are offline, cannot be scanned.

    operation
      Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    stopEvent
      Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra fields dictionary, as from ``readDatasetMap``.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment
      String comment on the dataset version. If the dataset version is not increased, the comment is ignored.

    useVersion=-1:
      Integer version number of the dataset version to modify. By default the latest version is modified.

    forceRescan
      Boolean, if True force all files to be rescanned on an update.

    context
      A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset.

    """

    session = dbSession()

    # Get configuration options related to the scan
    configOptions = {}
    config = getConfig()
    if config is not None:
        section = 'project:%s'%context.get('project')
        vlstring = config.get(section, 'variable_locate', default=None)
        if vlstring is not None:
            fields = splitLine(vlstring)
            varlocate = [s.split(',') for s in fields]
        else:
            varlocate = None

        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None
            checksumType = None

        versionByDate = config.getboolean(section, 'version_by_date', default=False)
    else:
        varlocate = None
        checksumClient = None
        checksumType = None
        versionByDate = False
        
    configOptions['variable_locate'] = varlocate
    configOptions['checksumClient'] = checksumClient
    configOptions['checksumType'] = checksumType

    # Check if the dataset / version is already in the database
    dset = session.query(Dataset).filter_by(name=datasetName).first()
    if dset is not None:
        if operation==CREATE_OP:
            operation = REPLACE_OP
    else:
        if operation in [UPDATE_OP, REPLACE_OP]:
            operation = CREATE_OP
        elif operation in [DELETE_OP, RENAME_OP]:
            raise ESGPublishError("No such dataset: %s"%datasetName)

    # Cannot add online files to offline dataset, and vice versa
    if dset is not None and dset.offline != offline:
        if dset.offline:
            raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name)
        else:
            raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name)

    # Cannot publish a replica with the same ID as a local dataset and vice versa
    if dset is not None and dset.master_gateway != masterGateway:
        if dset.master_gateway is None:
            raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name)
        else:
            raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name)

    createTime = datetime.datetime.now() # DatasetVersion creation_time
    fobjs = None
    pathlist = [item for item in fileIterator]
    if operation==CREATE_OP:
        # Create a new dataset
        info("Creating dataset: %s"%datasetName)
        dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway)
        session.add(dset)

        # Create an initial dataset version
        existingVersion = 0
        eventFlag = CREATE_DATASET_EVENT
        addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context)
        
    elif operation in [UPDATE_OP, REPLACE_OP]:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, **context)
         
    elif operation==RENAME_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
         
    elif operation==DELETE_OP:
        versionObj = dset.getVersionObj(useVersion)
        if versionObj is None:
            raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name))
        existingVersion = dset.getVersion()
        eventFlag = UPDATE_DATASET_EVENT
        addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context)
    else:
        raise ESGPublishError("Invalid dataset operation: %s"%`operation`)

    # Create a new dataset version if necessary
    if keepVersion:
        if existingVersion<=0:
            newVersion = getInitialDatasetVersion(versionByDate)
        else:
            newVersion = existingVersion
    elif newVersion is None:
        newVersion = getNextDatasetVersion(existingVersion, versionByDate)
        
    dset.reaggregate = False
    # Add a new version
    if addNewVersion and newVersion>existingVersion:
        datasetTechNotes = datasetTechNotesTitle = None
        if hasattr(dset, "dataset_tech_notes"):
            datasetTechNotes = dset.dataset_tech_notes
        if hasattr(dset, "dataset_tech_notes_title"):
            datasetTechNotesTitle = dset.dataset_tech_notes_title
        newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle)
        info("New dataset version = %d"%newDsetVersionObj.version)
        for var in dset.variables:
            session.delete(var)
        newDsetVersionObj.files.extend(fobjs)
        event = Event(datasetName, newDsetVersionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    # Keep the current (latest) version
    elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]:
        versionObj.deleteChildren(session)
        versionObj.reset(creation_time=createTime, comment=comment)
        info("Keeping dataset version = %d"%versionObj.version)
        for var in dset.variables:
            session.delete(var)
        session.commit()
        versionObj.files.extend(fobjs)
        event = Event(datasetName, versionObj.version, eventFlag)
        dset.events.append(event)
        dset.reaggregate = True
    elif masterGateway is not None:     # Force version set on replication
        info("Dataset version = %d"%newVersion)
        dset.setVersion(newVersion)
        event = Event(datasetName, newVersion, eventFlag)
        dset.events.append(event)

    info("Adding file info to database")
    session.commit()
    session.close()

    return dset