Example #1
0
    def checksum(self, checksum_type):
        """
        Does the checksum by the Shell avoiding Python memory limits.

        :param str checksum_type: Checksum type
        :returns: The checksum
        :rtype: *str*
        :raises Error: If the checksum fails

        """
        return checksum(self.ffp, checksum_type)
Example #2
0
    def check_uniqueness(self, checksum_type):
        """
        Check tree upgrade uniqueness. Each data version to upgrade has to be stricly different
        from the latest version if exists.

        """
        for latest_dset in self.hash.keys():
            latest_path, latest_version = os.path.dirname(latest_dset), os.path.basename(latest_dset)
            self.hash[latest_dset]['upgrade'] = dict()
            for dset_leaf in self.leaves(root=os.path.join(latest_path, DRSPath.TREE_VERSION)):
                filename = os.path.basename(dset_leaf.data.origin)
                self.hash[latest_dset]['upgrade'][filename] = checksum(dset_leaf.data.origin,
                                                                       checksum_type)
            if self.hash[latest_dset]['latest'] == self.hash[latest_dset]['upgrade']:
                raise DuplicatedDataset(latest_path, latest_version)
Example #3
0
def process(source):
    """
    process(collector_input)

    File process that:

     * Handles files,
     * Deduces facet key, values pairs from file attributes
     * Checks facet values against CV,
     * Applies the versioning
     * Populates the DRS tree crating the appropriate leaves,
     * Stores dataset statistics.


    :param str source: The file full path to process

    """
    # Get process content from process global env
    assert 'pctx' in globals().keys()
    pctx = globals()['pctx']
    # Block to avoid program stop if a thread fails
    try:
        # Instantiate file handler
        fh = File(source)
        # Ignore files from incoming
        if fh.filename in pctx.ignore_from_incoming:
            msg = TAGS.SKIP + COLORS.HEADER(source)
            with pctx.lock:
                Print.exception(msg, buffer=True)
            return None
        # Loads attributes from filename, netCDF attributes, command-line
        fh.load_attributes(root=pctx.root,
                           pattern=pctx.pattern,
                           set_values=pctx.set_values)
        # Checks the facet values provided by the loaded attributes
        fh.check_facets(facets=pctx.facets,
                        config=pctx.cfg,
                        set_keys=pctx.set_keys)
        # Get parts of DRS path
        parts = fh.get_drs_parts(pctx.facets)
        # Instantiate file DRS path handler
        fh.drs = DRSPath(parts)
        # Ensure that the called project section is ALWAYS part of the DRS path elements (case insensitive)
        if not fh.drs.path().lower().startswith(pctx.project.lower()):
            raise InconsistentDRSPath(pctx.project, fh.drs.path())
        # Compute file checksum
        if fh.drs.v_latest and not pctx.no_checksum:
            fh.checksum = checksum(fh.ffp, pctx.checksum_type)
        # Get file tracking id
        fh.tracking_id = get_tracking_id(fh.ffp, pctx.project)
        if fh.drs.v_latest:
            latest_file = os.path.join(fh.drs.path(latest=True, root=True),
                                       fh.filename)
            # Compute checksum of latest file version if exists
            if os.path.exists(latest_file) and not pctx.no_checksum:
                fh.latest_checksum = checksum(latest_file, pctx.checksum_type)
            # Get tracking_id of latest file version if exists
            if os.path.exists(latest_file):
                fh.latest_tracking_id = get_tracking_id(
                    latest_file, pctx.project)
        msg = TAGS.SUCCESS + 'Processing {}'.format(COLORS.HEADER(fh.ffp))
        Print.info(msg)
        return fh
    except KeyboardInterrupt:
        raise
    except Exception:
        exc = traceback.format_exc().splitlines()
        msg = TAGS.SKIP + COLORS.HEADER(source) + '\n'
        msg += '\n'.join(exc)
        with pctx.lock:
            Print.exception(msg, buffer=True)
        return None
    finally:
        with pctx.lock:
            pctx.progress.value += 1
            percentage = int(pctx.progress.value * 100 / pctx.nbsources)
            msg = COLORS.OKBLUE('\rScanning incoming file(s): ')
            msg += '{}% | {}/{} file(s)'.format(percentage,
                                                pctx.progress.value,
                                                pctx.nbsources)
            Print.progress(msg)
Example #4
0
def tree_builder(fh):
    """
    Builds the DRS tree accord to a source

    :param esgprep.drs.handler.File fh: The file handler object

    """
    # Get process content from process global env
    assert 'pctx' in globals().keys()
    pctx = globals()['pctx']
    try:
        # If a latest version already exists make some checks FIRST to stop file process
        if fh.drs.v_latest:
            # Latest version should be older than upgrade version
            if int(DRSPath.TREE_VERSION[1:]) <= int(fh.drs.v_latest[1:]):
                raise OlderUpgrade(DRSPath.TREE_VERSION, fh.drs.v_latest)
            # Walk through the latest dataset version to check its uniqueness with file checksums
            if not pctx.no_checksum:
                dset_nid = fh.drs.path(f_part=False, latest=True, root=True)
                if dset_nid not in tree.hash.keys():
                    tree.hash[dset_nid] = dict()
                    tree.hash[dset_nid]['latest'] = dict()
                    for root, _, filenames in os.walk(
                            fh.drs.path(f_part=False, latest=True, root=True)):
                        for filename in filenames:
                            tree.hash[dset_nid]['latest'][filename] = checksum(
                                os.path.join(root, filename),
                                pctx.checksum_type)
            # Pickup the latest file version
            latest_file = os.path.join(fh.drs.path(latest=True, root=True),
                                       fh.filename)
            # Check latest file if exists
            if os.path.exists(latest_file):
                if not pctx.no_checksum:
                    # If checksumming disabled duplicated files cannot be detected
                    # In this case, incoming files are assumed to be different in any cases
                    # Duplicated files should not exist.
                    # Check if processed file is a duplicate in comparison with latest version
                    if fh.latest_checksum == fh.checksum:
                        fh.is_duplicate = True
                if not fh.is_duplicate:
                    # If files are different check that PID/tracking_id is different from latest version
                    if fh.latest_tracking_id == fh.tracking_id:
                        raise UnchangedTrackingID(latest_file,
                                                  fh.latest_tracking_id,
                                                  fh.ffp, fh.tracking_id)

        # Start the tree generation
        if not fh.is_duplicate:
            # Add the processed file to the "vYYYYMMDD" node
            src = ['..'] * len(fh.drs.items(d_part=False))
            src.extend(fh.drs.items(d_part=False, file_folder=True))
            src.append(fh.filename)
            tree.create_leaf(nodes=fh.drs.items(root=True),
                             leaf=fh.filename,
                             label='{}{}{}'.format(fh.filename, LINK_SEPARATOR,
                                                   os.path.join(*src)),
                             src=os.path.join(*src),
                             mode='symlink',
                             origin=fh.ffp,
                             force=True)
            # Add the "latest" node for symlink
            tree.create_leaf(nodes=fh.drs.items(f_part=False,
                                                version=False,
                                                root=True),
                             leaf='latest',
                             label='{}{}{}'.format('latest', LINK_SEPARATOR,
                                                   fh.drs.v_upgrade),
                             src=fh.drs.v_upgrade,
                             mode='symlink')
            # Add the processed file to the "files" node
            tree.create_leaf(nodes=fh.drs.items(file_folder=True, root=True),
                             leaf=fh.filename,
                             label=fh.filename,
                             src=fh.ffp,
                             mode=pctx.mode)
            if fh.drs.v_latest and pctx.upgrade_from_latest:
                # Walk through the latest dataset version and create a symlink for each file with a different
                # filename than the processed one
                for root, _, filenames in os.walk(
                        fh.drs.path(f_part=False, latest=True, root=True)):
                    print filenames
                    for filename in filenames:
                        # Add latest files as tree leaves with version to upgrade instead of latest version
                        # i.e., copy latest dataset leaves to Tree
                        # Except if file has be ignored from latest version (i.e., with known issue)
                        # Except if file leaf has already been created to avoid overwriting new version
                        # leaf will be not create if already exists
                        if filename != fh.filename and filename not in pctx.ignore_from_latest:
                            src = os.path.join(root, filename)
                            tree.create_leaf(nodes=fh.drs.items(root=True),
                                             leaf=filename,
                                             label='{}{}{}'.format(
                                                 filename, LINK_SEPARATOR,
                                                 os.readlink(src)),
                                             src=os.readlink(src),
                                             mode='symlink',
                                             origin=os.path.realpath(src))

        else:
            # Pickup the latest file version
            latest_file = os.path.join(fh.drs.path(latest=True, root=True),
                                       fh.filename)
            if pctx.upgrade_from_latest:
                # If upgrade from latest is activated, raise the error, no duplicated files allowed
                # Because incoming must only contain modifed/corrected files
                raise DuplicatedFile(latest_file, fh.ffp)
            else:
                # If default behavior, the incoming contains all data for a new version
                # In the case of a duplicated file, just pass to the expected symlink creation
                # and records duplicated file for further removal only if migration mode is the
                # default (i.e., moving files). In the case of --copy or --link, keep duplicates
                # in place into the incoming directory
                src = os.readlink(latest_file)
                tree.create_leaf(nodes=fh.drs.items(root=True),
                                 leaf=fh.filename,
                                 label='{}{}{}'.format(fh.filename,
                                                       LINK_SEPARATOR, src),
                                 src=src,
                                 mode='symlink',
                                 origin=fh.ffp)
                if pctx.mode == 'move':
                    tree.duplicates.append(fh.ffp)
        # Record entry for list()
        record = {
            'src': fh.ffp,
            'dst': fh.drs.path(root=True),
            'filename': fh.filename,
            'latest': fh.drs.v_latest or 'Initial',
            'size': fh.size
        }
        if fh.drs.path(f_part=False) in tree.paths.keys():
            tree.paths[fh.drs.path(f_part=False)].append(record)
        else:
            tree.paths[fh.drs.path(f_part=False)] = [record]
        msg = TAGS.SUCCESS + 'DRS Path = {}'.format(
            COLORS.HEADER(fh.drs.path(f_part=False)))
        msg += ' <-- ' + fh.filename
        Print.info(msg)
        return True
    except KeyboardInterrupt:
        raise
    except Exception:
        exc = traceback.format_exc().splitlines()
        msg = TAGS.FAIL + 'Build {}'.format(COLORS.HEADER(
            fh.drs.path())) + '\n'
        msg += '\n'.join(exc)
        Print.exception(msg, buffer=True)
        return None
    finally:
        pctx.progress.value += 1
        percentage = int(pctx.progress.value * 100 / pctx.nbsources)
        msg = COLORS.OKBLUE('\rBuilding DRS tree: ')
        msg += '{}% | {}/{} file(s)'.format(percentage, pctx.progress.value,
                                            pctx.nbsources)
        Print.progress(msg)
Example #5
0
def process(collector_input):
    """
    process(collector_input)

    File process that:

     * Handles files,
     * Deduces facet key, values pairs from file attributes
     * Checks facet values against CV,
     * Applies the versioning
     * Populates the DRS tree crating the appropriate leaves,
     * Stores dataset statistics.

    :param tuple collector_input: A tuple with the file path and the processing context
    :return: True on success
    :rtype: *boolean*

    """
    # Deserialize inputs from collector
    ffp, ctx = collector_input
    # Block to avoid program stop if a thread fails
    try:
        # Instantiate file handler
        fh = File(ffp)
        # Loads attributes from filename, netCDF attributes, command-line
        fh.load_attributes(root=ctx.root,
                           pattern=ctx.pattern,
                           set_values=ctx.set_values)
        # Apply proper case to each attribute
        for key in fh.attributes:
            # Try to get the appropriate facet case for "category_default"
            try:
                fh.attributes[key] = ctx.cfg.get_options_from_pairs('category_defaults', key)
            except NoConfigKey:
                # If not specified keep facet case from local path, do nothing
                pass
        fh.check_facets(facets=ctx.facets,
                        config=ctx.cfg,
                        set_keys=ctx.set_keys)
        # Get parts of DRS path
        parts = fh.get_drs_parts(ctx.facets)
        # Instantiate file DRS path handler
        fph = DRSPath(parts)
        # If a latest version already exists make some checks FIRST to stop files to not process
        if fph.v_latest:
            # Latest version should be older than upgrade version
            if int(DRSPath.TREE_VERSION[1:]) <= int(fph.v_latest[1:]):
                raise OlderUpgrade(DRSPath.TREE_VERSION, fph.v_latest)
            # Walk through the latest dataset version to check its uniqueness with file checksums
            dset_nid = fph.path(f_part=False, latest=True, root=True)
            if dset_nid not in ctx.tree.hash.keys():
                ctx.tree.hash[dset_nid] = dict()
                ctx.tree.hash[dset_nid]['latest'] = dict()
                for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)):
                    for filename in filenames:
                        ctx.tree.hash[dset_nid]['latest'][filename] = checksum(os.path.join(root, filename),
                                                                               ctx.checksum_type)
            # Pickup the latest file version
            latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename)
            # Check latest file if exists
            if os.path.exists(latest_file):
                latest_checksum = checksum(latest_file, ctx.checksum_type)
                current_checksum = checksum(fh.ffp, ctx.checksum_type)
                # Check if processed file is a duplicate in comparison with latest version
                if latest_checksum == current_checksum:
                    fh.is_duplicate = True
        # Start the tree generation
        if not fh.is_duplicate:
            # Add the processed file to the "vYYYYMMDD" node
            src = ['..'] * len(fph.items(d_part=False))
            src.extend(fph.items(d_part=False, file_folder=True))
            src.append(fh.filename)
            ctx.tree.create_leaf(nodes=fph.items(root=True),
                                 leaf=fh.filename,
                                 label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, os.path.join(*src)),
                                 src=os.path.join(*src),
                                 mode='symlink',
                                 origin=fh.ffp)
            # Add the "latest" node for symlink
            ctx.tree.create_leaf(nodes=fph.items(f_part=False, version=False, root=True),
                                 leaf='latest',
                                 label='{}{}{}'.format('latest', LINK_SEPARATOR, fph.v_upgrade),
                                 src=fph.v_upgrade,
                                 mode='symlink')
            # Add the processed file to the "files" node
            ctx.tree.create_leaf(nodes=fph.items(file_folder=True, root=True),
                                 leaf=fh.filename,
                                 label=fh.filename,
                                 src=fh.ffp,
                                 mode=ctx.mode)
            if ctx.upgrade_from_latest:
                # Walk through the latest dataset version and create a simlink for each file with a different
                # filename than the processed one
                for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)):
                    for filename in filenames:
                        # Add latest files as tree leaves with version to upgrade instead of latest version
                        # i.e., copy latest dataset leaves to Tree
                        if filename != fh.filename:
                            src = os.path.join(root, filename)
                            ctx.tree.create_leaf(nodes=fph.items(root=True),
                                                 leaf=filename,
                                                 label='{}{}{}'.format(filename, LINK_SEPARATOR, os.readlink(src)),
                                                 src=os.readlink(src),
                                                 mode='symlink',
                                                 origin=os.path.realpath(src))
        else:
            # Pickup the latest file version
            latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename)
            if ctx.upgrade_from_latest:
                # If upgrade from latest is activated, raise the error, no duplicated files allowed
                # Because incoming must only contain modifed/corrected files
                raise DuplicatedFile(latest_file, fh.ffp)
            else:
                # If default behavior, the incoming contains all data for a new version
                # In the case of a duplicated file, just pass to the expected symlink creation
                # and records duplicated file for further removal only if migration mode is the
                # default (i.e., moving files). In the case of --copy or --link, keep duplicates
                # in place into the incoming directory
                src = os.readlink(latest_file)
                ctx.tree.create_leaf(nodes=fph.items(root=True),
                                     leaf=fh.filename,
                                     label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, src),
                                     src=src,
                                     mode='symlink',
                                     origin=fh.ffp)
                if ctx.mode == 'move':
                    ctx.tree.duplicates.append(fh.ffp)
        # Record entry for list()
        incoming = {'src': fh.ffp,
                    'dst': fph.path(root=True),
                    'filename': fh.filename,
                    'latest': fph.v_latest or 'Initial',
                    'size': fh.size}
        if fph.path(f_part=False) in ctx.tree.paths.keys():
            ctx.tree.paths[fph.path(f_part=False)].append(incoming)
        else:
            ctx.tree.paths[fph.path(f_part=False)] = [incoming]
        logging.info('{} <-- {}'.format(fph.path(f_part=False), fh.filename))
        return True
    except Exception as e:
        logging.error('{} skipped\n{}: {}'.format(ffp, e.__class__.__name__, e.message))
        return None
    finally:
        ctx.pbar.update()