def checksum(self, checksum_type): """ Does the checksum by the Shell avoiding Python memory limits. :param str checksum_type: Checksum type :returns: The checksum :rtype: *str* :raises Error: If the checksum fails """ return checksum(self.ffp, checksum_type)
def check_uniqueness(self, checksum_type): """ Check tree upgrade uniqueness. Each data version to upgrade has to be stricly different from the latest version if exists. """ for latest_dset in self.hash.keys(): latest_path, latest_version = os.path.dirname(latest_dset), os.path.basename(latest_dset) self.hash[latest_dset]['upgrade'] = dict() for dset_leaf in self.leaves(root=os.path.join(latest_path, DRSPath.TREE_VERSION)): filename = os.path.basename(dset_leaf.data.origin) self.hash[latest_dset]['upgrade'][filename] = checksum(dset_leaf.data.origin, checksum_type) if self.hash[latest_dset]['latest'] == self.hash[latest_dset]['upgrade']: raise DuplicatedDataset(latest_path, latest_version)
def process(source): """ process(collector_input) File process that: * Handles files, * Deduces facet key, values pairs from file attributes * Checks facet values against CV, * Applies the versioning * Populates the DRS tree crating the appropriate leaves, * Stores dataset statistics. :param str source: The file full path to process """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(source) # Ignore files from incoming if fh.filename in pctx.ignore_from_incoming: msg = TAGS.SKIP + COLORS.HEADER(source) with pctx.lock: Print.exception(msg, buffer=True) return None # Loads attributes from filename, netCDF attributes, command-line fh.load_attributes(root=pctx.root, pattern=pctx.pattern, set_values=pctx.set_values) # Checks the facet values provided by the loaded attributes fh.check_facets(facets=pctx.facets, config=pctx.cfg, set_keys=pctx.set_keys) # Get parts of DRS path parts = fh.get_drs_parts(pctx.facets) # Instantiate file DRS path handler fh.drs = DRSPath(parts) # Ensure that the called project section is ALWAYS part of the DRS path elements (case insensitive) if not fh.drs.path().lower().startswith(pctx.project.lower()): raise InconsistentDRSPath(pctx.project, fh.drs.path()) # Compute file checksum if fh.drs.v_latest and not pctx.no_checksum: fh.checksum = checksum(fh.ffp, pctx.checksum_type) # Get file tracking id fh.tracking_id = get_tracking_id(fh.ffp, pctx.project) if fh.drs.v_latest: latest_file = os.path.join(fh.drs.path(latest=True, root=True), fh.filename) # Compute checksum of latest file version if exists if os.path.exists(latest_file) and not pctx.no_checksum: fh.latest_checksum = checksum(latest_file, pctx.checksum_type) # Get tracking_id of latest file version if exists if os.path.exists(latest_file): fh.latest_tracking_id = get_tracking_id( latest_file, pctx.project) msg = TAGS.SUCCESS + 'Processing {}'.format(COLORS.HEADER(fh.ffp)) Print.info(msg) return fh except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.SKIP + COLORS.HEADER(source) + '\n' msg += '\n'.join(exc) with pctx.lock: Print.exception(msg, buffer=True) return None finally: with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rScanning incoming file(s): ') msg += '{}% | {}/{} file(s)'.format(percentage, pctx.progress.value, pctx.nbsources) Print.progress(msg)
def tree_builder(fh): """ Builds the DRS tree accord to a source :param esgprep.drs.handler.File fh: The file handler object """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] try: # If a latest version already exists make some checks FIRST to stop file process if fh.drs.v_latest: # Latest version should be older than upgrade version if int(DRSPath.TREE_VERSION[1:]) <= int(fh.drs.v_latest[1:]): raise OlderUpgrade(DRSPath.TREE_VERSION, fh.drs.v_latest) # Walk through the latest dataset version to check its uniqueness with file checksums if not pctx.no_checksum: dset_nid = fh.drs.path(f_part=False, latest=True, root=True) if dset_nid not in tree.hash.keys(): tree.hash[dset_nid] = dict() tree.hash[dset_nid]['latest'] = dict() for root, _, filenames in os.walk( fh.drs.path(f_part=False, latest=True, root=True)): for filename in filenames: tree.hash[dset_nid]['latest'][filename] = checksum( os.path.join(root, filename), pctx.checksum_type) # Pickup the latest file version latest_file = os.path.join(fh.drs.path(latest=True, root=True), fh.filename) # Check latest file if exists if os.path.exists(latest_file): if not pctx.no_checksum: # If checksumming disabled duplicated files cannot be detected # In this case, incoming files are assumed to be different in any cases # Duplicated files should not exist. # Check if processed file is a duplicate in comparison with latest version if fh.latest_checksum == fh.checksum: fh.is_duplicate = True if not fh.is_duplicate: # If files are different check that PID/tracking_id is different from latest version if fh.latest_tracking_id == fh.tracking_id: raise UnchangedTrackingID(latest_file, fh.latest_tracking_id, fh.ffp, fh.tracking_id) # Start the tree generation if not fh.is_duplicate: # Add the processed file to the "vYYYYMMDD" node src = ['..'] * len(fh.drs.items(d_part=False)) src.extend(fh.drs.items(d_part=False, file_folder=True)) src.append(fh.filename) tree.create_leaf(nodes=fh.drs.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, os.path.join(*src)), src=os.path.join(*src), mode='symlink', origin=fh.ffp, force=True) # Add the "latest" node for symlink tree.create_leaf(nodes=fh.drs.items(f_part=False, version=False, root=True), leaf='latest', label='{}{}{}'.format('latest', LINK_SEPARATOR, fh.drs.v_upgrade), src=fh.drs.v_upgrade, mode='symlink') # Add the processed file to the "files" node tree.create_leaf(nodes=fh.drs.items(file_folder=True, root=True), leaf=fh.filename, label=fh.filename, src=fh.ffp, mode=pctx.mode) if fh.drs.v_latest and pctx.upgrade_from_latest: # Walk through the latest dataset version and create a symlink for each file with a different # filename than the processed one for root, _, filenames in os.walk( fh.drs.path(f_part=False, latest=True, root=True)): print filenames for filename in filenames: # Add latest files as tree leaves with version to upgrade instead of latest version # i.e., copy latest dataset leaves to Tree # Except if file has be ignored from latest version (i.e., with known issue) # Except if file leaf has already been created to avoid overwriting new version # leaf will be not create if already exists if filename != fh.filename and filename not in pctx.ignore_from_latest: src = os.path.join(root, filename) tree.create_leaf(nodes=fh.drs.items(root=True), leaf=filename, label='{}{}{}'.format( filename, LINK_SEPARATOR, os.readlink(src)), src=os.readlink(src), mode='symlink', origin=os.path.realpath(src)) else: # Pickup the latest file version latest_file = os.path.join(fh.drs.path(latest=True, root=True), fh.filename) if pctx.upgrade_from_latest: # If upgrade from latest is activated, raise the error, no duplicated files allowed # Because incoming must only contain modifed/corrected files raise DuplicatedFile(latest_file, fh.ffp) else: # If default behavior, the incoming contains all data for a new version # In the case of a duplicated file, just pass to the expected symlink creation # and records duplicated file for further removal only if migration mode is the # default (i.e., moving files). In the case of --copy or --link, keep duplicates # in place into the incoming directory src = os.readlink(latest_file) tree.create_leaf(nodes=fh.drs.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, src), src=src, mode='symlink', origin=fh.ffp) if pctx.mode == 'move': tree.duplicates.append(fh.ffp) # Record entry for list() record = { 'src': fh.ffp, 'dst': fh.drs.path(root=True), 'filename': fh.filename, 'latest': fh.drs.v_latest or 'Initial', 'size': fh.size } if fh.drs.path(f_part=False) in tree.paths.keys(): tree.paths[fh.drs.path(f_part=False)].append(record) else: tree.paths[fh.drs.path(f_part=False)] = [record] msg = TAGS.SUCCESS + 'DRS Path = {}'.format( COLORS.HEADER(fh.drs.path(f_part=False))) msg += ' <-- ' + fh.filename Print.info(msg) return True except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.FAIL + 'Build {}'.format(COLORS.HEADER( fh.drs.path())) + '\n' msg += '\n'.join(exc) Print.exception(msg, buffer=True) return None finally: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rBuilding DRS tree: ') msg += '{}% | {}/{} file(s)'.format(percentage, pctx.progress.value, pctx.nbsources) Print.progress(msg)
def process(collector_input): """ process(collector_input) File process that: * Handles files, * Deduces facet key, values pairs from file attributes * Checks facet values against CV, * Applies the versioning * Populates the DRS tree crating the appropriate leaves, * Stores dataset statistics. :param tuple collector_input: A tuple with the file path and the processing context :return: True on success :rtype: *boolean* """ # Deserialize inputs from collector ffp, ctx = collector_input # Block to avoid program stop if a thread fails try: # Instantiate file handler fh = File(ffp) # Loads attributes from filename, netCDF attributes, command-line fh.load_attributes(root=ctx.root, pattern=ctx.pattern, set_values=ctx.set_values) # Apply proper case to each attribute for key in fh.attributes: # Try to get the appropriate facet case for "category_default" try: fh.attributes[key] = ctx.cfg.get_options_from_pairs('category_defaults', key) except NoConfigKey: # If not specified keep facet case from local path, do nothing pass fh.check_facets(facets=ctx.facets, config=ctx.cfg, set_keys=ctx.set_keys) # Get parts of DRS path parts = fh.get_drs_parts(ctx.facets) # Instantiate file DRS path handler fph = DRSPath(parts) # If a latest version already exists make some checks FIRST to stop files to not process if fph.v_latest: # Latest version should be older than upgrade version if int(DRSPath.TREE_VERSION[1:]) <= int(fph.v_latest[1:]): raise OlderUpgrade(DRSPath.TREE_VERSION, fph.v_latest) # Walk through the latest dataset version to check its uniqueness with file checksums dset_nid = fph.path(f_part=False, latest=True, root=True) if dset_nid not in ctx.tree.hash.keys(): ctx.tree.hash[dset_nid] = dict() ctx.tree.hash[dset_nid]['latest'] = dict() for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)): for filename in filenames: ctx.tree.hash[dset_nid]['latest'][filename] = checksum(os.path.join(root, filename), ctx.checksum_type) # Pickup the latest file version latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename) # Check latest file if exists if os.path.exists(latest_file): latest_checksum = checksum(latest_file, ctx.checksum_type) current_checksum = checksum(fh.ffp, ctx.checksum_type) # Check if processed file is a duplicate in comparison with latest version if latest_checksum == current_checksum: fh.is_duplicate = True # Start the tree generation if not fh.is_duplicate: # Add the processed file to the "vYYYYMMDD" node src = ['..'] * len(fph.items(d_part=False)) src.extend(fph.items(d_part=False, file_folder=True)) src.append(fh.filename) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, os.path.join(*src)), src=os.path.join(*src), mode='symlink', origin=fh.ffp) # Add the "latest" node for symlink ctx.tree.create_leaf(nodes=fph.items(f_part=False, version=False, root=True), leaf='latest', label='{}{}{}'.format('latest', LINK_SEPARATOR, fph.v_upgrade), src=fph.v_upgrade, mode='symlink') # Add the processed file to the "files" node ctx.tree.create_leaf(nodes=fph.items(file_folder=True, root=True), leaf=fh.filename, label=fh.filename, src=fh.ffp, mode=ctx.mode) if ctx.upgrade_from_latest: # Walk through the latest dataset version and create a simlink for each file with a different # filename than the processed one for root, _, filenames in os.walk(fph.path(f_part=False, latest=True, root=True)): for filename in filenames: # Add latest files as tree leaves with version to upgrade instead of latest version # i.e., copy latest dataset leaves to Tree if filename != fh.filename: src = os.path.join(root, filename) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=filename, label='{}{}{}'.format(filename, LINK_SEPARATOR, os.readlink(src)), src=os.readlink(src), mode='symlink', origin=os.path.realpath(src)) else: # Pickup the latest file version latest_file = os.path.join(fph.path(latest=True, root=True), fh.filename) if ctx.upgrade_from_latest: # If upgrade from latest is activated, raise the error, no duplicated files allowed # Because incoming must only contain modifed/corrected files raise DuplicatedFile(latest_file, fh.ffp) else: # If default behavior, the incoming contains all data for a new version # In the case of a duplicated file, just pass to the expected symlink creation # and records duplicated file for further removal only if migration mode is the # default (i.e., moving files). In the case of --copy or --link, keep duplicates # in place into the incoming directory src = os.readlink(latest_file) ctx.tree.create_leaf(nodes=fph.items(root=True), leaf=fh.filename, label='{}{}{}'.format(fh.filename, LINK_SEPARATOR, src), src=src, mode='symlink', origin=fh.ffp) if ctx.mode == 'move': ctx.tree.duplicates.append(fh.ffp) # Record entry for list() incoming = {'src': fh.ffp, 'dst': fph.path(root=True), 'filename': fh.filename, 'latest': fph.v_latest or 'Initial', 'size': fh.size} if fph.path(f_part=False) in ctx.tree.paths.keys(): ctx.tree.paths[fph.path(f_part=False)].append(incoming) else: ctx.tree.paths[fph.path(f_part=False)] = [incoming] logging.info('{} <-- {}'.format(fph.path(f_part=False), fh.filename)) return True except Exception as e: logging.error('{} skipped\n{}: {}'.format(ffp, e.__class__.__name__, e.message)) return None finally: ctx.pbar.update()