def archive(rgc, registry_paths, force, remove, cfg_path, genomes_desc): """ Takes the RefGenConf object and builds individual tar archives that can be then served with 'refgenieserver serve'. Additionally determines their md5 checksums, file sizes and updates the original refgenie config with these data. If the --asset and/or --genome options are used (specific build is requested) the archiver will check for the existence of config file saved in the path provided in `genome_server` in the original config and update it so that no archive metadata is lost :param RefGenConf rgc: configuration object with the data to build the servable archives for :param list[dict] registry_paths: a collection of mappings that identifies the assets to update :param bool force: whether to force the build of archive, regardless of its existence :param bool remove: whether remove specified genome/asset:tag from the archive :param str cfg_path: config file path """ if float(rgc[CFG_VERSION_KEY]) < float(REQ_CFG_VERSION): raise ConfigNotCompliantError( "You need to update the genome config to v{} in order to use the archiver. " "The required version can be generated with refgenie >= {}".format( REQ_CFG_VERSION, REFGENIE_BY_CFG[REQ_CFG_VERSION])) if CFG_ARCHIVE_CONFIG_KEY in rgc: srp = rgc[CFG_ARCHIVE_CONFIG_KEY] server_rgc_path = srp if os.path.isabs(srp) \ else os.path.join(os.path.dirname(rgc.file_path), srp) else: try: server_rgc_path = os.path.join(rgc[CFG_ARCHIVE_KEY], os.path.basename(cfg_path)) except KeyError: raise GenomeConfigFormatError( "The config '{}' is missing a {} entry. Can't determine the desired archive." .format(cfg_path, " or ".join([CFG_ARCHIVE_KEY, CFG_ARCHIVE_KEY_OLD]))) if os.path.isfile(server_rgc_path) and not os.access( server_rgc_path, os.W_OK): raise OSError( "The determined archive config path is not writable: {}".format( server_rgc_path)) if force: _LOGGER.info("Build forced; file existence will be ignored") if os.path.exists(server_rgc_path): _LOGGER.debug("'{}' file was found and will be updated".format( server_rgc_path)) _LOGGER.debug("Registry_paths: {}".format(registry_paths)) # original RefGenConf has been created in read-only mode, # make it RW compatible and point to new target path for server use or initialize a new object if os.path.exists(server_rgc_path): rgc_server = RefGenConf(filepath=server_rgc_path) if remove: if not registry_paths: _LOGGER.error( "To remove archives you have to specify them. Use 'asset_registry_path' argument." ) exit(1) with rgc_server as r: _remove_archive(r, registry_paths, CFG_ARCHIVE_KEY) exit(0) else: if remove: _LOGGER.error( "You can't remove archives since the genome_archive path does not exist yet." ) exit(1) rgc_server = RefGenConf(filepath=rgc.file_path) rgc_server.make_writable(filepath=server_rgc_path) rgc_server.make_readonly() if registry_paths: genomes = _get_paths_element(registry_paths, "namespace") asset_list = _get_paths_element(registry_paths, "item") tag_list = _get_paths_element(registry_paths, "tag") else: genomes = rgc.genomes_list() asset_list, tag_list = None, None if not genomes: _LOGGER.error("No genomes found") exit(1) else: _LOGGER.debug("Genomes to be processed: {}".format(str(genomes))) if genomes_desc is not None: if os.path.exists(genomes_desc): import csv _LOGGER.info("Found a genomes descriptions CSV file: {}".format( genomes_desc)) with open(genomes_desc, mode='r') as infile: reader = csv.reader(infile) descs = {rows[0]: rows[1] for rows in reader} else: _LOGGER.error( "Genomes descriptions CSV file does not exist: {}".format( genomes_desc)) sys.exit(1) counter = 0 for genome in genomes: genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome) target_dir = os.path.join(rgc[CFG_ARCHIVE_KEY], genome) if not os.path.exists(target_dir): os.makedirs(target_dir) genome_desc = rgc[CFG_GENOMES_KEY][genome].setdefault(CFG_GENOME_DESC_KEY, DESC_PLACEHOLDER) \ if genomes_desc is None or genome not in descs else descs[genome] genome_checksum = rgc[CFG_GENOMES_KEY][genome].\ setdefault(CFG_CHECKSUM_KEY, CHECKSUM_PLACEHOLDER) genome_attrs = { CFG_GENOME_DESC_KEY: genome_desc, CFG_CHECKSUM_KEY: genome_checksum } rgc_server.update_genomes(genome, genome_attrs) _LOGGER.debug("Updating '{}' genome attributes...".format(genome)) asset = asset_list[counter] if asset_list is not None else None assets = asset or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys() if not assets: _LOGGER.error("No assets found") continue else: _LOGGER.debug("Assets to be processed: {}".format(str(assets))) for asset_name in assets if isinstance(assets, list) else [assets]: asset_desc = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\ .setdefault(CFG_ASSET_DESC_KEY, DESC_PLACEHOLDER) default_tag = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\ .setdefault(CFG_ASSET_DEFAULT_TAG_KEY, DEFAULT_TAG) asset_attrs = { CFG_ASSET_DESC_KEY: asset_desc, CFG_ASSET_DEFAULT_TAG_KEY: default_tag } _LOGGER.debug("Updating '{}/{}' asset attributes...".format( genome, asset_name)) with rgc_server as r: r.update_assets(genome, asset_name, asset_attrs) tag = tag_list[counter] if tag_list is not None else None tags = tag or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY].keys() for tag_name in tags if isinstance(tags, list) else [tags]: if not rgc.is_asset_complete(genome, asset_name, tag_name): raise MissingConfigDataError( "Asset '{}/{}:{}' is incomplete. This probably means an" " attempt to archive a partially pulled parent. " "refgenieserver archive requires all assets to be built" " prior to archiving.".format(genome, asset_name, tag_name)) file_name = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name][ CFG_ASSET_PATH_KEY] target_file = os.path.join( target_dir, "{}__{}".format(asset_name, tag_name) + ".tgz") input_file = os.path.join(genome_dir, file_name, tag_name) # these attributes have to be read from the original RefGenConf in case the archiver just increments # an existing server RefGenConf parents = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \ setdefault(CFG_ASSET_PARENTS_KEY, []) children = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \ setdefault(CFG_ASSET_CHILDREN_KEY, []) seek_keys = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \ setdefault(CFG_SEEK_KEYS_KEY, {}) asset_digest = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \ setdefault(CFG_ASSET_CHECKSUM_KEY, None) if not os.path.exists(target_file) or force: _LOGGER.info( "Creating archive '{}' from '{}' asset".format( target_file, input_file)) try: _check_tgz(input_file, target_file, asset_name) _copy_recipe(input_file, target_dir, asset_name, tag_name) _copy_log(input_file, target_dir, asset_name, tag_name) except OSError as e: _LOGGER.warning(e) continue else: _LOGGER.info( "Updating '{}/{}:{}' tag attributes...".format( genome, asset_name, tag_name)) tag_attrs = { CFG_ASSET_PATH_KEY: file_name, CFG_SEEK_KEYS_KEY: seek_keys, CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file), CFG_ARCHIVE_SIZE_KEY: size(target_file), CFG_ASSET_SIZE_KEY: size(input_file), CFG_ASSET_PARENTS_KEY: parents, CFG_ASSET_CHILDREN_KEY: children, CFG_ASSET_CHECKSUM_KEY: asset_digest } _LOGGER.debug("attr dict: {}".format(tag_attrs)) with rgc_server as r: for parent in parents: # here we update any pre-existing parents' children attr with the newly added asset _LOGGER.debug( "Updating {} children list with {}".format( parent, "{}/{}:{}".format( genome, asset_name, tag_name))) rp = parse_registry_path(parent) parent_genome = rp["namespace"] parent_asset = rp["item"] parent_tag = rp["tag"] try: r.get_asset(parent_genome, parent_asset, parent_tag) except RefgenconfError: _LOGGER.warning( "'{}/{}:{}'s parent '{}' does not exist, " "skipping relationship updates".format( genome, asset_name, tag_name, parent)) continue r.update_relatives_assets( parent_genome, parent_asset, parent_tag, [ "{}/{}:{}".format( genome, asset_name, tag_name) ], children=True) r.update_tags(genome, asset_name, tag_name, tag_attrs) else: _LOGGER.debug("'{}' exists".format(target_file)) counter += 1 with rgc_server as r: _purge_nonservable(r) _LOGGER.info( "Builder finished; server config file saved to: '{}'".format( r.write(server_rgc_path)))
def archive(rgc, registry_paths, force, remove, cfg_path, genomes_desc): """ Takes the RefGenConf object and builds individual tar archives that can be then served with 'refgenieserver serve'. Additionally determines their md5 checksums, file sizes and updates the original refgenie config with these data. If the --asset and/or --genome options are used (specific build is requested) the archiver will check for the existence of config file saved in the path provided in `genome_server` in the original config and update it so that no archive metadata is lost :param RefGenConf rgc: configuration object with the data to build the servable archives for :param list[dict] registry_paths: a collection of mappings that identifies the assets to update :param bool force: whether to force the build of archive, regardless of its existence :param bool remove: whether remove specified genome/asset:tag from the archive :param str cfg_path: config file path """ if float(rgc[CFG_VERSION_KEY]) < float(REQ_CFG_VERSION): raise ConfigNotCompliantError( f"You need to update the genome config to v{REQ_CFG_VERSION} in order to use the archiver. " f"The required version can be generated with refgenie >= {REFGENIE_BY_CFG[REQ_CFG_VERSION]}" ) if CFG_ARCHIVE_CONFIG_KEY in rgc: srp = rgc[CFG_ARCHIVE_CONFIG_KEY] server_rgc_path = (srp if os.path.isabs(srp) else os.path.join( os.path.dirname(rgc.file_path), srp)) else: try: server_rgc_path = os.path.join(rgc[CFG_ARCHIVE_KEY], os.path.basename(cfg_path)) except KeyError: raise GenomeConfigFormatError( f"The config '{cfg_path}' is missing a {' or '.join([CFG_ARCHIVE_KEY, CFG_ARCHIVE_KEY_OLD])} entry. " f"Can't determine the desired archive.") if os.path.isfile(server_rgc_path) and not os.access( server_rgc_path, os.W_OK): raise OSError( "The determined archive config path is not writable: {}".format( server_rgc_path)) if force: _LOGGER.info("Build forced; file existence will be ignored") _LOGGER.debug("Registry_paths: {}".format(registry_paths)) # original RefGenConf has been created in read-only mode, # make it RW compatible and point to new target path for server use or initialize a new object if os.path.exists(server_rgc_path): _LOGGER.debug( f"'{server_rgc_path}' file was found and will be updated") rgc_server = RefGenConf(filepath=server_rgc_path) if remove: if not registry_paths: _LOGGER.error("To remove archives you have to specify them. " "Use 'asset_registry_path' argument.") exit(1) with rgc_server as r: _remove_archive(r, registry_paths, CFG_ARCHIVE_KEY) exit(0) else: if remove: _LOGGER.error( "You can't remove archives since the genome_archive path does not exist yet." ) exit(1) _LOGGER.debug( f"'{server_rgc_path}' file was not found and will be created") rgc_server = RefGenConf(filepath=rgc.file_path) rgc_server.make_writable(filepath=server_rgc_path) rgc_server.make_readonly() if registry_paths: genomes = _get_paths_element(registry_paths, "namespace") asset_list = _get_paths_element(registry_paths, "item") tag_list = _get_paths_element(registry_paths, "tag") else: genomes = rgc.genomes_list() asset_list, tag_list = None, None if not genomes: _LOGGER.error("No genomes found") exit(1) else: _LOGGER.debug(f"Genomes to be processed: {str(genomes)}") genomes = [rgc.get_genome_alias_digest(g) for g in genomes] if genomes_desc is not None: if os.path.exists(genomes_desc): import csv _LOGGER.info( f"Found a genomes descriptions CSV file: {genomes_desc}") with open(genomes_desc, mode="r") as infile: reader = csv.reader(infile) descs = {rows[0]: rows[1] for rows in reader} else: _LOGGER.error( f"Genomes descriptions CSV file does not exist: {genomes_desc}" ) sys.exit(1) counter = 0 for genome in genomes: genome_dir = os.path.join(rgc.data_dir, genome) target_dir = os.path.join(rgc[CFG_ARCHIVE_KEY], genome) alias_target_dir = os.path.join( rgc[CFG_ARCHIVE_KEY], rgc.get_genome_alias(digest=genome, fallback=True)) if not os.path.exists(target_dir): os.makedirs(target_dir, exist_ok=True) # create legacy directory for archive # TODO: remove in the future os.makedirs(alias_target_dir, exist_ok=True) genome_desc = (rgc[CFG_GENOMES_KEY][genome].setdefault( CFG_GENOME_DESC_KEY, DESC_PLACEHOLDER) if genomes_desc is None or genome not in descs else descs[genome]) genome_aliases = rgc[CFG_GENOMES_KEY][genome].setdefault( CFG_ALIASES_KEY, []) genome_attrs = { CFG_GENOME_DESC_KEY: genome_desc, CFG_ALIASES_KEY: genome_aliases, } with rgc_server as r: r[CFG_GENOMES_KEY].setdefault(genome, PXAM()) r[CFG_GENOMES_KEY][genome].update(genome_attrs) _LOGGER.debug(f"Updating '{genome}' genome attributes...") asset = asset_list[counter] if asset_list is not None else None assets = asset or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys() if not assets: _LOGGER.error("No assets found") continue else: _LOGGER.debug(f"Assets to be processed: {str(assets)}") for asset_name in assets if isinstance(assets, list) else [assets]: asset_desc = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name].setdefault(CFG_ASSET_DESC_KEY, DESC_PLACEHOLDER) default_tag = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name].setdefault(CFG_ASSET_DEFAULT_TAG_KEY, DEFAULT_TAG) asset_attrs = { CFG_ASSET_DESC_KEY: asset_desc, CFG_ASSET_DEFAULT_TAG_KEY: default_tag, } _LOGGER.debug( f"Updating '{genome}/{asset_name}' asset attributes...") with rgc_server as r: r.update_assets(genome, asset_name, asset_attrs) tag = tag_list[counter] if tag_list is not None else None tags = (tag or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY] [asset_name][CFG_ASSET_TAGS_KEY].keys()) for tag_name in tags if isinstance(tags, list) else [tags]: if not rgc.is_asset_complete(genome, asset_name, tag_name): raise MissingConfigDataError( f"Asset '{genome}/{asset_name}:{tag_name}' is incomplete. " f"This probably means an attempt to archive a partially " f"pulled parent. refgenieserver archive requires all assets to " f"be built prior to archiving.") file_name = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name][ CFG_ASSET_PATH_KEY] target_file_core = os.path.join(target_dir, f"{asset_name}__{tag_name}") target_file = f"{target_file_core}.tgz" input_file = os.path.join(genome_dir, file_name, tag_name) # these attributes have to be read from the original RefGenConf in case the archiver just increments # an existing server RefGenConf parents = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault( CFG_ASSET_PARENTS_KEY, []) children = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault( CFG_ASSET_CHILDREN_KEY, []) seek_keys = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault( CFG_SEEK_KEYS_KEY, {}) asset_digest = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault( CFG_ASSET_CHECKSUM_KEY, None) if not os.path.exists(target_file) or force: _LOGGER.info( f"Creating archive '{target_file}' from '{input_file}' asset" ) try: _copy_asset_dir(input_file, target_file_core) _get_asset_dir_contents(target_file_core, asset_name, tag_name) _check_tgz(input_file, target_file) _copy_recipe(input_file, target_dir, asset_name, tag_name) _copy_log(input_file, target_dir, asset_name, tag_name) # TODO: remove the rest of this try block in the future _check_tgz_legacy( input_file, target_file, asset_name, rgc.get_genome_alias_digest(alias=genome, fallback=True), rgc.get_genome_alias(digest=genome, fallback=True), ) _copy_recipe(input_file, alias_target_dir, asset_name, tag_name) _copy_log(input_file, alias_target_dir, asset_name, tag_name) except OSError as e: _LOGGER.warning(e) continue else: _LOGGER.info( f"Updating '{genome}/{asset_name}:{tag_name}' tag attributes" ) tag_attrs = { CFG_ASSET_PATH_KEY: file_name, CFG_SEEK_KEYS_KEY: seek_keys, CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file), CFG_ARCHIVE_SIZE_KEY: size(target_file), CFG_ASSET_SIZE_KEY: size(input_file), CFG_ASSET_PARENTS_KEY: parents, CFG_ASSET_CHILDREN_KEY: children, CFG_ASSET_CHECKSUM_KEY: asset_digest, } # TODO: legacy checksum generation and tag dictionary # update to be removed in the future legacy_digest = checksum( replace_str_in_obj( target_file, x=rgc.get_genome_alias_digest(alias=genome, fallback=True), y=rgc.get_genome_alias(digest=genome, fallback=True), )) tag_attrs.update( {CFG_LEGACY_ARCHIVE_CHECKSUM_KEY: legacy_digest}) _LOGGER.debug(f"attr dict: {tag_attrs}") with rgc_server as r: for parent in parents: # here we update any pre-existing parents' children # attr with the newly added asset _LOGGER.debug( f"Updating {parent} children list with " f"{genome}/{asset_name}:{tag_name}") rp = parse_registry_path(parent) parent_genome = rp["namespace"] parent_asset = rp["item"] parent_tag = rp["tag"] try: r.seek( parent_genome, parent_asset, parent_tag, strict_exists=True, ) except RefgenconfError: _LOGGER.warning( f"'{genome}/{asset_name}:{tag_name}'s parent " f"'{parent}' does not exist, skipping relationship updates" ) continue r.update_relatives_assets( parent_genome, parent_asset, parent_tag, [f"{genome}/{asset_name}:{tag_name}"], children=True, ) r.update_tags(genome, asset_name, tag_name, tag_attrs) else: exists_msg = f"'{target_file}' exists." try: rgc_server[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ asset_name][CFG_ASSET_TAGS_KEY][tag_name][ CFG_ARCHIVE_CHECKSUM_KEY] _LOGGER.debug(exists_msg + " Skipping") except KeyError: _LOGGER.debug(exists_msg + " Calculating archive digest") tag_attrs = { CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file) } with rgc_server as r: r.update_tags(genome, asset_name, tag_name, tag_attrs) counter += 1 _LOGGER.info( f"Builder finished; server config file saved: {rgc_server.file_path}")