Example #1
0
def archive(rgc, registry_paths, force, remove, cfg_path, genomes_desc):
    """
    Takes the RefGenConf object and builds individual tar archives
    that can be then served with 'refgenieserver serve'. Additionally determines their md5 checksums, file sizes and
    updates the original refgenie config with these data. If the --asset and/or --genome options  are used (specific
    build is requested) the archiver will check for the existence of config file saved in the path provided in
    `genome_server` in the original config and update it so that no archive metadata is lost

    :param RefGenConf rgc: configuration object with the data to build the servable archives for
    :param list[dict] registry_paths: a collection of mappings that identifies the assets to update
    :param bool force: whether to force the build of archive, regardless of its existence
    :param bool remove: whether remove specified genome/asset:tag from the archive
    :param str cfg_path: config file path
    """
    if float(rgc[CFG_VERSION_KEY]) < float(REQ_CFG_VERSION):
        raise ConfigNotCompliantError(
            "You need to update the genome config to v{} in order to use the archiver. "
            "The required version can be generated with refgenie >= {}".format(
                REQ_CFG_VERSION, REFGENIE_BY_CFG[REQ_CFG_VERSION]))
    if CFG_ARCHIVE_CONFIG_KEY in rgc:
        srp = rgc[CFG_ARCHIVE_CONFIG_KEY]
        server_rgc_path = srp if os.path.isabs(srp) \
            else os.path.join(os.path.dirname(rgc.file_path), srp)
    else:
        try:
            server_rgc_path = os.path.join(rgc[CFG_ARCHIVE_KEY],
                                           os.path.basename(cfg_path))
        except KeyError:
            raise GenomeConfigFormatError(
                "The config '{}' is missing a {} entry. Can't determine the desired archive."
                .format(cfg_path,
                        " or ".join([CFG_ARCHIVE_KEY, CFG_ARCHIVE_KEY_OLD])))
    if os.path.isfile(server_rgc_path) and not os.access(
            server_rgc_path, os.W_OK):
        raise OSError(
            "The determined archive config path is not writable: {}".format(
                server_rgc_path))
    if force:
        _LOGGER.info("Build forced; file existence will be ignored")
        if os.path.exists(server_rgc_path):
            _LOGGER.debug("'{}' file was found and will be updated".format(
                server_rgc_path))
    _LOGGER.debug("Registry_paths: {}".format(registry_paths))

    # original RefGenConf has been created in read-only mode,
    # make it RW compatible and point to new target path for server use or initialize a new object
    if os.path.exists(server_rgc_path):
        rgc_server = RefGenConf(filepath=server_rgc_path)
        if remove:
            if not registry_paths:
                _LOGGER.error(
                    "To remove archives you have to specify them. Use 'asset_registry_path' argument."
                )
                exit(1)
            with rgc_server as r:
                _remove_archive(r, registry_paths, CFG_ARCHIVE_KEY)
            exit(0)
    else:
        if remove:
            _LOGGER.error(
                "You can't remove archives since the genome_archive path does not exist yet."
            )
            exit(1)
        rgc_server = RefGenConf(filepath=rgc.file_path)
        rgc_server.make_writable(filepath=server_rgc_path)
        rgc_server.make_readonly()
    if registry_paths:
        genomes = _get_paths_element(registry_paths, "namespace")
        asset_list = _get_paths_element(registry_paths, "item")
        tag_list = _get_paths_element(registry_paths, "tag")
    else:
        genomes = rgc.genomes_list()
        asset_list, tag_list = None, None
    if not genomes:
        _LOGGER.error("No genomes found")
        exit(1)
    else:
        _LOGGER.debug("Genomes to be processed: {}".format(str(genomes)))
    if genomes_desc is not None:
        if os.path.exists(genomes_desc):
            import csv
            _LOGGER.info("Found a genomes descriptions CSV file: {}".format(
                genomes_desc))
            with open(genomes_desc, mode='r') as infile:
                reader = csv.reader(infile)
                descs = {rows[0]: rows[1] for rows in reader}
        else:
            _LOGGER.error(
                "Genomes descriptions CSV file does not exist: {}".format(
                    genomes_desc))
            sys.exit(1)
    counter = 0
    for genome in genomes:
        genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome)
        target_dir = os.path.join(rgc[CFG_ARCHIVE_KEY], genome)
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        genome_desc = rgc[CFG_GENOMES_KEY][genome].setdefault(CFG_GENOME_DESC_KEY, DESC_PLACEHOLDER) \
            if genomes_desc is None or genome not in descs else descs[genome]
        genome_checksum = rgc[CFG_GENOMES_KEY][genome].\
            setdefault(CFG_CHECKSUM_KEY, CHECKSUM_PLACEHOLDER)
        genome_attrs = {
            CFG_GENOME_DESC_KEY: genome_desc,
            CFG_CHECKSUM_KEY: genome_checksum
        }
        rgc_server.update_genomes(genome, genome_attrs)
        _LOGGER.debug("Updating '{}' genome attributes...".format(genome))
        asset = asset_list[counter] if asset_list is not None else None
        assets = asset or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys()
        if not assets:
            _LOGGER.error("No assets found")
            continue
        else:
            _LOGGER.debug("Assets to be processed: {}".format(str(assets)))
        for asset_name in assets if isinstance(assets, list) else [assets]:
            asset_desc = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\
                .setdefault(CFG_ASSET_DESC_KEY, DESC_PLACEHOLDER)
            default_tag = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\
                .setdefault(CFG_ASSET_DEFAULT_TAG_KEY, DEFAULT_TAG)
            asset_attrs = {
                CFG_ASSET_DESC_KEY: asset_desc,
                CFG_ASSET_DEFAULT_TAG_KEY: default_tag
            }
            _LOGGER.debug("Updating '{}/{}' asset attributes...".format(
                genome, asset_name))
            with rgc_server as r:
                r.update_assets(genome, asset_name, asset_attrs)

            tag = tag_list[counter] if tag_list is not None else None
            tags = tag or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                asset_name][CFG_ASSET_TAGS_KEY].keys()
            for tag_name in tags if isinstance(tags, list) else [tags]:
                if not rgc.is_asset_complete(genome, asset_name, tag_name):
                    raise MissingConfigDataError(
                        "Asset '{}/{}:{}' is incomplete. This probably means an"
                        " attempt to archive a partially pulled parent. "
                        "refgenieserver archive requires all assets to be built"
                        " prior to archiving.".format(genome, asset_name,
                                                      tag_name))
                file_name = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name][
                        CFG_ASSET_PATH_KEY]
                target_file = os.path.join(
                    target_dir, "{}__{}".format(asset_name, tag_name) + ".tgz")
                input_file = os.path.join(genome_dir, file_name, tag_name)
                # these attributes have to be read from the original RefGenConf in case the archiver just increments
                # an existing server RefGenConf
                parents = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_PARENTS_KEY, [])
                children = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_CHILDREN_KEY, [])
                seek_keys = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_SEEK_KEYS_KEY, {})
                asset_digest = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_CHECKSUM_KEY, None)
                if not os.path.exists(target_file) or force:
                    _LOGGER.info(
                        "Creating archive '{}' from '{}' asset".format(
                            target_file, input_file))
                    try:
                        _check_tgz(input_file, target_file, asset_name)
                        _copy_recipe(input_file, target_dir, asset_name,
                                     tag_name)
                        _copy_log(input_file, target_dir, asset_name, tag_name)
                    except OSError as e:
                        _LOGGER.warning(e)
                        continue
                    else:
                        _LOGGER.info(
                            "Updating '{}/{}:{}' tag attributes...".format(
                                genome, asset_name, tag_name))
                        tag_attrs = {
                            CFG_ASSET_PATH_KEY: file_name,
                            CFG_SEEK_KEYS_KEY: seek_keys,
                            CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file),
                            CFG_ARCHIVE_SIZE_KEY: size(target_file),
                            CFG_ASSET_SIZE_KEY: size(input_file),
                            CFG_ASSET_PARENTS_KEY: parents,
                            CFG_ASSET_CHILDREN_KEY: children,
                            CFG_ASSET_CHECKSUM_KEY: asset_digest
                        }
                        _LOGGER.debug("attr dict: {}".format(tag_attrs))
                        with rgc_server as r:
                            for parent in parents:
                                # here we update any pre-existing parents' children attr with the newly added asset
                                _LOGGER.debug(
                                    "Updating {} children list with {}".format(
                                        parent, "{}/{}:{}".format(
                                            genome, asset_name, tag_name)))
                                rp = parse_registry_path(parent)
                                parent_genome = rp["namespace"]
                                parent_asset = rp["item"]
                                parent_tag = rp["tag"]
                                try:
                                    r.get_asset(parent_genome, parent_asset,
                                                parent_tag)
                                except RefgenconfError:
                                    _LOGGER.warning(
                                        "'{}/{}:{}'s parent '{}' does not exist, "
                                        "skipping relationship updates".format(
                                            genome, asset_name, tag_name,
                                            parent))
                                    continue
                                r.update_relatives_assets(
                                    parent_genome,
                                    parent_asset,
                                    parent_tag, [
                                        "{}/{}:{}".format(
                                            genome, asset_name, tag_name)
                                    ],
                                    children=True)
                            r.update_tags(genome, asset_name, tag_name,
                                          tag_attrs)
                else:
                    _LOGGER.debug("'{}' exists".format(target_file))
        counter += 1
    with rgc_server as r:
        _purge_nonservable(r)
        _LOGGER.info(
            "Builder finished; server config file saved to: '{}'".format(
                r.write(server_rgc_path)))
Example #2
0
def archive(rgc, registry_paths, force, remove, cfg_path, genomes_desc):
    """
    Takes the RefGenConf object and builds individual tar archives
    that can be then served with 'refgenieserver serve'. Additionally determines their md5 checksums, file sizes and
    updates the original refgenie config with these data. If the --asset and/or --genome options  are used (specific
    build is requested) the archiver will check for the existence of config file saved in the path provided in
    `genome_server` in the original config and update it so that no archive metadata is lost

    :param RefGenConf rgc: configuration object with the data to build the servable archives for
    :param list[dict] registry_paths: a collection of mappings that identifies the assets to update
    :param bool force: whether to force the build of archive, regardless of its existence
    :param bool remove: whether remove specified genome/asset:tag from the archive
    :param str cfg_path: config file path
    """
    if float(rgc[CFG_VERSION_KEY]) < float(REQ_CFG_VERSION):
        raise ConfigNotCompliantError(
            f"You need to update the genome config to v{REQ_CFG_VERSION} in order to use the archiver. "
            f"The required version can be generated with refgenie >= {REFGENIE_BY_CFG[REQ_CFG_VERSION]}"
        )
    if CFG_ARCHIVE_CONFIG_KEY in rgc:
        srp = rgc[CFG_ARCHIVE_CONFIG_KEY]
        server_rgc_path = (srp if os.path.isabs(srp) else os.path.join(
            os.path.dirname(rgc.file_path), srp))
    else:
        try:
            server_rgc_path = os.path.join(rgc[CFG_ARCHIVE_KEY],
                                           os.path.basename(cfg_path))
        except KeyError:
            raise GenomeConfigFormatError(
                f"The config '{cfg_path}' is missing a {' or '.join([CFG_ARCHIVE_KEY, CFG_ARCHIVE_KEY_OLD])} entry. "
                f"Can't determine the desired archive.")
    if os.path.isfile(server_rgc_path) and not os.access(
            server_rgc_path, os.W_OK):
        raise OSError(
            "The determined archive config path is not writable: {}".format(
                server_rgc_path))
    if force:
        _LOGGER.info("Build forced; file existence will be ignored")
    _LOGGER.debug("Registry_paths: {}".format(registry_paths))
    # original RefGenConf has been created in read-only mode,
    # make it RW compatible and point to new target path for server use or initialize a new object
    if os.path.exists(server_rgc_path):
        _LOGGER.debug(
            f"'{server_rgc_path}' file was found and will be updated")
        rgc_server = RefGenConf(filepath=server_rgc_path)
        if remove:
            if not registry_paths:
                _LOGGER.error("To remove archives you have to specify them. "
                              "Use 'asset_registry_path' argument.")
                exit(1)
            with rgc_server as r:
                _remove_archive(r, registry_paths, CFG_ARCHIVE_KEY)
            exit(0)
    else:
        if remove:
            _LOGGER.error(
                "You can't remove archives since the genome_archive path does not exist yet."
            )
            exit(1)
        _LOGGER.debug(
            f"'{server_rgc_path}' file was not found and will be created")
        rgc_server = RefGenConf(filepath=rgc.file_path)
        rgc_server.make_writable(filepath=server_rgc_path)
        rgc_server.make_readonly()
    if registry_paths:
        genomes = _get_paths_element(registry_paths, "namespace")
        asset_list = _get_paths_element(registry_paths, "item")
        tag_list = _get_paths_element(registry_paths, "tag")
    else:
        genomes = rgc.genomes_list()
        asset_list, tag_list = None, None
    if not genomes:
        _LOGGER.error("No genomes found")
        exit(1)
    else:
        _LOGGER.debug(f"Genomes to be processed: {str(genomes)}")
    genomes = [rgc.get_genome_alias_digest(g) for g in genomes]
    if genomes_desc is not None:
        if os.path.exists(genomes_desc):
            import csv

            _LOGGER.info(
                f"Found a genomes descriptions CSV file: {genomes_desc}")
            with open(genomes_desc, mode="r") as infile:
                reader = csv.reader(infile)
                descs = {rows[0]: rows[1] for rows in reader}
        else:
            _LOGGER.error(
                f"Genomes descriptions CSV file does not exist: {genomes_desc}"
            )
            sys.exit(1)
    counter = 0
    for genome in genomes:
        genome_dir = os.path.join(rgc.data_dir, genome)
        target_dir = os.path.join(rgc[CFG_ARCHIVE_KEY], genome)
        alias_target_dir = os.path.join(
            rgc[CFG_ARCHIVE_KEY],
            rgc.get_genome_alias(digest=genome, fallback=True))
        if not os.path.exists(target_dir):
            os.makedirs(target_dir, exist_ok=True)
            # create legacy directory for archive
            # TODO: remove in the future
            os.makedirs(alias_target_dir, exist_ok=True)
        genome_desc = (rgc[CFG_GENOMES_KEY][genome].setdefault(
            CFG_GENOME_DESC_KEY, DESC_PLACEHOLDER) if genomes_desc is None
                       or genome not in descs else descs[genome])
        genome_aliases = rgc[CFG_GENOMES_KEY][genome].setdefault(
            CFG_ALIASES_KEY, [])
        genome_attrs = {
            CFG_GENOME_DESC_KEY: genome_desc,
            CFG_ALIASES_KEY: genome_aliases,
        }
        with rgc_server as r:
            r[CFG_GENOMES_KEY].setdefault(genome, PXAM())
            r[CFG_GENOMES_KEY][genome].update(genome_attrs)
        _LOGGER.debug(f"Updating '{genome}' genome attributes...")
        asset = asset_list[counter] if asset_list is not None else None
        assets = asset or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys()
        if not assets:
            _LOGGER.error("No assets found")
            continue
        else:
            _LOGGER.debug(f"Assets to be processed: {str(assets)}")
        for asset_name in assets if isinstance(assets, list) else [assets]:
            asset_desc = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                asset_name].setdefault(CFG_ASSET_DESC_KEY, DESC_PLACEHOLDER)
            default_tag = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                asset_name].setdefault(CFG_ASSET_DEFAULT_TAG_KEY, DEFAULT_TAG)
            asset_attrs = {
                CFG_ASSET_DESC_KEY: asset_desc,
                CFG_ASSET_DEFAULT_TAG_KEY: default_tag,
            }
            _LOGGER.debug(
                f"Updating '{genome}/{asset_name}' asset attributes...")
            with rgc_server as r:
                r.update_assets(genome, asset_name, asset_attrs)

            tag = tag_list[counter] if tag_list is not None else None
            tags = (tag or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY]
                    [asset_name][CFG_ASSET_TAGS_KEY].keys())
            for tag_name in tags if isinstance(tags, list) else [tags]:
                if not rgc.is_asset_complete(genome, asset_name, tag_name):
                    raise MissingConfigDataError(
                        f"Asset '{genome}/{asset_name}:{tag_name}' is incomplete. "
                        f"This probably means an attempt to archive a partially "
                        f"pulled parent. refgenieserver archive requires all assets to "
                        f"be built prior to archiving.")
                file_name = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name][
                        CFG_ASSET_PATH_KEY]
                target_file_core = os.path.join(target_dir,
                                                f"{asset_name}__{tag_name}")
                target_file = f"{target_file_core}.tgz"
                input_file = os.path.join(genome_dir, file_name, tag_name)
                # these attributes have to be read from the original RefGenConf in case the archiver just increments
                # an existing server RefGenConf
                parents = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault(
                        CFG_ASSET_PARENTS_KEY, [])
                children = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault(
                        CFG_ASSET_CHILDREN_KEY, [])
                seek_keys = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault(
                        CFG_SEEK_KEYS_KEY, {})
                asset_digest = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name].setdefault(
                        CFG_ASSET_CHECKSUM_KEY, None)
                if not os.path.exists(target_file) or force:
                    _LOGGER.info(
                        f"Creating archive '{target_file}' from '{input_file}' asset"
                    )
                    try:
                        _copy_asset_dir(input_file, target_file_core)
                        _get_asset_dir_contents(target_file_core, asset_name,
                                                tag_name)
                        _check_tgz(input_file, target_file)
                        _copy_recipe(input_file, target_dir, asset_name,
                                     tag_name)
                        _copy_log(input_file, target_dir, asset_name, tag_name)
                        # TODO: remove the rest of this try block in the future
                        _check_tgz_legacy(
                            input_file,
                            target_file,
                            asset_name,
                            rgc.get_genome_alias_digest(alias=genome,
                                                        fallback=True),
                            rgc.get_genome_alias(digest=genome, fallback=True),
                        )
                        _copy_recipe(input_file, alias_target_dir, asset_name,
                                     tag_name)
                        _copy_log(input_file, alias_target_dir, asset_name,
                                  tag_name)
                    except OSError as e:
                        _LOGGER.warning(e)
                        continue
                    else:
                        _LOGGER.info(
                            f"Updating '{genome}/{asset_name}:{tag_name}' tag attributes"
                        )
                        tag_attrs = {
                            CFG_ASSET_PATH_KEY: file_name,
                            CFG_SEEK_KEYS_KEY: seek_keys,
                            CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file),
                            CFG_ARCHIVE_SIZE_KEY: size(target_file),
                            CFG_ASSET_SIZE_KEY: size(input_file),
                            CFG_ASSET_PARENTS_KEY: parents,
                            CFG_ASSET_CHILDREN_KEY: children,
                            CFG_ASSET_CHECKSUM_KEY: asset_digest,
                        }
                        # TODO: legacy checksum generation and tag dictionary
                        #  update to be removed in the future
                        legacy_digest = checksum(
                            replace_str_in_obj(
                                target_file,
                                x=rgc.get_genome_alias_digest(alias=genome,
                                                              fallback=True),
                                y=rgc.get_genome_alias(digest=genome,
                                                       fallback=True),
                            ))
                        tag_attrs.update(
                            {CFG_LEGACY_ARCHIVE_CHECKSUM_KEY: legacy_digest})
                        _LOGGER.debug(f"attr dict: {tag_attrs}")
                        with rgc_server as r:
                            for parent in parents:
                                # here we update any pre-existing parents' children
                                # attr with the newly added asset
                                _LOGGER.debug(
                                    f"Updating {parent} children list with "
                                    f"{genome}/{asset_name}:{tag_name}")
                                rp = parse_registry_path(parent)
                                parent_genome = rp["namespace"]
                                parent_asset = rp["item"]
                                parent_tag = rp["tag"]
                                try:
                                    r.seek(
                                        parent_genome,
                                        parent_asset,
                                        parent_tag,
                                        strict_exists=True,
                                    )
                                except RefgenconfError:
                                    _LOGGER.warning(
                                        f"'{genome}/{asset_name}:{tag_name}'s parent "
                                        f"'{parent}' does not exist, skipping relationship updates"
                                    )
                                    continue
                                r.update_relatives_assets(
                                    parent_genome,
                                    parent_asset,
                                    parent_tag,
                                    [f"{genome}/{asset_name}:{tag_name}"],
                                    children=True,
                                )
                            r.update_tags(genome, asset_name, tag_name,
                                          tag_attrs)
                else:
                    exists_msg = f"'{target_file}' exists."
                    try:
                        rgc_server[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                            asset_name][CFG_ASSET_TAGS_KEY][tag_name][
                                CFG_ARCHIVE_CHECKSUM_KEY]
                        _LOGGER.debug(exists_msg + " Skipping")
                    except KeyError:
                        _LOGGER.debug(exists_msg +
                                      " Calculating archive digest")
                        tag_attrs = {
                            CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file)
                        }
                        with rgc_server as r:
                            r.update_tags(genome, asset_name, tag_name,
                                          tag_attrs)

        counter += 1
    _LOGGER.info(
        f"Builder finished; server config file saved: {rgc_server.file_path}")