Example #1
0
    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually runs the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """
        if args.map:
            # Performing a build map step.
            # The reduce step will need to be performed to get the built
            # asset metadata to the master config file
            genome_alias = rgc.get_genome_alias(digest=genome)
            # create an empty config file in the genome directory
            _LOGGER.info(f"Using new map genome config: {locked_map_gencfg}")
            make_sure_path_exists(os.path.dirname(locked_map_gencfg))
            open(locked_map_gencfg, "a").close()
            # initialize a new RefGenConf.
            # Use the master location for data storage,
            # but change path to the in asset dir location
            rgc_map = RefGenConf(
                entries={"genome_folder": rgc.genome_folder},
                filepath=locked_map_gencfg,
            )
            # set the alias first (if available), based on the master file

            rgc_map.set_genome_alias(
                digest=genome,
                genome=genome_alias,
                create_genome=True,
            )

            # copy the genome of interest section to the new RefGenConf,
            # so that possible dependancies can be satisfied
            rgc_map.update_genomes(
                genome=genome_alias,
                data=rgc[CFG_GENOMES_KEY][genome],
            )

        else:
            rgc_map = rgc

        _LOGGER.info(
            f"Saving outputs to:{block_iter_repr(['content: ' + genome_outfolder, 'logs: ' + build_stats_dir])}"
        )
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                f"Insufficient permissions to write to output folder: {genome_outfolder}"
            )
            return False, rgc_map

        pm = pypiper.PipelineManager(name=PKG_NAME,
                                     outfolder=build_stats_dir,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(build_stats_dir,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False, rgc_map
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(build_stats_dir, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc_map.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info(f"Asset digest: {digest}")
            # add a 'dir' seek_key that points to the asset directory
            build_pkg[ASSETS].update({"dir": "."})
            # add updates to config file
            with rgc_map as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True, rgc_map
Example #2
0
def archive(rgc, registry_paths, force, remove, cfg_path, genomes_desc):
    """
    Takes the RefGenConf object and builds individual tar archives
    that can be then served with 'refgenieserver serve'. Additionally determines their md5 checksums, file sizes and
    updates the original refgenie config with these data. If the --asset and/or --genome options  are used (specific
    build is requested) the archiver will check for the existence of config file saved in the path provided in
    `genome_server` in the original config and update it so that no archive metadata is lost

    :param RefGenConf rgc: configuration object with the data to build the servable archives for
    :param list[dict] registry_paths: a collection of mappings that identifies the assets to update
    :param bool force: whether to force the build of archive, regardless of its existence
    :param bool remove: whether remove specified genome/asset:tag from the archive
    :param str cfg_path: config file path
    """
    if float(rgc[CFG_VERSION_KEY]) < float(REQ_CFG_VERSION):
        raise ConfigNotCompliantError(
            "You need to update the genome config to v{} in order to use the archiver. "
            "The required version can be generated with refgenie >= {}".format(
                REQ_CFG_VERSION, REFGENIE_BY_CFG[REQ_CFG_VERSION]))
    if CFG_ARCHIVE_CONFIG_KEY in rgc:
        srp = rgc[CFG_ARCHIVE_CONFIG_KEY]
        server_rgc_path = srp if os.path.isabs(srp) \
            else os.path.join(os.path.dirname(rgc.file_path), srp)
    else:
        try:
            server_rgc_path = os.path.join(rgc[CFG_ARCHIVE_KEY],
                                           os.path.basename(cfg_path))
        except KeyError:
            raise GenomeConfigFormatError(
                "The config '{}' is missing a {} entry. Can't determine the desired archive."
                .format(cfg_path,
                        " or ".join([CFG_ARCHIVE_KEY, CFG_ARCHIVE_KEY_OLD])))
    if os.path.isfile(server_rgc_path) and not os.access(
            server_rgc_path, os.W_OK):
        raise OSError(
            "The determined archive config path is not writable: {}".format(
                server_rgc_path))
    if force:
        _LOGGER.info("Build forced; file existence will be ignored")
        if os.path.exists(server_rgc_path):
            _LOGGER.debug("'{}' file was found and will be updated".format(
                server_rgc_path))
    _LOGGER.debug("Registry_paths: {}".format(registry_paths))

    # original RefGenConf has been created in read-only mode,
    # make it RW compatible and point to new target path for server use or initialize a new object
    if os.path.exists(server_rgc_path):
        rgc_server = RefGenConf(filepath=server_rgc_path)
        if remove:
            if not registry_paths:
                _LOGGER.error(
                    "To remove archives you have to specify them. Use 'asset_registry_path' argument."
                )
                exit(1)
            with rgc_server as r:
                _remove_archive(r, registry_paths, CFG_ARCHIVE_KEY)
            exit(0)
    else:
        if remove:
            _LOGGER.error(
                "You can't remove archives since the genome_archive path does not exist yet."
            )
            exit(1)
        rgc_server = RefGenConf(filepath=rgc.file_path)
        rgc_server.make_writable(filepath=server_rgc_path)
        rgc_server.make_readonly()
    if registry_paths:
        genomes = _get_paths_element(registry_paths, "namespace")
        asset_list = _get_paths_element(registry_paths, "item")
        tag_list = _get_paths_element(registry_paths, "tag")
    else:
        genomes = rgc.genomes_list()
        asset_list, tag_list = None, None
    if not genomes:
        _LOGGER.error("No genomes found")
        exit(1)
    else:
        _LOGGER.debug("Genomes to be processed: {}".format(str(genomes)))
    if genomes_desc is not None:
        if os.path.exists(genomes_desc):
            import csv
            _LOGGER.info("Found a genomes descriptions CSV file: {}".format(
                genomes_desc))
            with open(genomes_desc, mode='r') as infile:
                reader = csv.reader(infile)
                descs = {rows[0]: rows[1] for rows in reader}
        else:
            _LOGGER.error(
                "Genomes descriptions CSV file does not exist: {}".format(
                    genomes_desc))
            sys.exit(1)
    counter = 0
    for genome in genomes:
        genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome)
        target_dir = os.path.join(rgc[CFG_ARCHIVE_KEY], genome)
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        genome_desc = rgc[CFG_GENOMES_KEY][genome].setdefault(CFG_GENOME_DESC_KEY, DESC_PLACEHOLDER) \
            if genomes_desc is None or genome not in descs else descs[genome]
        genome_checksum = rgc[CFG_GENOMES_KEY][genome].\
            setdefault(CFG_CHECKSUM_KEY, CHECKSUM_PLACEHOLDER)
        genome_attrs = {
            CFG_GENOME_DESC_KEY: genome_desc,
            CFG_CHECKSUM_KEY: genome_checksum
        }
        rgc_server.update_genomes(genome, genome_attrs)
        _LOGGER.debug("Updating '{}' genome attributes...".format(genome))
        asset = asset_list[counter] if asset_list is not None else None
        assets = asset or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY].keys()
        if not assets:
            _LOGGER.error("No assets found")
            continue
        else:
            _LOGGER.debug("Assets to be processed: {}".format(str(assets)))
        for asset_name in assets if isinstance(assets, list) else [assets]:
            asset_desc = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\
                .setdefault(CFG_ASSET_DESC_KEY, DESC_PLACEHOLDER)
            default_tag = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name]\
                .setdefault(CFG_ASSET_DEFAULT_TAG_KEY, DEFAULT_TAG)
            asset_attrs = {
                CFG_ASSET_DESC_KEY: asset_desc,
                CFG_ASSET_DEFAULT_TAG_KEY: default_tag
            }
            _LOGGER.debug("Updating '{}/{}' asset attributes...".format(
                genome, asset_name))
            with rgc_server as r:
                r.update_assets(genome, asset_name, asset_attrs)

            tag = tag_list[counter] if tag_list is not None else None
            tags = tag or rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                asset_name][CFG_ASSET_TAGS_KEY].keys()
            for tag_name in tags if isinstance(tags, list) else [tags]:
                if not rgc.is_asset_complete(genome, asset_name, tag_name):
                    raise MissingConfigDataError(
                        "Asset '{}/{}:{}' is incomplete. This probably means an"
                        " attempt to archive a partially pulled parent. "
                        "refgenieserver archive requires all assets to be built"
                        " prior to archiving.".format(genome, asset_name,
                                                      tag_name))
                file_name = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][
                    asset_name][CFG_ASSET_TAGS_KEY][tag_name][
                        CFG_ASSET_PATH_KEY]
                target_file = os.path.join(
                    target_dir, "{}__{}".format(asset_name, tag_name) + ".tgz")
                input_file = os.path.join(genome_dir, file_name, tag_name)
                # these attributes have to be read from the original RefGenConf in case the archiver just increments
                # an existing server RefGenConf
                parents = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_PARENTS_KEY, [])
                children = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_CHILDREN_KEY, [])
                seek_keys = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_SEEK_KEYS_KEY, {})
                asset_digest = rgc[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset_name][CFG_ASSET_TAGS_KEY][tag_name]. \
                    setdefault(CFG_ASSET_CHECKSUM_KEY, None)
                if not os.path.exists(target_file) or force:
                    _LOGGER.info(
                        "Creating archive '{}' from '{}' asset".format(
                            target_file, input_file))
                    try:
                        _check_tgz(input_file, target_file, asset_name)
                        _copy_recipe(input_file, target_dir, asset_name,
                                     tag_name)
                        _copy_log(input_file, target_dir, asset_name, tag_name)
                    except OSError as e:
                        _LOGGER.warning(e)
                        continue
                    else:
                        _LOGGER.info(
                            "Updating '{}/{}:{}' tag attributes...".format(
                                genome, asset_name, tag_name))
                        tag_attrs = {
                            CFG_ASSET_PATH_KEY: file_name,
                            CFG_SEEK_KEYS_KEY: seek_keys,
                            CFG_ARCHIVE_CHECKSUM_KEY: checksum(target_file),
                            CFG_ARCHIVE_SIZE_KEY: size(target_file),
                            CFG_ASSET_SIZE_KEY: size(input_file),
                            CFG_ASSET_PARENTS_KEY: parents,
                            CFG_ASSET_CHILDREN_KEY: children,
                            CFG_ASSET_CHECKSUM_KEY: asset_digest
                        }
                        _LOGGER.debug("attr dict: {}".format(tag_attrs))
                        with rgc_server as r:
                            for parent in parents:
                                # here we update any pre-existing parents' children attr with the newly added asset
                                _LOGGER.debug(
                                    "Updating {} children list with {}".format(
                                        parent, "{}/{}:{}".format(
                                            genome, asset_name, tag_name)))
                                rp = parse_registry_path(parent)
                                parent_genome = rp["namespace"]
                                parent_asset = rp["item"]
                                parent_tag = rp["tag"]
                                try:
                                    r.get_asset(parent_genome, parent_asset,
                                                parent_tag)
                                except RefgenconfError:
                                    _LOGGER.warning(
                                        "'{}/{}:{}'s parent '{}' does not exist, "
                                        "skipping relationship updates".format(
                                            genome, asset_name, tag_name,
                                            parent))
                                    continue
                                r.update_relatives_assets(
                                    parent_genome,
                                    parent_asset,
                                    parent_tag, [
                                        "{}/{}:{}".format(
                                            genome, asset_name, tag_name)
                                    ],
                                    children=True)
                            r.update_tags(genome, asset_name, tag_name,
                                          tag_attrs)
                else:
                    _LOGGER.debug("'{}' exists".format(target_file))
        counter += 1
    with rgc_server as r:
        _purge_nonservable(r)
        _LOGGER.info(
            "Builder finished; server config file saved to: '{}'".format(
                r.write(server_rgc_path)))