Example #1
0
def refgenie_init(genome_config_path, genome_server=DEFAULT_SERVER):
    """
    Initialize a genome config file.
    
    :param str genome_config_path: path to genome configuration file to 
        create/initialize
    :param st genome_server: URL for a server
    """

    # Set up default
    rgc = RefGenConf(
        OrderedDict({
            CFG_FOLDER_KEY:
            os.path.dirname(os.path.abspath(genome_config_path)),
            CFG_SERVER_KEY:
            genome_server,
            CFG_GENOMES_KEY:
            None
        }))

    _LOGGER.debug("RGC: {}".format(rgc))

    if genome_config_path and not os.path.exists(genome_config_path):
        rgc.write(genome_config_path)
        _LOGGER.info("Wrote new refgenie genome configuration file: {}".format(
            genome_config_path))
    else:
        _LOGGER.warning(
            "Can't initialize, file exists: {} ".format(genome_config_path))
 def test_genome_folder_is_pwd_if_no_folder_key_and_raw_entries_passed(
         self, ro_rgc):
     data = PathExAttMap(
         {k: v
          for k, v in ro_rgc.items() if k != CFG_FOLDER_KEY})
     new_rgc = RefGenConf(entries=data)
     assert os.getcwd() == new_rgc[CFG_FOLDER_KEY]
Example #3
0
def main():
    global rgc, _LOGGER
    parser = build_parser()
    args = parser.parse_args()
    if not args.command:
        parser.print_help()
        print("No subcommand given")
        sys.exit(1)
    logger_args = (dict(name=PKG_NAME, fmt=LOG_FORMAT, level=5)
                   if args.debug else dict(name=PKG_NAME, fmt=LOG_FORMAT))
    _LOGGER = logmuse.setup_logger(**logger_args)
    selected_cfg = select_genome_config(args.config)
    assert (
        selected_cfg is not None
    ), "You must provide a config file or set the {} environment variable".format(
        "or ".join(CFG_ENV_VARS))
    # this RefGenConf object will be used in the server, so it's read-only
    rgc = RefGenConf(filepath=selected_cfg, writable=False)
    if args.command == "archive":
        arp = ([parse_registry_path(x) for x in args.asset_registry_paths]
               if args.asset_registry_paths is not None else None)
        archive(rgc, arp, args.force, args.remove, selected_cfg,
                args.genomes_desc)
    elif args.command == "serve":
        # the router imports need to be after the RefGenConf object is declared
        with rgc as r:
            purge_nonservable(r)
        from .routers import private, version1, version2, version3

        app.include_router(version3.router)
        app.include_router(version1.router, prefix="/v1")
        app.include_router(version2.router, prefix="/v2")
        app.include_router(version3.router, prefix="/v3")
        app.include_router(private.router, prefix=f"/{PRIVATE_API}")
        uvicorn.run(app, host="0.0.0.0", port=args.port)
Example #4
0
 def test_init_success(self):
     rgc = RefGenConf()
     dirpath = tempfile.mkdtemp(prefix="/tmp/")
     cfg_file_path = os.path.join(dirpath, "test.yaml")
     rgc.initialize_config_file(filepath=cfg_file_path)
     assert os.path.exists(cfg_file_path)
     shutil.rmtree(dirpath)
Example #5
0
 def test_force_overwrite_asset(self, cfg_file, gname, aname, tname):
     rgc = RefGenConf(filepath=cfg_file)
     path = rgc.seek(genome_name=gname,
                     asset_name="fasta",
                     tag_name=tname,
                     enclosing_dir=True)
     assert rgc.add(path, gname, aname, tname, force=True)
     assert rgc.add(path, gname, aname, tname, force=True)
Example #6
0
def test_pull_asset_works_with_nonwritable_and_writable_rgc(
        cfg_file, gname, aname, tname, state):
    rgc = RefGenConf(filepath=cfg_file, writable=state)
    remove_asset_and_file(rgc, gname, aname, tname)
    with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True):
        print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(
            gname, aname, tname))
        rgc.pull_asset(gname, aname, tname)
Example #7
0
def test_pull_asset_updates_genome_config(cfg_file, gname, aname, tname):
    """
    Test that the object that was identical prior to the asset pull differs afterwards
    and the pulled asset metadata has been written to the config file
    """
    ori_rgc = RefGenConf(filepath=cfg_file, writable=False)
    rgc = RefGenConf(filepath=cfg_file, writable=False)
    remove_asset_and_file(rgc, gname, aname, tname)
    remove_asset_and_file(ori_rgc, gname, aname, tname)
    # ori_rgc.remove_assets(gname, aname, tname)
    assert ori_rgc.to_dict() == rgc.to_dict()
    with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True):
        print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname))
        rgc.pull(gname, aname, tname)
    assert not ori_rgc.to_dict() == rgc.to_dict()
    post_rgc = RefGenConf(filepath=cfg_file, writable=False)
    assert isinstance(post_rgc.seek(gname, aname, tname), str)
Example #8
0
 def test_prelist_plugins_called(self, cfg_file):
     with mock.patch("refgenconf.refgenconf.RefGenConf.plugins",
                     new_callable=mock.PropertyMock) as mock_plugins:
         mock_plugins.return_value = PLUGINS_DICT
         rgc = RefGenConf(cfg_file, writable=False)
         rgc.list()
         assert get_flag_pth(rgc)
     os.remove(get_flag_pth(rgc))
     assert not os.path.exists(get_flag_pth(rgc))
Example #9
0
 def test_correct_namespaces(self, namespaces, genome, cfg_file):
     namespaces["pipeline"]["var_templates"]["refgenie_config"] = cfg_file
     ret = looper_refgenie_populate(namespaces=namespaces)
     assert "refgenie" in ret
     rgc = RefGenConf(filepath=cfg_file)
     assert all([
         asset in ret["refgenie"][genome].keys()
         for asset in rgc.list_assets_by_genome(genome=genome)
     ])
Example #10
0
 def test_seekr(self, remote_class, servers, reset, genome, asset):
     unbound_rgc = RefGenConf()
     unbound_rgc.subscribe(servers, no_write=True, reset=reset)
     isinstance(
         unbound_rgc.seekr(genome_name=genome,
                           asset_name=asset,
                           remote_class=remote_class),
         str,
     )
Example #11
0
 def test_cant_add_without_digest_set_first(self, cfg_file, gname, aname,
                                            tname):
     rgc = RefGenConf(filepath=cfg_file)
     path = rgc.seek(genome_name=gname,
                     asset_name="fasta",
                     tag_name=tname,
                     enclosing_dir=True)
     gname = gname + "_new"
     assert not rgc.add(path, gname, aname, tname)
Example #12
0
 def test_path_overrides(self, namespaces, genome, cfg_file):
     rgc = RefGenConf(filepath=cfg_file)
     test_asset = rgc.list_assets_by_genome(genome=genome)[0]
     namespaces["pipeline"]["var_templates"]["refgenie_config"] = cfg_file
     namespaces["project"]["refgenie"]["path_overrides"][0][
         "registry_path"] = f"{genome}/{test_asset}"
     ret = looper_refgenie_populate(namespaces=namespaces)
     assert "refgenie" in ret
     assert ret["refgenie"][genome][test_asset][test_asset] == "REPLACEMENT"
 def test_illegal_genomes_mapping_type_gets_converted_to_empty_mapping(
         self, genomes, tmpdir):
     rgc = RefGenConf(
         entries={
             CFG_FOLDER_KEY: tmpdir.strpath,
             CFG_GENOMES_KEY: genomes,
             CFG_SERVERS_KEY: DEFAULT_SERVER
         })
     res = rgc[CFG_GENOMES_KEY]
     assert isinstance(res, PathExAttMap)
     assert 0 == len(res)
Example #14
0
 def test_nofile(self, cfg_file, gname, aname, tname):
     rgc = RefGenConf(filepath=cfg_file)
     path = rgc.seek(genome_name=gname,
                     asset_name="fasta",
                     tag_name=tname,
                     enclosing_dir=True)
     assert rgc.add(path,
                    gname,
                    aname,
                    tname,
                    seek_keys={"file": "b"},
                    force=True)
Example #15
0
    def test_listr(self, servers, reset, genome):
        unbound_rgc = RefGenConf()
        unbound_rgc.subscribe(servers, no_write=True, reset=reset)
        remote_list = unbound_rgc.listr(genome=genome)

        assert len(remote_list) == len(unbound_rgc[CFG_SERVERS_KEY])
        if genome is not None:
            assert all([
                len(assets_dict.keys()) == 1 and genome in assets_dict.keys()
                for url, assets_dict in remote_list.items()
            ])
        else:
            assert isinstance(remote_list, dict)
Example #16
0
def test_list_remote(rgc, tmpdir):
    """ Verify expected behavior of remote genome/asset listing. """
    new_rgc = RefGenConf(
        entries={
            CFG_FOLDER_KEY: tmpdir.strpath,
            CFG_SERVERS_KEY: DEFAULT_SERVER,
            CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY]
        })
    new_rgc[CFG_SERVERS_KEY] = "http://staging.refgenomes.databio.org"
    print("NEW RGC KEYS: {}".format(list(new_rgc.keys())))
    with mock.patch("refgenconf.refgenconf._read_remote_data",
                    return_value=rgc.genomes):
        genomes, assets = new_rgc.list_remote()
    _assert_eq_as_sets(rgc.genomes_str(), genomes)
Example #17
0
def test_list_remote(rgc, tmpdir):
    """ Verify expected behavior of remote genome/asset listing. """
    new_rgc = RefGenConf(
        entries={
            CFG_FOLDER_KEY: tmpdir.strpath,
            CFG_SERVERS_KEY: [DEFAULT_SERVER],
            CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY]
        })
    result = new_rgc.listr()
    assert list(result.keys())[0].startswith(DEFAULT_SERVER)
    for server_url, asset_dict in result.items():
        assert isinstance(asset_dict, OrderedDict)
        assert len(asset_dict) == len(
            _download_json(DEFAULT_SERVER + "/genomes"))
Example #18
0
 def test_populater(self, remote_class, servers, reset, genome, asset):
     demo, nested_demo = get_demo_dicts(genome=genome,
                                        asset=asset,
                                        str_len=50)
     unbound_rgc = RefGenConf()
     unbound_rgc.subscribe(servers, no_write=True, reset=reset)
     assert unbound_rgc.seekr(genome_name=genome,
                              asset_name=asset,
                              remote_class=remote_class) in str(
                                  unbound_rgc.populater(
                                      glob=demo, remote_class=remote_class))
     assert unbound_rgc.seekr(genome_name=genome,
                              asset_name=asset,
                              remote_class=remote_class) in str(
                                  unbound_rgc.populater(
                                      glob=nested_demo,
                                      remote_class=remote_class))
 def test_genome_folder_is_value_from_config_file_if_key_present(
         self, tmpdir_factory, tmpdir, made_genome_config_file):
     conf_file = tmpdir_factory.mktemp("data2").join(
         "refgenie.yaml").strpath
     expected = tmpdir.strpath
     with open(made_genome_config_file, 'r') as fin, open(conf_file,
                                                          'w') as fout:
         found = False
         for l in fin:
             if l.startswith(CFG_FOLDER_KEY):
                 fout.write("{}: {}\n".format(CFG_FOLDER_KEY, expected))
             else:
                 fout.write(l)
                 if l.startswith(CFG_SERVERS_KEY):
                     found = True
         if not found:
             fout.write("{}: {}".format(CFG_SERVERS_KEY, DEFAULT_SERVER))
     rgc = RefGenConf(filepath=conf_file)
     assert expected != os.path.dirname(conf_file)
     assert expected == rgc[CFG_FOLDER_KEY]
Example #20
0
    def test_all_server_local_mix(self, cfg_file_old):
        """
        Test config upgrade from v0.3 to v0.4 when a mix of genomes in terms of
        remote digest availability is in defined the old config
        """
        old_rgc = _RefGenConfV03(cfg_file_old)
        # get some old asset data on disk
        g, a, t = "human_alu", "fasta", "default"
        try:
            pth = old_rgc.seek(g, "fasta", "default", strict_exists=True)
        except MissingGenomeError:
            src_url = f"http://big.databio.org/refgenie_raw/files.{g}.{a}.{a}"
            target_archive = f"/tmp/old/{g}.fa.gz"
            target_file = f"/tmp/old/{g}.fa"
            target_dir = f"/tmp/old/{g}/{a}/{t}"
            os.makedirs(target_dir, exist_ok=True)
            urllib.request.urlretrieve(src_url, target_archive)
            from subprocess import run

            run(
                f"gunzip {target_archive}; "
                f"mv {target_file} {target_dir}",
                shell=True,
            )
            old_rgc.add(
                path=target_dir,
                genome=g,
                asset=a,
                tag="default",
                seek_keys={a: f"{g}.fa"},
                force=True,
            )
        else:
            print(f"{pth} exists")
        finally:
            upgrade_config(filepath=cfg_file_old,
                           target_version="0.4",
                           force=True)
        rgc = RefGenConf(cfg_file_old)
        assert rgc[CFG_VERSION_KEY] == REQ_CFG_VERSION
Example #21
0
def ro_rgc(cfg_file):
    return RefGenConf(filepath=cfg_file, writable=False)
Example #22
0
def my_rgc(cfg_file):
    return RefGenConf(filepath=cfg_file)
Example #23
0
def rgc(made_genome_config_file):
    """Provide test case with a genome config instance."""
    with open(made_genome_config_file, "r") as f:
        return RefGenConf(entries=yaml.load(f, yaml.SafeLoader))
Example #24
0
 def test_preexisting_asset_prompt(self, cfg_file, gname, aname, tname):
     rgc = RefGenConf(filepath=cfg_file)
     path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname)
     with mock.patch("refgenconf.refgenconf.query_yes_no",
                     return_value=False):
         assert not rgc.add(path, gname, aname, tname)
Example #25
0
 def test_nonexistent_file(self, cfg_file, pth, gname, aname, tname):
     rgc = RefGenConf(filepath=cfg_file)
     with pytest.raises(OSError):
         rgc.add(pth, gname, aname, tname)
Example #26
0
def main():
    """ Primary workflow """

    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)
    logmuse.logger_via_cli(args, name=refgenconf.__name__)

    _LOGGER.debug("Args: {}".format(args))

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    gencfg = yacman.select_config(args.genome_config,
                                  CFG_ENV_VARS,
                                  check_exist=not args.command == INIT_CMD,
                                  on_missing=lambda fp: fp)
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    if args.command == INIT_CMD:
        _LOGGER.info("Initializing refgenie genome configuration")
        _writeable(os.path.dirname(gencfg), strict_exists=True)
        refgenie_init(gencfg, args.genome_server)
        sys.exit(0)

    rgc = RefGenConf(gencfg)

    if args.command == BUILD_CMD:
        refgenie_build(rgc, args)

    elif args.command == GET_ASSET_CMD:
        _LOGGER.debug("getting asset: '{}/{}'".format(args.genome, args.asset))
        print(" ".join(
            [rgc.get_asset(args.genome, asset) for asset in args.asset]))
        return

    elif args.command == INSERT_CMD:
        if len(args.asset) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            # recast from list to str
            args.asset = args.asset[0]
        refgenie_add(rgc, args)

    elif args.command == PULL_CMD:
        outdir = rgc[CFG_FOLDER_KEY]
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        target = _key_to_name(CFG_FOLDER_KEY)
        if not perm_check_x(outdir, target):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to {}: "
                          "{}".format(target, outdir))
            return
        rgc.pull_asset(args.genome,
                       args.asset,
                       gencfg,
                       unpack=not args.no_untar)

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        pfx, genomes, assets = _exec_list(rgc, args.command == LIST_REMOTE_CMD)
        _LOGGER.info("{} genomes: {}".format(pfx, genomes))
        _LOGGER.info("{} assets:\n{}".format(pfx, assets))
Example #27
0
def refgenie_build(gencfg, genome, asset_list, recipe_name, args):
    """
    Runs the refgenie build recipe.

    :param str gencfg: path to the genome configuration file
    :param argparse.Namespace args: parsed command-line options/arguments
    """
    rgc = RefGenConf(
        filepath=gencfg,
        writable=False,
        skip_read_lock=_skip_lock(args.skip_read_lock, gencfg),
    )
    specified_args = _parse_user_build_input(args.files)
    specified_params = _parse_user_build_input(args.params)

    def _read_json_file(filepath):
        """
        Read a JSON file

        :param str filepath: path to the file to read
        :return dict: read data
        """
        with open(filepath, "r") as f:
            data = json.load(f)
        return data

    if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(
            ".json"):
        recipe_name = _read_json_file(filepath=recipe_name)

    if not hasattr(args, "outfolder") or not args.outfolder:
        # Default to genome_folder
        _LOGGER.debug("No outfolder provided, using genome config.")
        args.outfolder = rgc.data_dir

    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """

        log_outfolder = os.path.abspath(
            os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR))
        _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(
            genome_outfolder, log_outfolder))
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                "Insufficient permissions to write to output folder: {}".
                format(genome_outfolder))
            return

        pm = pypiper.PipelineManager(name="refgenie",
                                     outfolder=log_outfolder,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(log_outfolder,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(log_outfolder, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info("Asset digest: {}".format(digest))
            # add updates to config file
            with rgc as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True

    for a in asset_list:
        asset_key = a["asset"]
        asset_tag = a["tag"] or rgc.get_default_tag(
            genome, a["asset"], use_existing=False)
        recipe_name = recipe_name or asset_key

        if isinstance(
                recipe_name,
                dict) or (isinstance(recipe_name, str)
                          and recipe_name in asset_build_packages.keys()):
            if isinstance(recipe_name, dict):
                _LOGGER.info("Using custom recipe: \n{}".format(recipe_name))
                asset_build_package = _check_recipe(recipe_name)
                recipe_name = asset_build_package["name"]
            else:
                asset_build_package = _check_recipe(
                    asset_build_packages[recipe_name])
            # handle user-requested parents for the required assets
            input_assets = {}
            parent_assets = []
            specified_asset_keys, specified_assets = None, None
            if args.assets is not None:
                parsed_parents_input = _parse_user_build_input(args.assets)
                specified_asset_keys = list(parsed_parents_input.keys())
                specified_assets = list(parsed_parents_input.values())
                _LOGGER.debug(f"Custom assets requested: {args.assets}")
            if not specified_asset_keys and isinstance(args.assets, list):
                _LOGGER.warning(
                    "Specified parent assets format is invalid. Using defaults."
                )
            for req_asset in asset_build_package[REQ_ASSETS]:
                req_asset_data = parse_registry_path(req_asset[KEY])
                # for each req asset see if non-default parents were requested
                if (specified_asset_keys is not None
                        and req_asset_data["asset"] in specified_asset_keys):
                    parent_data = parse_registry_path(
                        specified_assets[specified_asset_keys.index(
                            req_asset_data["asset"])])
                    g, a, t, s = (
                        parent_data["genome"],
                        parent_data["asset"],
                        parent_data["tag"]
                        or rgc.get_default_tag(genome, parent_data["asset"]),
                        parent_data["seek_key"],
                    )
                else:  # if no custom parents requested for the req asset, use default one
                    default = parse_registry_path(req_asset[DEFAULT])
                    g, a, t, s = (
                        genome,
                        default["asset"],
                        rgc.get_default_tag(genome, default["asset"]),
                        req_asset_data["seek_key"],
                    )
                parent_assets.append("{}/{}:{}".format(
                    rgc.get_genome_alias_digest(g, fallback=True), a, t))
                input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s)
            _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets)))
            _LOGGER.debug("Provided files: {}".format(specified_args))
            _LOGGER.debug("Provided parameters: {}".format(specified_params))
            for required_file in asset_build_package[REQ_FILES]:
                if (specified_args is None
                        or required_file[KEY] not in specified_args.keys()):
                    raise ValueError(
                        "Path to the '{x}' input ({desc}) is required, but not provided. "
                        "Specify it with: --files {x}=/path/to/{x}_file".
                        format(x=required_file[KEY], desc=required_file[DESC]))
            for required_param in asset_build_package[REQ_PARAMS]:
                if specified_params is None:
                    specified_params = {}
                if required_param[KEY] not in specified_params.keys():
                    if required_param[DEFAULT] is None:
                        raise ValueError(
                            "Value for the parameter '{x}' ({desc}) is required, but not provided. "
                            "Specify it with: --params {x}=value".format(
                                x=required_param[KEY],
                                desc=required_param[DESC]))
                    else:
                        specified_params.update(
                            {required_param[KEY]: required_param[DEFAULT]})
            _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(
                genome, asset_key, asset_tag, recipe_name))
            ori_genome = genome
            if recipe_name == "fasta":
                if (genome in rgc.genomes_list()
                        and "fasta" in rgc.list_assets_by_genome(genome)):
                    pretag = rgc.get_default_tag(genome, "fasta")
                    _LOGGER.warning(
                        "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})"
                        .format(g=genome, a=asset_key, t=pretag))
                    genome = rgc.get_genome_alias_digest(alias=genome,
                                                         fallback=True)
                else:
                    # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file
                    genome, _ = rgc.initialize_genome(
                        fasta_path=specified_args["fasta"],
                        alias=ori_genome,
                        skip_alias_write=True,
                    )
            else:
                try:
                    genome = rgc.get_genome_alias_digest(genome, fallback=True)
                except UndefinedAliasError:
                    _LOGGER.error("Genome '{}' has not been initialized yet; "
                                  "no key found for this alias".format(genome))
                    return
            recipe_name = None
            genome_outfolder = os.path.join(args.outfolder, genome)
            if not _build_asset(
                    genome,
                    asset_key,
                    asset_tag,
                    asset_build_package,
                    genome_outfolder,
                    specified_args,
                    specified_params,
                    ori_genome,
                    **input_assets,
            ):
                log_path = os.path.abspath(
                    os.path.join(
                        genome_outfolder,
                        asset_key,
                        asset_tag,
                        BUILD_STATS_DIR,
                        ORI_LOG_NAME,
                    ))
                _LOGGER.info(
                    "'{}/{}:{}' was not added to the config, but directory has been left in place. "
                    "See the log file for details: {}".format(
                        genome, asset_key, asset_tag, log_path))
                return
            _LOGGER.info("Finished building '{}' asset".format(asset_key))
            with rgc as r:
                # update asset relationships
                r.update_relatives_assets(genome, asset_key, asset_tag,
                                          parent_assets)  # adds parents
                for i in parent_assets:
                    parsed_parent = parse_registry_path(i)
                    # adds child (currently built asset) to the parent
                    r.update_relatives_assets(
                        parsed_parent["genome"],
                        parsed_parent["asset"],
                        parsed_parent["tag"],
                        ["{}/{}:{}".format(genome, asset_key, asset_tag)],
                        True,
                    )
                if args.genome_description is not None:
                    _LOGGER.debug(
                        "adding genome ({}) description: '{}'".format(
                            genome, args.genome_description))
                    r.update_genomes(
                        genome, {CFG_GENOME_DESC_KEY: args.genome_description})
                if args.tag_description is not None:
                    _LOGGER.debug(
                        "adding tag ({}/{}:{}) description: '{}'".format(
                            genome, asset_key, asset_tag,
                            args.tag_description))
                    r.update_tags(
                        genome,
                        asset_key,
                        asset_tag,
                        {CFG_TAG_DESC_KEY: args.tag_description},
                    )
            rgc._symlink_alias(genome, asset_key, asset_tag)
        else:
            _raise_missing_recipe_error(recipe_name)
Example #28
0
def main():
    """ Primary workflow """
    parser = logmuse.add_logging_options(build_argparser())
    args, remaining_args = parser.parse_known_args()
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args, make_root=True)
    _LOGGER.debug("refgenie {}".format(__version__))
    _LOGGER.debug("Args: {}".format(args))

    if not args.command:
        parser.print_help()
        _LOGGER.error("No command given")
        sys.exit(1)

    gencfg = refgenconf.select_genome_config(
        filename=args.genome_config,
        check_exist=not args.command == INIT_CMD,
        on_missing=lambda fp: fp,
        strict_env=True)
    if gencfg is None:
        raise MissingGenomeConfigError(args.genome_config)
    _LOGGER.debug("Determined genome config: {}".format(gencfg))

    # From user input we want to construct a list of asset dicts, where each
    # asset has a genome name, asset name, and tag

    if "asset_registry_paths" in args and args.asset_registry_paths:
        _LOGGER.debug("Found registry_path: {}".format(
            args.asset_registry_paths))
        asset_list = [
            parse_registry_path(x) for x in args.asset_registry_paths
        ]

        for a in asset_list:
            # every asset must have a genome, either provided via registry path
            # or the args.genome arg.
            if not a["genome"]:
                if args.genome:
                    a["genome"] = args.genome
                else:
                    _LOGGER.error(
                        "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference."
                        .format(a["genome"], a["asset"], a["tag"]))
                    sys.exit(1)
            else:
                if args.genome and args.genome != a["genome"]:
                    _LOGGER.warn(
                        "Two different genomes specified for asset '{}'.".
                        format(a["asset"]))

    else:
        if args.command in GENOME_ONLY_REQUIRED and not args.genome:
            parser.error("You must provide either a genome or a registry path")
            sys.exit(1)
        if args.command in ASSET_REQUIRED:
            parser.error("You must provide an asset registry path")
            sys.exit(1)

    if args.command == INIT_CMD:
        _LOGGER.debug("Initializing refgenie genome configuration")
        rgc = RefGenConf(entries=OrderedDict(
            {
                CFG_VERSION_KEY: REQ_CFG_VERSION,
                CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
                CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
                CFG_GENOMES_KEY: None
            }))
        rgc.initialize_config_file(os.path.abspath(gencfg))

    elif args.command == BUILD_CMD:
        if not all(
            [x["genome"] == asset_list[0]["genome"] for x in asset_list]):
            _LOGGER.error("Build can only build assets for one genome")
            sys.exit(1)
        recipe_name = None
        if args.recipe:
            if len(asset_list) > 1:
                _LOGGER.error(
                    "Recipes cannot be specified for multi-asset builds")
                sys.exit(1)
            recipe_name = args.recipe
        if args.requirements:
            for a in asset_list:
                recipe = recipe_name or a["asset"]
                if recipe not in asset_build_packages.keys():
                    _raise_missing_recipe_error(recipe)
                _LOGGER.info("'{}' recipe requirements: ".format(recipe))
                _make_asset_build_reqs(recipe)
            sys.exit(0)
        refgenie_build(gencfg, asset_list[0]["genome"], asset_list,
                       recipe_name, args)

    elif args.command == GET_ASSET_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        check = args.check_exists if args.check_exists else None
        for a in asset_list:
            _LOGGER.debug("getting asset: '{}/{}.{}:{}'".format(
                a["genome"], a["asset"], a["seek_key"], a["tag"]))
            print(
                rgc.seek(a["genome"],
                         a["asset"],
                         a["tag"],
                         a["seek_key"],
                         strict_exists=check))
        return

    elif args.command == INSERT_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only add 1 asset at a time")
        else:
            refgenie_add(rgc, asset_list[0], args.path, args.force)

    elif args.command == PULL_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        force = None if not args.force else True
        outdir = rgc[CFG_FOLDER_KEY]
        if not os.path.exists(outdir):
            raise MissingFolderError(outdir)
        target = _key_to_name(CFG_FOLDER_KEY)
        if not perm_check_x(outdir, target):
            return
        if not _single_folder_writeable(outdir):
            _LOGGER.error("Insufficient permissions to write to {}: {}".format(
                target, outdir))
            return

        for a in asset_list:
            rgc.pull(a["genome"],
                     a["asset"],
                     a["tag"],
                     unpack=not args.no_untar,
                     force=force)

    elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if args.command == LIST_REMOTE_CMD:
            num_servers = 0
            # Keep all servers so that child updates maintain server list
            server_list = rgc[CFG_SERVERS_KEY]
            bad_servers = []
            for server_url in rgc[CFG_SERVERS_KEY]:
                num_servers += 1
                try:
                    rgc[CFG_SERVERS_KEY] = server_url
                    pfx, genomes, assets, recipes = _exec_list(
                        rgc, args.command == LIST_REMOTE_CMD, args.genome)
                    if assets is None and genomes is None:
                        continue
                    _LOGGER.info("{} genomes: {}".format(pfx, genomes))
                    if args.command != LIST_REMOTE_CMD:  # Not implemented yet
                        _LOGGER.info("{} recipes: {}".format(pfx, recipes))
                    _LOGGER.info("{} assets:\n{}\n".format(pfx, assets))
                except (DownloadJsonError, ConnectionError):
                    bad_servers.append(server_url)
                    continue
            if num_servers >= len(server_list) and bad_servers:
                _LOGGER.error(
                    "Could not list assets from the following server(s): {}".
                    format(bad_servers))
            # Restore original server list, even when we couldn't find assets on a server
            rgc[CFG_SERVERS_KEY] = server_list
        else:  # Only check local assets once
            _LOGGER.info("Server subscriptions: {}".format(", ".join(
                rgc[CFG_SERVERS_KEY])))
            pfx, genomes, assets, recipes = _exec_list(
                rgc, args.command == LIST_REMOTE_CMD, args.genome)
            _LOGGER.info("{} genomes: {}".format(pfx, genomes))
            if args.command != LIST_REMOTE_CMD:  # Not implemented yet
                _LOGGER.info("{} recipes: {}".format(pfx, recipes))
            _LOGGER.info("{} assets:\n{}".format(pfx, assets))

    elif args.command == GETSEQ_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.getseq(rgc, args.genome, args.locus)

    elif args.command == REMOVE_CMD:
        force = args.force
        rgc = RefGenConf(filepath=gencfg)
        for a in asset_list:
            a["tag"] = a["tag"] or rgc.get_default_tag(
                a["genome"], a["asset"], use_existing=False)
            _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
            if a["seek_key"] is not None:
                raise NotImplementedError(
                    "You can't remove a specific seek_key.")
            bundle = [a["genome"], a["asset"], a["tag"]]
            try:
                if not rgc.is_asset_complete(*bundle):
                    with rgc as r:
                        r.cfg_remove_assets(*bundle)
                    _LOGGER.info(
                        "Removed an incomplete asset '{}/{}:{}'".format(
                            *bundle))
                    return
            except (KeyError, MissingAssetError, MissingGenomeError):
                _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle))
                return
        if len(asset_list) > 1:
            if not query_yes_no(
                    "Are you sure you want to remove {} assets?".format(
                        len(asset_list))):
                _LOGGER.info("Action aborted by the user")
                return
            force = True
        for a in asset_list:
            rgc.remove(genome=a["genome"],
                       asset=a["asset"],
                       tag=a["tag"],
                       force=force)

    elif args.command == TAG_CMD:
        rgc = RefGenConf(filepath=gencfg)
        if len(asset_list) > 1:
            raise NotImplementedError("Can only tag 1 asset at a time")
        if args.default:
            # set the default tag and exit
            with rgc as r:
                r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
            sys.exit(0)
        rgc.tag(a["genome"], a["asset"], a["tag"], args.tag)

    elif args.command == ID_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        if len(asset_list) == 1:
            g, a = asset_list[0]["genome"], asset_list[0]["asset"]
            t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
            print(rgc.id(g, a, t))
            return
        for asset in asset_list:
            g, a = asset["genome"], asset["asset"]
            t = asset["tag"] or rgc.get_default_tag(g, a)
            print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
        return
    elif args.command == SUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.subscribe(urls=args.genome_server, reset=args.reset)
        return
    elif args.command == UNSUBSCRIBE_CMD:
        rgc = RefGenConf(filepath=gencfg, writable=False)
        rgc.unsubscribe(urls=args.genome_server)
        return
Example #29
0
def refgenie_build(gencfg, genome, asset_list, recipe_name, args):
    """
    Runs the refgenie build recipe.

    :param str gencfg: path to the genome configuration file
    :param argparse.Namespace args: parsed command-line options/arguments
    """
    rgc = RefGenConf(filepath=gencfg, writable=False)
    specified_args = _parse_user_build_input(args.files)
    specified_params = _parse_user_build_input(args.params)

    if not hasattr(args, "outfolder") or not args.outfolder:
        # Default to genome_folder
        _LOGGER.debug("No outfolder provided, using genome config.")
        args.outfolder = rgc[CFG_FOLDER_KEY]

    _LOGGER.debug("Default config file: {}".format(default_config_file()))

    if args.config_file and not os.path.isfile(args.config_file):
        _LOGGER.debug("Config file path isn't a file: {}".format(
            args.config_file))
        args.config_file = default_config_file()

    def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder,
                    specific_args, specific_params, **kwargs):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """

        log_outfolder = os.path.abspath(
            os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR))
        _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(
            genome_outfolder, log_outfolder))
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                "Insufficient permissions to write to output folder: {}".
                format(genome_outfolder))
            return

        pm = pypiper.PipelineManager(name="refgenie",
                                     outfolder=log_outfolder,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        gat = [genome, asset_key,
               tag]  # create a bundle list to simplify calls below
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(genome, asset_key, tag, genome_outfolder,
                                    specific_args, specific_params, **kwargs)
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(log_outfolder,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(log_outfolder, recipe_file_name),
                      'w') as outfile:
                json.dump(build_pkg, outfile)
            # update and write refgenie genome configuration
            with rgc as r:
                r.update_assets(*gat[0:2],
                                data={CFG_ASSET_DESC_KEY: build_pkg[DESC]})
                r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key})
                r.update_seek_keys(*gat,
                                   keys={
                                       k: v.format(**asset_vars)
                                       for k, v in build_pkg[ASSETS].items()
                                   })
                # in order to conveniently get the path to digest we update the tags metadata in two steps
                digest = get_dir_digest(
                    r.get_asset(genome, asset_key, tag, enclosing_dir=True),
                    pm)
                r.update_tags(*gat, data={CFG_ASSET_CHECKSUM_KEY: digest})
                _LOGGER.info("Asset digest: {}".format(digest))
                r.set_default_pointer(*gat)
        pm.stop_pipeline()
        return True

    for a in asset_list:
        asset_key = a["asset"]
        asset_tag = a["tag"] or rgc.get_default_tag(
            genome, a["asset"], use_existing=False)
        recipe_name = recipe_name or asset_key

        if recipe_name in asset_build_packages.keys():
            asset_build_package = _check_recipe(
                asset_build_packages[recipe_name])
            # handle user-requested parents for the required assets
            input_assets = {}
            parent_assets = []
            specified_asset_keys, specified_assets = None, None
            if args.assets is not None:
                parsed_parents_input = _parse_user_build_input(args.assets)
                specified_asset_keys, specified_assets = \
                    list(parsed_parents_input.keys()), list(parsed_parents_input.values())
                _LOGGER.debug("Custom assets requested: {}".format(
                    args.assets))
            if not specified_asset_keys and isinstance(args.assets, list):
                _LOGGER.warning(
                    "Specified parent assets format is invalid. Using defaults."
                )
            for req_asset in asset_build_package[REQ_ASSETS]:
                req_asset_data = parse_registry_path(req_asset[KEY])
                # for each req asset see if non-default parents were requested
                if specified_asset_keys is not None and req_asset_data[
                        "asset"] in specified_asset_keys:
                    parent_data = \
                        parse_registry_path(specified_assets[specified_asset_keys.index(req_asset_data["asset"])])
                    g, a, t, s = parent_data["genome"], \
                                 parent_data["asset"], \
                                 parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \
                                 parent_data["seek_key"]
                else:  # if no custom parents requested for the req asset, use default one
                    default = parse_registry_path(req_asset[DEFAULT])
                    g, a, t, s = genome, default["asset"], \
                                 rgc.get_default_tag(genome, default["asset"]), \
                                 req_asset_data["seek_key"]
                parent_assets.append("{}/{}:{}".format(g, a, t))
                input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s)
            _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets)))
            _LOGGER.debug("Provided files: {}".format(specified_args))
            _LOGGER.debug("Provided parameters: {}".format(specified_params))
            for required_file in asset_build_package[REQ_FILES]:
                if specified_args is None or required_file[
                        KEY] not in specified_args.keys():
                    raise ValueError(
                        "Path to the '{x}' input ({desc}) is required, but not provided. "
                        "Specify it with: --files {x}=/path/to/{x}_file".
                        format(x=required_file[KEY], desc=required_file[DESC]))
            for required_param in asset_build_package[REQ_PARAMS]:
                if specified_params is None:
                    specified_params = {}
                if required_param[KEY] not in specified_params.keys():
                    if required_param[DEFAULT] is None:
                        raise ValueError(
                            "Value for the parameter '{x}' ({desc}) is required, but not provided. "
                            "Specify it with: --params {x}=value".format(
                                x=required_param[KEY],
                                desc=required_param[DESC]))
                    else:
                        specified_params.update(
                            {required_param[KEY]: required_param[DEFAULT]})
            genome_outfolder = os.path.join(args.outfolder, genome)
            _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(
                genome, asset_key, asset_tag, recipe_name))
            if recipe_name == 'fasta' and genome in rgc.genomes_list() \
                    and 'fasta' in rgc.list_assets_by_genome(genome):
                _LOGGER.warning(
                    "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). "
                    "It will be re-initialized.".format(g=genome,
                                                        a=asset_key,
                                                        t=asset_tag))
            if not build_asset(genome, asset_key, asset_tag,
                               asset_build_package, genome_outfolder,
                               specified_args, specified_params, **
                               input_assets):
                log_path = os.path.abspath(
                    os.path.join(genome_outfolder, asset_key, asset_tag,
                                 BUILD_STATS_DIR, ORI_LOG_NAME))
                _LOGGER.info(
                    "'{}/{}:{}' was not added to the config, but directory has been left in place. "
                    "See the log file for details: {}".format(
                        genome, asset_key, asset_tag, log_path))
                return
            # If the recipe was a fasta, we init the genome
            if recipe_name == 'fasta':
                _LOGGER.info("Computing initial genome digest...")
                collection_checksum, content_checksums = \
                    fasta_checksum(_seek(rgc, genome, asset_key, asset_tag, "fasta"))
                _LOGGER.info("Initializing genome...")
                refgenie_initg(rgc, genome, content_checksums)
            _LOGGER.info("Finished building '{}' asset".format(asset_key))
            with rgc as r:
                # update asset relationships
                r.update_relatives_assets(genome, asset_key, asset_tag,
                                          parent_assets)  # adds parents
                for i in parent_assets:
                    parsed_parent = parse_registry_path(i)
                    # adds child (currently built asset) to the parent
                    r.update_relatives_assets(
                        parsed_parent["genome"], parsed_parent["asset"],
                        parsed_parent["tag"],
                        ["{}/{}:{}".format(genome, asset_key, asset_tag)],
                        True)
                if args.genome_description is not None:
                    _LOGGER.debug(
                        "adding genome ({}) description: '{}'".format(
                            genome, args.genome_description))
                    r.update_genomes(
                        genome, {CFG_GENOME_DESC_KEY: args.genome_description})
                if args.tag_description is not None:
                    _LOGGER.debug(
                        "adding tag ({}/{}:{}) description: '{}'".format(
                            genome, asset_key, asset_tag,
                            args.tag_description))
                    r.update_tags(genome, asset_key, asset_tag,
                                  {CFG_TAG_DESC_KEY: args.tag_description})
                if recipe_name == "fasta":
                    # to save config lock time when building fasta assets
                    # (genome initialization takes some time for large genomes) we repeat the
                    # conditional here for writing the computed genome digest
                    r.update_genomes(
                        genome, data={CFG_CHECKSUM_KEY: collection_checksum})
        else:
            _raise_missing_recipe_error(recipe_name)
Example #30
0
 def test_invalid_path(self, pth):
     rgc = RefGenConf()
     with pytest.raises(TypeError):
         rgc.initialize_config_file(filepath=pth)