def refgenie_init(genome_config_path, genome_server=DEFAULT_SERVER): """ Initialize a genome config file. :param str genome_config_path: path to genome configuration file to create/initialize :param st genome_server: URL for a server """ # Set up default rgc = RefGenConf( OrderedDict({ CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(genome_config_path)), CFG_SERVER_KEY: genome_server, CFG_GENOMES_KEY: None })) _LOGGER.debug("RGC: {}".format(rgc)) if genome_config_path and not os.path.exists(genome_config_path): rgc.write(genome_config_path) _LOGGER.info("Wrote new refgenie genome configuration file: {}".format( genome_config_path)) else: _LOGGER.warning( "Can't initialize, file exists: {} ".format(genome_config_path))
def test_genome_folder_is_pwd_if_no_folder_key_and_raw_entries_passed( self, ro_rgc): data = PathExAttMap( {k: v for k, v in ro_rgc.items() if k != CFG_FOLDER_KEY}) new_rgc = RefGenConf(entries=data) assert os.getcwd() == new_rgc[CFG_FOLDER_KEY]
def main(): global rgc, _LOGGER parser = build_parser() args = parser.parse_args() if not args.command: parser.print_help() print("No subcommand given") sys.exit(1) logger_args = (dict(name=PKG_NAME, fmt=LOG_FORMAT, level=5) if args.debug else dict(name=PKG_NAME, fmt=LOG_FORMAT)) _LOGGER = logmuse.setup_logger(**logger_args) selected_cfg = select_genome_config(args.config) assert ( selected_cfg is not None ), "You must provide a config file or set the {} environment variable".format( "or ".join(CFG_ENV_VARS)) # this RefGenConf object will be used in the server, so it's read-only rgc = RefGenConf(filepath=selected_cfg, writable=False) if args.command == "archive": arp = ([parse_registry_path(x) for x in args.asset_registry_paths] if args.asset_registry_paths is not None else None) archive(rgc, arp, args.force, args.remove, selected_cfg, args.genomes_desc) elif args.command == "serve": # the router imports need to be after the RefGenConf object is declared with rgc as r: purge_nonservable(r) from .routers import private, version1, version2, version3 app.include_router(version3.router) app.include_router(version1.router, prefix="/v1") app.include_router(version2.router, prefix="/v2") app.include_router(version3.router, prefix="/v3") app.include_router(private.router, prefix=f"/{PRIVATE_API}") uvicorn.run(app, host="0.0.0.0", port=args.port)
def test_init_success(self): rgc = RefGenConf() dirpath = tempfile.mkdtemp(prefix="/tmp/") cfg_file_path = os.path.join(dirpath, "test.yaml") rgc.initialize_config_file(filepath=cfg_file_path) assert os.path.exists(cfg_file_path) shutil.rmtree(dirpath)
def test_force_overwrite_asset(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) path = rgc.seek(genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True) assert rgc.add(path, gname, aname, tname, force=True) assert rgc.add(path, gname, aname, tname, force=True)
def test_pull_asset_works_with_nonwritable_and_writable_rgc( cfg_file, gname, aname, tname, state): rgc = RefGenConf(filepath=cfg_file, writable=state) remove_asset_and_file(rgc, gname, aname, tname) with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): print("\nPulling; genome: {}, asset: {}, tag: {}\n".format( gname, aname, tname)) rgc.pull_asset(gname, aname, tname)
def test_pull_asset_updates_genome_config(cfg_file, gname, aname, tname): """ Test that the object that was identical prior to the asset pull differs afterwards and the pulled asset metadata has been written to the config file """ ori_rgc = RefGenConf(filepath=cfg_file, writable=False) rgc = RefGenConf(filepath=cfg_file, writable=False) remove_asset_and_file(rgc, gname, aname, tname) remove_asset_and_file(ori_rgc, gname, aname, tname) # ori_rgc.remove_assets(gname, aname, tname) assert ori_rgc.to_dict() == rgc.to_dict() with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname)) rgc.pull(gname, aname, tname) assert not ori_rgc.to_dict() == rgc.to_dict() post_rgc = RefGenConf(filepath=cfg_file, writable=False) assert isinstance(post_rgc.seek(gname, aname, tname), str)
def test_prelist_plugins_called(self, cfg_file): with mock.patch("refgenconf.refgenconf.RefGenConf.plugins", new_callable=mock.PropertyMock) as mock_plugins: mock_plugins.return_value = PLUGINS_DICT rgc = RefGenConf(cfg_file, writable=False) rgc.list() assert get_flag_pth(rgc) os.remove(get_flag_pth(rgc)) assert not os.path.exists(get_flag_pth(rgc))
def test_correct_namespaces(self, namespaces, genome, cfg_file): namespaces["pipeline"]["var_templates"]["refgenie_config"] = cfg_file ret = looper_refgenie_populate(namespaces=namespaces) assert "refgenie" in ret rgc = RefGenConf(filepath=cfg_file) assert all([ asset in ret["refgenie"][genome].keys() for asset in rgc.list_assets_by_genome(genome=genome) ])
def test_seekr(self, remote_class, servers, reset, genome, asset): unbound_rgc = RefGenConf() unbound_rgc.subscribe(servers, no_write=True, reset=reset) isinstance( unbound_rgc.seekr(genome_name=genome, asset_name=asset, remote_class=remote_class), str, )
def test_cant_add_without_digest_set_first(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) path = rgc.seek(genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True) gname = gname + "_new" assert not rgc.add(path, gname, aname, tname)
def test_path_overrides(self, namespaces, genome, cfg_file): rgc = RefGenConf(filepath=cfg_file) test_asset = rgc.list_assets_by_genome(genome=genome)[0] namespaces["pipeline"]["var_templates"]["refgenie_config"] = cfg_file namespaces["project"]["refgenie"]["path_overrides"][0][ "registry_path"] = f"{genome}/{test_asset}" ret = looper_refgenie_populate(namespaces=namespaces) assert "refgenie" in ret assert ret["refgenie"][genome][test_asset][test_asset] == "REPLACEMENT"
def test_illegal_genomes_mapping_type_gets_converted_to_empty_mapping( self, genomes, tmpdir): rgc = RefGenConf( entries={ CFG_FOLDER_KEY: tmpdir.strpath, CFG_GENOMES_KEY: genomes, CFG_SERVERS_KEY: DEFAULT_SERVER }) res = rgc[CFG_GENOMES_KEY] assert isinstance(res, PathExAttMap) assert 0 == len(res)
def test_nofile(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) path = rgc.seek(genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True) assert rgc.add(path, gname, aname, tname, seek_keys={"file": "b"}, force=True)
def test_listr(self, servers, reset, genome): unbound_rgc = RefGenConf() unbound_rgc.subscribe(servers, no_write=True, reset=reset) remote_list = unbound_rgc.listr(genome=genome) assert len(remote_list) == len(unbound_rgc[CFG_SERVERS_KEY]) if genome is not None: assert all([ len(assets_dict.keys()) == 1 and genome in assets_dict.keys() for url, assets_dict in remote_list.items() ]) else: assert isinstance(remote_list, dict)
def test_list_remote(rgc, tmpdir): """ Verify expected behavior of remote genome/asset listing. """ new_rgc = RefGenConf( entries={ CFG_FOLDER_KEY: tmpdir.strpath, CFG_SERVERS_KEY: DEFAULT_SERVER, CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY] }) new_rgc[CFG_SERVERS_KEY] = "http://staging.refgenomes.databio.org" print("NEW RGC KEYS: {}".format(list(new_rgc.keys()))) with mock.patch("refgenconf.refgenconf._read_remote_data", return_value=rgc.genomes): genomes, assets = new_rgc.list_remote() _assert_eq_as_sets(rgc.genomes_str(), genomes)
def test_list_remote(rgc, tmpdir): """ Verify expected behavior of remote genome/asset listing. """ new_rgc = RefGenConf( entries={ CFG_FOLDER_KEY: tmpdir.strpath, CFG_SERVERS_KEY: [DEFAULT_SERVER], CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY] }) result = new_rgc.listr() assert list(result.keys())[0].startswith(DEFAULT_SERVER) for server_url, asset_dict in result.items(): assert isinstance(asset_dict, OrderedDict) assert len(asset_dict) == len( _download_json(DEFAULT_SERVER + "/genomes"))
def test_populater(self, remote_class, servers, reset, genome, asset): demo, nested_demo = get_demo_dicts(genome=genome, asset=asset, str_len=50) unbound_rgc = RefGenConf() unbound_rgc.subscribe(servers, no_write=True, reset=reset) assert unbound_rgc.seekr(genome_name=genome, asset_name=asset, remote_class=remote_class) in str( unbound_rgc.populater( glob=demo, remote_class=remote_class)) assert unbound_rgc.seekr(genome_name=genome, asset_name=asset, remote_class=remote_class) in str( unbound_rgc.populater( glob=nested_demo, remote_class=remote_class))
def test_genome_folder_is_value_from_config_file_if_key_present( self, tmpdir_factory, tmpdir, made_genome_config_file): conf_file = tmpdir_factory.mktemp("data2").join( "refgenie.yaml").strpath expected = tmpdir.strpath with open(made_genome_config_file, 'r') as fin, open(conf_file, 'w') as fout: found = False for l in fin: if l.startswith(CFG_FOLDER_KEY): fout.write("{}: {}\n".format(CFG_FOLDER_KEY, expected)) else: fout.write(l) if l.startswith(CFG_SERVERS_KEY): found = True if not found: fout.write("{}: {}".format(CFG_SERVERS_KEY, DEFAULT_SERVER)) rgc = RefGenConf(filepath=conf_file) assert expected != os.path.dirname(conf_file) assert expected == rgc[CFG_FOLDER_KEY]
def test_all_server_local_mix(self, cfg_file_old): """ Test config upgrade from v0.3 to v0.4 when a mix of genomes in terms of remote digest availability is in defined the old config """ old_rgc = _RefGenConfV03(cfg_file_old) # get some old asset data on disk g, a, t = "human_alu", "fasta", "default" try: pth = old_rgc.seek(g, "fasta", "default", strict_exists=True) except MissingGenomeError: src_url = f"http://big.databio.org/refgenie_raw/files.{g}.{a}.{a}" target_archive = f"/tmp/old/{g}.fa.gz" target_file = f"/tmp/old/{g}.fa" target_dir = f"/tmp/old/{g}/{a}/{t}" os.makedirs(target_dir, exist_ok=True) urllib.request.urlretrieve(src_url, target_archive) from subprocess import run run( f"gunzip {target_archive}; " f"mv {target_file} {target_dir}", shell=True, ) old_rgc.add( path=target_dir, genome=g, asset=a, tag="default", seek_keys={a: f"{g}.fa"}, force=True, ) else: print(f"{pth} exists") finally: upgrade_config(filepath=cfg_file_old, target_version="0.4", force=True) rgc = RefGenConf(cfg_file_old) assert rgc[CFG_VERSION_KEY] == REQ_CFG_VERSION
def ro_rgc(cfg_file): return RefGenConf(filepath=cfg_file, writable=False)
def my_rgc(cfg_file): return RefGenConf(filepath=cfg_file)
def rgc(made_genome_config_file): """Provide test case with a genome config instance.""" with open(made_genome_config_file, "r") as f: return RefGenConf(entries=yaml.load(f, yaml.SafeLoader))
def test_preexisting_asset_prompt(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname) with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=False): assert not rgc.add(path, gname, aname, tname)
def test_nonexistent_file(self, cfg_file, pth, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) with pytest.raises(OSError): rgc.add(pth, gname, aname, tname)
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args) logmuse.logger_via_cli(args, name=refgenconf.__name__) _LOGGER.debug("Args: {}".format(args)) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) gencfg = yacman.select_config(args.genome_config, CFG_ENV_VARS, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) if args.command == INIT_CMD: _LOGGER.info("Initializing refgenie genome configuration") _writeable(os.path.dirname(gencfg), strict_exists=True) refgenie_init(gencfg, args.genome_server) sys.exit(0) rgc = RefGenConf(gencfg) if args.command == BUILD_CMD: refgenie_build(rgc, args) elif args.command == GET_ASSET_CMD: _LOGGER.debug("getting asset: '{}/{}'".format(args.genome, args.asset)) print(" ".join( [rgc.get_asset(args.genome, asset) for asset in args.asset])) return elif args.command == INSERT_CMD: if len(args.asset) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: # recast from list to str args.asset = args.asset[0] refgenie_add(rgc, args) elif args.command == PULL_CMD: outdir = rgc[CFG_FOLDER_KEY] if not os.path.exists(outdir): raise MissingFolderError(outdir) target = _key_to_name(CFG_FOLDER_KEY) if not perm_check_x(outdir, target): return if not _single_folder_writeable(outdir): _LOGGER.error("Insufficient permissions to write to {}: " "{}".format(target, outdir)) return rgc.pull_asset(args.genome, args.asset, gencfg, unpack=not args.no_untar) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: pfx, genomes, assets = _exec_list(rgc, args.command == LIST_REMOTE_CMD) _LOGGER.info("{} genomes: {}".format(pfx, genomes)) _LOGGER.info("{} assets:\n{}".format(pfx, assets))
def refgenie_build(gencfg, genome, asset_list, recipe_name, args): """ Runs the refgenie build recipe. :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ rgc = RefGenConf( filepath=gencfg, writable=False, skip_read_lock=_skip_lock(args.skip_read_lock, gencfg), ) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) def _read_json_file(filepath): """ Read a JSON file :param str filepath: path to the file to read :return dict: read data """ with open(filepath, "r") as f: data = json.load(f) return data if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith( ".json"): recipe_name = _read_json_file(filepath=recipe_name) if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc.data_dir def _build_asset( genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs, ): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict build_pkg: A dict (see examples) specifying lists of required input_assets, commands to run, and outputs to register as assets. """ log_outfolder = os.path.abspath( os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format( genome_outfolder, log_outfolder)) if args.docker: # Set up some docker stuff if args.volumes: # TODO: is volumes list defined here? volumes = volumes.append(genome_outfolder) else: volumes = genome_outfolder if not _writeable(genome_outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}". format(genome_outfolder)) return pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) # create a bundle list to simplify calls below gat = [genome, asset_key, tag] # collect variables required to populate the command templates asset_vars = get_asset_vars( genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs, ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) for x in build_pkg[CMD_LST] ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) _LOGGER.debug("Command populated: '{}'".format( " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) pm.run(command_list_populated, target, container=pm.container) except pypiper.exceptions.SubprocessError: _LOGGER.error("asset '{}' build failed".format(asset_key)) return False else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) # since the assets are always built to a standard dir structure, we # can just stitch a path together for asset digest calculation asset_dir = os.path.join(rgc.data_dir, *gat) if not os.path.exists(asset_dir): raise OSError("Could not compute asset digest. Path does not " "exist: {}".format(asset_dir)) digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: if asset_key == "fasta": r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) r.update_assets( *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome, ) r.update_tags( *gat, force_digest=genome, data={ CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest, }, ) r.update_seek_keys( *gat, force_digest=genome, keys={ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() }, ) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] asset_tag = a["tag"] or rgc.get_default_tag( genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key if isinstance( recipe_name, dict) or (isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys()): if isinstance(recipe_name, dict): _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) asset_build_package = _check_recipe(recipe_name) recipe_name = asset_build_package["name"] else: asset_build_package = _check_recipe( asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] specified_asset_keys, specified_assets = None, None if args.assets is not None: parsed_parents_input = _parse_user_build_input(args.assets) specified_asset_keys = list(parsed_parents_input.keys()) specified_assets = list(parsed_parents_input.values()) _LOGGER.debug(f"Custom assets requested: {args.assets}") if not specified_asset_keys and isinstance(args.assets, list): _LOGGER.warning( "Specified parent assets format is invalid. Using defaults." ) for req_asset in asset_build_package[REQ_ASSETS]: req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested if (specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys): parent_data = parse_registry_path( specified_assets[specified_asset_keys.index( req_asset_data["asset"])]) g, a, t, s = ( parent_data["genome"], parent_data["asset"], parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), parent_data["seek_key"], ) else: # if no custom parents requested for the req asset, use default one default = parse_registry_path(req_asset[DEFAULT]) g, a, t, s = ( genome, default["asset"], rgc.get_default_tag(genome, default["asset"]), req_asset_data["seek_key"], ) parent_assets.append("{}/{}:{}".format( rgc.get_genome_alias_digest(g, fallback=True), a, t)) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) _LOGGER.debug("Provided parameters: {}".format(specified_params)) for required_file in asset_build_package[REQ_FILES]: if (specified_args is None or required_file[KEY] not in specified_args.keys()): raise ValueError( "Path to the '{x}' input ({desc}) is required, but not provided. " "Specify it with: --files {x}=/path/to/{x}_file". format(x=required_file[KEY], desc=required_file[DESC])) for required_param in asset_build_package[REQ_PARAMS]: if specified_params is None: specified_params = {} if required_param[KEY] not in specified_params.keys(): if required_param[DEFAULT] is None: raise ValueError( "Value for the parameter '{x}' ({desc}) is required, but not provided. " "Specify it with: --params {x}=value".format( x=required_param[KEY], desc=required_param[DESC])) else: specified_params.update( {required_param[KEY]: required_param[DEFAULT]}) _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format( genome, asset_key, asset_tag, recipe_name)) ori_genome = genome if recipe_name == "fasta": if (genome in rgc.genomes_list() and "fasta" in rgc.list_assets_by_genome(genome)): pretag = rgc.get_default_tag(genome, "fasta") _LOGGER.warning( "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})" .format(g=genome, a=asset_key, t=pretag)) genome = rgc.get_genome_alias_digest(alias=genome, fallback=True) else: # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file genome, _ = rgc.initialize_genome( fasta_path=specified_args["fasta"], alias=ori_genome, skip_alias_write=True, ) else: try: genome = rgc.get_genome_alias_digest(genome, fallback=True) except UndefinedAliasError: _LOGGER.error("Genome '{}' has not been initialized yet; " "no key found for this alias".format(genome)) return recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) if not _build_asset( genome, asset_key, asset_tag, asset_build_package, genome_outfolder, specified_args, specified_params, ori_genome, **input_assets, ): log_path = os.path.abspath( os.path.join( genome_outfolder, asset_key, asset_tag, BUILD_STATS_DIR, ORI_LOG_NAME, )) _LOGGER.info( "'{}/{}:{}' was not added to the config, but directory has been left in place. " "See the log file for details: {}".format( genome, asset_key, asset_tag, log_path)) return _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships r.update_relatives_assets(genome, asset_key, asset_tag, parent_assets) # adds parents for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent r.update_relatives_assets( parsed_parent["genome"], parsed_parent["asset"], parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True, ) if args.genome_description is not None: _LOGGER.debug( "adding genome ({}) description: '{}'".format( genome, args.genome_description)) r.update_genomes( genome, {CFG_GENOME_DESC_KEY: args.genome_description}) if args.tag_description is not None: _LOGGER.debug( "adding tag ({}/{}:{}) description: '{}'".format( genome, asset_key, asset_tag, args.tag_description)) r.update_tags( genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}, ) rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name)
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug("refgenie {}".format(__version__)) _LOGGER.debug("Args: {}".format(args)) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) gencfg = refgenconf.select_genome_config( filename=args.genome_config, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp, strict_env=True) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: _LOGGER.debug("Found registry_path: {}".format( args.asset_registry_paths)) asset_list = [ parse_registry_path(x) for x in args.asset_registry_paths ] for a in asset_list: # every asset must have a genome, either provided via registry path # or the args.genome arg. if not a["genome"]: if args.genome: a["genome"] = args.genome else: _LOGGER.error( "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference." .format(a["genome"], a["asset"], a["tag"])) sys.exit(1) else: if args.genome and args.genome != a["genome"]: _LOGGER.warn( "Two different genomes specified for asset '{}'.". format(a["asset"])) else: if args.command in GENOME_ONLY_REQUIRED and not args.genome: parser.error("You must provide either a genome or a registry path") sys.exit(1) if args.command in ASSET_REQUIRED: parser.error("You must provide an asset registry path") sys.exit(1) if args.command == INIT_CMD: _LOGGER.debug("Initializing refgenie genome configuration") rgc = RefGenConf(entries=OrderedDict( { CFG_VERSION_KEY: REQ_CFG_VERSION, CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], CFG_GENOMES_KEY: None })) rgc.initialize_config_file(os.path.abspath(gencfg)) elif args.command == BUILD_CMD: if not all( [x["genome"] == asset_list[0]["genome"] for x in asset_list]): _LOGGER.error("Build can only build assets for one genome") sys.exit(1) recipe_name = None if args.recipe: if len(asset_list) > 1: _LOGGER.error( "Recipes cannot be specified for multi-asset builds") sys.exit(1) recipe_name = args.recipe if args.requirements: for a in asset_list: recipe = recipe_name or a["asset"] if recipe not in asset_build_packages.keys(): _raise_missing_recipe_error(recipe) _LOGGER.info("'{}' recipe requirements: ".format(recipe)) _make_asset_build_reqs(recipe) sys.exit(0) refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'".format( a["genome"], a["asset"], a["seek_key"], a["tag"])) print( rgc.seek(a["genome"], a["asset"], a["tag"], a["seek_key"], strict_exists=check)) return elif args.command == INSERT_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: refgenie_add(rgc, asset_list[0], args.path, args.force) elif args.command == PULL_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) force = None if not args.force else True outdir = rgc[CFG_FOLDER_KEY] if not os.path.exists(outdir): raise MissingFolderError(outdir) target = _key_to_name(CFG_FOLDER_KEY) if not perm_check_x(outdir, target): return if not _single_folder_writeable(outdir): _LOGGER.error("Insufficient permissions to write to {}: {}".format( target, outdir)) return for a in asset_list: rgc.pull(a["genome"], a["asset"], a["tag"], unpack=not args.no_untar, force=force) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: rgc = RefGenConf(filepath=gencfg, writable=False) if args.command == LIST_REMOTE_CMD: num_servers = 0 # Keep all servers so that child updates maintain server list server_list = rgc[CFG_SERVERS_KEY] bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: num_servers += 1 try: rgc[CFG_SERVERS_KEY] = server_url pfx, genomes, assets, recipes = _exec_list( rgc, args.command == LIST_REMOTE_CMD, args.genome) if assets is None and genomes is None: continue _LOGGER.info("{} genomes: {}".format(pfx, genomes)) if args.command != LIST_REMOTE_CMD: # Not implemented yet _LOGGER.info("{} recipes: {}".format(pfx, recipes)) _LOGGER.info("{} assets:\n{}\n".format(pfx, assets)) except (DownloadJsonError, ConnectionError): bad_servers.append(server_url) continue if num_servers >= len(server_list) and bad_servers: _LOGGER.error( "Could not list assets from the following server(s): {}". format(bad_servers)) # Restore original server list, even when we couldn't find assets on a server rgc[CFG_SERVERS_KEY] = server_list else: # Only check local assets once _LOGGER.info("Server subscriptions: {}".format(", ".join( rgc[CFG_SERVERS_KEY]))) pfx, genomes, assets, recipes = _exec_list( rgc, args.command == LIST_REMOTE_CMD, args.genome) _LOGGER.info("{} genomes: {}".format(pfx, genomes)) if args.command != LIST_REMOTE_CMD: # Not implemented yet _LOGGER.info("{} recipes: {}".format(pfx, recipes)) _LOGGER.info("{} assets:\n{}".format(pfx, assets)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.getseq(rgc, args.genome, args.locus) elif args.command == REMOVE_CMD: force = args.force rgc = RefGenConf(filepath=gencfg) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag( a["genome"], a["asset"], use_existing=False) _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: raise NotImplementedError( "You can't remove a specific seek_key.") bundle = [a["genome"], a["asset"], a["tag"]] try: if not rgc.is_asset_complete(*bundle): with rgc as r: r.cfg_remove_assets(*bundle) _LOGGER.info( "Removed an incomplete asset '{}/{}:{}'".format( *bundle)) return except (KeyError, MissingAssetError, MissingGenomeError): _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle)) return if len(asset_list) > 1: if not query_yes_no( "Are you sure you want to remove {} assets?".format( len(asset_list))): _LOGGER.info("Action aborted by the user") return force = True for a in asset_list: rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: # set the default tag and exit with rgc as r: r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) sys.exit(0) rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) print(rgc.id(g, a, t)) return for asset in asset_list: g, a = asset["genome"], asset["asset"] t = asset["tag"] or rgc.get_default_tag(g, a) print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.unsubscribe(urls=args.genome_server) return
def refgenie_build(gencfg, genome, asset_list, recipe_name, args): """ Runs the refgenie build recipe. :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ rgc = RefGenConf(filepath=gencfg, writable=False) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc[CFG_FOLDER_KEY] _LOGGER.debug("Default config file: {}".format(default_config_file())) if args.config_file and not os.path.isfile(args.config_file): _LOGGER.debug("Config file path isn't a file: {}".format( args.config_file)) args.config_file = default_config_file() def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, **kwargs): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict build_pkg: A dict (see examples) specifying lists of required input_assets, commands to run, and outputs to register as assets. """ log_outfolder = os.path.abspath( os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format( genome_outfolder, log_outfolder)) if args.docker: # Set up some docker stuff if args.volumes: # TODO: is volumes list defined here? volumes = volumes.append(genome_outfolder) else: volumes = genome_outfolder if not _writeable(genome_outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}". format(genome_outfolder)) return pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) gat = [genome, asset_key, tag] # create a bundle list to simplify calls below # collect variables required to populate the command templates asset_vars = get_asset_vars(genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) for x in build_pkg[CMD_LST] ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) _LOGGER.debug("Command populated: '{}'".format( " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) pm.run(command_list_populated, target, container=pm.container) except pypiper.exceptions.SubprocessError: _LOGGER.error("asset '{}' build failed".format(asset_key)) return False else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(log_outfolder, recipe_file_name), 'w') as outfile: json.dump(build_pkg, outfile) # update and write refgenie genome configuration with rgc as r: r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}) r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key}) r.update_seek_keys(*gat, keys={ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() }) # in order to conveniently get the path to digest we update the tags metadata in two steps digest = get_dir_digest( r.get_asset(genome, asset_key, tag, enclosing_dir=True), pm) r.update_tags(*gat, data={CFG_ASSET_CHECKSUM_KEY: digest}) _LOGGER.info("Asset digest: {}".format(digest)) r.set_default_pointer(*gat) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] asset_tag = a["tag"] or rgc.get_default_tag( genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key if recipe_name in asset_build_packages.keys(): asset_build_package = _check_recipe( asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] specified_asset_keys, specified_assets = None, None if args.assets is not None: parsed_parents_input = _parse_user_build_input(args.assets) specified_asset_keys, specified_assets = \ list(parsed_parents_input.keys()), list(parsed_parents_input.values()) _LOGGER.debug("Custom assets requested: {}".format( args.assets)) if not specified_asset_keys and isinstance(args.assets, list): _LOGGER.warning( "Specified parent assets format is invalid. Using defaults." ) for req_asset in asset_build_package[REQ_ASSETS]: req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested if specified_asset_keys is not None and req_asset_data[ "asset"] in specified_asset_keys: parent_data = \ parse_registry_path(specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) g, a, t, s = parent_data["genome"], \ parent_data["asset"], \ parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ parent_data["seek_key"] else: # if no custom parents requested for the req asset, use default one default = parse_registry_path(req_asset[DEFAULT]) g, a, t, s = genome, default["asset"], \ rgc.get_default_tag(genome, default["asset"]), \ req_asset_data["seek_key"] parent_assets.append("{}/{}:{}".format(g, a, t)) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) _LOGGER.debug("Provided parameters: {}".format(specified_params)) for required_file in asset_build_package[REQ_FILES]: if specified_args is None or required_file[ KEY] not in specified_args.keys(): raise ValueError( "Path to the '{x}' input ({desc}) is required, but not provided. " "Specify it with: --files {x}=/path/to/{x}_file". format(x=required_file[KEY], desc=required_file[DESC])) for required_param in asset_build_package[REQ_PARAMS]: if specified_params is None: specified_params = {} if required_param[KEY] not in specified_params.keys(): if required_param[DEFAULT] is None: raise ValueError( "Value for the parameter '{x}' ({desc}) is required, but not provided. " "Specify it with: --params {x}=value".format( x=required_param[KEY], desc=required_param[DESC])) else: specified_params.update( {required_param[KEY]: required_param[DEFAULT]}) genome_outfolder = os.path.join(args.outfolder, genome) _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format( genome, asset_key, asset_tag, recipe_name)) if recipe_name == 'fasta' and genome in rgc.genomes_list() \ and 'fasta' in rgc.list_assets_by_genome(genome): _LOGGER.warning( "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). " "It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, specified_args, specified_params, ** input_assets): log_path = os.path.abspath( os.path.join(genome_outfolder, asset_key, asset_tag, BUILD_STATS_DIR, ORI_LOG_NAME)) _LOGGER.info( "'{}/{}:{}' was not added to the config, but directory has been left in place. " "See the log file for details: {}".format( genome, asset_key, asset_tag, log_path)) return # If the recipe was a fasta, we init the genome if recipe_name == 'fasta': _LOGGER.info("Computing initial genome digest...") collection_checksum, content_checksums = \ fasta_checksum(_seek(rgc, genome, asset_key, asset_tag, "fasta")) _LOGGER.info("Initializing genome...") refgenie_initg(rgc, genome, content_checksums) _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships r.update_relatives_assets(genome, asset_key, asset_tag, parent_assets) # adds parents for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent r.update_relatives_assets( parsed_parent["genome"], parsed_parent["asset"], parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) if args.genome_description is not None: _LOGGER.debug( "adding genome ({}) description: '{}'".format( genome, args.genome_description)) r.update_genomes( genome, {CFG_GENOME_DESC_KEY: args.genome_description}) if args.tag_description is not None: _LOGGER.debug( "adding tag ({}/{}:{}) description: '{}'".format( genome, asset_key, asset_tag, args.tag_description)) r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) if recipe_name == "fasta": # to save config lock time when building fasta assets # (genome initialization takes some time for large genomes) we repeat the # conditional here for writing the computed genome digest r.update_genomes( genome, data={CFG_CHECKSUM_KEY: collection_checksum}) else: _raise_missing_recipe_error(recipe_name)
def test_invalid_path(self, pth): rgc = RefGenConf() with pytest.raises(TypeError): rgc.initialize_config_file(filepath=pth)