def main(): global sc global rgc global _LOGGER parser = build_parser() parser = logmuse.add_logging_options(parser) args = parser.parse_args() if not args.command: parser.print_help() print("No subcommand given") sys.exit(1) _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.info("Welcome to the SeqCol API app") # demo_filepath="/home/nsheff/code/seqcolapi/seqcolapi/seqcolapi_config_demo.yaml" scc = SeqColConf(filepath=args.config) _LOGGER.info(f"Connecting to database... {scc.database.host}") pgdb = RDBDict(scc.database.name, scc.database.user, scc.database.password, scc.database.host, scc.database.port) rgc = refget.RefGetClient(scc.refget_provider_apis, pgdb) sc = SeqColClient(database=pgdb, api_url_base=scc.refget_provider_apis, schemas=scc.schemas) seqcolapi_port = args.port if args.port else scc.server.port _LOGGER.info("Running on port {}".format(seqcolapi_port)) uvicorn.run(app, host=scc.server.host, port=seqcolapi_port)
def test_silence(parser, cmdl, flag, hdlr_type): """ Log silencing generates a null handler. """ opts = parser.parse_args(cmdl) assert getattr(opts, SILENCE_LOGS_OPTNAME.lstrip("-")) is flag logger = logger_via_cli(opts) hs = logger.handlers assert 1 == len(hs) assert isinstance(hs[0], hdlr_type)
def main(): """ Primary workflow """ from inspect import getdoc parser = logmuse.add_logging_options( build_argparser(getdoc(PipestatManager))) args = parser.parse_args() if args.command is None: parser.print_help(sys.stderr) sys.exit(1) global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug("Args namespace:\n{}".format(args)) if args.database_config and not args.schema: parser.error("the following arguments are required: -s/--schema") psm = PipestatManager( name=args.namespace, schema_path=args.schema, results_file=args.results_file, database_config=args.database_config ) if args.command == REPORT_CMD: value = args.value result_metadata = psm.schema[args.result_identifier] if result_metadata[SCHEMA_TYPE_KEY] in ["object", "image", "file"] \ and os.path.exists(expandpath(value)): from json import load _LOGGER.info(f"Reading JSON file with object type value: " f"{expandpath(value)}") with open(expandpath(value), "r") as json_file: value = load(json_file) psm.report( result_identifier=args.result_identifier, record_identifier=args.record_identifier, value=value, force_overwrite=args.overwrite, strict_type=not args.try_convert ) sys.exit(0) if args.command == INSPECT_CMD: print("\n") print(psm) if args.data: print("\nData:") print(psm.data) sys.exit(0) if args.command == REMOVE_CMD: psm.remove( result_identifier=args.result_identifier, record_identifier=args.record_identifier ) sys.exit(0) if args.command == RETRIEVE_CMD: print(psm.retrieve( result_identifier=args.result_identifier, record_identifier=args.record_identifier )) sys.exit(0)
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args = parser.parse_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) msg = "Input: {input}; Parameter: {parameter}" _LOGGER.info(msg.format(input=args.input, parameter=args.parameter))
def logger_via_cli(opts, **kwargs): """ Build and initialize logger from CLI specification. :param argparse.Namespace opts: parse of command-line interface :param kwargs: keyword arguments to pass along to underlying logmuse function :return logging.Logger: newly created and configured logger """ from copy import deepcopy import logmuse kwds = deepcopy(kwargs) # By default, don't require the logging options to have been added to the parser. kwds.setdefault("strict", False) return logmuse.logger_via_cli(opts, **kwds)
def main(cmdl): """ Run the script. """ args = _parse_cmdl(cmdl) global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug("Creating counter") counter = ReadCounter(args.readsfile, cores=args.cores, outfile=args.outfile, action="CountReads", limit=args.limit) _LOGGER.debug("Registering files") counter.register_files() _LOGGER.info("Counting reads: {}".format(args.readsfile)) good_chromosomes = counter.run() _LOGGER.info("Collecting read counts: {}".format(args.outfile)) counter.combine(good_chromosomes, chrom_sep="\n")
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(arguments.build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args) _LOGGER.info("Welcome to bedshift version {}".format(__version__)) _LOGGER.info("Shifting file: '{}'".format(args.bedfile)) if not args.bedfile: parser.print_help() _LOGGER.error("No BED file given") sys.exit(1) if args.chrom_lengths: pass elif args.genome: try: import refgenconf rgc = refgenconf.RefGenConf(refgenconf.select_genome_config()) args.chrom_lengths = rgc.seek(args.genome, "fasta", None, "chrom_sizes") except ModuleNotFoundError: _LOGGER.error( "You must have package refgenconf installed to use a refgenie genome" ) sys.exit(1) msg = arguments.param_msg if args.repeat < 1: _LOGGER.error("repeats specified is less than 1") sys.exit(1) if args.outputfile: outfile_base = args.outputfile else: outfile_base = "bedshifted_{}".format(os.path.basename(args.bedfile)) _LOGGER.info( msg.format( bedfile=args.bedfile, chromsizes=args.chrom_lengths, droprate=args.droprate, dropfile=args.dropfile, addrate=args.addrate, addmean=args.addmean, addstdev=args.addstdev, addfile=args.addfile, valid_regions=args.valid_regions, shiftrate=args.shiftrate, shiftmean=args.shiftmean, shiftstdev=args.shiftstdev, shiftfile=args.shiftfile, cutrate=args.cutrate, mergerate=args.mergerate, outputfile=outfile_base, repeat=args.repeat, yaml_config=args.yaml_config, )) bedshifter = Bedshift(args.bedfile, args.chrom_lengths) _LOGGER.info(f"Generating {args.repeat} repetitions...") pct_reports = [int(x * args.repeat / 100) for x in [5, 25, 50, 75, 100]] for i in range(args.repeat): n = bedshifter.all_perturbations( args.addrate, args.addmean, args.addstdev, args.addfile, args.valid_regions, args.shiftrate, args.shiftmean, args.shiftstdev, args.shiftfile, args.cutrate, args.mergerate, args.droprate, args.dropfile, args.yaml_config, ) if args.repeat == 1: bedshifter.to_bed(outfile_base) _LOGGER.info( "REGION COUNT | original: {}\tnew: {}\tchanged: {}\t\noutput file: {}" .format( bedshifter.original_num_regions, bedshifter.bed.shape[0], str(n), outfile_base, )) else: basename, ext = os.path.splitext(os.path.basename(outfile_base)) dirname = os.path.dirname(outfile_base) digits = int(math.log10(args.repeat)) + 1 rep = str(i + 1).zfill(digits) modified_outfile_path = os.path.join(dirname, f"{basename}_rep{rep}{ext}") bedshifter.to_bed(modified_outfile_path) pct_finished = int((100 * (i + 1)) / args.repeat) if i + 1 in pct_reports: _LOGGER.info( f"Rep {i+1}. Finished: {pct_finished}%. Output file: {modified_outfile_path}" ) bedshifter.reset_bed()
def main(): """Primary workflow""" from inspect import getdoc parser = logmuse.add_logging_options( build_argparser(getdoc(PipestatManager))) args = parser.parse_args() if args.command is None: parser.print_help(sys.stderr) sys.exit(1) global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug("Args namespace:\n{}".format(args)) if args.config and not args.schema and args.command != STATUS_CMD: parser.error("the following arguments are required: -s/--schema") psm = PipestatManager( namespace=args.namespace, schema_path=args.schema, results_file_path=args.results_file, config=args.config, database_only=args.database_only, status_schema_path=args.status_schema, flag_file_dir=args.flag_dir, ) if args.command == REPORT_CMD: value = args.value if psm.schema is None: raise SchemaNotFoundError(msg="report", cli=True) result_metadata = psm.schema[args.result_identifier] if (result_metadata[SCHEMA_TYPE_KEY] in [ "object", "image", "file", ] and os.path.exists(expandpath(value))): from json import load _LOGGER.info( f"Reading JSON file with object type value: {expandpath(value)}" ) with open(expandpath(value), "r") as json_file: value = load(json_file) psm.report( record_identifier=args.record_identifier, values={args.result_identifier: value}, force_overwrite=args.overwrite, strict_type=args.skip_convert, ) if args.command == INSPECT_CMD: print("\n") print(psm) if args.data and not args.database_only: print("\nData:") print(psm.data) if args.command == REMOVE_CMD: psm.remove( result_identifier=args.result_identifier, record_identifier=args.record_identifier, ) if args.command == RETRIEVE_CMD: print( psm.retrieve( result_identifier=args.result_identifier, record_identifier=args.record_identifier, )) if args.command == STATUS_CMD: if args.subcommand == STATUS_GET_CMD: print(psm.get_status(record_identifier=args.record_identifier)) if args.subcommand == STATUS_SET_CMD: psm.set_status( status_identifier=args.status_identifier, record_identifier=args.record_identifier, ) sys.exit(0)
def main(): """Run the script.""" cmdl = sys.argv[1:] args = _parse_cmdl(cmdl) global _LOGGER _LOGGER = logmuse.logger_via_cli(args) delete_sra = False # initialize to False # Name the pipeline run after the first element to convert. # Maybe we should just have a separate pipeline for each file? if args.sample_name: run_name = "_".join(uniqify(args.sample_name)) else: primary_srr_acc = os.path.splitext(os.path.basename(args.srr[0]))[0] run_name = primary_srr_acc if args.output_parent: outfolder = os.path.join(args.output_parent, run_name) else: outfolder = os.path.join(args.srafolder, "sra_convert_pipeline", run_name) _LOGGER.info("Using outfolder: {}".format(outfolder)) nfiles = len(args.srr) failed_files = [] pm = pypiper.PipelineManager(name="sra_convert", outfolder=outfolder, args=args) for i in range(nfiles): srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0] pm.info("Processing {} of {} files: {}".format(str(i + 1), str(nfiles), srr_acc)) bamfile = os.path.join(args.bamfolder, srr_acc + ".bam") fq_prefix = os.path.join(args.fqfolder, srr_acc) if args.mode == "convert": infile = args.srr[i] if not os.path.isfile(infile): pm.warning("Couldn't find sra file at: {}.".format(infile)) failed_files.append(args.srr[i]) if args.format == "fastq": # fastq-dump --split-files will produce *_1.fastq and *_2.fastq # for paired-end data, and only *_1.fastq for single-end data. outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix) cmd = "fastq-dump {data_source} --split-files --gzip -O {outfolder}".format( data_source=infile, outfolder=args.fqfolder, nofail=True) elif args.format == "bam": outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam") cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format( data_source=infile, outfile=outfile, nofail=True) else: raise KeyError("Unknown format: {}".format(args.format)) target = outfile ret = pm.run(cmd, target=target) if ret == 0: pm.info("Already completed files: {}".format(failed_files)) try: failed_files.remove(infile) except: pass elif args.mode == "delete_bam": pm.timestamp("Cleaning bam file: {}".format(bamfile)) pm.clean_add(bamfile) elif args.mode == "delete_fq": pm.timestamp("Cleaning fastq file(s): {}*".format(fq_prefix)) fq_prefix = os.path.join(args.fqfolder, srr_acc) pm.clean_add("{fq_prefix}.fastq.gz".format(fq_prefix=fq_prefix)) pm.clean_add( "{fq_prefix}_[0-9].fastq.gz".format(fq_prefix=fq_prefix)) elif args.mode == "delete_sra": delete_sra = True # if specifically requested to delete sra files if not args.keep_sra and os.path.isfile(outfile): # Only delete if the output file was created... # we can't trust the sra toolkit return codes because they # can return 0 even if the command didn't complete, causing us to # delete the sra file when we have no other copy of that data. delete_sra = True if delete_sra: pm.timestamp("Cleaning sra file: {}".format(infile)) pm.clean_add(infile) if len(failed_files) > 0: pm.fail_pipeline( Exception("Unable to locate the following files: {}".format( ",".join(failed_files)))) pm.stop_pipeline()
parser.add_argument('--retain-temp', action='store_true', default=False, help="Retain temporary files? Default: False") parser = logmuse.add_logging_options(parser) args = parser.parse_args(cmdl) if not (args.exactbw or args.smoothbw): parser.error('No output requested, use --exactbw and/or --smoothbw') return args if __name__ == "__main__": args = parse_args(sys.argv[1:]) _LOGGER = logmuse.logger_via_cli(args, make_root=True) if args.mode == "dnase": shift_factor = {"+": 1, "-": 0} # DNase elif args.mode == "atac": shift_factor = {"+": 4, "-": -5} # ATAC else: shift_factor = {"+": 0, "-": 0} ct = CutTracer(reads_filename=args.infile, chrom_sizes_file=args.chrom_sizes_file, scale=args.scale, variable_step=args.variable_step, exactbw=args.exactbw, smoothbw=args.smoothbw, step_size=args.step_size, bedout=args.bedout,
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args) logmuse.logger_via_cli(args, name=refgenconf.__name__) _LOGGER.debug("Args: {}".format(args)) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) gencfg = yacman.select_config(args.genome_config, CFG_ENV_VARS, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) if args.command == INIT_CMD: _LOGGER.info("Initializing refgenie genome configuration") _writeable(os.path.dirname(gencfg), strict_exists=True) refgenie_init(gencfg, args.genome_server) sys.exit(0) rgc = RefGenConf(gencfg) if args.command == BUILD_CMD: refgenie_build(rgc, args) elif args.command == GET_ASSET_CMD: _LOGGER.debug("getting asset: '{}/{}'".format(args.genome, args.asset)) print(" ".join( [rgc.get_asset(args.genome, asset) for asset in args.asset])) return elif args.command == INSERT_CMD: if len(args.asset) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: # recast from list to str args.asset = args.asset[0] refgenie_add(rgc, args) elif args.command == PULL_CMD: outdir = rgc[CFG_FOLDER_KEY] if not os.path.exists(outdir): raise MissingFolderError(outdir) target = _key_to_name(CFG_FOLDER_KEY) if not perm_check_x(outdir, target): return if not _single_folder_writeable(outdir): _LOGGER.error("Insufficient permissions to write to {}: " "{}".format(target, outdir)) return rgc.pull_asset(args.genome, args.asset, gencfg, unpack=not args.no_untar) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: pfx, genomes, assets = _exec_list(rgc, args.command == LIST_REMOTE_CMD) _LOGGER.info("{} genomes: {}".format(pfx, genomes)) _LOGGER.info("{} assets:\n{}".format(pfx, assets))
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() logger_kwargs = {"level": args.verbosity, "devmode": args.logdev} logmuse.init_logger(name="yacman", **logger_kwargs) global _LOGGER _LOGGER = logmuse.logger_via_cli(args) _LOGGER.debug("Command given: {}".format(args.command)) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) if args.command == "init": bulkercfg = args.config _LOGGER.debug("Initializing bulker configuration") _is_writable(os.path.dirname(bulkercfg), check_exist=False) bulker_init(bulkercfg, DEFAULT_CONFIG_FILEPATH, args.engine) sys.exit(0) bulkercfg = select_bulker_config(args.config) bulker_config = yacman.YacAttMap(filepath=bulkercfg, writable=False) if args.command == "list": # Output header via logger and content via print so the user can # redirect the list from stdout if desired without the header as clutter if args.simple: fmt = "{namespace}/{crate}:{tag}" else: _LOGGER.info("Available crates:") fmt = "{namespace}/{crate}:{tag} -- {path}" if bulker_config.bulker.crates: for namespace, crates in bulker_config.bulker.crates.items(): for crate, tags in crates.items(): for tag, path in tags.items(): print( fmt.format(namespace=namespace, crate=crate, tag=tag, path=path)) else: _LOGGER.info( "No crates available. Use 'bulker load' to load a crate.") sys.exit(1) # For all remaining commands we need a crate identifier _LOGGER.info("Bulker config: {}".format(bulkercfg)) if args.command == "activate": try: cratelist = parse_registry_paths( args.crate_registry_paths, bulker_config.bulker.default_namespace) _LOGGER.debug(cratelist) _LOGGER.info("Activating bulker crate: {}{}".format( args.crate_registry_paths, " (Strict)" if args.strict else "")) bulker_activate(bulker_config, cratelist, echo=args.echo, strict=args.strict, prompt=args.no_prompt) except KeyError as e: parser.print_help(sys.stderr) _LOGGER.error("{} is not an available crate".format(e)) sys.exit(1) except MissingCrateError as e: _LOGGER.error("Missing crate: {}".format(e)) sys.exit(1) except AttributeError as e: _LOGGER.error( "Your bulker config file is outdated, you need to re-initialize it: {}" .format(e)) sys.exit(1) if args.command == "run": try: cratelist = parse_registry_paths(args.crate_registry_paths) _LOGGER.info("Activating crate: {}\n".format( args.crate_registry_paths)) bulker_run(bulker_config, cratelist, args.cmd, strict=args.strict) except KeyError as e: parser.print_help(sys.stderr) _LOGGER.error("{} is not an available crate".format(e)) sys.exit(1) except MissingCrateError as e: _LOGGER.error("Missing crate: {}".format(e)) sys.exit(1) if args.command == "load": bulker_config.make_writable() manifest, cratevars = load_remote_registry_path( bulker_config, args.crate_registry_paths, args.manifest) exe_template_jinja = None build_template_jinja = None shell_template_jinja = None exe_template = mkabs(bulker_config.bulker.executable_template, os.path.dirname(bulker_config._file_path)) build_template = mkabs(bulker_config.bulker.build_template, os.path.dirname(bulker_config._file_path)) try: shell_template = mkabs(bulker_config.bulker.shell_template, os.path.dirname(bulker_config._file_path)) except AttributeError: _LOGGER.error( "You need to re-initialize your bulker config or add a 'shell_template' attribute." ) sys.exit(1) try: assert (os.path.exists(exe_template)) except AssertionError: _LOGGER.error( "Bulker config points to a missing executable template: {}". format(exe_template)) sys.exit(1) with open(exe_template, 'r') as f: # with open(DOCKER_TEMPLATE, 'r') as f: contents = f.read() exe_template_jinja = jinja2.Template(contents) try: assert (os.path.exists(shell_template)) except AssertionError: _LOGGER.error( "Bulker config points to a missing shell template: {}".format( shell_template)) sys.exit(1) with open(shell_template, 'r') as f: # with open(DOCKER_TEMPLATE, 'r') as f: contents = f.read() shell_template_jinja = jinja2.Template(contents) if args.build: try: assert (os.path.exists(build_template)) except AssertionError: _LOGGER.error( "Bulker config points to a missing build template: {}". format(build_template)) sys.exit(1) _LOGGER.info( "Building images with template: {}".format(build_template)) with open(build_template, 'r') as f: contents = f.read() build_template_jinja = jinja2.Template(contents) bulker_load(manifest, cratevars, bulker_config, exe_jinja2_template=exe_template_jinja, shell_jinja2_template=shell_template_jinja, crate_path=args.path, build=build_template_jinja, force=args.force) if args.command == "inspect": if args.crate_registry_paths == "": _LOGGER.error( "No active create. Inspect requires a provided crate, or a currently active create." ) sys.exit(1) manifest, cratevars = load_remote_registry_path( bulker_config, args.crate_registry_paths, None) manifest_name = cratevars['crate'] print("Bulker manifest: {}".format(args.crate_registry_paths)) crate_path = os.path.join(bulker_config.bulker.default_crate_folder, cratevars['namespace'], manifest_name, cratevars['tag']) if not os.path.isabs(crate_path): crate_path = os.path.join(os.path.dirname(bcfg._file_path), crate_path) print("Crate path: {}".format(crate_path)) import glob filenames = glob.glob(os.path.join(crate_path, "*")) available_commands = [ x for x in [os.path.basename(x) for x in filenames] if x[0] != "_" ] print("Available commands: {}".format(available_commands))
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}") _LOGGER.debug(f"Args: {args}") if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) if args.command == ALIAS_CMD and not args.subcommand: parser.print_help() _LOGGER.error("No alias subcommand command given") sys.exit(1) gencfg = select_genome_config( filename=args.genome_config, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp, strict_env=True, ) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] for a in asset_list: # every asset must have a genome, either provided via registry path # or the args.genome arg. if not a["genome"]: if args.genome: a["genome"] = args.genome else: _LOGGER.error( "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format( a["genome"], a["asset"], a["tag"] ) ) sys.exit(1) else: if args.genome and args.genome != a["genome"]: _LOGGER.warn( "Two different genomes specified for asset '{}'.".format( a["asset"] ) ) else: if args.command in GENOME_ONLY_REQUIRED and not args.genome: parser.error("You must provide either a genome or a registry path") sys.exit(1) if args.command in ASSET_REQUIRED: parser.error("You must provide an asset registry path") sys.exit(1) if args.command == INIT_CMD: _LOGGER.debug("Initializing refgenie genome configuration") entries = OrderedDict( { CFG_VERSION_KEY: REQ_CFG_VERSION, CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], CFG_GENOMES_KEY: None, } ) if args.settings_json: if os.path.isfile(args.settings_json): with open(args.settings_json, "r") as json_file: data = json.load(json_file) entries.update(data) else: raise FileNotFoundError( "JSON file with config init settings does not exist: {}".format( args.settings_json ) ) if args.genome_folder: entries.update({CFG_FOLDER_KEY: args.genome_folder}) if args.remote_url_base: entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) if args.genome_archive_folder: entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) if args.genome_archive_config: entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) _LOGGER.debug("initializing with entries: {}".format(entries)) rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) rgc.initialize_config_file(os.path.abspath(gencfg)) elif args.command == BUILD_CMD: if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]): _LOGGER.error("Build can only build assets for one genome") sys.exit(1) recipe_name = None if args.recipe: if len(asset_list) > 1: _LOGGER.error("Recipes cannot be specified for multi-asset builds") sys.exit(1) recipe_name = args.recipe if args.requirements: for a in asset_list: recipe = recipe_name or a["asset"] if recipe not in asset_build_packages.keys(): _raise_missing_recipe_error(recipe) _LOGGER.info("'{}' recipe requirements: ".format(recipe)) _make_asset_build_reqs(recipe) sys.exit(0) refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug( "getting asset: '{}/{}.{}:{}'".format( a["genome"], a["asset"], a["seek_key"], a["tag"] ) ) print( rgc.seek( a["genome"], a["asset"], a["tag"], a["seek_key"], strict_exists=check, ) ) return elif args.command == INSERT_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: sk = args.seek_keys if sk: sk = json.loads(args.seek_keys) rgc.add( path=args.path, genome=asset_list[0]["genome"], asset=asset_list[0]["asset"], tag=asset_list[0]["tag"], seek_keys=sk, force=args.force, ) elif args.command == PULL_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) # existing assets overwriting if args.no_overwrite: force = False elif args.force_overwrite: force = True else: force = None # large archive pulling if args.no_large: force_large = False elif args.pull_large: force_large = True else: force_large = None # batch mode takes precedence over other choices if args.batch: force_large = True force = False outdir = rgc.data_dir if not os.path.exists(outdir): raise MissingFolderError(outdir) if not perm_check_x(outdir): return if not _single_folder_writeable(outdir): _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) return for a in asset_list: rgc.pull( a["genome"], a["asset"], a["tag"], force=force, force_large=force_large, size_cutoff=args.size_cutoff, ) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) console = Console() if args.command == LIST_REMOTE_CMD: num_servers = 0 bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: num_servers += 1 try: table = rgc.get_asset_table( genomes=args.genome, server_url=server_url ) except (DownloadJsonError, ConnectionError, MissingSchema): bad_servers.append(server_url) continue else: console.print(table) if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: _LOGGER.error( "Could not list assets from the following servers: {}".format( bad_servers ) ) else: if args.recipes: print(", ".join(sorted(list(asset_build_packages.keys())))) else: console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: force = args.force rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag( a["genome"], a["asset"], use_existing=False ) _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: raise NotImplementedError("You can't remove a specific seek_key.") gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} try: if not rgc.is_asset_complete(**gat): with rgc as r: r.cfg_remove_assets(**gat) _LOGGER.info( "Removed an incomplete asset " "'{genome}/{asset}:{tag}'".format(*gat) ) return except (KeyError, MissingAssetError, MissingGenomeError): _LOGGER.info( "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat) ) return if len(asset_list) > 1: if not query_yes_no( "Are you sure you want to remove {} assets?".format(len(asset_list)) ): _LOGGER.info("Action aborted by the user") return force = True for a in asset_list: rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: # set the default tag and exit with rgc as r: r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) sys.exit(0) rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) elif args.command == ID_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) print(rgc.id(g, a, t)) return for asset in asset_list: g, a = asset["genome"], asset["asset"] t = asset["tag"] or rgc.get_default_tag(g, a) print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) if args.subcommand == ALIAS_GET_CMD: if args.aliases is not None: for a in args.aliases: print(rgc.get_genome_alias_digest(alias=a)) return console = Console() console.print(rgc.genome_aliases_table) if args.subcommand == ALIAS_SET_CMD: rgc.set_genome_alias( digest=args.digest, genome=args.aliases, reset_digest=args.reset, create_genome=args.force, ) return elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) return elif args.command == COMPARE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) res = rgc.compare( args.genome1[0], args.genome2[0], explain=not args.no_explanation ) if args.no_explanation: print(res) elif args.command == UPGRADE_CMD: upgrade_config( target_version=args.target_version, filepath=gencfg, force=args.force )
def create_logger(): return logger_via_cli(opts, strict=strict)
def main(): """ Primary workflow """ parser = logmuse.add_logging_options(build_argparser()) args, remaining_args = parser.parse_known_args() global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) _LOGGER.debug("refgenie {}".format(__version__)) _LOGGER.debug("Args: {}".format(args)) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) gencfg = refgenconf.select_genome_config( filename=args.genome_config, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp, strict_env=True) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: _LOGGER.debug("Found registry_path: {}".format( args.asset_registry_paths)) asset_list = [ parse_registry_path(x) for x in args.asset_registry_paths ] for a in asset_list: # every asset must have a genome, either provided via registry path # or the args.genome arg. if not a["genome"]: if args.genome: a["genome"] = args.genome else: _LOGGER.error( "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference." .format(a["genome"], a["asset"], a["tag"])) sys.exit(1) else: if args.genome and args.genome != a["genome"]: _LOGGER.warn( "Two different genomes specified for asset '{}'.". format(a["asset"])) else: if args.command in GENOME_ONLY_REQUIRED and not args.genome: parser.error("You must provide either a genome or a registry path") sys.exit(1) if args.command in ASSET_REQUIRED: parser.error("You must provide an asset registry path") sys.exit(1) if args.command == INIT_CMD: _LOGGER.debug("Initializing refgenie genome configuration") rgc = RefGenConf(entries=OrderedDict( { CFG_VERSION_KEY: REQ_CFG_VERSION, CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], CFG_GENOMES_KEY: None })) rgc.initialize_config_file(os.path.abspath(gencfg)) elif args.command == BUILD_CMD: if not all( [x["genome"] == asset_list[0]["genome"] for x in asset_list]): _LOGGER.error("Build can only build assets for one genome") sys.exit(1) recipe_name = None if args.recipe: if len(asset_list) > 1: _LOGGER.error( "Recipes cannot be specified for multi-asset builds") sys.exit(1) recipe_name = args.recipe if args.requirements: for a in asset_list: recipe = recipe_name or a["asset"] if recipe not in asset_build_packages.keys(): _raise_missing_recipe_error(recipe) _LOGGER.info("'{}' recipe requirements: ".format(recipe)) _make_asset_build_reqs(recipe) sys.exit(0) refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'".format( a["genome"], a["asset"], a["seek_key"], a["tag"])) print( rgc.seek(a["genome"], a["asset"], a["tag"], a["seek_key"], strict_exists=check)) return elif args.command == INSERT_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: refgenie_add(rgc, asset_list[0], args.path, args.force) elif args.command == PULL_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) force = None if not args.force else True outdir = rgc[CFG_FOLDER_KEY] if not os.path.exists(outdir): raise MissingFolderError(outdir) target = _key_to_name(CFG_FOLDER_KEY) if not perm_check_x(outdir, target): return if not _single_folder_writeable(outdir): _LOGGER.error("Insufficient permissions to write to {}: {}".format( target, outdir)) return for a in asset_list: rgc.pull(a["genome"], a["asset"], a["tag"], unpack=not args.no_untar, force=force) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: rgc = RefGenConf(filepath=gencfg, writable=False) if args.command == LIST_REMOTE_CMD: num_servers = 0 # Keep all servers so that child updates maintain server list server_list = rgc[CFG_SERVERS_KEY] bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: num_servers += 1 try: rgc[CFG_SERVERS_KEY] = server_url pfx, genomes, assets, recipes = _exec_list( rgc, args.command == LIST_REMOTE_CMD, args.genome) if assets is None and genomes is None: continue _LOGGER.info("{} genomes: {}".format(pfx, genomes)) if args.command != LIST_REMOTE_CMD: # Not implemented yet _LOGGER.info("{} recipes: {}".format(pfx, recipes)) _LOGGER.info("{} assets:\n{}\n".format(pfx, assets)) except (DownloadJsonError, ConnectionError): bad_servers.append(server_url) continue if num_servers >= len(server_list) and bad_servers: _LOGGER.error( "Could not list assets from the following server(s): {}". format(bad_servers)) # Restore original server list, even when we couldn't find assets on a server rgc[CFG_SERVERS_KEY] = server_list else: # Only check local assets once _LOGGER.info("Server subscriptions: {}".format(", ".join( rgc[CFG_SERVERS_KEY]))) pfx, genomes, assets, recipes = _exec_list( rgc, args.command == LIST_REMOTE_CMD, args.genome) _LOGGER.info("{} genomes: {}".format(pfx, genomes)) if args.command != LIST_REMOTE_CMD: # Not implemented yet _LOGGER.info("{} recipes: {}".format(pfx, recipes)) _LOGGER.info("{} assets:\n{}".format(pfx, assets)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.getseq(rgc, args.genome, args.locus) elif args.command == REMOVE_CMD: force = args.force rgc = RefGenConf(filepath=gencfg) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag( a["genome"], a["asset"], use_existing=False) _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: raise NotImplementedError( "You can't remove a specific seek_key.") bundle = [a["genome"], a["asset"], a["tag"]] try: if not rgc.is_asset_complete(*bundle): with rgc as r: r.cfg_remove_assets(*bundle) _LOGGER.info( "Removed an incomplete asset '{}/{}:{}'".format( *bundle)) return except (KeyError, MissingAssetError, MissingGenomeError): _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle)) return if len(asset_list) > 1: if not query_yes_no( "Are you sure you want to remove {} assets?".format( len(asset_list))): _LOGGER.info("Action aborted by the user") return force = True for a in asset_list: rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: # set the default tag and exit with rgc as r: r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) sys.exit(0) rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) print(rgc.id(g, a, t)) return for asset in asset_list: g, a = asset["genome"], asset["asset"] t = asset["tag"] or rgc.get_default_tag(g, a) print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) rgc.unsubscribe(urls=args.genome_server) return
def test_opts_added_none_used(parser): """ Addition of logging options allows logger_via_cli to complete. """ opts = parser.parse_args([]) assert all(hasattr(opts, _rawopt(n)) for n in LOGGING_CLI_OPTDATA) logger = logger_via_cli(opts) assert isinstance(logger, logging.Logger)
def run_geofetch(cmdl): """ Main script driver/workflow """ args = _parse_cmdl(cmdl) global _LOGGER _LOGGER = logger_via_cli(args, name="geofetch") if args.name: project_name = args.name else: project_name = os.path.splitext(os.path.basename(args.input))[0] def render_env_var(ev): return "{} ({})".format(ev, expandpath(ev)) metadata_expanded = expandpath(args.metadata_folder) _LOGGER.info("Given metadata folder: {} ({})".format( args.metadata_folder, metadata_expanded)) if os.path.isabs(metadata_expanded): metadata_raw = args.metadata_folder else: metadata_expanded = os.path.abspath(metadata_expanded) metadata_raw = os.path.abspath(args.metadata_folder) _LOGGER.info("Initial raw metadata folder: {}".format( render_env_var(metadata_raw))) if not args.no_subfolder: metadata_expanded = os.path.join(metadata_expanded, project_name) metadata_raw = os.path.join(metadata_raw, project_name) _LOGGER.info("Final raw metadata folder: {}".format( render_env_var(metadata_raw))) # Some sanity checks before proceeding if args.bam_folder and not which("samtools"): raise SystemExit("For SAM/BAM processing, samtools should be on PATH.") acc_GSE_list = parse_accessions(args.input, metadata_expanded, args.just_metadata) # Loop through each accession. # This will process that accession, produce metadata and download files for # the GSM #s included in the list for each GSE#. # acc_GSE = "GSE61150" # example # This loop populates a list of metadata. metadata_dict = OrderedDict() subannotation_dict = OrderedDict() failed_runs = [] for acc_GSE in acc_GSE_list.keys(): _LOGGER.info("Processing accession: " + acc_GSE) if len(re.findall(GSE_PATTERN, acc_GSE)) != 1: print(len(re.findall(GSE_PATTERN, acc_GSE))) _LOGGER.warning("This does not appear to be a correctly formatted " "GSE accession! Continue anyway...") # Get GSM#s (away from sample_name) GSM_limit_list = list(acc_GSE_list[acc_GSE].keys() ) #[x[1] for x in acc_GSE_list[acc_GSE]] if (len(acc_GSE_list[acc_GSE]) > 0): _LOGGER.info("Limit to: {}".format(list( acc_GSE_list[acc_GSE]))) # a list of GSM#s if args.refresh_metadata: _LOGGER.info("Refreshing metadata...") # For each GSE acc, produce a series of metadata files file_gse = os.path.join(metadata_expanded, acc_GSE + '_GSE.soft') file_gsm = os.path.join(metadata_expanded, acc_GSE + '_GSM.soft') file_sra = os.path.join(metadata_expanded, acc_GSE + '_SRA.csv') file_srafilt = os.path.join(metadata_expanded, acc_GSE + '_SRA_filt.csv') # Grab the GSE and GSM SOFT files from GEO. # The GSE file has metadata describing the experiment, which includes # The SRA number we need to download the raw data from SRA # The GSM file has metadata describing each sample, which we will use to # produce a sample annotation sheet. if not os.path.isfile(file_gse) or args.refresh_metadata: Accession(acc_GSE).fetch_metadata(file_gse) else: _LOGGER.info("Found previous GSE file: " + file_gse) if not os.path.isfile(file_gsm) or args.refresh_metadata: Accession(acc_GSE).fetch_metadata(file_gsm, typename="GSM") else: _LOGGER.info("Found previous GSM file: " + file_gsm) # A simple state machine to parse SOFT formatted files (Here, the GSM file) gsm_metadata = OrderedDict() # For multi samples (samples with multiple runs), we keep track of these # relations in a separate table, which is called the subannotation table. gsm_multi_table = OrderedDict() # save the state current_sample_id = None current_sample_srx = False for line in open(file_gsm, 'r'): line = line.rstrip() if line[0] is "^": pl = parse_SOFT_line(line) if len(acc_GSE_list[acc_GSE] ) > 0 and pl['SAMPLE'] not in GSM_limit_list: #sys.stdout.write(" Skipping " + a['SAMPLE'] + ".") current_sample_id = None continue current_sample_id = pl['SAMPLE'] current_sample_srx = False columns_init = [("sample_name", ""), ("protocol", ""), ("organism", ""), ("read_type", ""), ("data_source", None), ("SRR", None), ("SRX", None)] gsm_metadata[current_sample_id] = OrderedDict(columns_init) _LOGGER.info("Found sample: {}".format(current_sample_id)) elif current_sample_id is not None: try: pl = parse_SOFT_line(line) except IndexError: # TODO: do we "fail the current sample" here and remove it # from gsm_metadata? Or just skip the line? _LOGGER.debug( "Failed to parse alleged SOFT line for sample " "ID {}; line: {}".format(current_sample_id, line)) continue gsm_metadata[current_sample_id].update(pl) # For processed data, here's where we would download it if args.processed and not args.just_metadata: found = re.findall(SUPP_FILE_PATTERN, line) if found: print(pl[pl.keys()[0]]) # Now convert the ids GEO accessions into SRX accessions if not current_sample_srx: found = re.findall(EXPERIMENT_PATTERN, line) if found: _LOGGER.info("(SRX accession: {})".format(found[0])) srx_id = found[0] gsm_metadata[srx_id] = gsm_metadata.pop( current_sample_id) gsm_metadata[srx_id][ "gsm_id"] = current_sample_id # save the GSM id current_sample_id = srx_id current_sample_srx = True # GSM SOFT file parsed, save it in a list metadata_dict[acc_GSE] = gsm_metadata # Parse out the SRA project identifier from the GSE file acc_SRP = None for line in open(file_gse, 'r'): found = re.findall(PROJECT_PATTERN, line) if found: acc_SRP = found[0] _LOGGER.info("Found SRA Project accession: {}".format(acc_SRP)) break # For processed data, here's where we would download it if args.processed and not args.just_metadata: found = re.findall(SER_SUPP_FILE_PATTERN, line) if found: pl = parse_SOFT_line(line) file_url = pl[pl.keys()[0]].rstrip() _LOGGER.info("File: " + str(file_url)) # download file if args.geofolder: data_folder = os.path.join(args.geofolder, acc_GSE) print(file_url, data_folder) subprocess.call(['wget', file_url, '-P', data_folder]) if not acc_SRP: # If I can't get an SRA accession, maybe raw data wasn't submitted to SRA # as part of this GEO submission. Can't proceed. _LOGGER.warning( "\033[91mUnable to get SRA accession (SRP#) from GEO GSE SOFT file. No raw data?\033[0m" ) # but wait; another possibility: there's no SRP linked to the GSE, but there # could still be an SRX linked to the (each) GSM. if len(gsm_metadata) == 1: acc_SRP = gsm_metadata.keys()[0] _LOGGER.warning("But the GSM has an SRX number; instead of an " "SRP, using SRX identifier for this sample: " + acc_SRP) else: # More than one sample? not sure what to do here. Does this even happen? continue # Now we have an SRA number, grab the SraRunInfo Metadata sheet: # The SRARunInfo sheet has additional sample metadata, which we will combine # with the GSM file to produce a single sample a if not os.path.isfile(file_sra) or args.refresh_metadata: Accession(acc_SRP).fetch_metadata(file_sra) else: _LOGGER.info("Found previous SRA file: " + file_sra) _LOGGER.info("SRP: {}".format(acc_SRP)) # Parse metadata from SRA # Produce an annotated output from the GSM and SRARunInfo files. # This will merge the GSM and SRA sample metadata into a dict of dicts, # with one entry per sample. # NB: There may be multiple SRA Runs (and thus lines in the RunInfo file) # Corresponding to each sample. if not args.processed: file_read = open(file_sra, 'r') file_write = open(file_srafilt, 'w') _LOGGER.info("Parsing SRA file to download SRR records") initialized = False input_file = csv.DictReader(file_read) for line in input_file: if not initialized: initialized = True w = csv.DictWriter(file_write, line.keys()) w.writeheader() #print(line) #print(gsm_metadata[line['SampleName']]) # SampleName is not necessarily the GSM number, though frequently it is #gsm_metadata[line['SampleName']].update(line) # Only download if it's in the include list: experiment = line["Experiment"] run_name = line["Run"] if experiment not in gsm_metadata: # print("Skipping: {}".format(experiment)) continue # local convenience variable # possibly set in the input tsv file sample_name = None # initialize to empty try: sample_name = acc_GSE_list[acc_GSE][ gsm_metadata[experiment]["gsm_id"]] except KeyError: pass if not sample_name or sample_name is "": temp = gsm_metadata[experiment]['Sample_title'] # Now do a series of transformations to cleanse the sample name temp = temp.replace(" ", "_") # Do people put commas in their sample names? Yes. temp = temp.replace(",", "_") temp = temp.replace("__", "_") sample_name = temp # Otherwise, record that there's SRA data for this run. # And set a few columns that are used as input to the Looper # print("Updating columns for looper") update_columns(gsm_metadata, experiment, sample_name=sample_name, read_type=line['LibraryLayout']) # Some experiments are flagged in SRA as having multiple runs. if gsm_metadata[experiment].get("SRR") is not None: # This SRX number already has an entry in the table. _LOGGER.info("Found additional run: {} ({})".format( run_name, experiment)) if isinstance(gsm_metadata[experiment]["SRR"], _STRING_TYPES) \ and experiment not in gsm_multi_table: # Only one has been stuck in so far, make a list gsm_multi_table[experiment] = [] # Add first the original one, which was stored as a string # previously gsm_multi_table[experiment].append([ sample_name, experiment, gsm_metadata[experiment]["SRR"] ]) # Now append the current SRR number in a list as [SRX, SRR] gsm_multi_table[experiment].append( [sample_name, experiment, run_name]) else: # this is the 3rd or later sample; the first two are done, # so just add it. gsm_multi_table[experiment].append( [sample_name, experiment, run_name]) if args.split_experiments: # Duplicate the gsm metadata for this experiment (copy to make sure # it's not just an alias). rep_number = len(gsm_multi_table[experiment]) new_SRX = experiment + "_" + str(rep_number) gsm_metadata[new_SRX] = copy.copy( gsm_metadata[experiment]) # gsm_metadata[new_SRX]["SRX"] = new_SRX gsm_metadata[new_SRX]["sample_name"] += "_" + str( rep_number) gsm_metadata[new_SRX]["SRR"] = run_name else: # Either way, set the srr code to multi in the main table. gsm_metadata[experiment]["SRR"] = "multi" else: # The first SRR for this SRX is added to GSM metadata gsm_metadata[experiment]["SRR"] = run_name #gsm_metadata[experiment].update(line) # Write to filtered SRA Runinfo file w.writerow(line) _LOGGER.info("Get SRR: {} ({})".format(run_name, experiment)) bam_file = "" if args.bam_folder == "" else os.path.join( args.bam_folder, run_name + ".bam") # TODO: sam-dump has a built-in prefetch. I don't have to do # any of this stuff... This also solves the bad sam-dump issues. if os.path.exists(bam_file): _LOGGER.info("BAM found:" + bam_file) else: if not args.just_metadata: # Use the 'prefetch' utility from the SRA Toolkit # to download the raw reads. # (http://www.ncbi.nlm.nih.gov/books/NBK242621/) # Set up a simple loop to try a few times in case of failure t = 0 while True: t = t + 1 subprocess_return = subprocess.call([ 'prefetch', run_name, '--max-size', '50000000' ]) if subprocess_return == 0: break if t >= NUM_RETRIES: _LOGGER.info( "Prefetch retries failed. Try this sample later" ) failed_runs.append(run_name) break _LOGGER.info( "Prefetch attempt failed, wait a few seconds to try again" ) time.sleep(t * 2) else: _LOGGER.info("Dry run (no data download)") if args.bam_conversion and args.bam_folder is not '': _LOGGER.info("Converting to bam: " + run_name) sra_file = os.path.join(args.sra_folder, run_name + ".sra") if not os.path.exists(sra_file): _LOGGER.info("SRA file doesn't exist, please " "download it first: " + sra_file) continue # The -u here allows unaligned reads, and seems to be # required for some sra files regardless of aligned state cmd = "sam-dump -u " + \ os.path.join(args.sra_folder, run_name + ".sra") + \ " | samtools view -bS - > " + bam_file #sam-dump -u SRR020515.sra | samtools view -bS - > test.bam _LOGGER.info("Conversion command: {}".format(cmd)) subprocess.call(cmd, shell=True) # check to make sure it worked # NS: Sometimes sam-dump fails, yielding an empty bam file, but # a fastq-dump works. This happens on files with bad quality # encodings. I contacted GEO about it in December 2015 # Here we check the file size and use fastq -> bam conversion # if the sam-dump failed. if args.bam_conversion and args.bam_folder is not '': st = os.stat(bam_file) # print("File size: " + str(st.st_size)) if st.st_size < 100: _LOGGER.warning( "Bam conversion failed with sam-dump. Trying fastq-dump..." ) # recreate? cmd = "fastq-dump --split-3 -O " + \ os.path.realpath(args.sra_folder) + " " + \ os.path.join(args.sra_folder, run_name + ".sra") _LOGGER.info("Command: {}".format(cmd)) subprocess.call(cmd, shell=True) if not args.picard_path: _LOGGER.warning( "Can't convert the fastq to bam without picard path" ) else: # was it paired data? you have to process it differently # so it knows it's paired end fastq0 = os.path.join(args.sra_folder, run_name + ".fastq") fastq1 = os.path.join(args.sra_folder, run_name + "_1.fastq") fastq2 = os.path.join(args.sra_folder, run_name + "_2.fastq") cmd = "java -jar " + args.picard_path + " FastqToSam" if os.path.exists(fastq1) and os.path.exists( fastq2): cmd += " FASTQ=" + fastq1 cmd += " FASTQ2=" + fastq2 else: cmd += " FASTQ=" + fastq0 cmd += " OUTPUT=" + bam_file cmd += " SAMPLE_NAME=" + run_name cmd += " QUIET=true" _LOGGER.info("Conversion command: {}".format(cmd)) subprocess.call(cmd, shell=True) file_read.close() file_write.close() # accumulate subannotations subannotation_dict[acc_GSE] = gsm_multi_table # Combine individual accessions into project-level annotations, and write # individual accession files (if requested) metadata_dict_combined = OrderedDict() for acc_GSE, gsm_metadata in metadata_dict.iteritems(): file_annotation = os.path.join(metadata_expanded, acc_GSE + '_annotation.csv') if args.acc_anno: write_annotation(gsm_metadata, file_annotation, use_key_subset=args.use_key_subset) metadata_dict_combined.update(gsm_metadata) subannotation_dict_combined = OrderedDict() for acc_GSE, gsm_multi_table in subannotation_dict.iteritems(): file_subannotation = os.path.join(metadata_expanded, acc_GSE + '_subannotation.csv') if args.acc_anno: write_subannotation(gsm_multi_table, file_subannotation) subannotation_dict_combined.update(gsm_multi_table) _LOGGER.info("Finished processing {} accession(s)".format( len(acc_GSE_list))) if (len(failed_runs) > 0): _LOGGER.warn( "The following samples could not be downloaded: {}".format( failed_runs)) # if user specified a pipeline interface path, add it into the project config if args.pipeline_interfaces: file_pipeline_interfaces = args.pipeline_interfaces else: file_pipeline_interfaces = "null" _LOGGER.info( "Creating complete project annotation sheets and config file...") # If the project included more than one GSE, we can now output combined # annotation tables for the entire project. # Write combined annotation sheet file_annotation = os.path.join(metadata_raw, project_name + '_annotation.csv') write_annotation(metadata_dict_combined, file_annotation, use_key_subset=args.use_key_subset) # Write combined subannotation table if len(subannotation_dict_combined) > 0: file_subannotation = os.path.join(metadata_raw, project_name + '_subannotation.csv') write_subannotation(subannotation_dict_combined, file_subannotation) else: file_subannotation = "null" # Write project config file if not args.config_template: geofetchdir = os.path.dirname(__file__) args.config_template = os.path.join(geofetchdir, "config_template.yaml") with open(args.config_template, 'r') as template_file: template = template_file.read() template_values = { "project_name": project_name, "annotation": file_annotation, "subannotation": file_subannotation, "pipeline_interfaces": file_pipeline_interfaces } for k, v in template_values.items(): placeholder = "{" + str(k) + "}" template = template.replace(placeholder, str(v)) config = os.path.join(metadata_raw, project_name + "_config.yaml") _write(config, template, msg_pre=" Config file: ")
def test_typical_verbosity(parser, verbosity): """ Typical verbosity specifications yield logger with expected level. """ opts = parser.parse_args([VERBOSITY_OPTNAME, str(verbosity)]) logger = logger_via_cli(opts) exp = getattr(logging, LEVEL_BY_VERBOSITY[verbosity - 1]) _assert_level(logger, exp)
def main(): """Primary workflow""" parser = logmuse.add_logging_options(build_argparser()) # args, remaining_args = parser.parse_known_args() args = parser.parse_args() logger_kwargs = {"level": args.verbosity, "devmode": args.logdev} logmuse.init_logger("yacman", **logger_kwargs) global _LOGGER _LOGGER = logmuse.logger_via_cli(args) if not args.command: parser.print_help() _LOGGER.error("No command given") sys.exit(1) if args.command == "init": divcfg = args.config _LOGGER.debug("Initializing divvy configuration") is_writable(os.path.dirname(divcfg), check_exist=False) divvy_init(divcfg, DEFAULT_CONFIG_FILEPATH) sys.exit(0) _LOGGER.debug("Divvy config: {}".format(args.config)) divcfg = select_divvy_config(args.config) _LOGGER.info("Using divvy config: {}".format(divcfg)) dcc = ComputingConfiguration(filepath=divcfg) if args.command == "list": # Output header via logger and content via print so the user can # redirect the list from stdout if desired without the header as clutter _LOGGER.info("Available compute packages:\n") print("{}".format("\n".join(dcc.list_compute_packages()))) sys.exit(1) # Any non-divvy arguments will be passed along as key-value pairs # that can be used to populate the template. # keys = [str.replace(x, "--", "") for x in remaining_args[::2]] # cli_vars = dict(zip(keys, remaining_args[1::2])) if args.compute: cli_vars = {y[0]: y[1] for y in [x.split("=") for x in args.compute]} else: cli_vars = {} if args.command == "write" or args.command == "submit": try: dcc.activate_package(args.package) except AttributeError: parser.print_help(sys.stderr) sys.exit(1) if args.settings: _LOGGER.info("Loading settings file: %s", args.settings) with open(args.settings, "r") as f: vars_groups = [cli_vars, yaml.load(f, SafeLoader)] else: vars_groups = [cli_vars] _LOGGER.debug(vars_groups) if args.command == "write": dcc.write_script(args.outfile, vars_groups) elif args.command == "submit": dcc.submit(args.outfile, vars_groups)