def _run_kraken(data,ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") stats = out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(),"genome","kraken","minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report" : "null"} if not os.path.exists(os.path.join(kraken_out,"kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir,"kraken_out") out_stats = os.path.join(tx_tmp_dir,"kraken_stats") cl = (" ").join([config_utils.get_program("kraken", data["config"]), "--db",db,"--quick", "--preload","--min-hits","2","--threads",str(num_cores), "--out", out, files[0]," 2>",out_stats]) do.run(cl,"kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out,db,data) return metrics
def _prepare_samples(args): """ create dict for each sample having all information """ if args.galaxy: system_config = args.galaxy else: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") config = yaml.load(open(system_config)) config['algorithm'] = {} data = [] vcf_files = [fn for fn in args.files if fn.endswith('vcf')] bam_files = [fn for fn in args.files if fn.endswith('bam')] fastq_files = [fn for fn in args.files if is_fastq(fn)] if not fastq_files: fastq_files = vcf_files for sample in fastq_files: dt = {} dt['name'] = splitext_plus(op.basename(sample))[0] dt['config'] = config dt['fastq'] = op.abspath(sample) if bam_files: dt['bam'] = _find_bam(bam_files, sample) data.append([dt]) return data
def _config(args): if args.galaxy: system_config = args.galaxy else: system_config = op.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") config = yaml.load(open(system_config)) config['algorithm'] = {} return config
def update_samples(data, resources, args): """ Update algorithm dict with new cores set """ if args.galaxy: system_config = args.galaxy else: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") config = yaml.load(open(system_config)) config['algorithm'] = {} new_data = [] for sample in data: sample['config'] = config sample['config']['algorithm'] = resources new_data.append([sample]) return new_data
def _prepare_samples(args): """ create dict for each sample having all information """ if args.galaxy: system_config = args.galaxy else: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") config = yaml.load(open(system_config)) config['algorithm'] = {} data = [] for sample in args.files: dt = {} dt['name'] = splitext_plus(op.basename(sample))[0] dt['config'] = config dt['bed'] = op.abspath(sample) data.append([dt]) return data
def _run_kraken(data, ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") out = out_stats = None db = data['config']["algorithm"]["kraken"] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(kraken_out, "kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fasta files as input") return {"kraken_report": "null"} with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--out {out} --fastq-input /dev/stdin 2> {out_stats}" ).format(**locals()) do.run(cl, "kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out, db, data) return metrics
def run(_, data, out_dir): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio)) logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data)) # ratio = bam.get_aligned_reads(bam_file, data) out = out_stats = None db = tz.get_in(["config", "algorithm", "kraken"], data) kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(out_dir, "kraken_out")): work_dir = os.path.dirname(out_dir) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files_orig"][0] if dd.get_save_diskspace( data) else data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fastq files as input") return {"kraken_report": "null"} with tx_tmpdir(data) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--output {out} --fastq-input /dev/stdin 2> {out_stats}" ).format(**locals()) do.run(cl, "kraken: %s" % dd.get_sample_name(data)) if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_tmp_dir, out_dir) metrics = _parse_kraken_output(out_dir, db, data) return metrics
def run(_, data, out_dir): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio)) logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data)) # ratio = bam.get_aligned_reads(bam_file, data) out = out_stats = None db = tz.get_in(["config", "algorithm", "kraken"], data) if db and isinstance(db, (list, tuple)): db = db[0] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(out_dir, "kraken_out")): work_dir = os.path.dirname(out_dir) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files_orig"][0] if dd.get_save_diskspace(data) else data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fastq files as input") return {"kraken_report": "null"} with tx_tmpdir(data) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--output {out} --fastq-input /dev/stdin 2> {out_stats}").format(**locals()) do.run(cl, "kraken: %s" % dd.get_sample_name(data)) if os.path.exists(out_dir): shutil.rmtree(out_dir) shutil.move(tx_tmp_dir, out_dir) metrics = _parse_kraken_output(out_dir, db, data) return metrics
def _run_kraken(data, ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(kraken_out, "kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] if files[0].endswith("bam"): logger.info("kraken: need fasta files as input") return {"kraken_report": "null"} with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cl = (" ").join([ config_utils.get_program("kraken", data["config"]), "--db", db, "--quick", "--preload", "--min-hits", "2", "--threads", str(num_cores), "--out", out, files[0], " 2>", out_stats ]) do.run(cl, "kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out, db, data) return metrics
def _run_kraken(data, ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") out = out_stats = None db = data['config']["algorithm"]["kraken"] kraken_cmd = config_utils.get_program("kraken", data["config"]) if db == "minikraken": db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken") if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report": "null"} if not os.path.exists(os.path.join(kraken_out, "kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) fn_file = data["files"][0] if fn_file.endswith("bam"): logger.info("kraken: need fasta files as input") return {"kraken_report": "null"} with tx_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir, "kraken_out") out_stats = os.path.join(tx_tmp_dir, "kraken_stats") cat = "zcat" if fn_file.endswith(".gz") else "cat" cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick " "--preload --min-hits 2 " "--threads {num_cores} " "--out {out} --fastq-input /dev/stdin 2> {out_stats}").format(**locals()) do.run(cl, "kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out, db, data) return metrics
parser.add_argument("-q", "--queue", help="Queue to submit jobs to.") parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() out_dir = os.path.abspath(args.out) utils.safe_makedir(out_dir) try: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log") parallel = clargs.to_parallel(args)
parser.add_argument("--timeout", default=15, help="Time to wait before giving up starting.") parser.add_argument("--retries", default=0, type=int, help=("Number of retries of failed tasks during " "distributed processing. Default 0 " "(no retries)")) parser.add_argument("-s", "--scheduler", help="Type of scheduler to use.", choices=["lsf", "slurm", "torque", "sge", "pbspro"]) parser.add_argument("-r", "--resources", help="Extra scheduler resource flags.", default=[], action="append") parser.add_argument("-q", "--queue", help="Queue to submit jobs to.") parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") if args.galaxy: system_config = args.galaxy with open(system_config) as in_handle: config = yaml.load(in_handle) parallel = clargs.to_parallel(args) parallel.update({'progs': args.progs}) dirs = {'work': os.path.abspath(os.getcwd())} if args.sys_info.find(";") > -1: info = args.sys_info.split(";") sysinfo = {'cores': int(info[0]), 'memory': float(info[1])} else: if utils.file_exists(args.sys_info): sysinfo = yaml.load(open(args.sys_info))[0] print "system info %s" % sysinfo
raise ValueError("--mirbase and --srna_gtf both need a value.") env.hosts = ["localhost"] env.cores = args.cores os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable) cbl = get_cloudbiolinux(REMOTES) sys.path.insert(0, cbl["dir"]) genomemod = __import__("cloudbio.biodata", fromlist=["genomes"]) # monkey patch cloudbiolinux to use this indexing command instead genomes = getattr(genomemod, 'genomes') genomes._index_w_command = _index_w_command fabmod = __import__("cloudbio", fromlist=["fabutils"]) fabutils = getattr(fabmod, 'fabutils') fabutils.configure_runsudo(env) system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) env.picard_home = config_utils.get_program("picard", config, ptype="dir") genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes")) args.fasta = os.path.abspath(args.fasta) args.gtf = os.path.abspath(args.gtf) if args.gtf else None if args.gff3: args.gtf = gff3_to_gtf(args.gtf) # always make a sequence dictionary if "seq" not in args.indexes: args.indexes.append("seq")
def error(self, message): self.print_help() galaxy_base = os.path.join(_get_data_dir(), "galaxy") print("\nCurrent genomes\n") print(open(loc.get_loc_file(galaxy_base, "samtools")).read()) sys.exit(0)
parser.add_argument("--mirbase", help="species in mirbase for smallRNAseq data.") parser.add_argument("--srna_gtf", help="gtf to use for smallRNAseq data.") args = parser.parse_args() # if not all([args.mirbase, args.srna_gtf]) and any([args.mirbase, args.srna_gtf]): # raise ValueError("--mirbase and --srna_gtf both need a value.") os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable) cbl = get_cloudbiolinux(REMOTES) sys.path.insert(0, cbl["dir"]) genomemod = __import__("cloudbio.biodata", fromlist=["genomes"]) # monkey patch cloudbiolinux to use this indexing command instead genomes = getattr(genomemod, 'genomes') genomes._index_w_command = _index_w_command genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes")) args.fasta = os.path.abspath(args.fasta) if not file_exists(args.fasta): print("%s does not exist, exiting." % args.fasta) sys.exit(1) args.gtf = os.path.abspath(args.gtf) if args.gtf else None if args.gtf and not file_exists(args.gtf): print("%s does not exist, exiting." % args.gtf) sys.exit(1) args.srna_gtf = os.path.abspath(args.srna_gtf) if args.srna_gtf else None gtf_file = args.gtf if args.gff3: gtf_file = extract_if_gzipped(gtf_file) gtf_file = gff3_to_gtf(gtf_file)
raise ValueError("--mirbase and --srna_gtf both need a value.") env.hosts = ["localhost"] env.cores = args.cores os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable) cbl = get_cloudbiolinux(REMOTES) sys.path.insert(0, cbl["dir"]) genomemod = __import__("cloudbio.biodata", fromlist=["genomes"]) # monkey patch cloudbiolinux to use this indexing command instead genomes = getattr(genomemod, 'genomes') genomes._index_w_command = _index_w_command fabmod = __import__("cloudbio", fromlist=["fabutils"]) fabutils = getattr(fabmod, 'fabutils') fabutils.configure_runsudo(env) system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) env.picard_home = config_utils.get_program("picard", config, ptype="dir") genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes")) args.fasta = os.path.abspath(args.fasta) args.gtf = os.path.abspath(args.gtf) if args.gtf else None if args.gff3: args.gtf = gff3_to_gtf(args.gtf) # always make a sequence dictionary if "seq" not in args.indexes: args.indexes.append("seq") env.system_install = genome_dir