def run(self, command): """ run the command constructed by the command builder method :param command: compare pipeline-tool name to call :return: job id if LSF used otherwise none """ print('*' * 100) print("running the command") print(self.cgetools) print(command) print('*' * 100) job_id = '' processing_id = self.workdir.split('/')[-2] if not self.lsf: print('*'*100) print("NO LSF MODE: \n Running Command: {}".format(command)) print('*'* 100) sub_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = sub_process.communicate() print(out) print(err) if out: print('*' * 100) print("standard output of subprocess:") print(out.decode()) print('*' * 100) data = out.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 if err: print('*' * 100) print("standard error of subprocess:") print("ERROR MESSAGE: {} ".format(err)) print(err.decode()) print('*' * 100) data = err.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 if sub_process.returncode != 0: self.error_list.append(err.decode().replace("'", "")) print(err, file=sys.stderr) elif self.lsf: print("LSF option is true... PAIRED_END{} , type_of:{} ".format(self.pair, type(self.pair))) print(command) if self.pair.lower() == 'true': job_id = bsub('core_executor_' + processing_id, R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command) else: print('*'*100) print("PAIRED-END:{}".format(self.pair)) job_id = bsub('core_executor_' + processing_id, P='singularity', R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command) return [job_id]
def indexbams(): for sample in SAMPLES: for bam in getfilelist(RESULTS, sample + ".bam"): assert(len(bam) == 1) outdir = os.path.dirname(bam) cmd = "samtools index " + bam bsub("indexing", "-cwd " + outdir, verbose=True)(cmd)
def bsez(args): """ automatically add -e and -o with reasonable paths given the job name """ if not sys.stdin.isatty(): _, f = tempfile.mkstemp(suffix=".sh") with open(f, 'w') as fh: fh.write(sys.stdin.read()) atexit.register(os.unlink, f) else: sys.stderr.write("empty job\n") sys.exit(1) args = sys.argv[1:] assert "-J" in args ji = args.index("-J") _ = args.pop(ji) # remove -J job_name = args.pop(ji) args2 = [] for i, a in enumerate(args): if not a.startswith('-'): args2.append(a) # so a is a flag. if the next is also a flag, insert True elif i < len(args) - 1 and args[i + 1][0] == "-": args2.extend((a[1:], True)) elif i == len(args) - 1: assert a[0] == "-", a args2.extend((a[1:], True)) else: args2.append(a[1:]) kwargs = dict(zip(args2[::2], args2[1::2])) print bsub(job_name, f, **kwargs)()
def fastqc(script, samples, data_path): for sample in samples: fastq = getfilelist(data_path, sample + ".fastq.gz") assert(len(fastq) == 1) fastq = fastq[0] qcresult = "%s/%s_fastqc.zip" % (data_path.rstrip("/"), sample) if op.exists(qcresult): continue cmd = "%s --outdir %s %s" % (script, data_path, fastq) bsub("qc", verbose=True)(cmd)
def main(args): if not op.exists(args.out): os.makedirs(args.out) jobids = [] genelist = getfilelist(args.index, "*.pickle") for i, gene in enumerate(genelist): gname = op.splitext(op.basename(gene))[0] cmd = "python %s --read-len %s --overhang-len %s --settings-filename %s --compute-gene-psi %s %s %s %s" % (args.miso_script, args.read_length, args.overhang_length, args.miso_settings, gname, gene, args.bam, args.out) bsub("miso_" + gname, q=args.queue_name)(cmd, job_cap=args.job_cap)
def cleanup(): """take care of the mess left by rum.""" sortjobs = [] for sam in getfilelist(RESULTS, "RUM.sam") outdir = os.path.dirname(sam) sample = outdir.rsplit("/", 1)[1] cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + sample + ".bam" sortjobs.append(bsub("sam2bam", "-cwd " + outdir, verbose=True)(cmd)) cmd = "gzip *.fa RUM_Unique RUM_NU" bsub("compress", "-cwd " + outdir, verbose=True)(cmd) return sortjobs
def cleanup(path): """it'd be a good idea to not run this on the data dir""" exts = ['bed', 'xls'] for ext in exts: for f in getfilelist(path, "*." + ext): cmd = "gzip -f " + f bsub("zip", q="idle")(cmd) try: [os.remove(sam) for sam in getfilelist(path, "*.sam")] except OSError: pass
def alignment_stats(results_path, picard_path, ref_fasta): for bam in getfilelist(results_path, "*.bam"): cmd = "samtools index %s" % bam if not op.exists("%s.bai" % bam): jobid = bsub("index", verbose=True)(cmd) bsub.poll(jobid) cmd = "java -Xmx8g -jar %s/CollectMultipleMetrics.jar \ INPUT=%s REFERENCE_SEQUENCE=%s ASSUME_SORTED=true OUTPUT=metrics \ PROGRAM=CollectAlignmentSummaryMetrics \ PROGRAM=QualityScoreDistribution \ PROGRAM=MeanQualityByCycle" % (picard_path, bam, ref_fasta) bsub("alignment_summary", verbose=True)(cmd)
def cleanup(path): """it'd be a good idea not to run this on the data dir""" exts = ['bed', 'xls'] for ext in exts: for f in getfilelist(path, "*." + ext): cmd = "gzip -f " + f bsub("zip", q="idle")(cmd) try: [os.remove(fastq) for fastq in getfilelist(path, "*.fastq")] [os.remove(fastq) for fastq in getfilelist(path, "*.fq")] [os.remove(fastq) for fastq in getfilelist(path, "*.csfasta")] [os.remove(fastq) for fastq in getfilelist(path, "*.qual")] except OSError: pass
def fastqc(): """qc for single or paired-end data.""" fastqc="/vol1/home/brownj/opt/fastqc/fastqc" for sample in SAMPLES: outdir = RESULTS + "/" + sample if not os.path.exists(outdir): os.makedirs(outdir) for fastq in getfilelist(DATA, sample + "_*"): # see if result exists -- fastqc naming convention uses a portion of the read file name qcresult = getfilename(fastq) + "_fastqc.zip" if os.path.exists(outdir + "/" + qcresult): continue cmd = fastqc + " --outdir " + outdir + " --threads 4 " + fastq bsub("fastqc", verbose=True)(cmd)
def convert_gwas_vcf_to_tsv_with_cluster(vcf): sub = bsub("gwas_vcf2tsv", M="4000", R="rusage[mem=4000]", N="") command = "vcf2tsv -f {} -exec local".format(vcf) print(">>>> Submitting job to cluster, job id below") print(sub(command).job_id) print( "You will receive an email when the job is finished. Formatted files will appear in the same directory as the input file." )
def fastqc(samples, datadir, resultsdir): """qc for single or paired-end data""" fastqc="/vol1/home/brownj/opt/fastqc/fastqc" for sample in samples: fastqs = getfilelist(datadir, sample + ".fastq.gz") assert(len(fastqs) == 1) outdir = resultsdir.rstrip("/") + "/" + sample if not op.exists(outdir): os.makedirs(outdir) qcresult = outdir + "/" + sample + "_fastqc.zip" if op.exists(qcresult): continue cmd = fastqc + " --outdir " + outdir + " " + fastqs[0] bsub("fastqc", verbose=True)(cmd)
def run(self, command): """ run the command constructed by the command builder method :param command: compare pipeline-tool name to call :return: job id if LSF used otherwise none """ print('*' * 100) print("IN RUN FUNCTION: running the command:", command) print("Requested memory: {}".format(self.rmem)) print("Memory limits: {}".format(self.lmem)) print('*' * 100) processing_id = self.workdir.split('/')[-2] job_id = '' if not self.lsf: sub_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sub_process.communicate() if out: print('*' * 100) print("standard output of subprocess:\n", out.decode()) print('*' * 100) data = out.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 if err: print('*' * 100) print("standard error of subprocess:\n", err.decode()) print('*' * 100) data = err.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 #Comment this out after amending A above if sub_process.returncode != 0: if err: self.error_list.append(err.decode().replace("'", "")) print(err.decode(), file=sys.stderr) else: print("LSF option is set to true .....") print(command) job_id = bsub('core_executor_' + processing_id, R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command) return [job_id]
def trim(path, pattern): """uses https://github.com/lh3/seqtk""" jobs = [] for fastq in getfilelist(path, pattern): trimresult = "%s.trim.fastq.gz" % fastq.split(".fastq", 1)[0] if op.exists(trimresult): continue cmd = "seqtk trimfq %s | gzip -c > %s" % (fastq, trimresult) jobs.append(bsub("trim", verbose=True)(cmd)) return jobs
def launch_lsf(self, command_strings, verbose=False, output='/dev/null'): curr_dir = os.getcwd() os.chdir(self.tmpdir) job_ids = [bsub('phyml_task', o='/dev/null', e='/dev/null', verbose=verbose)(cmd).job_id for cmd in command_strings] bsub.poll(job_ids) os.chdir(curr_dir)
def trim(path, pattern): """uses https://github.com/lh3/seqtk""" jobs = [] for fastq in getfilelist(path, pattern): trimresult = fastq.split(".fastq", 1)[0] + ".trim.fastq.gz" if os.path.exists(trimresult): continue cmd = "seqtk trimfq " + fastq + " | gzip -c > " + trimresult jobs.append(bsub("seqtk", verbose=True)(cmd)) return jobs
def postprocessrum(resultsdir): """take care of the mess left by rum.""" jobs = [] for sam in getfilelist(resultsdir, "RUM.sam"): outdir = op.dirname(sam) try: [os.remove(fastq) for fastq in getfilelist(outdir, "*.fastq")] except OSError: pass sample = outdir.rsplit("/", 1)[1] cmd = "gzip -f *.fa RUM_Unique RUM_NU RUM_NU.cov RUM_Unique.cov" bsub("postprocessrum", q="idle", cwd=outdir, verbose=True)(cmd) bam = outdir + "/" + sample + ".bam" if op.exists(bam): continue cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + bam jobs.append(bsub("postprocessrum", cwd=outdir, verbose=True)(cmd)) return jobs
def cleanup(genome): """take care of the mess left by rum.""" jobs = [] for sam in getfilelist(RESULTS, "RUM.sam"): outdir = op.dirname(sam) try: [os.remove(fastq) for fastq in getfilelist(outdir, "*.fastq")] except OSError: # no fastq found pass sample = outdir.rsplit("/", 1)[1] cmd = "gzip *.fa RUM_Unique RUM_NU RUM_NU.cov RUM_Unique.cov" bsub("compress", cwd=outdir, verbose=True)(cmd) bam = outdir + "/" + sample + "." + genome + ".bam" if op.exists(bam): continue cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + bam jobs.append(bsub(PI + ".distill_rum", cwd=outdir, verbose=True)(cmd)) return jobs
def trimadapters(datadir): """trim adapters using ea-utils""" jobs = [] adapters = "/vol1/home/brownj/projects/walter/data/20121005/adapters.fa" for fastq in getfilelist(datadir, "*.fastq.gz"): trimresult = op.dirname(fastq) + "/" + op.basename(fastq).split(".fastq", 1)[0] + ".trm.fq.gz" if op.exists(trimresult): continue cmd = "fastq-mcf " + adapters + " " + fastq + " | gzip -c > " + trimresult jobid = bsub(PI + ".trimadapter", verbose=True)(cmd) jobs.append(jobid) return jobs
def build_bsub(config, algorithm, **kwargs): """ >>> from bsub import bsub >>> config = {'pipeline': {'filter': {1: 'idx', 'bsub': {'P': 'test', 'R': 'span[hosts=1]', 'n': 10}, 'p': 10}}, 'project_id': 'test'} >>> b = build_bsub(config, "filter") >>> print b.command.replace("logs/", "") bsub -e filter.%J.err -J filter -o filter.%J.out -n 10 -P test -R "span[hosts=1]" >>> b = build_bsub(config, "filter", **{'w':10010}) >>> print b.command.replace("logs/", "") bsub -e filter.%J.err -J filter -o filter.%J.out -n 10 -P test -R "span[hosts=1]" -w "done(10010)" """ try: pid = config["project_id"] except KeyError: # this is required logging.critical("Define a Project ID (project_id) in the config") sys.exit(1) try: # args as defined in config:pipeline:algorithm:bsub config_kwargs = config["pipeline"][algorithm]["bsub"] # overwrite existing with new config_kwargs.update(kwargs) except KeyError: # LSF reservations not defined in config if not kwargs: return bsub(algorithm, P=pid, verbose=True) config_kwargs = kwargs # fix wait syntax if "w" in config_kwargs.keys(): config_kwargs["w"] = '"exit({i},0)"'.format(i=config_kwargs["w"]) if not "P" in config_kwargs.keys(): config_kwargs["P"] = pid # args to strings for k, v in config_kwargs.items(): if isinstance(v, int): config_kwargs[k] = str(v) return bsub(algorithm, verbose=True, **config_kwargs)
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-e', '--env', help="- Environment", default="dev") parser.add_argument('-p', '--project', help="- MetaboLights Labs project ID", required=True) parser.add_argument('-t', '--token', help="- MetaboLights Labs user token", required=True) parser.add_argument('-j', '--job', help="- LSF Job id", required=False) args = parser.parse_args(arguments) global project global token global job global env global userSpace project = args.project token = args.token job = args.job env = args.env baseDirectory = userSpace[env] inputLocation = baseDirectory + token + "/" + project outputLocation = baseDirectory + token + "/" + project if not (job is None): # check the status of job and return the value print _getJobStatus(_run("bjobs " + job)) else: #check if the folder locations are valid and the submit the job #check if the input location / outputlocation exists if not os.path.isdir(inputLocation) or not os.path.exists( inputLocation): raise Exception("Input folder doesnt exist") if not os.path.isdir(outputLocation) or not os.path.exists( outputLocation): raise Exception("Output folder doesnt exist") sub = bsub("mzml2isaJob", verbose=False) sub("mzml2isa -i " + inputLocation + " -o " + outputLocation + " -s ''") status = {"message": "Job submitted successfully", "code": "PEND"} status["jobID"] = sub.job_id print status
def apply_config_to_file_use_cluster(file, config_type, config_path, memory): sub = bsub("gwas_ss_format", M="{}".format(str(memory)), R="rusage[mem={}]".format(str(memory)), N="") command = "ss-format -f {} -t {} -c {} -m apply".format( file, config_type, config_path) print(">>>> Submitting job to cluster, job id below") print(sub(command).job_id) print( "You will receive an email when the job is finished. Formatted files, md5sums and configs will appear in " "the same directory as the input file.")
def run(self, command): """ pass :return: """ processing_id = self.workdir.split('/')[-2] job_id = '' if not self.lsf: sub_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sub_process.communicate() if out: print('*' * 100) print("standard output of subprocess:\n", out.decode()) print('*' * 100) data = out.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 if err: print('*' * 100) print("standard error of subprocess:\n", err.decode()) print('*' * 100) data = err.decode().split('\n') i = 0 for line in data: if 'error' in line.lower(): message = data[i - 1] + '\n' + data[i] self.error_list.append(message.replace("'", "")) i = i + 1 # Comment this out after amending A above if sub_process.returncode != 0: if err: self.error_list.append(err.decode().replace("'", "")) print(err.decode(), file=sys.stderr) else: print("LSF option is set to true .....") print(command) job_id = bsub('core_executor_' + processing_id, R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command) return [job_id]
def assemble(samples, data_dir, results_dir, seed_fa): """assemble using SSAKE.""" # jobs = [] for sample in samples: fastas = ngseq.getfilelist(datadir, sample + ".jnd.fa.gz") assert(len(fastas) == 1) gzipfasta = fastas[0] outdir = "%s/%s" % (results_dir, sample) fasta = outdir + "/" + op.splitext(op.basename(gzipfasta))[0] if not op.exists(fasta): bsub.poll(ngseq.extract(gzipfasta, fasta)) cmd = "SSAKE -f " + fasta + " -s " + seed_fa + " -m 40 -o 50 -r 0.8 -b " + sample + " -p 1 -v 1 -d 200 -e 0.75 -k 10 -a 0.5 -x 50" jobid = bsub("3prime_seed_extension", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
def align(): """align reads using rum""" alignjobs = [] for sample in SAMPLES: fastqs = getfilelist(sample + ".fq.gz") assert(len(fastqs) == 1) outdir = RESULTS + "/" + sample alignresult = outdir + "/" + sample + ".bam" if os.path.exists(alignresult): continue if not os.path.exists(outdir): os.makedirs(outdir) cmd = "rum_runner align -v -i " + INDEX + " -o " + outdir + " --chunks 5 --name " + sample + " " + DATA + "/" + fastq[0] alignjobs.append(bsub("align_reads", "-n 5", verbose=True)(cmd))
def macs(samples, resultsdir): jobs = [] for sample in samples: bams = getfilelist(resultsdir, sample + ".hg19.bam") assert(len(bams) == 1) outdir = resultsdir.rstrip("/") + "/" + sample macsresult = outdir + "/" + sample + "_peaks.xls" if op.exists(macsresult) or op.exists(macsresult + ".gz"): continue cmd = "macs14 -t " + bams[0] + " -f BAM -n " + sample + " -g hs -w --single-profile" # writes to directory in which it was executed jobid = bsub("macs", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def solid2fastq(samples, datadir): script = "/vol2/home/brentp/src/bfast-git/scripts/solid2fastq" jobs = [] for sample in samples: csfastas = getfilelist(datadir, sample + "*.csfasta.gz") quals = getfilelist(datadir, sample + "*.qual.gz") assert(len(csfastas) == 1) assert(len(quals) == 1) if op.exists(datadir + "/" + sample + ".fastq.gz"): continue cmd = script + " -z -Z -o " + sample + " " + csfastas[0] + " " + quals[0] jobid = bsub("solid2fastq", cwd=datadir, verbose=True)(cmd) jobs.append(jobid) return jobs
def unload_experiment(ae_id): unload_command = """%s -a %s""" % (UNLOADER_PATH, ae_id) clean_command = """%s -a %s -f %s""" % (UNLOAD_CLEAN_PATH, ae_id, FULL_FTP_LOCATION) print unload_command print clean_command j = bsub(unload_command) j2 = j.then(clean_command) print j print j2 print dir(j) print dir(j2) print j.__dict__ print j2.__dict__
def concat(): """join reads from all lanes.""" concatjobs = [] for sample in SAMPLES: fastqs = getfilelist(DATA, sample + "_*") # check for output concatresult = DATA + "/" + sample + ".fq.gz" if os.path.exists(concatresult): continue assert(len(fastqs) == 2) cmd = "zcat " + " ".join(fastqs) + " | gzip -c > " + concatresult concatjobs.append(bsub("concat_reads", verbose=True)(cmd))
def join(samples, datadir, script): """joins paired-end data into SSAKE format.""" jobs = [] sub = bsub("join_reads", verbose=True) for sample in samples: # sort for ordering: R1 then R2 fastqs = sorted(ngseq.getfilelist(datadir, sample + "_*.trm.fq.gz")) # check for output joinresult = datadir + "/" + sample + ".jnd.fa.gz" if op.exists(joinresult) or op.exists(joinresult + ".gz"): continue assert(len(fastqs) == 2) # usage: join_reads.py R1 R2 --insert 200 cmd = "python " + script + " " + " ".join(fastqs) + " | gzip -c > " + joinresult jobs.append(sub(cmd)) return jobs
def bowtiealign(samples, datadir, resultsdir, index, genome): """align to index using bowtie""" jobs = [] for sample in samples: fasta = datadir + "/" + sample + ".csfasta" qual = datadir + "/" + sample + ".qual" outdir = resultsdir.rstrip("/") + "/" + sample alignresult = outdir + "/" + sample + "." + genome + ".bam" if op.exists(alignresult): continue cmd = "bowtie -p4 -m1 -v1 -f -C --best --strata --chunkmbs 512 --trim3 25 --sam " + index + " -Q " + qual + " " + fasta + " | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult jobid = bsub("bowtie", n="4", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def launch_lsf(self, command_strings, verbose=False): curr_dir = os.getcwd() os.chdir(self.tmpdir) job_launcher = bsub('treeCl_gtp_task', o='/dev/null', e='/dev/null', verbose=verbose) if not self.debug: job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null' job_ids = [job_launcher(cmd).job_id for cmd in command_strings] self.job_ids.update(job_ids) bsub.poll(job_ids) os.chdir(curr_dir)
def counts(samples, resultsdir): """get counts over peaks regions for each sample""" jobs = [] # the merged peaks file consensus = getfilelist(BASE + "/results", "consensus.bed*") assert(len(consensus) == 1) consensus = consensus[0] for sample in samples: bams = getfilelist(resultsdir, sample + "*.hg19.bam") assert(len(bams) == 1) outdir = resultsdir.rstrip("/") + "/" + sample countsresult = outdir + "/" + sample + ".counts" if op.exists(countsresult): continue cmd = "bedtools coverage -abam " + bams[0] + " -b " + consensus + " > " + countsresult jobid = bsub(PI + ".counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def apply_config_bsub(self): self.get_options_data() self.get_split_data() self.get_col_shuffle_data() print("File to format: {}\nConfig: {}".format( str(self.table.file), str(json.dumps(self.config, sort_keys=True, indent=4)))) config_out = self.filename + ".tabman_config.json" with open(config_out, "w") as f: json.dump(self.config, f) sub = bsub("gwas_ss_format", M="24000", R="rusage[mem=24000]", N="") command = "tabman -f {} -config {}".format(self.filename, config_out) print(">>>> Submitting job to cluster, job id below") print(sub(command).job_id) print( "You will receive an email when the job is finished. Formatted files, md5sums and configs will appear in the same directory as the input file." ) sys.exit()
def gsnap(samples, reads_path, results_path, gmap_db, cmd_str): """align reads for each sample according to the command string.""" jobs = [] for sample in samples: fastqs = getfilelist(reads_path, sample + ".trim.fastq.gz") assert(len(fastqs) == 1) fastq = fastqs[0] out = "%s/%s" % (results_path, sample) if not op.exists(out): os.makedirs(out) align_result = "%s/%s.bam" % (out, sample) if op.exists(align_result): continue cmd = cmd_str.format(gmap_db, fastq, sample, align_result) jobid = bsub("align", n="5", R="select[mem>28] rusage[mem=28] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def novoalign(samples, datadir, resultsdir, index, genome): jobs = [] for sample in samples: fastqs = getfilelist(datadir, sample + ".fastq.gz") assert(len(fastqs) == 1) outdir = resultsdir.rstrip("/") + "/" + sample alignresult = outdir + "/" + sample + "." + genome + ".bam" if op.exists(alignresult): continue if not op.exists(outdir): os.makedirs(outdir) gzipfastq = fastqs[0] fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0] if not op.exists(fastq): bsub.poll(extract(gzipfastq, fastq)) cmd = "novoalignCS -c 1 -d " + index + " -f " + fastq + " -F BFASTQ -o SAM -r Random -e 100 -s 8 -l 20 | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult jobid = bsub("novoalign", n="1", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def counts(samples, result_path, peak_ext, bam_ext): # get the consensus peaks f = open("%s/peak_coordinates.bed" % result_path, 'w') x = BedTool() consensus = x.multi_intersect(i=getfilelist(result_path, "*%s" % peak_ext)) for c in consensus: # fixing formatting from bedtool object replicate_counts = c.name if replicate_counts < 2: continue fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % \ (c.chrom, c.start, c.stop)] f.write("\t".join(map(str, fields))) f.close() # get counts for each sample jobs = [] countfiles = [] for sample in samples: bams = getfilelist(result_path, sample + "*%s" % bam_ext) assert(len(bams) == 1) outdir = result_path.rstrip("/") + "/" + sample countsresult = outdir + "/" + sample + ".counts" countfiles.append(countsresult) if op.exists(countsresult): continue cmd = "bedtools coverage -abam %s -b %s > %s" % \ (bams[0], f.name, countsresult) jobid = bsub(sample + "_counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) bsub.poll(jobs) # counts to matrix allcounts = {} for cf in countfiles: cfname = op.basename(cf).split(".counts")[0] casecounts = {} for toks in reader(cf, header="chrom start stop name a_overlaps_in_b \ b_with_nonzero length_b frac_b_nonzero".split()): casecounts[toks['name']] = int(toks['a_overlaps_in_b']) allcounts[cfname] = casecounts countsdf = pd.DataFrame(allcounts) countsdf.to_csv(sys.stdout, sep="\t", header=True)
def macs(samples, resultsdir, control): jobs = [] for sample in samples: bams = getfilelist(resultsdir, sample + ".bam") assert(len(bams) == 1) # control if control in bams[0]: continue controlbam = getfilelist(resultsdir, control + ".bam") assert(len(controlbam) == 1) outdir = resultsdir.rstrip("/") + "/" + sample macsresult = outdir + "/" + sample + "_peaks.xls" if op.exists(macsresult) or op.exists(macsresult + ".gz"): continue cmd = "macs14 -t " + bams[0] + " -c " + controlbam[0] + " -f BAM -n " + sample + " -g mm -w --single-profile" jobid = bsub("macs", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd) jobs.append(jobid) return jobs
def fetch_datahub_metadatafile(account, workdir, lsf): """ Fetch datahub metadata file via pathogen portal curl -o output.txt -X GET --header 'Accept: application/json' -u dcc_beethoven:xxxx 'https://www.ebi.ac.uk/ena/portal/api/search? result=read_run&dataPortal=pathogen&dccDataOnly=true&fields=tax_id, scientific_name,sample_accession,secondary_sample_accession,experiment_accession, study_accession,secondary_study_accession,run_accession,center_name,fastq_ftp, fastq_md5&sortFields=scientific_name,sample_accession&limit=0' :param account: datahub account :param workdir: working directory :param lsf: bo0lean :return: Metadata file """ print("In fetch_datahub_metadatafile") error_list = list() datahub = account['account_id'] password = account['password'] #inputfile = datahub.replace('dcc_', '') + "_run_metadata_*.tsv" outputfile = workdir + datahub.replace("dcc_", "") + '_run_metadata.tsv' retrieved_fields = ( "fields=tax_id,scientific_name,sample_accession,secondary_sample_accession,experiment_accession," "study_accession,secondary_study_accession,run_accession,center_name,instrument_platform,fastq_ftp," "fastq_md5&sortFields=scientific_name,sample_accession&limit=0' -k") correct_ftp_path = " && perl -p -i -e '~s/ftp\.sra\.ebi\.ac\.uk\/vol1\/|ftp\.dcc\-private\.ebi\.ac\.uk\/vol1\///g' {} ".format(outputfile) """ Below correct_ftp_path reflect a quick hack to be able to process dcc_bromhead ... Should resume to the above for normal mode of action..... correct_ftp_path = " && egrep -v -e 'SAMEA104423915|SAMEA4058395|SAMEA4058397|SAMEA4058405|SAMEA4058441'" \ "'SELECTA_REMOVE' \ {} >{}.tmp && mv {}.tmp {} && \ perl -p -i -e '~s/ftp\.sra\.ebi\.ac\.uk\/vol1\///g' {} ".format(outputfile, outputfile, outputfile, outputfile, outputfile) """ base_command = ("curl -o {} -X GET --header 'Accept:\ application/json' -u {}:{} ").format(outputfile, datahub, password) base_command = base_command + (" 'https://www.ebi.ac.uk/ena/portal/api/search?") base_command = base_command + ("result=read_run&dataPortal=pathogen&dccDataOnly=true&") command = base_command + retrieved_fields + correct_ftp_path if os.path.isfile(outputfile): os.remove(outputfile) if not lsf: print(ruler) print("FETCHMETADATA COMMAND:\n\t", command) print("LSF VALUE=", lsf) print(ruler) sp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sp.communicate() if out: print(ruler, "\nstandard output of subprocess: {}".format(out), file=sys.stdout) if err: print(ruler, "\nstandard error of subprocess: {}".format(err), file=sys.stderr) if sp.returncode != 0: error_list.append(err) print(err, end="", file=sys.stderr) print(ruler, "\nreturncode of subprocess:{}".format(sp.returncode), file=sys.stdout) else: print("LSF value is YES, still need implementation at the moment ...") print("Working dir: {}".format(workdir)) print(ruler) print("Running: ", command) print(ruler) try: job_id = bsub.bsub("selection_2_attribute", verbose=True)(command) # , R="rusage[mem=1]") bsub.bsub.poll(job_id) except: message = str(sys.exc_info()[1]) error_list.append(message) print(ruler, "ERROR MESSAGE:\n{}".format(message), "\n", ruler) if lsf: return [outputfile, job_id] else: return [outputfile, None]
def _run_cmds_on_cluster(args, commands, out_dir, results_file): """This method will submit a certain number of jobs onto an LSF cluster and wait for these jobs to complete before starting new jobs. This allows to run several jobs in parallel. Args: args: Command-line arguments. commands: List of command dictionaries. out_dir: Output directory. results_file: CSV file to store summary. """ from bsub import bsub def check_running(jobs): rjobs = bsub.running_jobs() tmp_jobs = jobs jobs = [] for job, cmd_dict, ind in tmp_jobs: if job.job_id in rjobs: jobs.append((job, cmd_dict, ind)) continue print('Job %d finished.' % ind) cmd_out_dir = cmd_dict[_OUT_ARG] try: # We store the command used for execution. This might be helpful # for the user in case he wants to manually continue the # simulation. with open(os.path.join(cmd_out_dir, 'hpsearch_command.sh'), 'w') as f: f.write('#!/bin/sh\n') f.write('%s' % (_args_to_cmd_str(cmd_dict))) # Get training results. performance_dict = _SUMMARY_PARSER_HANDLE(cmd_out_dir, i) for k, v in performance_dict.items(): cmd_dict[k] = v # Create or update the CSV file summarizing all runs. panda_frame = pandas.DataFrame.from_dict(cmd_dict) if os.path.isfile(results_file): old_frame = pandas.read_csv(results_file, sep=';') panda_frame = pandas.concat([old_frame, panda_frame], sort=True) panda_frame.to_csv(results_file, sep=';', index=False) # Check whether simulation has finished successfully. has_finished = int(cmd_dict['finished'][0]) if has_finished == 1: _CMD_FINISHED[ind] = True else: _CMD_FINISHED[ind] = False except Exception: traceback.print_exc(file=sys.stdout) warnings.warn('Could not assess whether run %d has been ' \ % (ind+1) + 'completed.') return jobs jobs = [] i = -1 while len(commands) > 0: jobs = check_running(jobs) while len(jobs) >= args.num_jobs: time.sleep(10) jobs = check_running(jobs) cmd_dict = commands.pop() i += 1 # FIXME quick and dirty solution. folder_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3] if os.path.exists(os.path.join(out_dir, folder_name)): time.sleep(1.1) folder_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3] cmd_out_dir = os.path.join(out_dir, folder_name) assert (not os.path.exists(cmd_out_dir)) cmd_str = _args_to_cmd_str(cmd_dict, out_dir=cmd_out_dir) cmd_dict[_OUT_ARG] = cmd_out_dir # Execute the program. print('Starting training run %d/%d -- "%s"' % (i + 1, len(commands), cmd_str)) job_name = 'job_%s' % folder_name # FIXME the bsub module ignores the pathnames we set. Hence, all output # files are simply stored in the local directory. For now, we will # capture this in the postprocessing script. job_error_file = os.path.join(cmd_out_dir, job_name + '.err') job_out_file = os.path.join(cmd_out_dir, job_name + '.out') sub = bsub(job_name, R=args.resources, n=1, W='%d:00' % args.num_hours, e=job_error_file, o=job_out_file, verbose=True) sub(cmd_str) jobs.append((sub, cmd_dict, i)) # Wait for all jobs to complete. while len(jobs) > 0: time.sleep(10) jobs = check_running(jobs)
def download_datahub_file(account_name, password, files, outdir, process_id, lsf, dryrun=True): """ Fetch datahub metadata from the Pathogen portal given datahub credentials :param account_name: datahub name :param password: datahub password :param files: datahub metadata file :param outdir: directory where to store the metadata file, define in configuration file :param process_id: process_id of the runs :param lsf: Boolean (run through LSF or not) :param dryrun: dry run :return: LSF job ids or empty string """ jobids = [] for file in files: outputfile = outdir + '/' + os.path.basename(file) print(file) """ For some reason the data folder is empty, fastqs are now in vol1 folder :( """ url = "ftp://{}:{}@ftp.dcc-private.ebi.ac.uk/vol1/{}".format( account_name, password, file) command = "wget -t 2 {} -O {}".format(url, outputfile) print('*' * 100) print(command) print('*' * 100) if not dryrun: if not lsf: sub_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sub_process.communicate() if out: print("standard output of subprocess:") print(out) if err: print("standard error of subprocess:") print(err) if sub_process.returncode != 0: error_list.append(err) print(err, end="", file=sys.stderr) else: print( "LSF value is YES, still need implementation at the moment..." ) print('*' * 100) print("Running: ", command) print('*' * 100) try: job_id = bsub('data_provider_' + process_id, g='/SELECTA', verbose=True)(command) jobids.append(job_id) except: message = str(sys.exc_info()[1]) error_list.append(message) return jobids
err_dir = '/icgc/dkfzlsdf/analysis/B240/kong/Projects/PANCSTRAT/Err_Out/DeepLearning/Survival/LSTM/' + 'Features_' + feat + '_Boots/' if os.path.isdir(err_dir): shutil.rmtree(err_dir) os.mkdir(err_dir) #################################################################################################################### #################################################################################################################### # Get parameters for the model of choice. model_id = 'dense_16_0.1_l1_0.5_lstm_0.3_Adam_learning_0.01' model_dir = models_dir + model_id + '/' #################################################################################################################### # Bootstrap. for i_boots in range(n_bootstrap): boots_dir = bootstrap_dir + 'boots_' + str(i_boots) + '/' if os.path.isdir(boots_dir): shutil.rmtree(boots_dir) os.mkdir(boots_dir) \ # Submit one job to find error for one bootstrap. job_name = err_dir + 'boots_' + str(i_boots) job = bsub(job_name, W='50:00', M='10G', verbose=True) args = model_dir + ' ' + boots_dir job('module load anaconda3/2019.07; source activate TensorFlow_CPU; python /icgc/dkfzlsdf/analysis/B240/kong/Projects/PANCSTRAT/Code/WGS/Kipoi/MMSplice/DeepLearning/Survival/LSTM/Bootstrap_Error_bsub.py' + ' ' + args) #################################################################################################################### ####################################################################################################################