Ejemplo n.º 1
0
	def run(self, command):
		"""
		run the command constructed by the command builder method
		:param command: compare pipeline-tool name to call
		:return: job id if LSF used otherwise none
		"""
		print('*' * 100)
		print("running the command")
		print(self.cgetools)
		print(command)
		print('*' * 100)
		job_id = ''
		processing_id = self.workdir.split('/')[-2]
		if not self.lsf:
			print('*'*100)
			print("NO LSF MODE: \n Running Command: {}".format(command))
			print('*'* 100)
			sub_process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
			out, err = sub_process.communicate()
			print(out)
			print(err)
			if out:
				print('*' * 100)
				print("standard output of subprocess:")
				print(out.decode())
				print('*' * 100)
				data = out.decode().split('\n')
				i = 0
				for line in data:
					if 'error' in line.lower():
						message = data[i - 1] + '\n' + data[i]
						self.error_list.append(message.replace("'", ""))
					i = i + 1
			if err:
				print('*' * 100)
				print("standard error of subprocess:")
				print("ERROR MESSAGE: {} ".format(err))
				print(err.decode())
				print('*' * 100)

				data = err.decode().split('\n')
				i = 0
				for line in data:
					if 'error' in line.lower():
						message = data[i - 1] + '\n' + data[i]
						self.error_list.append(message.replace("'", ""))
					i = i + 1
			if sub_process.returncode != 0:
				self.error_list.append(err.decode().replace("'", ""))
			print(err, file=sys.stderr)
		elif self.lsf:
			print("LSF option is true... PAIRED_END{} , type_of:{} ".format(self.pair, type(self.pair)))
			print(command)
			if self.pair.lower() == 'true':
				job_id = bsub('core_executor_' + processing_id, R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command)
			else:
				print('*'*100)
				print("PAIRED-END:{}".format(self.pair))
				job_id = bsub('core_executor_' + processing_id, P='singularity', R=self.rmem, M=self.lmem, g=self.bgroup, verbose=True)(command)
		return [job_id]
Ejemplo n.º 2
0
def indexbams():
    for sample in SAMPLES:
        for bam in getfilelist(RESULTS, sample + ".bam"):
            assert(len(bam) == 1)
            outdir = os.path.dirname(bam)
            cmd = "samtools index " + bam
            bsub("indexing", "-cwd " + outdir, verbose=True)(cmd)
Ejemplo n.º 3
0
def bsez(args):
    """
    automatically add -e and -o with reasonable paths
    given the job name
    """
    if not sys.stdin.isatty():
        _, f = tempfile.mkstemp(suffix=".sh")
        with open(f, 'w') as fh:
            fh.write(sys.stdin.read())
        atexit.register(os.unlink, f)
    else:
        sys.stderr.write("empty job\n")
        sys.exit(1)
    args = sys.argv[1:]
    assert "-J" in args
    ji = args.index("-J")
    _ = args.pop(ji) # remove -J
    job_name = args.pop(ji)


    args2 = []
    for i, a in enumerate(args):
        if not a.startswith('-'):
            args2.append(a)
        # so a is a flag. if the next is also a flag, insert True
        elif i < len(args) - 1 and args[i + 1][0] == "-":
            args2.extend((a[1:], True))
        elif i == len(args) - 1:
            assert a[0] == "-", a
            args2.extend((a[1:], True))
        else:
            args2.append(a[1:])
    kwargs = dict(zip(args2[::2], args2[1::2]))
    print bsub(job_name, f, **kwargs)()
Ejemplo n.º 4
0
def bsez(args):
    """
    automatically add -e and -o with reasonable paths
    given the job name
    """
    if not sys.stdin.isatty():
        _, f = tempfile.mkstemp(suffix=".sh")
        with open(f, 'w') as fh:
            fh.write(sys.stdin.read())
        atexit.register(os.unlink, f)
    else:
        sys.stderr.write("empty job\n")
        sys.exit(1)
    args = sys.argv[1:]
    assert "-J" in args
    ji = args.index("-J")
    _ = args.pop(ji)  # remove -J
    job_name = args.pop(ji)

    args2 = []
    for i, a in enumerate(args):
        if not a.startswith('-'):
            args2.append(a)
        # so a is a flag. if the next is also a flag, insert True
        elif i < len(args) - 1 and args[i + 1][0] == "-":
            args2.extend((a[1:], True))
        elif i == len(args) - 1:
            assert a[0] == "-", a
            args2.extend((a[1:], True))
        else:
            args2.append(a[1:])
    kwargs = dict(zip(args2[::2], args2[1::2]))
    print bsub(job_name, f, **kwargs)()
Ejemplo n.º 5
0
def fastqc(script, samples, data_path):
    for sample in samples:
        fastq = getfilelist(data_path, sample + ".fastq.gz")
        assert(len(fastq) == 1)
        fastq = fastq[0]
        qcresult = "%s/%s_fastqc.zip" % (data_path.rstrip("/"), sample)
        if op.exists(qcresult): continue
        cmd = "%s --outdir %s %s" % (script, data_path, fastq)
        bsub("qc", verbose=True)(cmd)
Ejemplo n.º 6
0
def main(args):
    if not op.exists(args.out):
        os.makedirs(args.out)
    
    jobids = []
    genelist = getfilelist(args.index, "*.pickle")
    for i, gene in enumerate(genelist):
        gname = op.splitext(op.basename(gene))[0]
        cmd = "python %s --read-len %s --overhang-len %s --settings-filename %s --compute-gene-psi %s %s %s %s" % (args.miso_script, args.read_length, args.overhang_length, args.miso_settings, gname, gene, args.bam, args.out)
        bsub("miso_" + gname, q=args.queue_name)(cmd, job_cap=args.job_cap)
Ejemplo n.º 7
0
def cleanup():
    """take care of the mess left by rum."""
    sortjobs = []
    for sam in getfilelist(RESULTS, "RUM.sam")
        outdir = os.path.dirname(sam)
        sample = outdir.rsplit("/", 1)[1]
        cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + sample + ".bam"
        sortjobs.append(bsub("sam2bam", "-cwd " + outdir, verbose=True)(cmd))
        cmd = "gzip *.fa RUM_Unique RUM_NU"
        bsub("compress", "-cwd " + outdir, verbose=True)(cmd)
    return sortjobs
Ejemplo n.º 8
0
def cleanup(path):
    """it'd be a good idea to not run this on the data dir"""
    exts = ['bed', 'xls']
    for ext in exts:
        for f in getfilelist(path, "*." + ext):
            cmd = "gzip -f " + f
            bsub("zip", q="idle")(cmd)
    try:
        [os.remove(sam) for sam in getfilelist(path, "*.sam")]
    except OSError:
        pass
Ejemplo n.º 9
0
def alignment_stats(results_path, picard_path, ref_fasta):
    for bam in getfilelist(results_path, "*.bam"):
        cmd = "samtools index %s" % bam
        if not op.exists("%s.bai" % bam):
            jobid = bsub("index", verbose=True)(cmd)
            bsub.poll(jobid)
        cmd = "java -Xmx8g -jar %s/CollectMultipleMetrics.jar \
                INPUT=%s REFERENCE_SEQUENCE=%s ASSUME_SORTED=true OUTPUT=metrics \
                PROGRAM=CollectAlignmentSummaryMetrics \
                PROGRAM=QualityScoreDistribution \
                PROGRAM=MeanQualityByCycle" % (picard_path, bam, ref_fasta)
        bsub("alignment_summary", verbose=True)(cmd)
Ejemplo n.º 10
0
def cleanup(path):
    """it'd be a good idea not to run this on the data dir"""
    exts = ['bed', 'xls']
    for ext in exts:
        for f in getfilelist(path, "*." + ext):
            cmd = "gzip -f " + f
            bsub("zip", q="idle")(cmd)
    try:
        [os.remove(fastq) for fastq in getfilelist(path, "*.fastq")]
        [os.remove(fastq) for fastq in getfilelist(path, "*.fq")]
        [os.remove(fastq) for fastq in getfilelist(path, "*.csfasta")]
        [os.remove(fastq) for fastq in getfilelist(path, "*.qual")]
    except OSError:
        pass
Ejemplo n.º 11
0
def fastqc():
    """qc for single or paired-end data."""
    fastqc="/vol1/home/brownj/opt/fastqc/fastqc"
    for sample in SAMPLES:
        outdir = RESULTS + "/" + sample
        
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        
        for fastq in getfilelist(DATA, sample + "_*"):
            # see if result exists -- fastqc naming convention uses a portion of the read file name
            qcresult = getfilename(fastq) + "_fastqc.zip"
            if os.path.exists(outdir + "/" + qcresult): continue
            
            cmd = fastqc + " --outdir " + outdir + " --threads 4 " + fastq
            bsub("fastqc", verbose=True)(cmd)
Ejemplo n.º 12
0
def convert_gwas_vcf_to_tsv_with_cluster(vcf):
    sub = bsub("gwas_vcf2tsv", M="4000", R="rusage[mem=4000]", N="")
    command = "vcf2tsv -f {} -exec local".format(vcf)
    print(">>>> Submitting job to cluster, job id below")
    print(sub(command).job_id)
    print(
        "You will receive an email when the job is finished. Formatted files will appear in the same directory as the input file."
    )
Ejemplo n.º 13
0
def fastqc(samples, datadir, resultsdir):
    """qc for single or paired-end data"""
    fastqc="/vol1/home/brownj/opt/fastqc/fastqc"
    for sample in samples:
        fastqs = getfilelist(datadir, sample + ".fastq.gz")
        assert(len(fastqs) == 1)
        
        outdir = resultsdir.rstrip("/") + "/" + sample
        
        if not op.exists(outdir):
            os.makedirs(outdir)
        
        
        qcresult = outdir + "/" + sample + "_fastqc.zip"
        if op.exists(qcresult): continue
        cmd = fastqc + " --outdir " + outdir + " " + fastqs[0]
        bsub("fastqc", verbose=True)(cmd)
Ejemplo n.º 14
0
    def run(self, command):
        """
		run the command constructed by the command builder method
		:param command: compare pipeline-tool name to call
		:return: job id if LSF used otherwise none
	   """
        print('*' * 100)
        print("IN RUN FUNCTION: running the command:", command)
        print("Requested memory: {}".format(self.rmem))
        print("Memory limits: {}".format(self.lmem))
        print('*' * 100)
        processing_id = self.workdir.split('/')[-2]
        job_id = ''
        if not self.lsf:
            sub_process = subprocess.Popen(command,
                                           shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            out, err = sub_process.communicate()

            if out:
                print('*' * 100)
                print("standard output of subprocess:\n", out.decode())
                print('*' * 100)
                data = out.decode().split('\n')
                i = 0
                for line in data:
                    if 'error' in line.lower():
                        message = data[i - 1] + '\n' + data[i]
                        self.error_list.append(message.replace("'", ""))
                    i = i + 1

            if err:
                print('*' * 100)
                print("standard error of subprocess:\n", err.decode())
                print('*' * 100)
                data = err.decode().split('\n')
                i = 0
                for line in data:
                    if 'error' in line.lower():
                        message = data[i - 1] + '\n' + data[i]
                        self.error_list.append(message.replace("'", ""))
                    i = i + 1
            #Comment this out after amending A above
            if sub_process.returncode != 0:
                if err:
                    self.error_list.append(err.decode().replace("'", ""))
                    print(err.decode(), file=sys.stderr)
        else:
            print("LSF option is set to true .....")
            print(command)
            job_id = bsub('core_executor_' + processing_id,
                          R=self.rmem,
                          M=self.lmem,
                          g=self.bgroup,
                          verbose=True)(command)
        return [job_id]
Ejemplo n.º 15
0
def trim(path, pattern):
    """uses https://github.com/lh3/seqtk"""
    jobs = []
    for fastq in getfilelist(path, pattern):
        trimresult = "%s.trim.fastq.gz" % fastq.split(".fastq", 1)[0]
        if op.exists(trimresult): continue
        cmd = "seqtk trimfq %s | gzip -c > %s" % (fastq, trimresult)
        jobs.append(bsub("trim", verbose=True)(cmd))
    return jobs
Ejemplo n.º 16
0
 def launch_lsf(self, command_strings, verbose=False, output='/dev/null'):
     curr_dir = os.getcwd()
     os.chdir(self.tmpdir)
     job_ids = [bsub('phyml_task',
                     o='/dev/null',
                     e='/dev/null',
                     verbose=verbose)(cmd).job_id
                for cmd in command_strings]
     bsub.poll(job_ids)
     os.chdir(curr_dir)
Ejemplo n.º 17
0
def trim(path, pattern):
    """uses https://github.com/lh3/seqtk"""
    jobs = []
    for fastq in getfilelist(path, pattern):
        trimresult = fastq.split(".fastq", 1)[0] + ".trim.fastq.gz"
        if os.path.exists(trimresult): continue

        cmd = "seqtk trimfq " + fastq + " | gzip -c > " + trimresult
        jobs.append(bsub("seqtk", verbose=True)(cmd))
    return jobs
Ejemplo n.º 18
0
def postprocessrum(resultsdir):
    """take care of the mess left by rum."""
    jobs = []
    for sam in getfilelist(resultsdir, "RUM.sam"):
        outdir = op.dirname(sam)
        try:
            [os.remove(fastq) for fastq in getfilelist(outdir, "*.fastq")]
        except OSError:
            pass
        sample = outdir.rsplit("/", 1)[1]
        
        cmd = "gzip -f *.fa RUM_Unique RUM_NU RUM_NU.cov RUM_Unique.cov"
        bsub("postprocessrum", q="idle", cwd=outdir, verbose=True)(cmd)
        
        bam = outdir + "/" + sample + ".bam"
        if op.exists(bam): continue
        cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + bam
        jobs.append(bsub("postprocessrum", cwd=outdir, verbose=True)(cmd))
    return jobs
Ejemplo n.º 19
0
def cleanup(genome):
    """take care of the mess left by rum."""
    jobs = []
    for sam in getfilelist(RESULTS, "RUM.sam"):
        outdir = op.dirname(sam)
        try:
            [os.remove(fastq) for fastq in getfilelist(outdir, "*.fastq")]
        except OSError:
            # no fastq found
            pass
        sample = outdir.rsplit("/", 1)[1]
        
        cmd = "gzip *.fa RUM_Unique RUM_NU RUM_NU.cov RUM_Unique.cov"
        bsub("compress", cwd=outdir, verbose=True)(cmd)
        
        bam = outdir + "/" + sample + "." + genome + ".bam"
        if op.exists(bam): continue
        cmd = "samtools view -ShuF 4 " + sam + " | samtools sort -o - " + sample + ".temp -m 9500000000 > " + bam
        jobs.append(bsub(PI + ".distill_rum", cwd=outdir, verbose=True)(cmd))
    return jobs
Ejemplo n.º 20
0
def trimadapters(datadir):
    """trim adapters using ea-utils"""
    jobs = []
    adapters = "/vol1/home/brownj/projects/walter/data/20121005/adapters.fa"
    for fastq in getfilelist(datadir, "*.fastq.gz"):
        trimresult = op.dirname(fastq) + "/" + op.basename(fastq).split(".fastq", 1)[0] + ".trm.fq.gz"
        if op.exists(trimresult): continue
        cmd = "fastq-mcf " + adapters + " " + fastq + " | gzip -c > " + trimresult
        jobid = bsub(PI + ".trimadapter", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 21
0
def build_bsub(config, algorithm, **kwargs):
    """
    >>> from bsub import bsub
    >>> config = {'pipeline': {'filter': {1: 'idx', 'bsub': {'P': 'test', 'R': 'span[hosts=1]', 'n': 10}, 'p': 10}}, 'project_id': 'test'}
    >>> b = build_bsub(config, "filter")
    >>> print b.command.replace("logs/", "")
    bsub -e filter.%J.err -J filter -o filter.%J.out -n 10 -P test -R "span[hosts=1]"
    >>> b = build_bsub(config, "filter", **{'w':10010})
    >>> print b.command.replace("logs/", "")
    bsub -e filter.%J.err -J filter -o filter.%J.out -n 10 -P test -R "span[hosts=1]" -w "done(10010)"
    """
    try:
        pid = config["project_id"]
    except KeyError:
        # this is required
        logging.critical("Define a Project ID (project_id) in the config")
        sys.exit(1)

    try:
        # args as defined in config:pipeline:algorithm:bsub
        config_kwargs = config["pipeline"][algorithm]["bsub"]
        # overwrite existing with new
        config_kwargs.update(kwargs)
    except KeyError:
        # LSF reservations not defined in config
        if not kwargs:
            return bsub(algorithm, P=pid, verbose=True)
        config_kwargs = kwargs

    # fix wait syntax
    if "w" in config_kwargs.keys():
        config_kwargs["w"] = '"exit({i},0)"'.format(i=config_kwargs["w"])
    if not "P" in config_kwargs.keys():
        config_kwargs["P"] = pid

    # args to strings
    for k, v in config_kwargs.items():
        if isinstance(v, int):
            config_kwargs[k] = str(v)

    return bsub(algorithm, verbose=True, **config_kwargs)
Ejemplo n.º 22
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-e', '--env', help="- Environment", default="dev")
    parser.add_argument('-p',
                        '--project',
                        help="- MetaboLights Labs project ID",
                        required=True)
    parser.add_argument('-t',
                        '--token',
                        help="- MetaboLights Labs user token",
                        required=True)
    parser.add_argument('-j', '--job', help="- LSF Job id", required=False)
    args = parser.parse_args(arguments)

    global project
    global token
    global job
    global env
    global userSpace

    project = args.project
    token = args.token
    job = args.job
    env = args.env

    baseDirectory = userSpace[env]

    inputLocation = baseDirectory + token + "/" + project
    outputLocation = baseDirectory + token + "/" + project

    if not (job is None):
        # check the status of job and return the value
        print _getJobStatus(_run("bjobs " + job))
    else:
        #check if the folder locations are valid and the submit the job
        #check if the input location / outputlocation exists
        if not os.path.isdir(inputLocation) or not os.path.exists(
                inputLocation):
            raise Exception("Input folder doesnt exist")

        if not os.path.isdir(outputLocation) or not os.path.exists(
                outputLocation):
            raise Exception("Output folder doesnt exist")

        sub = bsub("mzml2isaJob", verbose=False)
        sub("mzml2isa -i " + inputLocation + " -o " + outputLocation +
            " -s ''")
        status = {"message": "Job submitted successfully", "code": "PEND"}
        status["jobID"] = sub.job_id
        print status
def apply_config_to_file_use_cluster(file, config_type, config_path, memory):
    sub = bsub("gwas_ss_format",
               M="{}".format(str(memory)),
               R="rusage[mem={}]".format(str(memory)),
               N="")
    command = "ss-format -f {} -t {} -c {} -m apply".format(
        file, config_type, config_path)
    print(">>>> Submitting job to cluster, job id below")
    print(sub(command).job_id)
    print(
        "You will receive an email when the job is finished. Formatted files, md5sums and configs will appear in "
        "the same directory as the input file.")
Ejemplo n.º 24
0
    def run(self, command):
        """
		pass
		:return:
		"""
        processing_id = self.workdir.split('/')[-2]
        job_id = ''

        if not self.lsf:
            sub_process = subprocess.Popen(command,
                                           shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            out, err = sub_process.communicate()

            if out:
                print('*' * 100)
                print("standard output of subprocess:\n", out.decode())
                print('*' * 100)
                data = out.decode().split('\n')
                i = 0
                for line in data:
                    if 'error' in line.lower():
                        message = data[i - 1] + '\n' + data[i]
                        self.error_list.append(message.replace("'", ""))
                    i = i + 1

            if err:
                print('*' * 100)
                print("standard error of subprocess:\n", err.decode())
                print('*' * 100)
                data = err.decode().split('\n')
                i = 0
                for line in data:
                    if 'error' in line.lower():
                        message = data[i - 1] + '\n' + data[i]
                        self.error_list.append(message.replace("'", ""))
                    i = i + 1
            # Comment this out after amending A above
            if sub_process.returncode != 0:
                if err:
                    self.error_list.append(err.decode().replace("'", ""))
                    print(err.decode(), file=sys.stderr)
        else:
            print("LSF option is set to true .....")
            print(command)
            job_id = bsub('core_executor_' + processing_id,
                          R=self.rmem,
                          M=self.lmem,
                          g=self.bgroup,
                          verbose=True)(command)
        return [job_id]
Ejemplo n.º 25
0
def assemble(samples, data_dir, results_dir, seed_fa):
    """assemble using SSAKE."""
    # jobs = []
    for sample in samples:
        fastas = ngseq.getfilelist(datadir, sample + ".jnd.fa.gz")
        assert(len(fastas) == 1)
        gzipfasta = fastas[0]
        outdir = "%s/%s" % (results_dir, sample)
        fasta = outdir + "/" + op.splitext(op.basename(gzipfasta))[0]
        if not op.exists(fasta):
            bsub.poll(ngseq.extract(gzipfasta, fasta))
        cmd = "SSAKE -f " + fasta + " -s " + seed_fa + " -m 40 -o 50 -r 0.8 -b " + sample + " -p 1 -v 1 -d 200 -e 0.75 -k 10 -a 0.5 -x 50"
        jobid = bsub("3prime_seed_extension", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
Ejemplo n.º 26
0
def align():
    """align reads using rum"""
    alignjobs = []
    for sample in SAMPLES:
        fastqs = getfilelist(sample + ".fq.gz")
        assert(len(fastqs) == 1)
        outdir = RESULTS + "/" + sample
        alignresult = outdir + "/" + sample + ".bam"
        if os.path.exists(alignresult): continue
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        cmd = "rum_runner align -v -i " + INDEX + " -o " + outdir + " --chunks 5 --name " + sample + " " + DATA + "/" + fastq[0]
        alignjobs.append(bsub("align_reads", "-n 5", verbose=True)(cmd))
Ejemplo n.º 27
0
def macs(samples, resultsdir):
    jobs = []
    for sample in samples:
        bams = getfilelist(resultsdir, sample + ".hg19.bam")
        assert(len(bams) == 1)
        outdir = resultsdir.rstrip("/") + "/" + sample
        macsresult = outdir + "/" + sample + "_peaks.xls"
        if op.exists(macsresult) or op.exists(macsresult + ".gz"): continue
        cmd = "macs14 -t " + bams[0] + " -f BAM -n " + sample + " -g hs -w --single-profile"
        # writes to directory in which it was executed
        jobid = bsub("macs", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 28
0
def solid2fastq(samples, datadir):
    script = "/vol2/home/brentp/src/bfast-git/scripts/solid2fastq"
    jobs = []
    for sample in samples:
        csfastas = getfilelist(datadir, sample + "*.csfasta.gz")
        quals = getfilelist(datadir, sample + "*.qual.gz")
        assert(len(csfastas) == 1)
        assert(len(quals) == 1)
        if op.exists(datadir + "/" + sample + ".fastq.gz"): continue
        cmd = script + " -z -Z -o " + sample + " " + csfastas[0] + " " + quals[0]
        jobid = bsub("solid2fastq", cwd=datadir, verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 29
0
def unload_experiment(ae_id):
    unload_command = """%s -a %s""" % (UNLOADER_PATH, ae_id)
    clean_command = """%s -a %s -f %s""" % (UNLOAD_CLEAN_PATH, ae_id,
                                            FULL_FTP_LOCATION)
    print unload_command
    print clean_command
    j = bsub(unload_command)
    j2 = j.then(clean_command)
    print j
    print j2
    print dir(j)
    print dir(j2)
    print j.__dict__
    print j2.__dict__
Ejemplo n.º 30
0
def concat():
    """join reads from all lanes."""
    concatjobs = []
    for sample in SAMPLES:
        fastqs = getfilelist(DATA, sample + "_*")
        
        # check for output
        concatresult = DATA + "/" + sample + ".fq.gz"
        if os.path.exists(concatresult): continue
        
        assert(len(fastqs) == 2)
        
        cmd = "zcat " + " ".join(fastqs) + " | gzip -c > " + concatresult
        concatjobs.append(bsub("concat_reads", verbose=True)(cmd))
Ejemplo n.º 31
0
def join(samples, datadir, script):
    """joins paired-end data into SSAKE format."""
    jobs = []
    sub = bsub("join_reads", verbose=True)
    for sample in samples:
        # sort for ordering: R1 then R2
        fastqs = sorted(ngseq.getfilelist(datadir, sample + "_*.trm.fq.gz"))
        # check for output
        joinresult = datadir + "/" + sample + ".jnd.fa.gz"
        if op.exists(joinresult) or op.exists(joinresult + ".gz"): continue
        assert(len(fastqs) == 2)
        # usage: join_reads.py R1 R2 --insert 200
        cmd = "python " + script + " " + " ".join(fastqs) + " | gzip -c > " + joinresult
        jobs.append(sub(cmd))
    return jobs
Ejemplo n.º 32
0
def bowtiealign(samples, datadir, resultsdir, index, genome):
    """align to index using bowtie"""
    jobs = []
    for sample in samples:
        fasta = datadir + "/" + sample + ".csfasta"
        qual = datadir + "/" + sample + ".qual"
        
        outdir = resultsdir.rstrip("/") + "/" + sample
        alignresult = outdir + "/" + sample + "." + genome + ".bam"
        if op.exists(alignresult): continue
        
        cmd = "bowtie -p4 -m1 -v1 -f -C --best --strata --chunkmbs 512 --trim3 25 --sam " + index + " -Q " + qual + " " + fasta + " | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult        
        jobid = bsub("bowtie", n="4", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 33
0
    def launch_lsf(self, command_strings, verbose=False):
        curr_dir = os.getcwd()
        os.chdir(self.tmpdir)
        job_launcher = bsub('treeCl_gtp_task',
                        o='/dev/null',
                        e='/dev/null',
                        verbose=verbose)

        if not self.debug:
            job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null'

        job_ids = [job_launcher(cmd).job_id
                   for cmd in command_strings]
        self.job_ids.update(job_ids)
        bsub.poll(job_ids)
        os.chdir(curr_dir)
Ejemplo n.º 34
0
def counts(samples, resultsdir):
    """get counts over peaks regions for each sample"""
    jobs = []
    # the merged peaks file
    consensus = getfilelist(BASE + "/results", "consensus.bed*")
    assert(len(consensus) == 1)
    consensus = consensus[0]
    for sample in samples:
        bams = getfilelist(resultsdir, sample + "*.hg19.bam")
        assert(len(bams) == 1)
        outdir = resultsdir.rstrip("/") + "/" + sample
        countsresult = outdir + "/" + sample + ".counts"
        if op.exists(countsresult): continue
        cmd = "bedtools coverage -abam " + bams[0] + " -b " + consensus + " > " + countsresult
        jobid = bsub(PI + ".counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 35
0
 def apply_config_bsub(self):
     self.get_options_data()
     self.get_split_data()
     self.get_col_shuffle_data()
     print("File to format: {}\nConfig: {}".format(
         str(self.table.file),
         str(json.dumps(self.config, sort_keys=True, indent=4))))
     config_out = self.filename + ".tabman_config.json"
     with open(config_out, "w") as f:
         json.dump(self.config, f)
     sub = bsub("gwas_ss_format", M="24000", R="rusage[mem=24000]", N="")
     command = "tabman -f {} -config {}".format(self.filename, config_out)
     print(">>>> Submitting job to cluster, job id below")
     print(sub(command).job_id)
     print(
         "You will receive an email when the job is finished. Formatted files, md5sums and configs will appear in the same directory as the input file."
     )
     sys.exit()
Ejemplo n.º 36
0
def gsnap(samples, reads_path, results_path, gmap_db, cmd_str):
    """align reads for each sample according to the command string."""
    jobs = []
    for sample in samples:
        fastqs = getfilelist(reads_path, sample + ".trim.fastq.gz")
        assert(len(fastqs) == 1)
        fastq = fastqs[0]
        
        out = "%s/%s" % (results_path, sample)
        if not op.exists(out):
            os.makedirs(out)
        
        align_result = "%s/%s.bam" % (out, sample)
        if op.exists(align_result): continue
        
        cmd = cmd_str.format(gmap_db, fastq, sample, align_result)
        jobid = bsub("align", n="5", R="select[mem>28] rusage[mem=28] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 37
0
def novoalign(samples, datadir, resultsdir, index, genome):
    jobs = []
    for sample in samples:
        fastqs = getfilelist(datadir, sample + ".fastq.gz")
        assert(len(fastqs) == 1)
        
        outdir = resultsdir.rstrip("/") + "/" + sample
        alignresult = outdir + "/" + sample + "." + genome + ".bam"
        if op.exists(alignresult): continue
        if not op.exists(outdir):
            os.makedirs(outdir)
        gzipfastq = fastqs[0]
        fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0]
        if not op.exists(fastq):
            bsub.poll(extract(gzipfastq, fastq))
        cmd = "novoalignCS -c 1 -d " + index + " -f " + fastq + " -F BFASTQ -o SAM -r Random -e 100 -s 8 -l 20 | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult
        jobid = bsub("novoalign", n="1", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 38
0
def counts(samples, result_path, peak_ext, bam_ext):
    # get the consensus peaks
    f = open("%s/peak_coordinates.bed" % result_path, 'w')
    x = BedTool()
    consensus = x.multi_intersect(i=getfilelist(result_path, "*%s" % peak_ext))
    for c in consensus:
        # fixing formatting from bedtool object
        replicate_counts = c.name
        if replicate_counts < 2: continue
        
        fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % \
                    (c.chrom, c.start, c.stop)]
        f.write("\t".join(map(str, fields)))
    f.close()
    # get counts for each sample
    jobs = []
    countfiles = []
    for sample in samples:
        bams = getfilelist(result_path, sample + "*%s" % bam_ext)
        assert(len(bams) == 1)
        outdir = result_path.rstrip("/") + "/" + sample
        countsresult = outdir + "/" + sample + ".counts"
        countfiles.append(countsresult)
        if op.exists(countsresult): continue
        cmd = "bedtools coverage -abam %s -b %s > %s" % \
                    (bams[0], f.name, countsresult)
        jobid = bsub(sample + "_counts", 
                        R="select[mem>16] rusage[mem=16] span[hosts=1]",
                        verbose=True)(cmd)
        jobs.append(jobid)
    bsub.poll(jobs)
    # counts to matrix
    allcounts = {}
    for cf in countfiles:
        cfname = op.basename(cf).split(".counts")[0]
        casecounts = {}
        for toks in reader(cf, header="chrom start stop name a_overlaps_in_b \
                    b_with_nonzero length_b frac_b_nonzero".split()):
            casecounts[toks['name']] = int(toks['a_overlaps_in_b'])
        allcounts[cfname] = casecounts
    countsdf = pd.DataFrame(allcounts)
    countsdf.to_csv(sys.stdout, sep="\t", header=True)
Ejemplo n.º 39
0
def macs(samples, resultsdir, control):
    jobs = []
    for sample in samples:
        bams = getfilelist(resultsdir, sample + ".bam")
        assert(len(bams) == 1)
        
        # control
        if control in bams[0]: continue
        controlbam = getfilelist(resultsdir, control + ".bam")
        assert(len(controlbam) == 1)
        
        outdir = resultsdir.rstrip("/") + "/" + sample
        macsresult = outdir + "/" + sample + "_peaks.xls"
        
        if op.exists(macsresult) or op.exists(macsresult + ".gz"): continue
        
        cmd = "macs14 -t " + bams[0] + " -c " + controlbam[0] + " -f BAM -n " + sample + " -g mm -w --single-profile"
        jobid = bsub("macs", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 40
0
def fetch_datahub_metadatafile(account, workdir, lsf):
	"""
	Fetch datahub metadata file via pathogen portal
	curl -o output.txt -X GET --header 'Accept: application/json' -u
	dcc_beethoven:xxxx 'https://www.ebi.ac.uk/ena/portal/api/search?
	result=read_run&dataPortal=pathogen&dccDataOnly=true&fields=tax_id,
	scientific_name,sample_accession,secondary_sample_accession,experiment_accession,
	study_accession,secondary_study_accession,run_accession,center_name,fastq_ftp,
	fastq_md5&sortFields=scientific_name,sample_accession&limit=0'
	:param account: datahub account
	:param workdir: working directory
	:param lsf: bo0lean
	:return: Metadata file
	"""
	print("In fetch_datahub_metadatafile")
	error_list = list()
	datahub = account['account_id']
	password = account['password']
	#inputfile = datahub.replace('dcc_', '') + "_run_metadata_*.tsv"
	outputfile = workdir + datahub.replace("dcc_", "") + '_run_metadata.tsv'
	retrieved_fields = (
		"fields=tax_id,scientific_name,sample_accession,secondary_sample_accession,experiment_accession,"
		"study_accession,secondary_study_accession,run_accession,center_name,instrument_platform,fastq_ftp,"
		"fastq_md5&sortFields=scientific_name,sample_accession&limit=0' -k")
	correct_ftp_path = " && perl -p -i -e '~s/ftp\.sra\.ebi\.ac\.uk\/vol1\/|ftp\.dcc\-private\.ebi\.ac\.uk\/vol1\///g' {} ".format(outputfile)

	"""
	Below correct_ftp_path reflect a quick hack to be able to process dcc_bromhead ... 
	Should resume to the above for normal mode of action.....
	correct_ftp_path = " && egrep -v -e 'SAMEA104423915|SAMEA4058395|SAMEA4058397|SAMEA4058405|SAMEA4058441'" \
					   "'SELECTA_REMOVE' \
					   {} >{}.tmp && mv {}.tmp {} && \
						perl -p -i -e '~s/ftp\.sra\.ebi\.ac\.uk\/vol1\///g' {} ".format(outputfile,
																						outputfile,
																						outputfile,
																						outputfile,
																						outputfile)
	"""

	base_command = ("curl -o {} -X GET --header 'Accept:\
	 application/json' -u {}:{} ").format(outputfile,
										  datahub,
										  password)
	base_command = base_command + (" 'https://www.ebi.ac.uk/ena/portal/api/search?")
	base_command = base_command + ("result=read_run&dataPortal=pathogen&dccDataOnly=true&")

	command = base_command + retrieved_fields + correct_ftp_path

	if os.path.isfile(outputfile):
		os.remove(outputfile)
	if not lsf:
		print(ruler)
		print("FETCHMETADATA COMMAND:\n\t", command)
		print("LSF VALUE=", lsf)
		print(ruler)
		sp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		out, err = sp.communicate()
		if out:
			print(ruler, "\nstandard output of subprocess: {}".format(out), file=sys.stdout)
		if err:
			print(ruler, "\nstandard error of subprocess: {}".format(err), file=sys.stderr)
		if sp.returncode != 0:
			error_list.append(err)
			print(err, end="", file=sys.stderr)
		print(ruler, "\nreturncode of subprocess:{}".format(sp.returncode), file=sys.stdout)
	else:
		print("LSF value is YES, still need implementation at the moment ...")
		print("Working dir: {}".format(workdir))
		print(ruler)
		print("Running: ", command)
		print(ruler)
		try:
			job_id = bsub.bsub("selection_2_attribute", verbose=True)(command)  # , R="rusage[mem=1]")
			bsub.bsub.poll(job_id)
		except:
			message = str(sys.exc_info()[1])
			error_list.append(message)
			print(ruler, "ERROR MESSAGE:\n{}".format(message), "\n", ruler)
	if lsf:
		return [outputfile, job_id]
	else:
		return [outputfile, None]
Ejemplo n.º 41
0
def _run_cmds_on_cluster(args, commands, out_dir, results_file):
    """This method will submit a certain number of jobs onto an LSF cluster and
    wait for these jobs to complete before starting new jobs. This allows to
    run several jobs in parallel.

    Args:
        args: Command-line arguments.
        commands: List of command dictionaries.
        out_dir: Output directory.
        results_file: CSV file to store summary.
    """
    from bsub import bsub

    def check_running(jobs):
        rjobs = bsub.running_jobs()
        tmp_jobs = jobs
        jobs = []
        for job, cmd_dict, ind in tmp_jobs:
            if job.job_id in rjobs:
                jobs.append((job, cmd_dict, ind))
                continue

            print('Job %d finished.' % ind)
            cmd_out_dir = cmd_dict[_OUT_ARG]

            try:
                # We store the command used for execution. This might be helpful
                # for the user in case he wants to manually continue the
                # simulation.
                with open(os.path.join(cmd_out_dir, 'hpsearch_command.sh'),
                          'w') as f:
                    f.write('#!/bin/sh\n')
                    f.write('%s' % (_args_to_cmd_str(cmd_dict)))

                # Get training results.
                performance_dict = _SUMMARY_PARSER_HANDLE(cmd_out_dir, i)
                for k, v in performance_dict.items():
                    cmd_dict[k] = v

                # Create or update the CSV file summarizing all runs.
                panda_frame = pandas.DataFrame.from_dict(cmd_dict)
                if os.path.isfile(results_file):
                    old_frame = pandas.read_csv(results_file, sep=';')
                    panda_frame = pandas.concat([old_frame, panda_frame],
                                                sort=True)
                panda_frame.to_csv(results_file, sep=';', index=False)

                # Check whether simulation has finished successfully.
                has_finished = int(cmd_dict['finished'][0])
                if has_finished == 1:
                    _CMD_FINISHED[ind] = True
                else:
                    _CMD_FINISHED[ind] = False

            except Exception:
                traceback.print_exc(file=sys.stdout)
                warnings.warn('Could not assess whether run %d has been ' \
                              % (ind+1) + 'completed.')

        return jobs

    jobs = []
    i = -1
    while len(commands) > 0:
        jobs = check_running(jobs)
        while len(jobs) >= args.num_jobs:
            time.sleep(10)
            jobs = check_running(jobs)

        cmd_dict = commands.pop()
        i += 1

        # FIXME quick and dirty solution.
        folder_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]
        if os.path.exists(os.path.join(out_dir, folder_name)):
            time.sleep(1.1)
            folder_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]
        cmd_out_dir = os.path.join(out_dir, folder_name)
        assert (not os.path.exists(cmd_out_dir))

        cmd_str = _args_to_cmd_str(cmd_dict, out_dir=cmd_out_dir)
        cmd_dict[_OUT_ARG] = cmd_out_dir

        # Execute the program.
        print('Starting training run %d/%d -- "%s"' %
              (i + 1, len(commands), cmd_str))

        job_name = 'job_%s' % folder_name
        # FIXME the bsub module ignores the pathnames we set. Hence, all output
        # files are simply stored in the local directory. For now, we will
        # capture this in the postprocessing script.
        job_error_file = os.path.join(cmd_out_dir, job_name + '.err')
        job_out_file = os.path.join(cmd_out_dir, job_name + '.out')
        sub = bsub(job_name,
                   R=args.resources,
                   n=1,
                   W='%d:00' % args.num_hours,
                   e=job_error_file,
                   o=job_out_file,
                   verbose=True)
        sub(cmd_str)
        jobs.append((sub, cmd_dict, i))

    # Wait for all jobs to complete.
    while len(jobs) > 0:
        time.sleep(10)
        jobs = check_running(jobs)
Ejemplo n.º 42
0
def download_datahub_file(account_name,
                          password,
                          files,
                          outdir,
                          process_id,
                          lsf,
                          dryrun=True):
    """
    Fetch datahub metadata from the Pathogen portal given datahub credentials
    :param account_name: datahub name
    :param password:  datahub password
    :param files: datahub metadata file
    :param outdir: directory where to store the metadata file, define in configuration file
    :param process_id: process_id of the runs
    :param lsf:  Boolean (run through LSF or not)
    :param dryrun: dry run
    :return:  LSF job ids or empty string
    """
    jobids = []
    for file in files:
        outputfile = outdir + '/' + os.path.basename(file)
        print(file)
        """ For some reason the data folder is empty, fastqs are now in vol1 folder :( """
        url = "ftp://{}:{}@ftp.dcc-private.ebi.ac.uk/vol1/{}".format(
            account_name, password, file)
        command = "wget -t 2 {} -O {}".format(url, outputfile)
        print('*' * 100)
        print(command)
        print('*' * 100)
        if not dryrun:
            if not lsf:

                sub_process = subprocess.Popen(command,
                                               shell=True,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)
                out, err = sub_process.communicate()
                if out:
                    print("standard output of subprocess:")
                    print(out)
                if err:
                    print("standard error of subprocess:")
                    print(err)
                if sub_process.returncode != 0:
                    error_list.append(err)
                    print(err, end="", file=sys.stderr)
            else:
                print(
                    "LSF value is YES, still need implementation at the moment..."
                )
                print('*' * 100)
                print("Running: ", command)
                print('*' * 100)
                try:
                    job_id = bsub('data_provider_' + process_id,
                                  g='/SELECTA',
                                  verbose=True)(command)
                    jobids.append(job_id)
                except:
                    message = str(sys.exc_info()[1])
                    error_list.append(message)

    return jobids
Ejemplo n.º 43
0
err_dir = '/icgc/dkfzlsdf/analysis/B240/kong/Projects/PANCSTRAT/Err_Out/DeepLearning/Survival/LSTM/' + 'Features_' + feat + '_Boots/'
if os.path.isdir(err_dir):
    shutil.rmtree(err_dir)

os.mkdir(err_dir)

####################################################################################################################
####################################################################################################################
# Get parameters for the model of choice.
model_id = 'dense_16_0.1_l1_0.5_lstm_0.3_Adam_learning_0.01'
model_dir = models_dir + model_id + '/'

####################################################################################################################
# Bootstrap.
for i_boots in range(n_bootstrap):
    boots_dir = bootstrap_dir + 'boots_' + str(i_boots) + '/'
    if os.path.isdir(boots_dir):
        shutil.rmtree(boots_dir)
    os.mkdir(boots_dir)    \

    # Submit one job to find error for one bootstrap.
    job_name = err_dir + 'boots_' + str(i_boots)
    job = bsub(job_name, W='50:00', M='10G', verbose=True)
    args = model_dir + ' ' + boots_dir
    job('module load anaconda3/2019.07; source activate TensorFlow_CPU; python /icgc/dkfzlsdf/analysis/B240/kong/Projects/PANCSTRAT/Code/WGS/Kipoi/MMSplice/DeepLearning/Survival/LSTM/Bootstrap_Error_bsub.py'
        + ' ' + args)

####################################################################################################################
####################################################################################################################