Ejemplo n.º 1
0
def main(args):
    samples = ["MDX_22_AGTTCC_L003_R1_001",
               "MDX_23_ATGTCA_L003_R1_001",
               "MDX_24_CCGTCC_L003_R1_001",
               "WT_21_AGTCAA_L003_R1_001",
               "WT_25_GTAGAG_L003_R1_001",
               "WT_42_GTCCGC_L003_R1_001"]
    datadir = "/vol1/home/brownj/projects/leinwand/data/20121101"
    adapters = "%s/adapters.fa" % datadir
    resultsdir = "/vol1/home/brownj/projects/leinwand/results/common"
    fastqc_script = "/vol1/home/brownj/opt/fastqc/fastqc"
    picard = "/vol1/home/brownj/opt/picard-tools-1.79"
    reference_fasta = "/vol1/home/brownj/ref/mm9/mm9.fa"
    gmapdb = "/vol1/home/brownj/ref/gmapdb"
    gsnapcmd = "gsnap -D {} -d mm9 --gunzip \
                --batch=5 --nofails --nthreads=4 --format=sam -v snp128_strict_wholeChrs {} \
                | samtools view -ShuF 4 - \
                | samtools sort -o - {}.temp -m 9500000000 > {}"
    chrom_sizes = "/vol1/home/brownj/ref/mm9/mm9.sizes"
    
    if args.clobber:
        ngseq.clobber_previous(resultsdir)
    
    # ngseq.fastqc(fastqc_script, samples, datadir)
    bsub.poll(ngseq.trimadapter(datadir, adapters))
    # bsub.poll(ngseq.gsnap(samples, datadir, resultsdir, gmapdb, gsnapcmd))
    # ngseq.alignment_stats(resultsdir, picard, reference_fasta)
    ngseq.cleanup(resultsdir)

    # create genomedata archive in results/common
    bam_pattern = "/vol1/home/brownj/projects/leinwand/results/common/*/*.bam"
    output_dir = "/vol1/home/brownj/projects/leinwand/results/common"
Ejemplo n.º 2
0
def readoutfile(file, jobid):
    """
    parse lsf log (.out, .err) file
    :param file: lsf log file (.out)
    :param jobid: lsf job id
    :return: exitcode of lsf job
    """
    if not os.path.isfile(file):
        bsub.poll(jobid)
    else:
        with open(file) as myfile:
            lines = myfile.readlines()
            exitcode = None
            for line in lines:
                hits = regexes['exit_code'].search(line)
                if hits is None:
                    pass
                elif hits.group(1) is not None:
                    exitcode = 0
                elif hits.group(2) is not None:
                    exitcode = int(hits.group(2))
            print("Final exit code is ", exitcode)
            print(type(exitcode))

    return exitcode
Ejemplo n.º 3
0
 def launch_lsf(self, command_strings, verbose=False, output='/dev/null'):
     curr_dir = os.getcwd()
     os.chdir(self.tmpdir)
     job_ids = [bsub('phyml_task',
                     o='/dev/null',
                     e='/dev/null',
                     verbose=verbose)(cmd).job_id
                for cmd in command_strings]
     bsub.poll(job_ids)
     os.chdir(curr_dir)
Ejemplo n.º 4
0
def main():
    base = "/vol1/home/brownj/projects/davidson"
    data = base + "/data/20120924"
    results = base + "/results/common"
    samples = ["1","2","3","4","5","6"]
    joinscript = base + "/bin/join_reads.py"
    seeds = "/vol1/home/brownj/projects/davidson/data/20120924/tr_ab_v.fa"

    # bsub.poll(trim(samples, data))
    bsub.poll(join(samples, data, joinscript))
    assemble(samples, data, results, seeds)
Ejemplo n.º 5
0
def alignment_stats(results_path, picard_path, ref_fasta):
    for bam in getfilelist(results_path, "*.bam"):
        cmd = "samtools index %s" % bam
        if not op.exists("%s.bai" % bam):
            jobid = bsub("index", verbose=True)(cmd)
            bsub.poll(jobid)
        cmd = "java -Xmx8g -jar %s/CollectMultipleMetrics.jar \
                INPUT=%s REFERENCE_SEQUENCE=%s ASSUME_SORTED=true OUTPUT=metrics \
                PROGRAM=CollectAlignmentSummaryMetrics \
                PROGRAM=QualityScoreDistribution \
                PROGRAM=MeanQualityByCycle" % (picard_path, bam, ref_fasta)
        bsub("alignment_summary", verbose=True)(cmd)
Ejemplo n.º 6
0
def assemble(samples, data_dir, results_dir, seed_fa):
    """assemble using SSAKE."""
    # jobs = []
    for sample in samples:
        fastas = ngseq.getfilelist(datadir, sample + ".jnd.fa.gz")
        assert(len(fastas) == 1)
        gzipfasta = fastas[0]
        outdir = "%s/%s" % (results_dir, sample)
        fasta = outdir + "/" + op.splitext(op.basename(gzipfasta))[0]
        if not op.exists(fasta):
            bsub.poll(ngseq.extract(gzipfasta, fasta))
        cmd = "SSAKE -f " + fasta + " -s " + seed_fa + " -m 40 -o 50 -r 0.8 -b " + sample + " -p 1 -v 1 -d 200 -e 0.75 -k 10 -a 0.5 -x 50"
        jobid = bsub("3prime_seed_extension", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
Ejemplo n.º 7
0
    def launch_lsf(self, command_strings, verbose=False):
        curr_dir = os.getcwd()
        os.chdir(self.tmpdir)
        job_launcher = bsub('treeCl_gtp_task',
                        o='/dev/null',
                        e='/dev/null',
                        verbose=verbose)

        if not self.debug:
            job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null'

        job_ids = [job_launcher(cmd).job_id
                   for cmd in command_strings]
        self.job_ids.update(job_ids)
        bsub.poll(job_ids)
        os.chdir(curr_dir)
Ejemplo n.º 8
0
def novoalign(samples, datadir, resultsdir, index, genome):
    jobs = []
    for sample in samples:
        fastqs = getfilelist(datadir, sample + ".fastq.gz")
        assert(len(fastqs) == 1)
        
        outdir = resultsdir.rstrip("/") + "/" + sample
        alignresult = outdir + "/" + sample + "." + genome + ".bam"
        if op.exists(alignresult): continue
        if not op.exists(outdir):
            os.makedirs(outdir)
        gzipfastq = fastqs[0]
        fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0]
        if not op.exists(fastq):
            bsub.poll(extract(gzipfastq, fastq))
        cmd = "novoalignCS -c 1 -d " + index + " -f " + fastq + " -F BFASTQ -o SAM -r Random -e 100 -s 8 -l 20 | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult
        jobid = bsub("novoalign", n="1", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 9
0
def readoutfile(file, jobid):
	if not os.path.isfile(file):
		bsub.poll(jobid)
	else:
		with open(file) as f:
			lines = f.readlines()
			exitcode= None
			for line in lines:
				hits = regexes['exit_code'].search(line)
				if hits is None:
					pass
				elif hits.group(1) is not None:
					exitcode = 0
				elif hits.group(2) is not None:
					exitcode = int(hits.group(2))
			print("Final exit code is ", exitcode)
			print(type(exitcode))

	return exitcode
Ejemplo n.º 10
0
def counts(samples, result_path, peak_ext, bam_ext):
    # get the consensus peaks
    f = open("%s/peak_coordinates.bed" % result_path, 'w')
    x = BedTool()
    consensus = x.multi_intersect(i=getfilelist(result_path, "*%s" % peak_ext))
    for c in consensus:
        # fixing formatting from bedtool object
        replicate_counts = c.name
        if replicate_counts < 2: continue
        
        fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % \
                    (c.chrom, c.start, c.stop)]
        f.write("\t".join(map(str, fields)))
    f.close()
    # get counts for each sample
    jobs = []
    countfiles = []
    for sample in samples:
        bams = getfilelist(result_path, sample + "*%s" % bam_ext)
        assert(len(bams) == 1)
        outdir = result_path.rstrip("/") + "/" + sample
        countsresult = outdir + "/" + sample + ".counts"
        countfiles.append(countsresult)
        if op.exists(countsresult): continue
        cmd = "bedtools coverage -abam %s -b %s > %s" % \
                    (bams[0], f.name, countsresult)
        jobid = bsub(sample + "_counts", 
                        R="select[mem>16] rusage[mem=16] span[hosts=1]",
                        verbose=True)(cmd)
        jobs.append(jobid)
    bsub.poll(jobs)
    # counts to matrix
    allcounts = {}
    for cf in countfiles:
        cfname = op.basename(cf).split(".counts")[0]
        casecounts = {}
        for toks in reader(cf, header="chrom start stop name a_overlaps_in_b \
                    b_with_nonzero length_b frac_b_nonzero".split()):
            casecounts[toks['name']] = int(toks['a_overlaps_in_b'])
        allcounts[cfname] = casecounts
    countsdf = pd.DataFrame(allcounts)
    countsdf.to_csv(sys.stdout, sep="\t", header=True)
Ejemplo n.º 11
0
    def _launch_lsf_dynamic_memory(self, command_strings, verbose=False):
        curr_dir = os.getcwd()
        os.chdir(self.tmpdir)

        memory = self.get_memory_requirements()
        job_ids = []
        for i, cmd in enumerate(command_strings):
            memory_reqd = memory[i]
            job_launcher = bsub('treeCl_dynamic_phyml_task',
                                R='rusage[mem={}]'.format(memory_reqd),
                                M=memory_reqd,
                                verbose=verbose)
            if not self.debug:
                job_launcher.kwargs['o'] = '/dev/null'
                job_launcher.kwargs['e'] = '/dev/null'

            job_ids.append(job_launcher(cmd).job_id)
        self.job_ids.update(job_ids)
        bsub.poll(job_ids)
        os.chdir(curr_dir)
Ejemplo n.º 12
0
    def _launch_lsf_fixed_memory(self, command_strings, minmem=4096,
                                 verbose=False):
        """ Uses bsub package to send phyml jobs to lsf """
        curr_dir = os.getcwd()
        os.chdir(self.tmpdir)

        job_launcher = bsub('treeCl_static_phyml_task',
                           R='rusage[mem={}]'.format(minmem),
                           M=minmem,
                           verbose=verbose)

        # overwrite kwargs pertaining to output log files
        if not self.debug:
            job_launcher.kwargs['o'] = job_launcher.kwargs['e'] = '/dev/null'

        job_ids = [job_launcher(cmd).job_id
                   for cmd in command_strings]
        self.job_ids.update(job_ids)
        bsub.poll(job_ids)
        os.chdir(curr_dir)
Ejemplo n.º 13
0
def rum(samples, datadir, resultsdir, index):
    """align to index using rum"""
    jobs = []
    for sample in samples:
        fastqs = getfilelist(datadir, sample + ".trim.fastq.gz")
        assert(len(fastqs) == 1)
        
        outdir = resultsdir + "/" + sample
        alignresult = outdir + "/" + sample + ".bam"
        alternatealignresult = outdir + "/RUM.sam"
        if op.exists(alignresult) or op.exists(alternatealignresult): continue
        
        gzipfastq = fastqs[0]
        fastq = outdir + "/" + op.splitext(op.basename(gzipfastq))[0]
        if not op.exists(fastq):
            bsub.poll(extract(gzipfastq, fastq))

        cmd = "rum_runner align -v -i " + index + " -o " + outdir + " --chunks 5 --dna --nu-limit 2 --variable-length-reads --name " + sample + " " + fastq
        jobid = bsub("rum", n="5", R="select[mem>28] rusage[mem=28] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 14
0
def main():
	fastqc()
	# does this actually work?
	bsub.poll(concat())
	bsub.poll(align())
	bsub.poll(cleanup())
	indexbams()
Ejemplo n.º 15
0
def main(args):
    samples = ['2Som_chip1_GCCAAT_L006_R1_001',
               '2Som_chip2_GTCCGC_L006_R1_001',
               '2Som_Input_GTGAAA_L006_R1_001',
               '31hpt_Chip1_CAGATC_L006_R1_001',
               '31hpt_Chip2_ACAGTG_L006_R1_001',
               '31hpt_Input_TGACCA_L006_R1_001']
    controls = ['2Som_Input_GTGAAA_L006_R1_001',
                '31hpt_Input_TGACCA_L006_R1_001']
    datadir = "/vol1/home/brownj/projects/artinger/data/20121101"
    resultsdir = "/vol1/home/brownj/projects/artinger/results/common"
    fastqc_script="/vol1/home/brownj/opt/fastqc/fastqc"
    picard = "/vol1/home/brownj/opt/picard-tools-1.79"
    rumindex = "/vol1/home/brownj/ref/rum/zebrafish"
    reference_fasta = "/vol1/home/brownj/ref/zebrafish/Danio_rerio.Zv9.68.fa"
    gmapdb = "/vol1/home/brownj/ref/gmapdb"
    
    rumcmd = "rum_runner align -v -i %s -o {} --chunks 5 --dna --nu-limit 2 --variable-length-reads --name {} {}" % rumindex
    macscmd = "macs14 -t {} -f BAM -n {} -g 1400000000 -w --single-profile --call-subpeaks"
    gsnapcmd = "gsnap -D {} -d zebrafish --gunzip --npaths=1 --quiet-if-excessive --batch=5 --nofails --nthreads=4 --format=sam {} | samtools view -ShuF 4 - | samtools sort -o - {}.temp -m 9500000000 > {}"
    
    if args.clobber:
        clobber_previous(resultsdir)
    
    fastqc(fastqc_script, samples, datadir)
    bsub.poll(trim(datadir, "*R1_001.fastq.gz"))
    bsub.poll(gsnap(samples, datadir, resultsdir, gmapdb, gsnapcmd))
    # alignment_stats(resultsdir, picard, reference_fasta)
    bsub.poll(macs(samples, resultsdir, controls, macscmd))
    cleanup(resultsdir)
Ejemplo n.º 16
0
def counts(samples, resultsdir):
    """docstring"""
    # get the consensus peaks
    f = open(resultsdir + "/peak_coordinates.bed", 'w')
    x = BedTool()
    consensus = x.multi_intersect(i=getfilelist(resultsdir, "*peaks.bed.gz"))
    for c in consensus:
        replicate_counts = c.name
        if replicate_counts < 2: continue
        
        fields = [c.chrom, c.start, c.stop, "%s:%d-%d\n" % (c.chrom, c.start, c.stop)]
        f.write("\t".join(map(str, fields)))
    f.close()
    
    # get counts for each sample
    jobs = []
    countfiles = []
    for sample in samples:
        bams = getfilelist(resultsdir, sample + "*.hg19_novoalign.bam")
        assert(len(bams) == 1)
        outdir = resultsdir.rstrip("/") + "/" + sample
        countsresult = outdir + "/" + sample + ".counts"
        countfiles.append(countsresult)
        if op.exists(countsresult): continue
        cmd = "bedtools coverage -abam " + bams[0] + " -b " + f.name + " > " + countsresult
        jobid = bsub(sample + "_counts", R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    bsub.poll(jobs)
    
    # counts to matrix
    allcounts = {}
    for cf in countfiles:
        cfname = op.basename(cf).split(".hg19_novoalign.bam")[0]
        casecounts = {}
        for toks in reader(cf, header="chrom start stop name a_overlaps_in_b b_with_nonzero length_b frac_b_nonzero".split()):
            casecounts[toks['name']] = int(toks['a_overlaps_in_b'])
        allcounts[cfname] = casecounts
    countsdf = pd.DataFrame(allcounts)
    countsdf.to_csv(resultsdir + "/sample_counts.csv", sep=",", header=True)
Ejemplo n.º 17
0
def main():
    samples = ['RS_input_CCGTCC_L005_R1_001',
                'RS_iso_ATGTCA_L005_R1_001',
                'RS_tbet_CTTGTA_L005_R1_001']
    control = 'RS_input_CCGTCC_L005_R1_001'
    datadir = "/vol1/home/brownj/projects/marrack/data/20121101"
    resultsdir = "/vol1/home/brownj/projects/marrack/results/common"
    rumindex = "/vol1/home/brownj/ref/rum/mm9"
    
    fastqc(samples, datadir, resultsdir)
    bsub.poll(trim(datadir, "*R1_001.fastq.gz"))
    bsub.poll(rum(samples, datadir, resultsdir, rumindex))
    bsub.poll(postprocessrum(resultsdir))
    bsub.poll(macs(samples, resultsdir, control))
    cleanup(resultsdir)
Ejemplo n.º 18
0
def bowtiealign(samples, index, genome):
    """align to index using bowtie"""
    jobs = []
    for sample in samples:
        fastqs = getfilelist(DATA, sample + "_*.trm.fq.gz")
        # single end
        assert(len(fastqs) == 1)
        
        outdir = RESULTS + sample
        alignresult = outdir + "/" + sample + "." + genome + ".bam"
        if op.exists(alignresult):continue
        if not op.exists(outdir):
            os.makedirs(outdir)
        
        gzipfastq = fastqs[0]
        fastq = outdir + "/" + os.path.splitext(op.basename(gzipfastq))[0]
        if not op.exists(fastq):
            bsub.poll(extract(gzipfastq, fastq))
        
        cmd = "bowtie -p4 --best --sam -q " + index + " " + fastq + " | samtools view -ShuF4 - | samtools sort -o - " + sample + ".temp -m 9500000000 > " + alignresult
        jobid = bsub(PI + ".bowtie", n="4", R="select[mem>20] rusage[mem=20] span[hosts=1]", verbose=True)(cmd)
        jobs.append(jobid)
    return jobs
Ejemplo n.º 19
0
def main():
    hairpinindex = "/vol1/home/brownj/ref/mirbase/19/hairpin19"
    matureindex = "/vol1/home/brownj/ref/mirbase/19/mature19"
    tuberculosisindex = "/vol1/home/brownj/ref/tuberculosis/H37Rv"
    fastqc(SAMPLES)
    bsub.poll(trimadapters(DATA))
            
    # Bowtie
    bsub.poll(bowtiealign(SAMPLES, matureindex, "mature"))
    bsub.poll(bowtiealign(SAMPLES, tuberculosisindex, "H37Rv"))
    removefastqs()
Ejemplo n.º 20
0
def main():
    """
    Main call to the data_provider scripts.
    :return: None
    """
    error_list = list()
    get_args()
    prop = properties(properties_file)
    lsf = prop.lsf
    #print(prop)
    conn = get_connection(prop.dbuser, prop.dbpassword, prop.dbhost,
                          prop.dbname, prop.dbport)
    data_provider_list = get_list(conn)
    print(data_provider_list)
    process_jobids = {}
    for data_provider_stage in data_provider_list:
        print(data_provider_stage.process_id, data_provider_stage.selection_id,
              data_provider_stage.stage_list)
        if not data_provider_stage.check_started(conn):
            print("\nTo be started job: process_id:{}\
             collection id: {} dataprovider id: {} ".format(
                data_provider_stage.process_id,
                data_provider_stage.selection_id,
                data_provider_stage.stage_list))
            data_provider_stage.set_started(conn)
            process_dir = prop.workdir + data_provider_stage.process_id
            print("Creating process directory:{}".format(process_dir))
            create_processing_dir(process_dir)
            account_name = get_datahub_names(conn,
                                             data_provider_stage.process_id)
            print("account to be processed:{}".format(account_name))
            files = get_file_names(conn, data_provider_stage.process_id)
            print("Files to be downloaded:{}".format(files))
            pass_word = get_datahub_account_password(conn, account_name)
            process_id = data_provider_stage.process_id
            jobids = download_datahub_file(account_name,
                                           pass_word,
                                           files,
                                           process_dir,
                                           process_id,
                                           lsf,
                                           dryrun=False)
            """
            We should be able to capture the .err and .out lsf output into the
            database. Maybe define a a generic lsf_stat class, that will match in
            .out the "Successfully completed" string if true set length of error_list to 0
            other wise logs the full path to the .out file in database
            """
            if not lsf:
                #if len(error_list) != 0:
                if len(error_list):
                    final_errors = '\n'.join(
                        str(v).replace("'", "") for v in error_list)
                    data_provider_stage.set_error(conn, final_errors)
                else:
                    data_provider_stage.set_finished(conn)
            elif lsf:
                err = [
                    os.getcwd() + '/data_provider_' + process_id + '.' + y
                    for y in [x + '.err' for x in jobids]
                ]
                out = [
                    os.getcwd() + '/data_provider_' + process_id + '.' + y
                    for y in [x + '.out' for x in jobids]
                ]
                final_errors = '\n'.join(str(v).replace("'", "") for v in out)
                print(final_errors)
                process_jobids[process_id] = out
        error_list = list()
        if lsf:
            print(process_jobids)
    """
    We should check for the content of lsmyfile.out file and store the 
    full path of the error and out file in DB
    """
    if lsf:
        for data_provider_stage in data_provider_list:
            process_id = data_provider_stage.process_id
            for lsf_out in process_jobids[process_id]:
                print('*' * 100)
                print(lsf_out)
                print('*' * 100)
                jobid = lsf_out.split('.')[-2]
                bsub.poll(jobid)
                if os.path.isfile(lsf_out):
                    print(
                        "Processing lsmyfile.out for: jobid {}".format(jobid))
                    print("Processing: {}".format(lsf_out))
                    print('*' * 100)
                    localexitcode = readoutfile(lsf_out, jobid)
                    print(localexitcode)
                    if localexitcode != 0:
                        final_errors = lsf_out + ' with exit code ' + str(
                            localexitcode)
                        data_provider_stage.set_error(conn, final_errors)
                    else:
                        data_provider_stage.set_finished(conn)
                    print('*' * 100)
                else:
                    print("Awaiting completion of: jobid {}".format(jobid))
                    print("Processing: {}".format(lsf_out))
                    print('*' * 100)
                    #bsub.poll(jobid)
                    if os.path.isfile(lsf_out):
                        localexitcode = readoutfile(lsf_out, jobid)
                        print(localexitcode)
                        if localexitcode != 0:
                            final_errors = lsf_out + ' with exit code ' + str(
                                localexitcode)
                            data_provider_stage.set_error(conn, final_errors)
                        else:
                            data_provider_stage.set_finished(conn)
                    else:
                        bsub.poll(jobid)

    conn.close()