def gatk(timestamp, path_base, folder, samples, nproc, wt, q, genome_build, args): args = args.split("|") multithread = False filt = "30" if len(args) == 2: if args[0] == "yes": multithread = True filt = args[1] output = "results_gatk" secure_mkdir(path_base + folder, output) print "## Variang calling with GATK" print "> Writing jobs for GATK..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/") for sample in ksamp: in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam" if sample + ".sorted.bam" in proc_files: C = gatk_commands(path_base + folder, sample, genome_build, multithread, filt) commands.append("\n".join(C)) else: print "Warning: [GATK] SORTED BAM output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("gatk", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def picardqc(timestamp, path_base, folder, samples, nproc, wt, q, annots, strand): nstrand = {" --stranded=no":"NONE", " --stranded=yes":"FIRST_READ_TRANSCRIPTION_STRAND", " --stranded=no":"SECOND_READ_TRANSCRIPTION_STRAND"} output = "results_picard" secure_mkdir(path_base + folder, output) print "## Alignment QC Picard-CollectRnaSeqMetrics" print "> Writing jobs for Picard QC..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") for sample in ksamp: in_file = path_base + folder + "/results_star/" + sample + "_Aligned.out.sam" if sample + "_Aligned.out.sam" in proc_files: for i in range(len(config.nannots)): annot = annots[i] out_file = in_file.replace(".sam", "." + config.nannots[i] + ".qc").replace("results_star/", "results_picard/").replace("_Aligned.out", "") call = "java -jar " + config.path_picard + "/CollectRnaSeqMetrics.jar REF_FLAT=" + annot + " STRAND_SPECIFICITY=" + nstrand[strand] + " INPUT=" + in_file + " OUTPUT=" + out_file if i == (len(config.nannots)-1): commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_picard").replace("#SAMPLE", sample)) else: commands.append(call) else: print "Warning: [Picard] STAR output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("picard", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def jsplice(timestamp, path_base, folder, samples, nproc, wt, q, genomebuild, pheno, extra_args, strand): output_dir = path_base + folder + '/results_jsplice' secure_mkdir(path_base + folder, 'results_jsplice') print "## jSPLICE" print "> Writing jobs for jSPLICE..." nproc, nchild, bsub_suffix = manager.get_bsub_arg('1/NA/NA', len(samples)) commands = list() ksamp = sortbysize(samples) out = open(output_dir + '/expdesign.txt', 'w') print >> out, '#exp\tcond\tjxnFile\tbamFile' for sample in ksamp: sj_file = path_base + folder + '/results_star/' + sample + '_SJ.out.tab' # Junction file created by STAR sj_out_file = output_dir + '/' + sample + '.SJ.bed' bam_file = path_base + folder + '/results_sam2sortbam/' + sample + '.sorted.bam' # BAM file created by STAR/Picard(AddOrReplaceReadGroups) if os.path.exists(sj_file) and os.path.exists(bam_file) and len(pheno[sample].split(':'))==2: command = 'python ' + config.path_jsplice + '/starJxn2bed.py -f ' + sj_file + ' -o '+ sj_out_file commands.append(command + sample_checker.replace("#FOLDER", output_dir).replace("#SAMPLE", sample)) print >> out, '\t'.join([pheno[sample].split(':')[0], pheno[sample].split(':')[1], sj_out_file, bam_file]) else: print "Warning: [JSPLICE] STAR output files not found -> " + sample out.close() if strand == " --stranded=no": extra_args = '-s ' + extra_args commands.append('python ' + config.path_jsplice + '/jSplice.py -d ' + output_dir + '/expdesign.txt -o ' + output_dir + ' -a '+ config.path_annotation.replace("#LABEL", genomebuild) + ' ' + extra_args) create_scripts(nchild, commands, path_base, folder, 'results_jsplice') return submit_job_super("jsplice", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def star(timestamp, path_base, folder, samples, nproc, wt, q, path_genome, star_params, tg): output = "results_star" secure_mkdir(path_base + folder, output) print "## RNAseq alignment with STAR..." print "> Writing jobs for STAR alignment..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: gg = "" files = samples[sample] if not tg: if len(files) == 2: fn = files[0] else: fn = files[0] + " " + files[1] if files[0].endswith(".fastq.gz"): gg = " --readFilesCommand zcat" else: gg = " --readFilesCommand zcat" g = path_base + folder + "/results_trimgalore/" suf = "" if not files[0].split("/")[-1].endswith(".gz"): suf = ".gz" if len(files) == 2: fn = g + files[0].split("/")[-1] + suf else: fn = g + files[0].split("/")[-1] + suf + " " + g + files[1].split("/")[-1] + suf command = config.path_star + " --quantMode TranscriptomeSAM GeneCounts --runThreadN " + str(nproc) + " --genomeDir " + path_genome command = command + " --readFilesIn " + fn + " --outFileNamePrefix " + path_base + folder + "/results_star/" + sample + "_" + gg if len(star_params) > 0: command = command + star_params commands.append(command + sample_checker.replace("#FOLDER", path_base + folder + "/results_star").replace("#SAMPLE", sample)) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("star", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def starfusion(timestamp, path_base, folder, samples, nproc, wt, q, path_star_fusion, star_fusion_params, tg): output = "results_star-fusion" secure_mkdir(path_base + folder, output) print "## Identification of gene fusions with star-fusion" print "> Writing jobs for Star-Fusion..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: files = samples[sample] if not tg: fn = files else: g = path_base + folder + "/results_trimgalore/" suf = "" if not files[0].split("/")[-1].endswith(".gz"): suf = ".gz" fn = [g + files[0].split("/")[-1] + suf, g + files[1].split("/")[-1] + suf] prefix = path_base + folder + "/results_star-fusion/" + sample call = config.path_starfusion + " --output_dir " + prefix + " --genome_lib_dir " + path_star_fusion + " --left_fq " + fn[0] + " --right_fq " + fn[1] + " --CPU " + str(nproc) if len(star_fusion_params) > 0: call = call + star_fusion_params commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_star-fusion").replace("#SAMPLE", sample)) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("star-fusion", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def fastqc(timestamp, path_base, folder, samples, nproc, wt, q, tg): ######################################################################## ## FastQC analysis ######################################################################## print "## QC: FastQC" print "> Quality control with fastQC..." output = "results_fastqc" secure_mkdir(path_base + folder, "results_fastqc") output_folder = path_base + folder + "/results_fastqc" print "> Writing jobs for fastqc analysis..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: files = samples[sample] if not tg: if len(files) == 4: fnames = files[0] + " " + files[1] else: fnames = files[0] else: g = path_base + folder + "/results_trimgalore/" suf = "" if not files[0].split("/")[-1].endswith(".gz"): suf = ".gz" if len(files) == 4: fnames = g + files[0].split("/")[-1] + suf + " " + g + files[1].split("/")[-1] + suf else: fnames = g + files[0].split("/")[-1] + suf call = config.path_fastqc + " -q -o " + output_folder + " " + fnames commands.append(call + sample_checker.replace("#FOLDER", output_folder).replace("#SAMPLE", sample)) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("fastqc", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def trimgalore(timestamp, path_base, folder, samples, nproc, wt, q, extra_args): ######################################################################## ## FastQC analysis ######################################################################## print "## Trim-galore: Quality and adapter trimming" print "> Quality and adapter trimming with Trim Galore..." output = "results_trimgalore" secure_mkdir(path_base + folder, "results_trimgalore") output_folder = path_base + folder + "/results_trimgalore" print "> Writing jobs for TrimGalore analysis..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: files = samples[sample] if len(files) == 4: args = extra_args + " --paired" fnames = files[0] + " " + files[1] else: args = extra_args fnames = files[0] if (args != "") and (not args.startswith(" ")): args = " " + args call = config.path_trimgalore + args + " --gzip --path_to_cutadapt " + config.path_cutadapt + " -o " + output_folder + " " + fnames call = call + sample_checker.replace("#FOLDER", output_folder).replace("#SAMPLE", sample) + "\n" + rename_tg_output(sample, files, path_base + folder) commands.append(call) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("trimgalore", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def htseq(timestamp, path_base, folder, samples, path_annotation, nproc, wt, q, mode, strand, countmode): output = "results_htseq-" + mode secure_mkdir(path_base + folder, output) print "## HTseq-count" print "> Writing jobs for HTseq-count " + mode + " analysis..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") for sample in ksamp: in_file = path_base + folder + "/results_star/" + sample + "_Aligned.out.sam" if sample + "_Aligned.out.sam" in proc_files: outputf = path_base + folder + "/results_htseq-" + mode + "/" + sample + ".tab" if mode == "gene": ld1 = config.path_htseq + strand + " -m " + countmode + " -q " + in_file + " " + path_annotation else: ld1 = config.path_htseq + strand + " -m " + countmode + " -i exon_id -q " + in_file + " " + path_annotation call = ld1 + " > " + outputf commands.append( call + sample_checker.replace("#FOLDER", path_base + folder + "/" + output).replace("#SAMPLE", sample)) else: print "Warning: [HTseq-" + mode + "] STAR output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("htseq-" + mode, path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def starfusion(timestamp, path_base, folder, samples, nproc, wt, q, genomebuild): output = "results_star-fusion" secure_mkdir(path_base + folder, output) print "## Identification of gene fusions with star-fusion" print "> Writing jobs for Star-Fusion..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") ref_file = config.path_annotation.replace("#LABEL", genomebuild) for sample in ksamp: in_file1 = path_base + folder + "/results_star/" + sample + "_Chimeric.out.junction" in_file2 = path_base + folder + "/results_star/" + sample + "_Chimeric.out.sam" prefix = path_base + folder + "/results_star-fusion/" + sample if os.path.exists(in_file1) and os.path.exists(in_file2): call = config.path_starfusion + " -J " + in_file1 + " -S " + in_file2 + " -G " + ref_file + " --out_prefix " + prefix commands.append(call + sample_checker.replace( "#FOLDER", path_base + folder + "/results_star-fusion").replace("#SAMPLE", sample)) else: print "Warning: [Star-Fusion] STAR output file not found -> " + in_file1 create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("star-fusion", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def kallisto(timestamp, path_base, folder, samples, path_index, bootstrap, nproc, wt, q, tg): output = "results_kallisto" secure_mkdir(path_base + folder, "results_kallisto") print "## RNAseq pseudoalignment with Kallisto" # Estimate counts in single-end datasss if len(samples[samples.keys()[0]]) == 2: print "> Estimating average and STD of fragment lengh required by Kalisto on single-read data..." outputT = path_base + folder + "/" + output + "/stats.txt" tid, log = compute_mean_std(path_base, folder, samples, outputT, "1", wt, q) vcrparser.job_wait(log, 10) f = open(outputT, 'r') i = f.readline() stats = dict() for i in f: i = i.strip("\n").split(" ") stats[i[0]] = [i[2], i[3]] f.close() print "> Writing jobs for Kallisto pseudoalignment" nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: files = samples[sample] if not tg: if len(files) == 4: args = "" fnames = files[0] + " " + files[1] else: args = " --single -l mean -s var".replace( "mean", stats[sample][0]).replace("var", stats[sample][1]) fnames = files[0] else: g = path_base + folder + "/results_trimgalore/" suf = "" if not files[0].split("/")[-1].endswith(".gz"): suf = ".gz" if len(files) == 4: args = "" fnames = g + files[0].split( "/")[-1] + suf + " " + g + files[1].split("/")[-1] + suf else: args = " --single -l mean -s var".replace( "mean", stats[sample][0]).replace("var", stats[sample][1]) fnames = g + files[0].split("/")[-1] + suf cmd = config.path_kallisto + " quant -b " + bootstrap + " -i " + path_index + " -o " + path_base + folder + "/results_kallisto/" + sample + args + " " + fnames commands.append( cmd + sample_checker.replace("#FOLDER", path_base + folder + "/" + output).replace("#SAMPLE", sample)) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("kallisto", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def kallisto(timestamp, path_base, folder, samples, path_index, bootstrap, nproc, wt, q, tg): output = "results_kallisto" secure_mkdir(path_base + folder, "results_kallisto") print "## RNAseq pseudoalignment with Kallisto" # Estimate counts in single-end datasss if len(samples[samples.keys()[0]]) == 2: print "> Estimating average and STD of fragment lengh required by Kalisto on single-read data..." outputT = path_base + folder + "/" + output + "/stats.txt" tid,log = compute_mean_std(path_base, folder, samples, outputT, "1", wt, q) vcrparser.job_wait(log, 10) f = open(outputT,'r') i = f.readline() stats = dict() for i in f: i = i.strip("\n").split(" ") stats[i[0]] = [i[2],i[3]] f.close() print "> Writing jobs for Kallisto pseudoalignment" nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) for sample in ksamp: files = samples[sample] if not tg: if len(files) == 4: args = "" fnames = files[0]+" "+files[1] else: args = " --single -l mean -s var".replace("mean", stats[sample][0]).replace("var", stats[sample][1]) fnames = files[0] else: g = path_base + folder + "/results_trimgalore/" suf = "" if not files[0].split("/")[-1].endswith(".gz"): suf = ".gz" if len(files) == 4: args = "" fnames = g + files[0].split("/")[-1] + suf + " " + g + files[1].split("/")[-1] + suf else: args = " --single -l mean -s var".replace("mean", stats[sample][0]).replace("var", stats[sample][1]) fnames = g + files[0].split("/")[-1] + suf cmd = config.path_kallisto+" quant -b " + bootstrap + " -i " + path_index + " -o " + path_base+folder + "/results_kallisto/" + sample + args + " " + fnames commands.append(cmd + sample_checker.replace("#FOLDER", path_base + folder + "/" + output).replace("#SAMPLE", sample)) create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("kallisto", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def sam2sortbam(timestamp, path_base, folder, samples, nproc, wt, q): output = "results_sam2sortbam" secure_mkdir(path_base + folder, output) print "## SAM2SORTEDBAM" print "> Writing jobs for SAM2SORTEDBAM..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") for sample in ksamp: in_file = path_base + folder + "/results_star/" + sample + "_Aligned.out.sam" if sample + "_Aligned.out.sam" in proc_files: out_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam" com = "java -jar " + config.path_picard + "/AddOrReplaceReadGroups.jar I=" + in_file + " O=" + out_file +" SO=coordinate RGID=id RGLB=library RGPL=ILLUMINA RGPU=machine RGSM=sample 2> " + out_file + ".log" commands.append(com + sample_checker.replace("#FOLDER", path_base + folder + "/results_sam2sortbam").replace("#SAMPLE", sample)) else: print "Warning: [SAM2SORTEDBAM] STAR output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("sam2sortbam", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def varscan(timestamp, path_base, folder, samples, nproc, wt, q, genome_build, args): ref = config.path_fasta.replace("#LABEL",genome_build) output = "results_varscan" secure_mkdir(path_base + folder, output) print "## Variang calling with VARSCAN" print "> Writing jobs for VARSCAN..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/") for sample in ksamp: in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam" if sample + ".sorted.bam" in proc_files: out_file = path_base + folder + "/results_varscan/" + sample + ".vcf" com = config.path_samtools + " mpileup -B -f " + ref + " " + in_file + " | java -jar " + config.path_varscan + " mpileup2cns " + args + " > " + out_file commands.append(com + sample_checker.replace("#FOLDER", path_base + folder + "/results_varscan").replace("#SAMPLE", sample)) else: print "Warning: [VARSCAN] SORTED BAM output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("varscan", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def picard_IS(timestamp, path_base, folder, samples, nproc, wt, q): output = "results_picard_IS" secure_mkdir(path_base + folder, output) print "## Picard-InsertSize" print "> Writing jobs for Picard InsertSize..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/") for sample in ksamp: in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam" if sample + ".sorted.bam" in proc_files: for i in range(len(config.nannots)): out_file = in_file.replace("results_sam2sortbam/", "results_picard_IS/").replace(".sorted.bam", "") call = "java -jar " + config.path_picard + "/CollectInsertSizeMetrics.jar I="+in_file+" O="+out_file+".txt H="+out_file+".pdf" commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_picard_IS").replace("#SAMPLE", sample)) else: print "Warning: [Picard] Sorted BAM file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("picard_IS", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def starfusion(timestamp, path_base, folder, samples, nproc, wt, q, genomebuild): output = "results_star-fusion" secure_mkdir(path_base + folder, output) print "## Identification of gene fusions with star-fusion" print "> Writing jobs for Star-Fusion..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") ref_file = config.path_annotation.replace("#LABEL", genomebuild) for sample in ksamp: in_file1 = path_base + folder + "/results_star/" + sample + "_Chimeric.out.junction" in_file2 = path_base + folder + "/results_star/" + sample + "_Chimeric.out.sam" prefix = path_base + folder + "/results_star-fusion/" + sample if os.path.exists(in_file1) and os.path.exists(in_file2): call = config.path_starfusion + " -J " + in_file1 + " -S " + in_file2 + " -G " + ref_file + " --out_prefix " + prefix commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_star-fusion").replace("#SAMPLE", sample)) else: print "Warning: [Star-Fusion] STAR output file not found -> " + in_file1 create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("star-fusion", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
def htseq(timestamp, path_base, folder, samples, path_annotation, nproc, wt, q, mode, strand, countmode): output = "results_htseq-" + mode secure_mkdir(path_base + folder, output) print "## HTseq-count" print "> Writing jobs for HTseq-count " + mode + " analysis..." nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples)) commands = list() ksamp = sortbysize(samples) proc_files = os.listdir(path_base + folder + "/results_star/") for sample in ksamp: in_file = path_base + folder + "/results_star/" + sample + "_Aligned.out.sam" if sample + "_Aligned.out.sam" in proc_files: outputf= path_base + folder + "/results_htseq-" + mode + "/" + sample + ".tab" if mode == "gene": ld1 = config.path_htseq + strand + " -m " + countmode + " -q " + in_file + " " + path_annotation else: ld1 = config.path_htseq + strand + " -m " + countmode + " -i exon_id -q " + in_file + " " + path_annotation call = ld1 + " > " + outputf commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/" + output).replace("#SAMPLE", sample)) else: print "Warning: [HTseq-" + mode + "] STAR output file not found -> " + in_file create_scripts(nchild, commands, path_base, folder, output) return submit_job_super("htseq-" + mode, path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)