def get_picard_string(basedir, bam_file, aligned_files_dir, base_filename): metrics_dir = aligned_files_dir + "picard_metrics/" cluster.check_dir(metrics_dir) sge_filename = basedir + "sge_files/picard_" + base_filename + ".sge" picard_string1 = 'PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"' picard_string2 = "java -Xmx16G -jar ${PICARD_ROOT}/MarkDuplicates.jar $PICARD_SETTINGS REMOVE_DUPLICATES=true ASSUME_SORTED=false CREATE_INDEX=true INPUT=" + bam_file + " OUTPUT=" + aligned_files_dir + base_filename + ".dedup.bam METRICS_FILE=" + metrics_dir + base_filename + ".dups.txt" return picard_string1, picard_string2
def get_picard_string(basedir,bam_file,aligned_files_dir,base_filename): metrics_dir=aligned_files_dir+"picard_metrics/" cluster.check_dir(metrics_dir) sge_filename=basedir+"sge_files/picard_"+base_filename+".sge" picard_string1='PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"' picard_string2="java -Xmx16G -jar ${PICARD_ROOT}/MarkDuplicates.jar $PICARD_SETTINGS REMOVE_DUPLICATES=true ASSUME_SORTED=false CREATE_INDEX=true INPUT="+bam_file+" OUTPUT="+aligned_files_dir+base_filename+".dedup.bam METRICS_FILE="+metrics_dir+base_filename+".dups.txt" return picard_string1,picard_string2
def get_htseqcount_string(basedir,sorted_bam_filename,gtf_file,sample): htseq_counts_dir=basedir+"htseq_counts_third/" cluster.check_dir(htseq_counts_dir) # use this with gtf file: htseq_string="htseq-count --stranded=no --format=bam "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt" # use this with gff file: #htseq_string="htseq-count --stranded=no --format=bam --idattr=ID --type=gene "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt" return htseq_string
def get_htseqcount_string(basedir, sorted_bam_filename, gtf_file, sample): htseq_counts_dir = basedir + "htseq_counts_third/" cluster.check_dir(htseq_counts_dir) # use this with gtf file: htseq_string = "htseq-count --stranded=no --format=bam " + sorted_bam_filename + " " + gtf_file + " > " + htseq_counts_dir + sample + "_counts.txt" # use this with gff file: #htseq_string="htseq-count --stranded=no --format=bam --idattr=ID --type=gene "+sorted_bam_filename+" "+gtf_file+" > "+htseq_counts_dir+sample+"_counts.txt" return htseq_string
def tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files_dictionary,files_dir,annotation,reference,reference_fasta): threads="8" process_name="tophat" #module_list=["cufflinks/2.2.0"] module_list=["bowtie2/2.1.0","tophat/2.0.9","cufflinks/2.2.0","samtools","picard-tools/1.88"] # makes a bunch of directories for file output: #tuxedo_files_dir=basedir+"tuxedo_4_Steve_gtf/" #tuxedo_files_dir=basedir+"tuxedo_3_UCSC/" #tuxedo_files_dir=basedir+"tuxedo_2/" tuxedo_files_dir=basedir+"tuxedo/" cluster.check_dir(tuxedo_files_dir) #print tuxedo_files_dir tophat_dir=tuxedo_files_dir+"tophat/" cluster.check_dir(tophat_dir) #print tophat_dir cufflinks_dir=tuxedo_files_dir+"cufflinks/" cluster.check_dir(cufflinks_dir) #print cufflinks_dir transcripts_output={} duplicates_removed_bam_files={} for sample in files_dictionary.keys(): # creates a dictionary that will be populated with cxb files file_list=merged_file_dictionary[sample] fastq_filename1=file_list[0] fastq_filename2=file_list[1] #print file_list base_filename=files.get_base_filename(fastq_filename1) #print base_filename # Separate directories have to be made in each process dir # because the output files are all genericaly named, e.g. transcripts.gtf, accepted_hits.bam, etc. # start tuxedo pipeline: # tophat --> remove_duplicates (2 picardtools strings) --> cufflinks --> cuffcompare --> cuffmerge --> cuffquant --> cuffdiff # 1. tophat tophat_outputdir=tophat_dir+base_filename+"/" cluster.check_dir(tophat_outputdir) #print tophat_outputdir fastq_files_string=files.get_file_string(file_list) #print_string[1:] tophat_string=tuxedo.get_tophat_string_annotation(reference,annotation,fastq_files_string[1:],tophat_outputdir) tophat_output=tuxedo.get_tophat_output(tophat_outputdir) # 2. Remove duplicates with picardtools picard_string1,picard_string2=samtools_picard.get_picard_string(basedir,tophat_output,tophat_outputdir,base_filename) duplicates_removed_bam=tuxedo.get_dupremoved_output(base_filename,tophat_outputdir) # 3. cufflinks cufflinks_outputdir=cufflinks_dir+base_filename+"/" cluster.check_dir(cufflinks_outputdir) cufflinks_output=cufflinks_outputdir+"transcripts.gtf" transcripts_output[sample]=cufflinks_output duplicates_removed_bam_files[sample]=duplicates_removed_bam cufflinks_string=tuxedo.get_cufflinks_string(cufflinks_outputdir,duplicates_removed_bam) #print cufflinks_output #print os.path.isfile(cufflinks_output) process_list=[tophat_string,picard_string1,picard_string2,cufflinks_string] #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads) #return transcripts_output return duplicates_removed_bam_files
def run_cuffdiff(basedir,cxb_output,reference_fasta,cuffmerge_file): cuffdiff_outputdir=basedir+"tuxedo/cuffdiff/" cluster.check_dir(cuffdiff_outputdir) Mutant,Control=get_lists_of_files_by_group(cxb_output) labels="Mutant,Control" groupsoffiles=groups_files(Mutant,Control) cuffdiff_string=tuxedo.get_cuffdiff_string(basedir,cuffdiff_outputdir,groupsoffiles,reference_fasta,labels,cuffmerge_file) threads="32" process_name="cuffdiff" module_list=["cufflinks/2.2.0"] base_filename="dasen_MutantvsControl" process_list=[cuffdiff_string]
def tuxedo_pipeline_cuffmerge(basedir,annotation,reference,reference_fasta,assembly_filename): threads="8" process_name="cuffmerge" module_list=["cufflinks/2.2.0"] base_filename="dasen2" cuffmerge_dir=basedir+"tuxedo/cufflinks/merged_asm/" cluster.check_dir(cuffmerge_dir) #print cuffmerge_dir # 5. cuffmerge - THIS HAS TO BE RUN AFTER CUFFLINKS AS A SEPARATE STEP, JUST ONCE!!! # produces a GTF file that contains all merged assemblies cuffmerge_string=tuxedo.get_cuffmerge_string(reference_fasta,annotation,assembly_filename,cuffmerge_dir) cuffmerge_output=cuffmerge_dir+"merge/merged.gtf" process_list=[cuffmerge_string] #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads) return cuffmerge_output
def get_RnaSeqMetrics(basedir,bam_file,aligned_files_dir,base_filename): # picard RnaSeqMetrics # http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics metrics_dir=aligned_files_dir+"picard_metrics/" cluster.check_dir(metrics_dir) metrics_txt=metrics_dir+base_filename+".txt" metrics_pdf=metrics_dir+base_filename+".pdf" process_list=[] # settings process_list.append('REFFLAT="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/refFlat.txt.gz"') process_list.append('PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"') process_list.append("java -Xmx16G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \\") process_list.append("$PICARD_SETTINGS \\") process_list.append("REF_FLAT=${REFFLAT} \\") process_list.append("STRAND_SPECIFICITY=NONE \\") process_list.append("INPUT="+bam_file+" \\") process_list.append("CHART_OUTPUT="+metrics_pdf+" \\") process_list.append("OUTPUT="+metrics_txt) return process_list
def get_RnaSeqMetrics(basedir, bam_file, aligned_files_dir, base_filename): # picard RnaSeqMetrics # http://broadinstitute.github.io/picard/picard-metric-definitions.html#RnaSeqMetrics metrics_dir = aligned_files_dir + "picard_metrics/" cluster.check_dir(metrics_dir) metrics_txt = metrics_dir + base_filename + ".txt" metrics_pdf = metrics_dir + base_filename + ".pdf" process_list = [] # settings process_list.append( 'REFFLAT="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/refFlat.txt.gz"' ) process_list.append( 'PICARD_SETTINGS="VERBOSITY=WARNING QUIET=true VALIDATION_STRINGENCY=LENIENT MAX_RECORDS_IN_RAM=2500000"' ) process_list.append( "java -Xmx16G -jar ${PICARD_ROOT}/CollectRnaSeqMetrics.jar \\") process_list.append("$PICARD_SETTINGS \\") process_list.append("REF_FLAT=${REFFLAT} \\") process_list.append("STRAND_SPECIFICITY=NONE \\") process_list.append("INPUT=" + bam_file + " \\") process_list.append("CHART_OUTPUT=" + metrics_pdf + " \\") process_list.append("OUTPUT=" + metrics_txt) return process_list
with open(alignment_table_filename,"w") as datafile: datafile.write("\t".join(header)) datafile.write("\n") for sample in sample_data_dictionary: print "Sample:",sample filename=sample_data_dictionary[sample] data_list=tuxedo.get_bowtie1_alignment_data(filename) print data_list duplicates_removed=get_duplicates_removed(aligned_files_dir,sample) datafile.write(sample+"\t") datafile.write(duplicates_removed+"\t") datafile.write("\t".join(data_list)) datafile.write("\n") datafile.close() print "Alignment stats written:",alignment_table_filename filesdir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/fastq/" basedir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/htseq/" reference="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/BowtieIndex/genome" annotation="/local/data/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf" aligned_files_dir=basedir+"bowtie1_aligned_files/" cluster.check_dir(aligned_files_dir) fileslist_all=os.listdir(filesdir) fileslist=get_fileslist(fileslist_all,filesdir) print fileslist #gunzip_files_list=gunzip_files(basedir,fileslist) files_dictionary=get_file_dictionaries(fileslist) print files_dictionary run_bowtie1_htseq(files_dictionary,annotation,reference,basedir,aligned_files_dir) #get_alignment_data(get_sample_data(),aligned_files_dir)
with open(alignment_table_filename,"w") as datafile: datafile.write("\t".join(header)) datafile.write("\n") for sample in sample_data_dictionary: print "Sample:",sample filename=sample_data_dictionary[sample] data_list=tuxedo.get_bowtie1_alignment_data(filename) print data_list duplicates_removed=get_duplicates_removed(aligned_files_dir,sample) datafile.write(sample+"\t") datafile.write(duplicates_removed+"\t") datafile.write("\t".join(data_list)) datafile.write("\n") datafile.close() print "Alignment stats written:",alignment_table_filename filesdir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/fastq/" basedir="/ifs/home/cohenl06/data/sequencing/dasen/thoracic/htseq/" reference="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/BowtieIndex/genome" annotation="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf" aligned_files_dir=basedir+"bowtie1_aligned_files/" cluster.check_dir(aligned_files_dir) fileslist_all=os.listdir(filesdir) fileslist=get_fileslist(fileslist_all) print fileslist gunzip_files_list=gunzip_files(basedir,fileslist) files_dictionary=get_file_dictionaries(gunzip_files_list) print files_dictionary run_bowtie1_htseq(files_dictionary,annotation,reference,basedir,aligned_files_dir) #get_alignment_data(get_sample_data(),aligned_files_dir)
def tuxedo_pipeline_cuffcompare_cuffquant(basedir,transcripts_output,annotation,reference,reference_fasta,cuffmerge_file): threads="4" process_name="cuffquant" #module_list=["cufflinks/2.2.0"] module_list=["cufflinks/2.2.0"] # makes a bunch of directories for file output: tuxedo_files_dir=basedir+"tuxedo/" cluster.check_dir(tuxedo_files_dir) #print tuxedo_files_dir tophat_dir=tuxedo_files_dir+"tophat/" cluster.check_dir(tophat_dir) #print tophat_dir cufflinks_dir=tuxedo_files_dir+"cufflinks/" cluster.check_dir(cufflinks_dir) #print cufflinks_dir #print os.path.isfile(cufflinks_output) cuffcompare_dir=tuxedo_files_dir+"cuffcompare/" cluster.check_dir(cuffcompare_dir) #print cuffcompare_dir cuffdiff_outputdir=tuxedo_files_dir+"cuffdiff/" cluster.check_dir(cuffdiff_outputdir) #print cuffdiff_outputdir cuffquant_dir=tuxedo_files_dir+"cuffquant/" cluster.check_dir(cuffquant_dir) #print cuffquant_dir cxb_output={} for sample in transcripts_output.keys(): base_filename=sample tophat_outputdir=tophat_dir+base_filename+"/" duplicates_removed_bam=tuxedo.get_dupremoved_output(base_filename,tophat_outputdir) # 4. cuffcompare #cuffcompare_outputdir=cuffcompare_dir+base_filename+"/" #cluster.check_dir(cuffcompare_outputdir) #cuffcompare_string=tuxedo.get_cuffcompare_string(annotation,cuffcompare_outputdir,cufflinks_outputdir) # 6. cuffquant cuffquant_outputdir=cuffquant_dir+base_filename+"/" cluster.check_dir(cuffquant_outputdir) cuffquant_string=tuxedo.get_cuffquant_string(cuffquant_outputdir,duplicates_removed_bam,cuffmerge_file) # populates a dictionary with cuffquant filenames to be used in cuffdiff cxb_filename=tuxedo.get_cuffquant_cxb_output(cuffquant_outputdir) #cxb_output[sample]=duplicates_removed_bam cxb_output[sample]=cxb_filename # processes: tophat --> remove_duplicates (2 picardtools strings) --> cufflinks --> cuffcompare --> cuffmerge --> cuffquant --> cuffdiff process_list=[cuffquant_string] #cluster.qsub_sge_file(basedir,process_name,module_list,base_filename,process_list,threads) return cxb_output
## reference files: reference="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/Bowtie2Index/genome" annotation="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Annotation/Genes/genes.gtf" reference_fasta="/phoenix/iGenomes/Mus_musculus/Ensembl/NCBIM37/Sequence/Bowtie2Index/genome.fa" #reference="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome" #annotation="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Annotation/Genes/genes.gtf" #reference_fasta="/phoenix/iGenomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome.fa" #annotation="/ifs/data/sequence/share/GTC/Steve/mm9.gtf" # fastq files: results_files_dir="/ifs/data/sequence/results/dasenlab/2014-08-27/fastq/" files_dir="/ifs/home/cohenl06/data/sequencing/dasen/merged/" basedir="/ifs/home/cohenl06/data/sequencing/dasen/" cluster.check_dir(files_dir) # this is the list of raw files we're working with: fileslist=os.listdir(results_files_dir) # this will take raw files from the original directory, merge, and output to subdirectory in my home files=dasen_files.merge_files(results_files_dir,files_dir,fileslist) #print files # need to write a different function so that the merged_files function doesn't have to be run each time # files need to be indexed in order with R1 and R2 by each sample #transcripts_files=tuxedo_dasen.tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files,files_dir,annotation,reference,reference_fasta) duplicates_removed_bam=tuxedo_dasen.tuxedo_pipeline_tophat_remdup_cufflinks(basedir,files,files_dir,annotation,reference,reference_fasta) print duplicates_removed_bam igv.run_igvtools(basedir,duplicates_removed_bam)