def gq_seq_utils_exploratory_analysis_rnaseq_denovo_filtered(self): """ Exploratory analysis using the gqSeqUtils R package using a subset of filtered transcripts """ # Run exploratory analysis on filtered components # Extract filtered components from counts file jobs=[] exploratory_output_dir = os.path.join("filtered_assembly","exploratory") counts_file = os.path.join("filtered_assembly", "isoforms.counts.matrix") trinotate_annotation_report_filtered = os.path.join("trinotate", "trinotate_annotation_report.tsv" + ".isoforms_filtered.tsv") trinotate_annotation_report_filtered_header="trinotate/trinotate_annotation_report.tsv.isoforms_filtered_header.tsv" lengths_file=os.path.join("differential_expression", "isoforms.lengths.tsv") lengths_filtered_file = os.path.join("filtered_assembly", "isoforms.lengths.tsv") jobs.append(concat_jobs([ Job(command="mkdir -p " + exploratory_output_dir), Job([trinotate_annotation_report_filtered], [trinotate_annotation_report_filtered_header], command="sed '1s/^/ \\n/' " + trinotate_annotation_report_filtered + " > " + trinotate_annotation_report_filtered_header), tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header, os.path.join("differential_expression", "isoforms.counts.matrix") ], "\\\\t", counts_file, "\'\'", left_join=True, exclude="\'\'" ), tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header, lengths_file ], "\\\\t", lengths_filtered_file, "\'\' transcript_id", left_join=True, exclude="\' \'" ) ], name="filter_annotated_components_exploratory" ) ) # gqSeqUtils function call jobs.append(concat_jobs([ Job(command="mkdir -p " + exploratory_output_dir), gq_seq_utils.exploratory_analysis_rnaseq_denovo( counts_file, lengths_filtered_file, exploratory_output_dir ) ], name="gq_seq_utils_exploratory_analysis_rnaseq_denovo")) # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = os.path.join(exploratory_output_dir, "index.tsv"), job_name = "gq_seq_utils_exploratory_analysis_rnaseq_denovo_filtered_report", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.gq_seq_utils_exploratory_analysis_rnaseq_filtered.Rmd") , render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="report/filtered_assembly"; exploratory_dir="' + exploratory_output_dir + '";' ) ) return jobs
def differential_expression(self): """ Performs differential gene expression analysis using [DESEQ](http://bioconductor.org/packages/release/bioc/html/DESeq.html) and [EDGER](http://www.bioconductor.org/packages/release/bioc/html/edgeR.html). Merge the results of the analysis in a single csv file. Also, performs Gene Ontology analysis for RNA-Seq denovo Assembly using the Bioconductor's R package [goseq](http://www.bioconductor.org/packages/release/bioc/html/goseq.html). Generates GO annotations for differential genes and isoforms expression analysis, based on associated GOTERMS generated by trinotate. """ output_directory = "differential_expression" jobs = [] trinotate_annotation_report = os.path.join("trinotate", "trinotate_annotation_report.tsv") report_dir= 'report' input_rmarkdown_file=os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.differential_expression_goseq.Rmd") # Run DGE and merge dge results with annotations for item in "genes","isoforms": jobs.append(concat_jobs( self.differential_expression_and_goseq_rsem(output_directory, item, trinotate_annotation_report) , name= "differential_expression_" + item) ) # DGE Report # Render Rmarkdown Report output_files = [] for job_item in jobs: output_files.extend([output_file for output_file in job_item.output_files if output_file not in output_files]) jobs.append( rmarkdown.render( job_input = output_files, job_name = "differential_expression_goseq_rnaseq_denovo_report", input_rmarkdown_file = input_rmarkdown_file, render_output_dir = 'report', module_section = 'report', prerun_r = 'design_file="' + os.path.relpath(self.args.design.name, self.output_dir) + '"; report_dir="' + report_dir + '"; source_dir="' + output_directory + '"; ' + 'top_n_results=10; contrasts=c("' + '","'.join(contrast.name for contrast in self.contrasts) + '");' ) ) return jobs
def gq_seq_utils_exploratory_analysis_rnaseq_denovo(self): """ Exploratory analysis using the gqSeqUtils R package. """ jobs = [] # gqSeqUtils function call jobs.append(concat_jobs([ Job(command="mkdir -p exploratory"), gq_seq_utils.exploratory_analysis_rnaseq_denovo( os.path.join("differential_expression", "genes.counts.matrix"), os.path.join("differential_expression", "genes.lengths.tsv"), "exploratory" ) ], name="gq_seq_utils_exploratory_analysis_rnaseq_denovo")) # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = os.path.join("exploratory", "index.tsv"), job_name = "gq_seq_utils_exploratory_analysis_rnaseq_denovo_report", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.gq_seq_utils_exploratory_analysis_rnaseq.Rmd") , render_output_dir = 'report', module_section = 'report', # TODO: this or exploratory? prerun_r = 'report_dir="report";' # TODO: really necessary or should be hard-coded in exploratory.Rmd? ) ) return jobs
def gq_seq_utils_exploratory_analysis_rnaseq_light(self): """ Exploratory analysis using the gqSeqUtils R package adapted for RnaSeqLight """ jobs = [] abundance_file=os.path.join(self.output_dir,"kallisto/All_readsets", "all_readsets.abundance_genes.csv") # gqSeqUtils function call jobs.append(concat_jobs([ Job(command="mkdir -p exploratory"), gq_seq_utils.exploratory_analysis_rnaseq_light( abundance_file, config.param('gq_seq_utils_exploratory_analysis_rnaseq_light', 'genes', type='filepath'), "exploratory" ) ], name="gq_seq_utils_exploratory_analysis_rnaseq_light", samples=self.samples)) jobs.append( rmarkdown.render( job_input = os.path.join("exploratory", "index.tsv"), job_name = "report.gq_seq_utils_exploratory_analysis_rnaseq", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqLight.gq_seq_utils_exploratory_analysis_rnaseq_light.Rmd"), samples = self.samples, render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="report";' ) ) return jobs
def gq_seq_utils_exploratory_analysis_rnaseq(self): """ Exploratory analysis using the gqSeqUtils R package. """ jobs = [] # gqSeqUtils function call sample_fpkm_readcounts = [[ sample.name, os.path.join("cufflinks", sample.name, "isoforms.fpkm_tracking"), os.path.join("raw_counts", sample.name + ".readcounts.csv") ] for sample in self.samples] jobs.append(concat_jobs([ Job(command="mkdir -p exploratory"), gq_seq_utils.exploratory_analysis_rnaseq( os.path.join("DGE", "rawCountMatrix.csv"), "cuffnorm", config.param('gq_seq_utils_exploratory_analysis_rnaseq', 'genes', type='filepath'), "exploratory" ) ], name="gq_seq_utils_exploratory_analysis_rnaseq")) # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = os.path.join("exploratory", "index.tsv"), job_name = "gq_seq_utils_exploratory_analysis_rnaseq_report", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeq.gq_seq_utils_exploratory_analysis_rnaseq.Rmd") , render_output_dir = 'report', module_section = 'report', # TODO: this or exploratory? prerun_r = 'report_dir="report";' # TODO: really necessary or should be hard-coded in exploratory.Rmd? ) ) report_file = os.path.join("report", "RnaSeq.cuffnorm.md") jobs.append( Job( [os.path.join("cufflinks", "AllSamples","merged.gtf")], [report_file], command="""\ mkdir -p report && \\ zip -r report/cuffAnalysis.zip cufflinks/ cuffdiff/ cuffnorm/ && \\ cp \\ {report_template_dir}/{basename_report_file} \\ {report_file}""".format( report_template_dir=self.report_template_dir, basename_report_file=os.path.basename(report_file), report_file=report_file ), report_files=[report_file], name="cuffnorm_report") ) return jobs
def kallisto_count_matrix(self): jobs=[] kallisto_directory="kallisto" all_readset_directory="All_readsets" output_dir=os.path.join(self.output_dir,kallisto_directory, all_readset_directory) #per trancripts input_abundance_files_transcripts = [os.path.join(self.output_dir,kallisto_directory, readset.sample.name, "abundance_transcripts.tsv") for readset in self.readsets] job_name_transcripts="kallisto_count_matrix.transcripts" data_type_transcripts="transcripts" job=tools.r_create_kallisto_count_matrix(input_abundance_files_transcripts, output_dir, data_type_transcripts, job_name_transcripts) job.samples = self.samples jobs.append(job) #per genes input_abundance_files_genes = [os.path.join(self.output_dir,kallisto_directory, readset.sample.name, "abundance_genes.tsv") for readset in self.readsets] job_name_genes="kallisto_count_matrix.genes" data_type_genes="genes" job=tools.r_create_kallisto_count_matrix(input_abundance_files_genes, output_dir, data_type_genes, job_name_genes) job.samples = [readset.sample] jobs.append(job) #copy tx2genes file jobs.append( Job( [os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_genes.csv"), os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_transcripts.csv")], [], command="""\ cp \\ {tx2genes_file} \\ {report_dir}""".format( tx2genes_file=config.param('kallisto', 'transcript2genes', type="filepath"), report_dir="report" ), name="report.copy_tx2genes_file", samples=self.samples ) ) # Create kallisto report jobs.append( rmarkdown.render( job_input = [os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_genes.csv"), os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_transcripts.csv")], job_name = "report.kallisto_count_matrix", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqLight.kallisto.Rmd"), samples = self.samples, render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="report";' ) ) return jobs
def trinotate(self): """ Perform transcriptome functional annotation and analysis using [Trinotate](http://trinotate.sourceforge.net/). All functional annotation data is integrated into a SQLite database and a whole annotation report is created. """ jobs = [] swissprot_db = os.path.basename(config.param("blastx_trinity_uniprot", "swissprot_db", type='prefixpath')) transdecoder_pep = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pep") job = trinotate.trinotate( swissprot_db = swissprot_db , trinity_fasta = os.path.join("trinity_out_dir", "Trinity.fasta"), swissprot_blastx = os.path.join("blast", "blastx_Trinity_" + swissprot_db + ".tsv"), transdecoder_pep = transdecoder_pep, transdecoder_pfam = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pfam"), swissprot_blastp = os.path.join("trinotate", "blastp", "blastp_" + os.path.basename(transdecoder_pep) + "_" + swissprot_db + ".tsv"), rnammer = os.path.join("trinotate", "rnammer", "Trinity.fasta.rnammer.gff"), signalp = os.path.join("trinotate", "signalp", "signalp.gff"), tmhmm = os.path.join("trinotate", "tmhmm", "tmhmm.out"), trinotate_sqlite = os.path.join("trinotate", "Trinotate.sqlite"), trinotate_report = os.path.join("trinotate", "trinotate_annotation_report.tsv") ) job.samples = self.samples jobs.append(job) # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = os.path.join("trinotate", "trinotate_annotation_report.tsv"), job_name = "trinotate_report", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.trinotate.Rmd") , render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="report"; source_dir="trinotate";' ) ) return jobs
def trinotate(self): """ Perform transcriptome functional annotation and analysis using [Trinotate](http://trinotate.sourceforge.net/). All functional annotation data is integrated into a SQLite database and a whole annotation report is created. """ jobs = [] swissprot_db = os.path.basename(config.param("blastx_trinity_uniprot", "swissprot_db", type='prefixpath')) transdecoder_pep = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pep") jobs.append( trinotate.trinotate( swissprot_db = swissprot_db , trinity_fasta = os.path.join("trinity_out_dir", "Trinity.fasta"), swissprot_blastx = os.path.join("blast", "blastx_Trinity_" + swissprot_db + ".tsv"), transdecoder_pep = transdecoder_pep, transdecoder_pfam = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pfam"), swissprot_blastp = os.path.join("trinotate", "blastp", "blastp_" + os.path.basename(transdecoder_pep) + "_" + swissprot_db + ".tsv"), rnammer = os.path.join("trinotate", "rnammer", "Trinity.fasta.rnammer.gff"), signalp = os.path.join("trinotate", "signalp", "signalp.gff"), tmhmm = os.path.join("trinotate", "tmhmm", "tmhmm.out"), trinotate_sqlite = os.path.join("trinotate", "Trinotate.sqlite"), trinotate_report = os.path.join("trinotate", "trinotate_annotation_report.tsv") ) ) # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = os.path.join("trinotate", "trinotate_annotation_report.tsv"), job_name = "trinotate_report", input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.trinotate.Rmd") , render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="report"; source_dir="trinotate";' ) ) return jobs
def verify_bam_id(self): """ verifyBamID is a software that verifies whether the reads in particular file match previously known genotypes for an individual (or group of individuals), and checks whether the reads are contaminated as a mixture of two samples. verifyBamID can detect sample contamination and swaps when external genotypes are available. When external genotypes are not available, verifyBamID still robustly detects sample swaps. """ # Known variants file population_AF = config.param('verify_bam_id', 'population_AF', required=False) known_variants_annotated = config.param('verify_bam_id', 'verifyBamID_variants_file', required=False) verify_bam_id_directory = "verify_bam_id" variants_directory = "variants" jobs = [] verify_bam_results = [] jobs.append( Job([known_variants_annotated], [variants_directory, verify_bam_id_directory], command="mkdir -p " + variants_directory + " " + verify_bam_id_directory, name="verify_bam_id_create_directories")) for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) candidate_input_files = [[ os.path.join(alignment_directory, sample.name + ".sorted.dup.recal.bam") ]] candidate_input_files.append([ os.path.join(alignment_directory, sample.name + ".sorted.dedup.bam") ]) candidate_input_files.append([ os.path.join(alignment_directory, sample.name + ".sorted.mdup.bam") ]) # this one is for RnaSeq pipeline [input_bam] = self.select_input_files(candidate_input_files) output_prefix = os.path.join(verify_bam_id_directory, sample.name) coverage_bed = bvatools.resolve_readset_coverage_bed( sample.readsets[0]) # Run verifyBamID job = verify_bam_id.verify(input_bam, known_variants_annotated, output_prefix) job.name = "verify_bam_id_" + sample.name job.samples = [sample] jobs.append(job) verify_bam_results.extend([output_prefix + ".selfSM"]) # Coverage bed is null if whole genome experiment target_bed = coverage_bed if coverage_bed else "" # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input=verify_bam_results, job_name="verify_bam_id_report", input_rmarkdown_file=os.path.join( self.report_template_dir, "Illumina.verify_bam_id.Rmd"), render_output_dir='report', module_section='report', prerun_r='source_dir="' + verify_bam_id_directory + '"; report_dir="report" ; params=list(verifyBamID_variants_file="' + known_variants_annotated + '", dbnsfp_af_field="' + population_AF + '", coverage_bed="' + target_bed + '");')) return jobs
def differential_expression_filtered(self): """ Differential Expression and GOSEQ analysis based on filtered transcripts and genes """ output_directory = os.path.join("filtered_assembly","differential_expression") jobs = [] trinotate_annotation_report = os.path.join("trinotate", "trinotate_annotation_report.tsv") report_dir= os.path.join("report","filtered_assembly") input_rmarkdown_file=os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.differential_expression_goseq_filtered.Rmd") # Filter input files trinotate_annotation_report_filtered = trinotate_annotation_report + ".isoforms_filtered.tsv" trinotate_annotation_report_filtered_header={} trinotate_annotation_report_filtered_header["isoforms"] = trinotate_annotation_report + ".isoforms_filtered_header.tsv" trinotate_annotation_report_filtered_header["genes"]= trinotate_annotation_report + ".genes_filtered_header.tsv" counts_ids = { 'genes':"Genes", 'isoforms':"Isoforms" } trinotate_filters = None if not config.param('filter_annotated_components', 'filters_trinotate', required=False) else config.param('filter_annotated_components', 'filters_trinotate', required=False).split("\n") source_directory = "differential_expression" # Create the files containing filtered isoforms and genes with headers jobs.append(concat_jobs([ Job(command="mkdir -p " + output_directory ), Job([trinotate_annotation_report_filtered], [trinotate_annotation_report_filtered_header["genes"]], command="cat " + trinotate_annotation_report_filtered + " | awk 'BEGIN{OFS=\"_\";FS=\"_\"}{print $1,$2}' | uniq | sed '1s/^/ \\n/' " + " > " + trinotate_annotation_report_filtered_header["genes"] ), Job([trinotate_annotation_report_filtered], [trinotate_annotation_report_filtered_header["isoforms"]], command="sed '1s/^/ \\n/' " + trinotate_annotation_report_filtered + " > " + trinotate_annotation_report_filtered_header["isoforms"]) ],name="differential_expression_filtered_get_trinotate") ) # Run DGE and merge dge results with annotations for item in "genes","isoforms": matrix = os.path.join(output_directory, item + ".counts.matrix.symbol") job=tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header[item], os.path.join(source_directory, item + ".counts.matrix.symbol") ], "\\\\t", matrix, "\'\' " + counts_ids[item], left_join=True, exclude="\' \'") jobs.append(concat_jobs([ job, Job([os.path.join(source_directory, item +".lengths.tsv.noheader.tsv")], [os.path.join(output_directory, item +".lengths.tsv.noheader.tsv")], command="cp " + os.path.join(source_directory, item +".lengths.tsv.noheader.tsv") + " " + os.path.join(output_directory, item +".lengths.tsv.noheader.tsv")), concat_jobs(self.differential_expression_and_goseq_rsem(output_directory, item, trinotate_annotation_report), name="differential_expression_filtered_" + item) ], name="differential_expression_filtered_" + item) ) # Dependencies for report output_files = [] for job_item in jobs: output_files.extend([output_file for output_file in job_item.output_files if output_file not in output_files]) # DGE Report # Render Rmarkdown Report jobs.append( rmarkdown.render( job_input = output_files, job_name = "differential_expression_goseq_rnaseq_denovo_filtered_report", input_rmarkdown_file = input_rmarkdown_file , render_output_dir = 'report', module_section = 'report', prerun_r = 'report_dir="' + report_dir + '"; source_dir="' + output_directory + '"; ' + 'top_n_results=10; contrasts=c("' + '","'.join(contrast.name for contrast in self.contrasts) + '");' ) ) return jobs