def buildAllStats(infiles, outfile): ''' paste stats together ''' statement = '''paste %s > %s''' % ( " ".join([infile for infile in infiles]), outfile) P.run()
def buildCDS(infile, outfile): '''output CDS features from an ENSEMBL gene set. Take all features from a :term:`gtf` file that are of feature type ``CDS`` and that are annotated as protein-coding. Note that only the coding parts of exons are output - UTR's are removed. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename in :term:`gtf` format. ''' statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=filter --filter-method=proteincoding --log=%(outfile)s.log | awk '$3 == "CDS"' | cgat gtf2gtf --method=remove-duplicates --duplicate-feature=gene --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run(statement)
def buildAnnotatorSlicedSegments(tmpdir, outfile, track, slice): '''slice segments.''' tmpsegments = os.path.join(tmpdir, "segments") to_cluster = True if slice == "all": where = "'1'" else: where = "is_%(slice)s" % locals() statement = ''' %(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(track)s_annotation AS a WHERE a.gene_id = g.gene_id AND %(where)s" | cgat gtf2tsv --invert | cgat gff2annotator2tsv --remove-regex='%(annotator_remove_pattern)s' --log=%(outfile)s.log --section=segments > %(tmpsegments)s ''' P.run(statement) if os.path.getsize(tmpsegments) == 0: return None else: return tmpsegments
def loadPeptideSequences(infile, outfile): '''load ENSEMBL peptide file into database This method removes empty sequences (see for example transcript:ENSMUST00000151316, ENSMUSP00000118372) The created table contains the columns ``protein_id``, ``length`` and ``sequence``. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string filename with logging information. The tablename is derived from ``outfile``. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-protein_id" "--map=protein_id:str") statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | cgat fasta2fasta --method=filter --filter-method=min-length=1 | cgat fasta2table --section=length --section=sequence | perl -p -e 's/id/protein_id/' | %(load_statement)s > %(outfile)s''' P.run(statement)
def buildNonCodingExons(infile, outfile): '''output non-coding exons from ENSEMBL gene set. Remove all features from a :term:`gtf` file that are ``exon`` and that are not protein-coding. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename in :term:`gtf` format. ''' statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=filter --filter-method=proteincoding --invert-filter --log=%(outfile)s.log | awk '$3 == "exon"' | cgat gtf2gtf --method=remove-duplicates --duplicate-feature=gene --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run(statement)
def buildPromotorRegions(infile, outfile, promotor_size=1000): '''annotate promotor regions from reference gene set. This method builds promotor regions for transcripts in an ENSEMBL gene set. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Filename in :term:`gff` format. promotor_size : int Size of the promotor region (nucleotides upstream of TSS). ''' statement = """ gunzip < %(infile)s | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | cgat gtf2gff --method=promotors --promotor-size=%(promotor_size)s \ --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s """ P.run(statement)
def sortGTF(infile, outfile, order="contig+gene", job_memory="8G"): '''sort a gtf file. The sorting is performed on the cluster. Arguments --------- infile : string Geneset in :term:`gtf` format. outfile : string Geneset in :term:`gtf` format. order : string Sort order. See :mod:`scripts/gtf2gtf` for valid options for `order`. ''' if infile.endswith(".gz"): uncompress = "zcat" else: # wastefull uncompress = "cat" if outfile.endswith(".gz"): compress = "gzip" else: compress = "cat" statement = '''%(uncompress)s %(infile)s | cgat gtf2gtf --method=sort --sort-order=%(order)s --log=%(outfile)s.log | %(compress)s > %(outfile)s''' P.run(statement, job_memory=job_memory)
def buildBigWig(infile, outfile): '''build wiggle files from bam files. Generate :term:`bigWig` format file from :term:`bam` alignment file Parameters ---------- infile : str Input filename in :term:`bam` format outfile : str Output filename in :term:`bigwig` format annotations_interface_contigs : str :term:`PARAMS` Input filename in :term:`bed` format ''' # wigToBigWig observed to use 16G job_memory = "16G" statement = '''cgat bam2wiggle --output-format=bigwig %(bigwig_options)s %(infile)s %(outfile)s > %(outfile)s.log''' P.run()
def loadRepeats(infile, outfile): """load genomic locations of repeats into database. This method loads the genomic coordinates (contig, start, end) and the repeat name into the database. Arguments --------- infile : string Input filename in :term:`gff` with repeat annotations. outfile : string Output filename with logging information. The table name is derived from outfile. """ load_statement = P.build_load_statement( P.to_table(outfile), options="--add-index=class " "--header-names=contig,start,stop,class") statement = """zcat %(infile)s | cgat gff2bed --set-name=class | grep -v "#" | cut -f1,2,3,4 | %(load_statement)s > %(outfile)s""" P.run(statement, job_memory=PARAMS["job_memory"])
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.get_temp_dir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) # Create fastq_screen config file in temp directory # using parameters from Pipeline.yml with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles, ), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) IOTools.touch_file(outfile)
def callMethylationStatus(infile, outfile): if infile.endswith(("bismark_bt2.bam", "bismark_bt.bam")): options = " --single-end " else: options = " --paired-end " if PARAMS["bismark_extraction_options"]: options += PARAMS["bismark_extraction_options"] CG = ("methylation.dir/CpG_context_" + P.snip(os.path.basename(outfile), ".bismark.cov") + ".txt") CHG = re.sub("CpG", "CHG", CG) CHH = re.sub("CpG", "CHH", CG) outdir = "methylation.dir" index_dir = PARAMS["bismark_index_dir"] genome = PARAMS["bismark_genome"] statement = '''bismark_methylation_extractor %(options)s --comprehensive --output %(outdir)s --counts --cytosine_report --bedGraph --genome_folder %(index_dir)s/%(genome)s/ %(infile)s; gzip -f %(CG)s; gzip -f %(CHG)s; gzip -f %(CHH)s ''' % locals() P.run()
def runFastqc(infiles, outfile): '''run Fastqc on each input file. convert sra files to fastq and check mapping qualities are in solexa format. Perform quality control checks on reads from .fastq files. ''' # MM: only pass the contaminants file list if requested by user, # do not make this the default behaviour if PARAMS['use_custom_contaiminants']: m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"], outdir=PARAMS["exportdir"] + "/fastqc", contaminants=PARAMS['contaminants_path'], qual_format=PARAMS['qual_format']) else: m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"], outdir=PARAMS["exportdir"] + "/fastqc", qual_format=PARAMS['qual_format']) if PARAMS["general_reconcile"] == 1: infiles = infiles.replace("processed.dir/trimmed", "reconciled.dir/trimmed") statement = m.build((infiles, ), outfile) P.run(statement)
def quantifyWithSailfish(infiles, outfile): ''' Quantify gene/transcript expression with sailfish ''' fastqs = infiles[0] geneset = infiles[1][1] # need to check that fastq2 file exists # if not, run as single-end index_dir = "/".join(infiles[1][0].split("/")[:-1]) out_dir = ".".join(outfile.split(".")[:-1]) job_threads = 8 job_memory = "6G" count_file = "/".join([out_dir, "quant.sf"]) statement = ''' cgat fastq2tpm --log=%(outfile)s.log --program=sailfish --method=quant --gene-gtf=%(geneset)s --index-file=%(index_dir)s --output-directory=%(out_dir)s --library-type=%(sailfish_library)s --threads=%(job_threads)s %(fastqs)s; ''' P.run(statement)
def quantifyWithSailfish(infiles, outfile): ''' Quantify gene/transcript expression with sailfish ''' fastq1 = infiles[0] # need to check that fastq2 file exists # if not, run as single-end fastq2 = infiles[1][1] geneset = infiles[1][2] index_dir = infiles[1][0] out_dir = "/".join(outfile.split("/")[:-1]) job_threads = 6 fastqs = ",".join([fastq1, fastq2]) job_memory = "1.5G" statement = ''' cgat fastq2tpm --log=%(out_dir)s.log --program=sailfish --method=quant --paired-end --gene-gtf=%(geneset)s --index-file=%(index_dir)s --output-directory=%(out_dir)s --library-type=%(sailfish_library)s --threads=%(job_threads)s %(fastqs)s''' P.run(statement)
def buildCoverageStats(infile, outfile): '''Generate coverage statistics for regions of interest from a bed file using Picard''' # TS check whether this is always required or specific to current baits # file # baits file requires modification to make picard accept it # this is performed before CalculateHsMetrics to_cluster = USECLUSTER baits = PARAMS["roi_baits"] modified_baits = infile + "_temp_baits_final.bed" regions = PARAMS["roi_regions"] statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt; awk 'NR>2' %(baits)s | awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}' > %(infile)s_temp_baits.bed; cat %(infile)s_temp_header.txt %(infile)s_temp_baits.bed > %(modified_baits)s; rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt ''' P.run(statement) PipelineMappingQC.buildPicardCoverageStats(infile, outfile, modified_baits, modified_baits) IOTools.zap_file(modified_baits)
def buildCpGBed(infile, outfile): ''' Output a :term:`BED` file that contains the location of all CpGs in the input genome using `CGAT` script `fasta2bed`. Parameters ---------- infile: str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile: str A :term:`BED` format file containing location of CpGs across the genome. The BED file is then indexed using tabix ''' statement = ''' cgat fasta2bed --method=cpg --log=%(outfile)s.log < %(infile)s | bgzip > %(outfile)s ''' P.run(statement, job_memory=PARAMS["job_highmemory"]) statement = ''' tabix -p bed %(outfile)s ''' P.run(statement, job_memory=PARAMS["job_highmemory"])
def run_test(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):]) pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node #to_cluster = False template_statement = ("cd %%(track)s.dir; " "xvfb-run -d cgatflow %%(pipeline_name)s " "%%(pipeline_options)s " "%%(workflow_options)s make %s " "-L ../%%(outfile)s " "-S ../%%(outfile)s.stdout " "-E ../%%(outfile)s.stderr") if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(statement, ignore_errors=True, job_memory="unlimited") else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(statement, ignore_errors=True, job_memory="unlimited")
def buildTranscriptRegions(infile, outfile): """ export a table of seleno cysteine transcripts. Selenocysteine containing transcripts are identified by checking if their protein sequence contains ``U``. The table contains a single column ``transcript_id`` with ENSEMBL transcript identifiers as values. Arguments --------- infile : string Input filename with geneset in :term:`gtf` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ statement = """ gunzip < %(infile)s | cgat gtf2gtf --method=join-exons --log=%(outfile)s.log | cgat gff2bed --is-gtf --set-name=transcript_id --log=%(outfile)s.log | gzip > %(outfile)s """ P.run(statement, job_memory=PARAMS["job_memory"])
def buildOverlapWithEnsembl(infile, outfile, filename_bed): '''compute overlap of genes with intervals. If `filename_bed` has multiple tracks the overlap will be computed for each track separately. The output is a tab-separated table with pairs of overlapping features between `infile` and `filename_bed`. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output file in :term:`tsv` format. filename_bed : string Filename in :term:`bed` format. ''' statement = '''gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts | cgat gff2bed --is-gtf | cgat bed2graph --output-section=name --log=%(outfile)s.log - %(filename_bed)s > %(outfile)s ''' P.run(statement)
def buildGeneRegions(infile, outfile): """build a :term:`bed` file of regions spanning whole gene models. This method outputs a single interval spanning the genomic region that covers all transcripts within a particular gene. The name column of the :term:`bed` file is set to the `gene_id`. Arguments --------- infile : string Input filename with geneset in :term:`gtf` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ statement = """ gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts --log=%(outfile)s.log | cgat gff2bed --is-gtf --set-name=gene_id --log=%(outfile)s.log | gzip > %(outfile)s """ P.run(statement, job_memory=PARAMS["job_memory"])
def buildPeptideFasta(infile, outfile): '''index an ENSEMBL peptide FASTA file The descriptions in the fasta file are truncated at the first space to contain only the sequence identifier. Arguments --------- infile : string ENSEMBL ``.pep.all.fa.gz`` file in :term:`fasta` format outfile : string indexed file in :term:`fasta` format ''' dbname = outfile[:-len(".fasta")] statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | cgat index_fasta --force-output %(dbname)s - > %(dbname)s.log ''' P.run(statement)
def buildTranscriptTTS(infile, outfile): """build a :term:`bed` file with transcription termination sites. This method outputs all transcription start sites within a geneset. The trancription start site is derived from the most downstream coordinate of each transcript. The name column of the :term:`bed` file is set to the `transcript_id`. Arguments --------- infile : string Input filename with geneset in :term:`gtf` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ statement = """ gunzip < %(infile)s | cgat gtf2gtf --method=join-exons --log=%(outfile)s.log | cgat gtf2gff --method=tts --promotor-size=1 --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | cgat gff2bed --is-gtf --set-name=transcript_id --log=%(outfile)s.log | gzip > %(outfile)s """ P.run(statement, job_memory=PARAMS["job_memory"])
def loadGeneStats(infile, outfile): """compute and load gene statistics to database. Gene statistics are computed by :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Parameters ---------- infile : string A :term:`gtf` file which is output from :meth:`buildGenes` outfile : string A log file. The table name is derived from `outfile`. e.g. bam_stats.load """ load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--map=gene_name:str") statement = ''' gunzip < %(infile)s | cgat gtf2table --log=%(outfile)s.log --genome=%(genome_dir)s/%(genome)s --counter=position --counter=length --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run(statement)
def mapReadsWithTophatFusion(infiles, outfile): '''map reads from .fastq or .sra files and find candidate fusions A list with known splice junctions expect from rnaseq pipeline ''' job_threads = PARAMS["tophat_threads"] if "--butterfly-search" in PARAMS["tophat_options"]: # for butterfly search - require insane amount of # RAM. job_options += " -l mem_free=50G" to_cluster = USECLUSTER m = PipelineMapping.TopHat_fusion() infile = infiles # if a file of reference junctions, as generated by the rnaseq pipline, # has been specified in the ini, then pass this to tophat-fusion if not PARAMS['tophatfusion_reference_junctions'] is None: reffile = PARAMS['tophatfusion_reference_junctions'] tophat_options = PARAMS["tophat_options"] + \ " --raw-juncs %(reffile)s" % locals() tophatfusion_options = PARAMS["tophatfusion_options"] statement = m.build((infile,), outfile) P.run()
def buildLincRNAExons(infile, outfile): """output LincRNA portion of ENSEMBL geneset. Take all features from a :term:`gtf` file that are of feature type ``exon`` and that are annotated as a lincrna biotype. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output filename in :term:`gtf` format. """ statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=filter --filter-method=lincrna --log=%(outfile)s.log | awk '$3 == "exon"' | cgat gtf2gtf --method=remove-duplicates --duplicate-feature=gene --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run(statement)
def postprocessTopHatFusion(infiles, outfile): ''' Uses tophat-fusion-post to postprocess and filter all of the tophat-fusion output into one report. Slow as it is not cluster aware and spawns a large number of blast tasks''' job_options = ' -l mem_free=50G' job_threads = PARAMS["tophatfusion_postthreads"] statement = ''' module load bio/tophatfusion; tophat-fusion-post -p %(tophatfusion_postthreads)s %(tophatfusion_postoptions)s %(bowtie_index_dir)s/%(genome)s &> tophatfusion_out.log ''' P.run() # put the results in the export directory. # if the export directory doesn't exist, create it if not os.path.exists('export'): os.mkdir('export') # otherwise if it does, then delete any out directory that is # already there. elif os.path.exists( 'export/tophatfusion_out') and os.path.isdir( 'export/tophatfusion_out'): shutil.rmtree('export/tophatfusion.out') shutil.move('tophatfusion_out', 'export')
def loadTranscriptStats(infile, outfile): '''compute and load transcript properties into database. The method calls :doc:`gtf2table` with the following counters: * length - gene/exon lengths * position - gene position * composition-na - gene nucleotide composition Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement(P.to_table(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--map=gene_id:str") statement = ''' gunzip < %(infile)s |\ cgat gtf2table \ --log=%(outfile)s.log \ --genome=%(genome_dir)s/%(genome)s \ --reporter=transcripts \ --counter=position \ --counter=length \ --counter=composition-na | %(load_statement)s > %(outfile)s''' P.run(statement)
def edgeR_analysis(infile, outfile): ''' Runs the edgeR GLM analysis script using each of the input files as the exon counts, and each of the *design.tsv* as the designs. Options to the script are stored in the ini. ''' to_cluster = USECLUSTER R_path = PARAMS['R_path'] R_script_dir = PARAMS['R_scriptdir'] R_args = PARAM['R_args'] edgeR_args = ['edgeR_args'] baseName = snip(infile, ".exon_counts.tsv.gz") + "_" if not os.path.exists('edgeR_output'): os.mkdir('edgeR_output') for design in glob.iglob(PARAMS['edgeR_design']): statement = ''' %(R_path)s CMD BATCH %(R_args)s \"--args count_file='%(infile)s' conditions_file='%(design)s' out_file='%(baseName)s' %(edgeR_args)s \" %(R_scriptdir)/edgeR-GLM.R edgeR_output/%(infile)s.edgeR.log ''' % locals() P.run()
def runAnnotator(tmpdir, outfile, tmpannotations, tmpsegments, tmpworkspaces, tmpsynonyms, options=""): '''run annotator.''' to_cluster = True job_queue = "medium_jobs.q" job_options = "-l mem_free=8000M" workspace_options = "" for x, workspace in enumerate(tmpworkspaces): if x == 0: workspace_options += " -workspace %s" % workspace else: workspace_options += " -workspace%i %s" % (x + 1, workspace) if tmpsynonyms: workspace_options += " -synonyms %s" % tmpsynonyms statement = ''' java -Xmx8000M -cp %(annotator_dir)s/commons-cli-1.0.jar:%(annotator_dir)s/Annotator.jar app.Annotator -verbose 4 -iterations %(annotator_iterations)s -annotation %(tmpannotations)s -segments %(tmpsegments)s -bucketsize %(annotator_bucketsize)i %(workspace_options)s %(options)s > %(outfile)s ''' P.run(statement)
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()