def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile) statement = '''gunzip < %(infile)s | cgat maf2psl --query=%(track)s --target=%(maf_master)s --log=%(outfile)s.log | cgat psl2psl --method=filter-fasta --method=sanitize --queries-tsv-file=%(genomefile)s --target-psl-file=%(genome)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def convertPslToChain(infile, outfile): '''convert a psl to a chain file. see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver ''' to_cluster = True target, query = extractGenomes(infile) tmpfilename1 = P.getTempFilename(".") tmpfilename2 = P.getTempFilename(".") writeContigSizes(target, tmpfilename1) writeContigSizes(query, tmpfilename2) statement = '''gunzip < %(infile)s | pslSwap stdin stdout | cgat psl2chain --log=%(outfile)s.log | chainSort stdin stdout | gzip > %(outfile)s.sorted.chain.gz; checkpoint; gunzip < %(outfile)s.sorted.chain.gz | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout | gzip > %(outfile)s''' P.run() os.unlink(tmpfilename1) os.unlink(tmpfilename2)
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError( "required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append( '''gunzip < %(filename)s''' % locals() ) else: statement.append( ''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals() ) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def spikeInCounts(infiles, outfile): ''' Perform spike-in across a specific range of fold changes or absolute count differences. Counts table generated from original input counts data. ''' counts_file = infiles[0] design_file = infiles[1] statement = ''' zcat %(counts_file)s | python %(scriptsdir)s/counts2counts.py --design-tsv-file=%(design_file)s --method="spike" --spike-type="row" --spike-change-bin-max=3.0 --spike-change-bin-width=0.1 --spike-change-bin-min=0.1 --spike-initial-bin-width=1 --spike-initial-bin-min=1 --spike-initial-bin-max=200000 --spike-minimum=1 --spike-maximum=1000000 --random-seed=%(random_seed)i --spike-iterations=%(spike_iterations)i -v 5 --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def buildBAMStats(infile, outfile): '''Count number of reads mapped, duplicates, etc. ''' to_cluster = USECLUSTER scriptsdir = PARAMS["general_scriptsdir"] statement = '''cgat bam2stats --force-output --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s''' P.run()
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''gather BAM file alignment statistics using Picard ''' job_options = getPicardOptions() job_threads = 3 if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence --output-sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def buildRawGenomeAlignment(infiles, outfile): '''build pairwise genomic aligment from maf files. ''' try: os.remove(outfile) except OSError: pass for infile in infiles: # skip maf files without Hsap on top. if "other" in infile or "supercontig" in infile: continue E.info("adding %s" % infile) genome_query, genome_target = getGenomes() statement = '''gunzip < %(infile)s | python %(scriptsdir)s/maf2psl.py --query=%(maf_name_query)s --target=%(maf_name_target)s --log=%(outfile)s.log | python %(scriptsdir)s/psl2psl.py --method=filter-fasta --method=sanitize --queries-tsv-file=%(genome_query)s --target-psl-file=%(genome_target)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def buildFilteredLncRNAGeneSet(infile, outfile): ''' Depending on on filtering_remove_single_exon will: i) remove all single exon transcripts from all lncrna models (transcripts) ii) remove lncrna loci that only contain single exon transcripts (loci) iii) leave all single-exon and multi-exon loci in outfile (None) ''' if not PARAMS["filtering_remove_single_exon"]: E.info("Both multi-exon and single-exon lncRNA are retained!") statement = ("cp %(infile)s %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "loci": E.info("Warning: removing all single-exon" " transcripts from lncRNA set") statement = ("zcat %(infile)s |" " grep 'exon_status_locus \"s\"'" " gzip > %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "transcripts": E.info("Warning: removing loci with only single-exon transcripts") statement = ("zcat %(infile)s |" " grep 'exon_status \"s\"'" " gzip > %(outfile)s") else: raise ValueError("Unregocnised parameter %s" % PARAMS["filtering_remove_single_exon"]) P.run()
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS["annotations_interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return load_statement = P.build_load_statement( tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def buildAllStats(infiles, outfile): ''' paste stats together ''' statement = '''paste %s > %s''' % ( " ".join([infile for infile in infiles]), outfile) P.run()
def mapReadsWithBowtie(infiles, outfile): """map reads with bowtie""" inifile, infile = infiles job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = """ gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s """ P.run()
def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def buildOverlapWithEnsembl(infile, outfile, filename_bed): '''compute overlap of genes with intervals. If `filename_bed` has multiple tracks the overlap will be computed for each track separately. The output is a tab-separated table with pairs of overlapping features between `infile` and `filename_bed`. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Output file in :term:`tsv` format. filename_bed : string Filename in :term:`bed` format. ''' statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts | python %(scriptsdir)s/gff2bed.py --is-gtf | python %(scriptsdir)s/bed2graph.py --output-section=name --log=%(outfile)s.log - %(filename_bed)s > %(outfile)s ''' P.run()
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def buildPromotorRegions(infile, outfile, promotor_size=1000): '''annotate promotor regions from reference gene set. This method builds promotor regions for transcripts in an ENSEMBL gene set. Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Filename in :term:`gff` format. promotor_size : int Size of the promotor region (nucleotides upstream of TSS). ''' statement = """ gunzip < %(infile)s | python %(scriptsdir)s/gff2gff.py --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gff.py --method=promotors --promotor-size=%(promotor_size)s \ --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s """ P.run()
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --map-tsv-file=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile): ''' loads the summary of reads contributing to transcripts ''' tablename = P.toTable(outfile.replace("/", "_")) statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadTranscripts(infile, outfile): '''load transcripts from a GTF file into the database. The table will be indexed on ``gene_id`` and ``transcript_id`` Arguments --------- infile : string ENSEMBL geneset in :term:`gtf` format. outfile : string Logfile. The table name is derived from `outfile`. ''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--add-index=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/gtf2tsv.py | %(load_statement)s > %(outfile)s''' P.run()
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def loadCountSingleAndMultiExonLincRNA(infile, outfile): ''' load the counts for the multi and single exon lincRNA ''' tablename = P.toTable(outfile.replace("/", "_")) + ".count" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. dbhandle = PipelineUCSC.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=ucsc_database) cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(shared=True) total_repeats = 0 for table in tables: E.info("%s: loading repeats from %s" % (ucsc_database, table)) cc = dbhandle.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) n = 0 for data in cc.fetchall(): n += 1 tmpfile.write("\t".join(map(str, data)) + "\n") E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n)) total_repeats += n if total_repeats == 0: raise ValueErrror("did not find any repeats for %s" % ucsc_database) tmpfile.close() tmpfilename = tmpfile.name statement = '''cat %(tmpfilename)s | %(pipeline_scriptsdir)s/gff_sort pos | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def runControlLncRNAPhyloCSF(infile, outfile): phylogeny = PARAMS["phyloCSF_phylogeny"] n_frames = int(PARAMS["phyloCSF_n_frames"]) if PARAMS["phyloCSF_options"]: options = PARAMS["phyloCSF_options"] else: options = "" species = [] for mapping in PARAMS["phyloCSF_map_species_names"].split(","): species.append(mapping.split(":")[1]) species = ",".join(species) to_cluster = True statement = ("PhyloCSF %(phylogeny)s" " %(infile)s" " --frames=%(n_frames)s" " --species=%(species)s" " %(options)s" " > %(outfile)s") P.run()
def mergeSummaries(infiles,summaryfile): #file to store all the stats combined print(mergeSummaries) combstats = os.getcwd()+"/"+summaryfile statementlist = [] in0 = os.getcwd()+"/"+infiles[0] statementlist.append("touch {}".format(combstats)) statementlist.append("head -1 {} >>{}".format(in0,combstats)) statementlist.append("sed -i '1s/^/{}\\t{}\\t /' {}".format("file","assembler",combstats)) #extract filenames and assembler names to add to summary text file for infile in infiles: indir=os.getcwd()+"/"+infile insplit=infile.split("/") filen=insplit[1] assem=insplit[0].split("_")[0] #just append the last line and add filename and assembler name statementlist.append("tail -1 {} >> {}".format(indir,combstats)) statementlist.append("sed -i '$s/^/{}\\t{}\\t /' {}".format(filen,assem,combstats)) statement = " && ".join(statementlist) P.run()
def getCoverageStats(outfile): ''' Grab the gene model coverage stats table from the mapping pipeline database This is a table in the report generated from a tracker, need to actually make this table ourselves to get 5'/3' coverages ''' statement = ''' python %(cgat_scripts)s/extract_stats.py --task=extract_table --log=%(outfile)s.log --database=%(mapping_db)s --table-name=%(mapping_picard_dups)s > %(outfile)s ''' P.run()
def mergeAllAssemblies(infiles, outfile): infiles = ["<(zcat %s)" % infile for infile in infiles] infiles, reference = infiles[:-1], infiles[-1] job_threads = PARAMS["stringtie_merge_threads"] infiles = " ".join(infiles) statement = '''stringtie --merge -G %(reference)s -p %(stringtie_merge_threads)s %(stringtie_merge_options)s %(infiles)s 2> %(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene+transcript -S %(outfile)s -L %(outfile)s.log''' P.run()
def buildRepeatsRates(infile, outfile): '''compute rates for individual aligned repeats.''' genome_query, genome_target = getGenomes() statement = '''gunzip < %(infile)s | sort -k10,10 -k14,14 -k9,9 -k12,12n | %(cmd-farm)s --split-at-lines=10000 --output-header --log=%(outfile)s.log "cgat psl2psl --log=%(outfile)s.log --method=add-sequence --queries-tsv-file=%(genome_query)s --target-psl-file=%(genome_target)s | cgat psl2table --method=query-counts --method=baseml --baseml-model=REV" | gzip > %(outfile)s ''' P.run()
def buildGenomeAlignment(infiles, outfile): '''build pairwise genomic aligment from axt files.''' try: os.remove(outfile) except OSError: pass for infile in infiles: E.info("adding %s" % infile) statement = '''gunzip < %(infile)s | axtToPsl /dev/stdin %(query)s.sizes %(target)s.sizes /dev/stdout | pslSwap /dev/stdin /dev/stdout | gzip >> %(outfile)s ''' P.run()
def convertChainToPsl(infile, outfile): '''convert a chain file to a psl file. ''' to_cluster = False target, query = extractGenomes(infile) E.debug("query=%s, target=%s" % (query, target)) statement = '''gunzip < %(infile)s | %(cmd-farm)s --split-at-regex="^chain" --chunk-size=1000 --max-lines=1000000 --log=%(outfile)s.log " cgat chain2psl --log=%(outfile)s.log | pslSwap stdin stdout " | gzip > %(outfile)s ''' P.run()
def variantAnnotatorIndels(infiles, outfile): '''Annotate variant file using GATK VariantAnnotator''' to_cluster = USECLUSTER infile, bamlist, effFile = infiles statement = '''GenomeAnalysisTK -T VariantAnnotator -R %(bwa_index_dir)s/%(genome)s.fa -I %(bamlist)s -A SnpEff --snpEffFile %(effFile)s -o %(outfile)s --variant %(infile)s -L %(infile)s -A Coverage -A FisherStrand -A HaplotypeScore -A MappingQualityRankSumTest -A ReadPosRankSumTest -A AlleleBalanceBySample -A RMSMappingQuality''' P.run()
def loadPolyphen(infile, outfile): '''load polyphen results. The comment column is ignored. ''' table = P.toTable(outfile) statement = '''gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g" | cut -f 1-55 |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=snp_id --add-index=protein_id --table=%(table)s --map=effect:str > %(outfile)s ''' P.run()
def merge_by_tissue(infiles, outfile): reference = "<(zcat %s)" % infiles[0][0] infiles = ["<(zcat %s)" % infile[0] for infile in infiles] job_threads = PARAMS["stringtie_merge_threads"] infiles = " ".join(infiles) statement = '''stringtie --merge -G %(reference)s -p %(stringtie_merge_threads)s %(stringtie_merge_options)s %(infiles)s 2> %(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene+transcript -S %(outfile)s -L %(outfile)s.log''' P.run()
def buildGenomeAlignment(infiles, outfile): '''remove non-unique alignments in genomic infile.''' to_cluster = True infiles = " ".join(infiles) statement = '''zcat %(infiles)s | sort -k10,10 -k12,12n | cgat psl2psl --method=remove-overlapping-query --log=%(outfile)s.log | sort -k14,14 -k16,16n | cgat psl2psl --method=remove-overlapping-target --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def haplotypeCaller(infile, outfile, genome, dbsnp, intervals, padding, options): '''Call SNVs and indels using GATK HaplotypeCaller in all members of a family together''' job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T HaplotypeCaller -ERC GVCF -variant_index_type LINEAR -variant_index_parameter 128000 -o %(outfile)s -R %(genome)s -I %(infile)s --dbsnp %(dbsnp)s -L %(intervals)s -ip %(padding)s %(options)s''' % locals() P.run()
def copyBamFile(infile, outfile): '''Make softlinks of the bam files Arguments --------- infile : string Input file in :term:`BAM` format. outfile : string Output file in :term: `BAM` format. ''' statement = '''ln -s ../%(infile)s %(outfile)s''' P.run() statement = '''samtools index %(outfile)s ''' P.run()
def loadRates(infile, outfile): '''load rates. Select the longest stretch for each transcript. ''' track = outfile[:-len(".load")] statement = ''' gunzip < %(infile)s | python %(toolsdir)s/csv_cut.py --large --remove qStarts tStarts blockSizes qSequence tSequence --log=%(outfile)s | csort -k:qName: -k:aligned:rn | perl -p -e "s/qName/gene_id/" | awk '{if (l==$10) {next;} l = $10; print; }' |cgat csv2db %(csv2db_options)s --map gene_id:str --table=%(track)s --add-index=gene_id --allow-empty-file > %(outfile)s ''' P.run()
def generateClusterSpikeIns(infile, outfile): # parametrise binning in pipeline.ini job_options = "-l mem_free=4G" statement = '''cat %(infile)s | cgat data2spike --method=spike --design-tsv-file=design.tsv --difference-method=relative --spike-shuffle-column-suffix=-perc --spike-keep-column-suffix=-meth,-unmeth --spike-minimum=100 --spike-maximum=100 --spike-output-method=seperate --spike-cluster-maximum-distance=150 --spike-cluster-minimum-size=10 --spike-iterations=50 --spike-type=cluster --spike-change-bin-min=-100 --spike-change-bin-max=100 --spike-change-bin-width=10 --spike-initial-bin-min=0 --spike-initial-bin-max=100 --spike-initial-bin-width=100 --spike-subcluster-min-size=1 --spike-subcluster-max-size=9 --spike-subcluster-bin-width=1 > %(outfile)s_tmp; mv %(outfile)s_tmp %(outfile)s''' % locals() P.run()
def runGsea(infile, outfile): ''' Perform the enrichment analysis, by using gene set enrichment analysis (GSEA) and leading edge analysis. ''' geneset = PARAMS['geneset_name'] idtype = PARAMS['id_gsea_type'] id_conversion = PARAMS['id_gsea_to_convert'] min_size = PARAMS['stats_gsea_min_size'] max_size = PARAMS['stats_gsea_max_size'] seed = PARAMS['stats_gsea_seed'] no = PARAMS['stats_gsea_permut'] p_no = PARAMS['stats_gsea_display_num'] l_no = PARAMS['stats_gsea_ngeneset'] statement = '''dir=$(basename %(infile)s .processed | awk '{split($0,a,"/"); print a[1]}') && mkdir $dir && cd $dir && xvfb-run cgat runGSEA -f ../%(infile)s -g %(geneset)s -m %(min_size)s -x %(max_size)s -s %(seed)s -n %(no)s -d %(p_no)s -l %(l_no)s''' P.run()
def buildIntergenicRegions(infiles, outfile): """build a :term:`bed` file with regions not overlapping any genes. Arguments --------- infiles : list - Input filename with geneset in :term:`gtf` format. - Input filename with chromosome sizes in :term:`tsv` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ infile, contigs = infiles statement = '''zcat %(infile)s | sort -k1,1 -k2,2n | complementBed -i stdin -g %(contigs)s | gzip > %(outfile)s''' P.run()
def subtractBedFiles(infile, subtractfile, outfile): '''subtract intervals in *subtractfile* from *infile* and store in *outfile*. ''' if IOTools.isEmpty(subtractfile): shutil.copyfile(infile, outfile) return elif IOTools.isEmpty(infile): P.touch(outfile) return statement = ''' intersectBed -v -a %(infile)s -b %(subtractfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ; tabix -p bed %(outfile)s ''' P.run()
def makeTagDirectoryChips(infile, outfile): ''' This will create a tag file for each bam file for a CHIP-seq experiment ''' bamstrip = infile.strip(".bam") samfile = bamstrip + ".sam" statement = ''' samtools view %(infile)s > homer/Tag.dir/%(samfile)s && cd homer/Tag.dir/ && makeTagDirectory %(bamstrip)s %(samfile)s -genome %(homer_maketagdir_genome)s -checkGC &> %(bamstrip)s.makeTagChip.log && touch %(bamstrip)s/%(bamstrip)s.txt && sleep 60''' P.run()
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles, ), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def loadGeneCoordinates(infile, outfile): '''merge transcripts to generate the genomic coordinates per gene and load ''' # TS. remove transcript_id column as this is now meaningless load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=gene_id " "--ignore-column=transcript_id " "--allow-empty-file ") statement = ''' gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts | cgat gtf2tsv | %(load_statement)s > %(outfile)s''' P.run()
def plotHeatmap(infile, outfile): ''' This tool creates a heatmap for scores associated with genomic regions. The program requires a matrix file generated by the tool computeMatrix. ''' infile = "".join(infile) statement = '''plotHeatmap -m %(infile)s -o %(outfile)s --outFileNameMatrix %(deep_out_namematrix)s --outFileSortedRegions %(deep_out_sorted)s --dpi %(deep_dpi)s --colorMap %(deep_colormap)s --kmeans %(deep_kmeans)s --legendLocation %(deep_legendlocation)s --refPointLabel %(deep_refpointlabel)s''' P.run()
def buildSequinsReferenceTranscriptome(infiles, outfile): ''' Builds a reference transcriptome from the provided GTF geneset - generates a fasta file containing the sequence of each feature labelled as "exon" in the GTF. --fold-at specifies the line length in the output fasta file''' infile, genome_file = infiles statement = ''' zcat %(infile)s | awk '$3=="exon"'| cgat gff2fasta --is-gtf --genome-file=%(genome_file)s --fold-at=60 -v 0 --log=%(outfile)s.log > %(outfile)s; checkpoint; samtools faidx %(outfile)s ''' P.run()
def mapReadsWithShrimp(infiles, outfile): '''map reads with shrimp''' inifile, infile = infiles job_options = "-l mem_free=64G" job_threads = PARAMS["shrimp_threads"] statement = ''' gmapper-cs --full-threshold 80%% --threads %(shrimp_threads)i --fastq --output-report --sam-unaligned %(shrimp_options)s %(infile)s %(genome_dir)s/%(genome)s.fa 2> %(outfile)s.log | gzip > %(outfile)s ''' P.run()
def getDuplicationStats(outfile): ''' Grab the picard duplication stats table from the mapping pipeline database ''' statement = ''' cgat extract_stats --log=%(outfile)s.log --task=extract_table --database-port=3306 --database-backend=%(database_backend)s --database-hostname=%(database_host)s --database-username=%(database_username)s --database=%(mapping_db)s --table-name=%(mapping_picard_dups)s > %(outfile)s ''' P.run()
def buildReadCorrespondence(infiles, outfile): '''count number of reads mapped, duplicates, etc. ''' to_cluster = USECLUSTER headers = ",".join([P.snip(x, ".bam") for x in infiles]) sorters = " ".join(["<( samtools view -h %s | %s/hsort 0 )" % (x, PARAMS["scriptsdir"]) for x in infiles]) statement = ''' cgat diff_bam --header-names=%(headers)s --log=%(outfile)s.log %(sorters)s | gzip > %(outfile)s ''' P.run()
def makeSailfishIndex(infile, outfile): ''' Make a sailfish index file from a multi-fasta of spliced transcript sequences ''' outdir = "/".join(outfile.split("/")[:-1]) job_threads = 8 statement = ''' python %(cgat_scripts)s/fastq2tpm.py --method=make_index --program=sailfish --index-fasta=%(infile)s --kmer-size=%(sailfish_kmer)s --threads=%(job_threads)s --output-directory=%(outdir)s --log=%(outfile)s.log ''' P.run()
def spikeVsGenome(infile, outfile): '''Summarise the number of reads mapping uniquely to spike-ins and genome. Compute the ratio of reads mapping to spike-ins vs genome. Only uniquely mapping reads are considered''' header = "\\t".join( ["nreads_uniq_map_genome", "nreads_uniq_map_spike", "fraction_spike"]) statement = ''' echo -e "%(header)s" > %(outfile)s; checkpoint; samtools view %(infile)s | grep NH:i:1 | awk 'BEGIN{OFS="\\t";ercc=0;genome=0}; $3~/chr*/{genome+=1}; $3~/ERCC*/{ercc+=1}; END{frac=ercc/(ercc+genome); print genome,ercc,frac};' >> %(outfile)s ''' P.run()
def mergeSailfishCounts(infiles, outfile): ''' Merge all raw counts from sailfish across each condition ''' infiles = " ".join(infiles) job_memory = "4G" statement = ''' cgat combine_tables --columns=1 --take=5 --use-file-prefix --regex-filename='(.+).quant' --log=%(outfile)s.log %(infiles)s > %(outfile)s''' P.run()
def grepPrimers(infile, outfile): '''count occurences of decreasing primer substrings at start of reads ''' to_cluster = False primer = "a" if infile.find("_b.") > 0: primer = "b" if primer == "a": primer_seq = PARAMS["grep_primer_a"] else: primer_seq = PARAMS["grep_primer_b"] for i in range(len(primer_seq), 5, -1): primer_subseq = primer_seq[:i] statement = '''echo "%(primer_subseq)s" >> %(outfile)s; zcat %(infile)s | grep ^%(primer_subseq)s | wc -l >> %(outfile)s;''' P.run() # reformat out file statement = '''echo "Total reads" >> %(outfile)s; echo `zcat %(infile)s | wc -l` / 4 | bc >> %(outfile)s; sed -i '{N;s/\\n/\\t/}' %(outfile)s; ''' P.run()