def computeOverlapCoding( infile, outfile ): '''compute overlap between coding markers and windows. This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id are removed. ''' to_cluster = True tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=ensembl.diff.genes_ovl \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def buildDownstreamFlankBed(infile, outfile): """ build interval downstream of gene start for each entry in bed file""" window = PARAMS["geneset_flank"] faidx = PARAMS["faidx"] statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s""" P.run()
def countReadsWithinWindows(bedfile, windowfile, outfile, counting_method="midpoint"): '''count reads given in *tagfile* within intervals in *windowfile*. Both files need to be :term:`bed` formatted. Counting is done using bedtools. The counting method can be 'midpoint' or 'nucleotide'. ''' job_options = "-l mem_free=4G" if counting_method == "midpoint": f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' ''' elif counting_method == "nucleotide": f = "" else: raise ValueError("unknown counting method: %s" % counting_method) statement = ''' zcat %(bedfile)s %(f)s | coverageBed -a stdin -b %(windowfile)s -split | sort -k1,1 -k2,2n | gzip > %(outfile)s ''' P.run()
def ExtendRegion(infile, outfile): """convert bed to gtf""" statement = """gunzip < %(infile)s | slopBed -i stdin -g %(faidx)s -b 1000 | gzip > %(outfile)s """ P.run()
def getNoncodingGeneset(infile, outfile): """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """ ensembl_transcripts = PARAMS["ensembl_transcripts"] statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s; echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; cat %(outfile)s | wc -l >> %(outfile)s.count;""" P.run()
def addMissingNoncodingTranscripts(infile, outfile): """ Add ensembl gene id to GTF file""" ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"] statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf; cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n > %(outfile)s;""" P.run()
def buildBAMStats( infile, outfile ): '''Count number of reads mapped, duplicates, etc. ''' to_cluster = USECLUSTER scriptsdir = PARAMS["general_scriptsdir"] statement = '''python %(scriptsdir)s/bam2stats.py --force --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s''' P.run()
def loadEffects(infile, outfile): '''load transcript effects into tables.''' root = infile[:-len(".effects.gz")] statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --from-zipped \ --index=transcript_id \ --table=%(root)s_effects \ < %(infile)s > %(outfile)s ''' P.run() for suffix in ("cds", "intron", "splicing", "translation"): statement = ''' gunzip < %(infile)s.%(suffix)s.gz | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=transcript_id --table=%(root)s_effects_%(suffix)s --ignore-column=seq_na --ignore-column=seq_aa >> %(outfile)s ''' P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS_ANNOTATIONS["interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def runGLAM2SCAN(infiles, outfile): '''run glam2scan on all intervals and motifs. ''' to_cluster = True # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles controlfile = dbfile[:-len(".fasta")] + ".controlfasta" if not os.path.exists(controlfile): raise P.PipelineError( "control file %s for %s does not exist" % (controlfile, dbfile)) if os.path.exists(outfile): os.remove(outfile) for motiffile in motiffiles: of = IOTools.openFile(outfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s ::\n" % motif) of.close() statement = ''' cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s ''' P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def buildFilteredLncRNAGeneSet(infile, outfile): ''' Depending on on filtering_remove_single_exon will: i) remove all single exon transcripts from all lncrna models (transcripts) ii) remove lncrna loci that only contain single exon transcripts (loci) iii) leave all single-exon and multi-exon loci in outfile (None) ''' if not PARAMS["filtering_remove_single_exon"]: E.info("Both multi-exon and single-exon lncRNA are retained!") statement = ("cp %(infile)s %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "loci": E.info("Warning: removing all single-exon" " transcripts from lncRNA set") statement = ("zcat %(infile)s |" " grep 'exon_status_locus \"s\"'" " gzip > %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "transcripts": E.info("Warning: removing loci with only single-exon transcripts") statement = ("zcat %(infile)s |" " grep 'exon_status \"s\"'" " gzip > %(outfile)s") else: raise ValueError("Unregocnised parameter %s" % PARAMS["filtering_remove_single_exon"]) P.run()
def exportMotifLocations( infiles, outfile ): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip( table, "_mast" ) for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ): tmpf.write( "\t".join( map(str, x) ) + "\n" ) tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif ) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink( tmpf.name )
def collectMEMEResults(tmpdir, target_path, outfile): '''collect output from a MEME run in tmpdir and copy all over to target_path convert images output by MEME (.eps files) to .png files.''' # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile) # convert images to png epsfiles = glob.glob(os.path.join(target_path, "*.eps")) for epsfile in epsfiles: b, ext = os.path.splitext(epsfile) pngfile = b + ".png" statement = '''convert %(epsfile)s %(pngfile)s ''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def makeSegments( infile, outfile ): '''compute intron overrun.''' to_cluster = True statement = '''gunzip < %(infile)s | %(scriptsdir)s/gff_sort pos | python %(scriptsdir)s/gff2histogram.py --method=values --output-filename-pattern="%(outfile)s.%%s" --force --log=%(outfile)s.log > %(outfile)s ''' P.run() statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gff2histogram.py --method=values --force --output-filename-pattern="%(outfile)s_genes.%%s" --log=%(outfile)s.log >> %(outfile)s''' P.run()
def loadRepeatInformation( infiles, outfile ): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename( "." ) statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | python %(scriptsdir)s/gff2bed.py -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink( tmpfilename )
def buildTranscriptLevelReadCounts(infiles, outfile): '''count reads falling into transcripts of protein coding gene models. .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' bamfile, geneset = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' statement = ''' zcat %(geneset)s | python %(scriptsdir)s/gtf2table.py --reporter=transcripts --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def buildGeneLevelReadCounts(infiles, outfile): '''compute read counts and coverage of exons with reads. ''' bamfile, exons = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' # ignore multi-mapping reads statement = ''' zcat %(exons)s | python %(scriptsdir)s/gtf2table.py --reporter=genes --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | python %(scriptsdir)s/gtf2gtf.py --sort=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --renumber-genes=%(gene_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --renumber-transcripts=%(transcript_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --sort=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def runSpades(infile, outfile): ''' run spades on each track ''' job_options = " -l mem_free=30G" statement = PipelineMetagenomeAssembly.Spades().build(infile) P.run()
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' tablename = os.path.basename( filenameToTablename(P.snip(infile, ".gtf.gz"))) to_cluster = False # just load each transcript with its classification temp = P.getTempFile() inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % ( transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() inf_1 = temp.name statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --header=transcript_id,gene_id,class" " < %(inf_1)s > %(outfile)s") P.run()
def runSoapdenovo(infile, outfile): ''' run soapdenovo ''' job_options = "-l mem_free=30G" statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile) P.run()
def runIdba(infile, outfile): ''' run idba on each track ''' job_options = " -l mem_free=30G" statement = PipelineMetagenomeAssembly.Idba().build(infile) P.run()
def buildAnnotations( infiles, outfile ): '''annotate transcripts by location (intergenic, intronic, ...)''' infile, annotation = infiles statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 "python %(scriptsdir)s/gtf2table.py --counter=position --counter=classifier --section=exons --section=introns --counter=length --counter=splice --counter=composition-na --counter=splice-comparison --log=%(outfile)s.log --filename-format=gff --filename-gff=%(annotation)s --genome-file=%(genome_dir)s/%(genome)s" | gzip > %(outfile)s ''' P.run()
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --apply=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def buildFullGeneSet(infiles, outfile): ''' produces a final gene set that can be used for differential expression analysis and comparisons between protein coding and lncRNA transcripts ''' # change the source to be in keeping with classification # of transcripts - f coming from cufflinks assembly infs = " ".join(infiles) statement = ("zcat %(infs)s |" " sed 's/Cufflinks/protein_coding/g' |" " python %(scriptsdir)s/gtf2gtf.py" " --sort=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run()
def lowerStringencyDeNovos(infiles, outfile): '''Filter lower stringency de novo variants based on provided jexl expression''' to_cluster = USECLUSTER infile, pedfile = infiles pedigree = csv.DictReader( open(pedfile), delimiter='\t', fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status']) for row in pedigree: if row['status'] == '2': father = row['father'] mother = row['mother'] child = row['sample'] statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")' > %(outfile)s''' % locals( ) P.run()
def alignContigsToReference(infile, outfile, param): ''' align the contigs to the reference genomes using nucmer ''' print infile, param to_cluster = True reffile, contigfile = infile, param pattern = P.snip(os.path.basename(outfile), ".delta") statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s''' P.run() outf = os.path.basename(outfile) statement = '''mv %(outf)s alignment.dir''' P.run()
def filterVariants(infiles, outfile): '''Filter variants based on provided jexl expression''' to_cluster = USECLUSTER infile, pedfile = infiles pedigree = csv.DictReader( open("%(pedfile)s"), delimiter='\t', fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status']) for row in pedigree: if row['status'] == '2': father = row['father'] mother = row['mother'] child = row['sample'] statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(father)s").getAB()<0.05&&vc.getGenotype("%(mother)s").getAB()<0.05&&vc.getGenotype("%(child)s").getAB()>=0.25&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3' > %(outfile)s''' % locals( ) P.run()
def loadOverlap(infile, outfile): '''load results of overlap computation.''' tablename = outfile[:-len("_table.load")] statement = ''' grep -v "\\bna\\b" < %(infile)s |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --map set1:str --map set2:str --index=set1 --index=set2 --table=%(tablename)s > %(outfile)s ''' P.run()
def runFrameFinder(infile, outfile): '''run FrameFinder search on both strands (-r TRUE). Note that CPC default is: only forward strand. ''' cpc_dir = "/ifs/apps/bio/cpc-0.9-r2" statement = ''' cat %(infile)s | %(cpc_dir)s/libs/estate/bin/framefinder -r TRUE -w %(cpc_dir)s/data/framefinder.model /dev/stdin | gzip > %(outfile)s ''' P.run()
def buildCodingExons( infile, outfile ): '''build a collection of transcripts from the protein-coding portion of the ENSEMBL gene set. All exons are kept ''' to_cluster = True statement = ''' gunzip < %(infile)s | awk '$2 == "protein_coding"' | awk '$3 == "exon"' | python %(scriptsdir)s/gtf2gtf.py --remove-duplicates=gene --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def makeDistances(infiles, outfile): '''compute intron overrun.''' infile, annotation = infiles statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 "python %(scriptsdir)s/gtf2table.py --counter=distance-genes --log=%(outfile)s.log --filename-gff=<( gunzip < %(annotation)s ) " > %(outfile)s ''' P.run()
def buildCodingGeneSet(infile, outfile): '''build a gene set with only protein coding transcripts. Genes are selected via their gene biotype in the GTF file. Note that this set will contain all transcripts of protein coding genes, including processed transcripts. This set includes UTR and CDS. ''' to_cluster = True statement = ''' zcat %(infile)s | awk '$2 == "protein_coding"' | gzip > %(outfile)s ''' P.run()
def convertBed2Psl(infile, outfile): """convert a bed to a psl file.""" track = outfile[:-len(".bed.gz")] genomefile = os.path.join(PARAMS["genome_dir"], PARAMS["%s_genome" % track]) if not os.path.exists(genomefile + ".fasta"): raise IOError("genome %s does not exist" % genomefile) statement = """gunzip < %(infile)s | python %(scriptsdir)s/bed2psl.py --genome=%(genomefile)s --log=%(outfile)s.log | gzip > %(outfile)s """ P.run()
def buildAnnotatorSegments(tmpdir, infile, outfile): '''convert segments in bed format to annotator format from infile to outfile. ''' tmpsegments = os.path.join(tmpdir, "segments") to_cluster = True statement = ''' python %(scriptsdir)s/bed2gff.py < %(infile)s |\ python %(scriptsdir)s/gff2annotator.py --log=%(outfile)s.log --section=segments > %(tmpsegments)s \ ''' P.run(**dict(locals().items() + PARAMS.items())) return tmpsegments
def mergeDMRWindows(infile, outfile): '''merge overlapping windows.''' to_cluster = True statement = ''' zcat %(infile)s | python %(scriptsdir)s/medip_merge_intervals.py --log=%(outfile)s.log --invert --output-filename-pattern=%(outfile)s.%%s.bed.gz | gzip > %(outfile)s ''' P.run()
def reportTotalRNAFunctions(infiles, outfiles): '''report total RNA functions.''' to_cluster = USECLUSTER rpkm_filename, annotations_filename = infiles expression_filename, diff_filename = outfiles statement = ''' python %(rmaadir)s/report_totalRNA_annotations.py %(rpkm_filename)s %(annotations_filename)s %(expression_filename)s %(diff_filename)s ''' P.run()
def copyEnsemblDb(infile, outfile): '''copy tables from ensembl database to rnaseq database''' table_list = P.asList(PARAMS["ensembl_tables"]) dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"] cc.execute(query) for table in table_list: cc = dbhandle.cursor() query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table, table) print query cc.execute(query) cc.close() statement = """touch %(outfile)s;""" P.run()
def buildGeneTables(infile, outfile): ''' build gene tables ''' if infile.endswith(".gff.gz"): outf = gzip.open(outfile, "w") outf.write( "chr\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\tattributes\n") for line in gzip.open(infile).readlines(): outf.write(line) outf.close() else: statement = '''zcat %(infile)s | python %(scriptsdir)s/fasta2table.py -s sequence --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def runMACS( infile, outfile ): to_cluster = False track = infile[:-len("normbam")] try: control = pipeline_vitaminD.getControl( track ) + ".bam" except AssertionError: return statement = ''' macs -t %(infile)s -c %(control)s \ --name=%(outfile)s \ --format=bam --tsize=35 --bw=110 --mfold=8 --gsize=6000000 >& %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) )
def assignEssentialGenesToContigs(infile, outfile): ''' assign essential genes to contigs ''' dirname = os.path.dirname(infile) essential = PARAMS["hmmer_hmm"] tempdir = P.getTempDir(".") statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa; hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc --notextw %(essential)s %(tempdir)s/orfs.fa; tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " " | gzip > %(outfile)s''' P.run() statement = '''rm -rf %(tempdir)s''' P.run()
def buildGenomeAlignment(infile, outfile): '''remove non-unique alignments in genomic infile.''' statement = '''gunzip < %(infile)s | sort -k10,10 -k12,12n | python %(scriptsdir)s/psl2psl.py --method=remove-overlapping-query --log=%(outfile)s.log | sort -k14,14 -k16,16n | python %(scriptsdir)s/psl2psl.py --method=remove-overlapping-target --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def loadPolyphenMap(infile, outfile): '''load polyphen input data.''' table = P.toTable(outfile) statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=snp_id --index=track,transcript_id --index=contig,pos --index=protein_id --index=transcript_id --table=%(table)s < %(infile)s.map > %(outfile)s ''' P.run()
def createRealignIntervals(infiles, outfile): infile, reference = infiles # need to unload java before runnning GATK as it now runs on java version 7 statement = '''module unload apps/java/jre1.6.0_26; java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T RealignerTargetCreator -R %(reference)s -I %(infile)s -o %(outfile)s ''' % locals() P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile): '''map reads from short read archive sequence using bowtie against transcriptome data. ''' # Mapping will permit up to one mismatches. This is sufficient # as the downstream filter in bams2bam requires the # number of mismatches less than the genomic number of mismatches. # Change this, if the number of permitted mismatches for the genome # increases. # Output all valid matches in the best stratum. This will # inflate the file sizes due to matches to alternative transcripts # but otherwise matches to paralogs will be missed (and such # reads would be filtered out). job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}" | samtools import %(contigs)s - - | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s ''' P.run()
def buildCDNAFasta( infile, outfile ): '''load ENSEMBL cdna FASTA file *infile* is an ENSEMBL cdna file. ''' dbname = outfile[:-len(".fasta")] statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | python %(scriptsdir)s/index_fasta.py --force %(dbname)s - > %(dbname)s.log ''' P.run()
def buildTileStats(infile, outfile): '''compute tiling window size statistics from bed file.''' use_cluster = True statement = ''' zcat %(infile)s | python %(scriptsdir)s/gff2histogram.py --force --format=bed --data=size --method=hist --method=stats --output-filename-pattern=%(outfile)s.%%s.tsv > %(outfile)s ''' P.run()
def exportSequences(infile, outfile): '''collect sequences from a gtf file.''' prefix = outfile[:-len(".fasta")] to_cluster = True statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gff2fasta.py --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | python %(toolsdir)s/index_fasta.py --force %(prefix)s - > %(outfile)s.log''' P.run()
def buildPeptideFasta( infile, outfile ): '''create ENSEMBL peptide file *infile* is an ENSEMBL .pep.all.fa.gz file. ''' dbname = outfile[:-len(".fasta")] statement = '''gunzip < %(infile)s | perl -p -e 'if ("^>") { s/ .*//};' | python %(scriptsdir)s/index_fasta.py --force %(dbname)s - > %(dbname)s.log ''' P.run()
def annotateVariantsSNPsift(infile, outfile): '''Add annotations using SNPsift''' to_cluster = USECLUSTER job_options = "-pe dedicated 4 -R y -l mem_free=6G" track = P.snip(os.path.basename(infile), ".vqsr.vcf") dbNSFP = PARAMS["annotation_snpsift_dbnsfp"] # The following statement is not fully implemented yet # statement = '''SnpSift.sh geneSets -v /ifs/projects/proj016/data/1000Genomes/msigdb.v4.0.symbols.gmt %(infile)s > variants/%(track)s_temp1.vcf; checkpoint;''' % locals() statement = '''SnpSift.sh dbnsfp -v %(dbNSFP)s %(infile)s > variants/%(track)s_temp1.vcf; checkpoint;''' % locals() statement += '''SnpSift.sh annotate /ifs/projects/proj016/data/1000Genomes/00-All.vcf variants/%(track)s_temp1.vcf > %(outfile)s ;''' % locals() # statement += '''rm -f variants/*temp*vcf;''' P.run()
def loadPolyphen(infile, outfile): '''load polyphen results.''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;" |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=snp_id --index=protein_id --table=%(table)s --map=effect:str > %(outfile)s ''' P.run()
def mergeGeneLists(infiles, outfile): '''Merge gene lists into single table and load into SQLite.''' tablename = P.toTable(outfile) species_list = P.asList(PARAMS["species"]) anno_list = P.asList(PARAMS["annotations_db"]) species_lookup = dict(zip(species_list, anno_list)) # Connect to database and attach annotation databases dbhandle = sqlite3.connect(PARAMS["database"]) for species in species_lookup.iterkeys(): species_db = species_lookup[species] #species_db = anno_base + species_genome + "/" + db_name cc = dbhandle.cursor() statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals( ) print statement cc.execute(statement) cc.close() # Build union statement pre = "CREATE TABLE %s AS " % tablename statement = "" for f in infiles: track = P.snip(os.path.basename(f), ".genelist.load").replace("-", "_").replace(".", "_") species = track[:2] genelist_id = PARAMS["genelist_id"] statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species FROM %(track)s_genelist g, %(species)s.transcript_info t WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals( ) pre = " UNION " print statement cc = dbhandle.cursor() cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals()) cc.execute(statement) cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' % tablename) cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' % tablename) cc.close() statement = "touch %s" % outfile P.run()
def loadRepeatsRates(infile, outfile): '''load repeat overlap''' table = outfile[:-len(".load")] statement = '''gunzip < %(infile)s | awk '$4 > 0' | python %(toolsdir)s/csv_cut.py --remove exons_lengths exons_values |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=gene_id --map=gene_id:str --table=%(table)s --allow-empty > %(outfile)s''' P.run()
def loadSegments(infile, outfile): '''load segments''' table = outfile[:-len(".load")] for x in (".distances", ".sizes", ".overlaps", "_genes.distances", "_genes.sizes", "_genes.overlaps"): y = re.sub("\.", "_", x) statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=gene_id --map=gene_id:str --table=%(table)s%(y)s < %(infile)s%(x)s >> %(outfile)s''' P.run()