def buildUniformityOfCoverage(infiles, outfile): ''' build matrix of coverage over contigs ''' bam = infiles[0] track = P.snip(os.path.basename(bam), ".bam") tmp_bed = P.getTempFilename(".") + ".bed" tmp_bam = P.getTempFilename(".") + ".bam" # filter for mapped reads statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s ; samtools index %(tmp_bam)s''' P.run() for infs in infiles[1:]: for inf in infs: if P.snip(inf, ".lengths.tsv") == track: length_file = inf statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s''' P.run() statement = '''python %(scriptsdir)s/bam2peakshape.py --only-interval %(tmp_bam)s %(tmp_bed)s --log=%(outfile)s.log --output-filename-pattern=%(track)s.%%s''' P.run() os.unlink(tmp_bed) os.unlink(tmp_bam)
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS_ANNOTATIONS["interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def convertPslToChain(infile, outfile): '''convert a psl to a chain file. see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver ''' to_cluster = True target, query = extractGenomes(infile) tmpfilename1 = P.getTempFilename(".") tmpfilename2 = P.getTempFilename(".") writeContigSizes(target, tmpfilename1) writeContigSizes(query, tmpfilename2) statement = '''gunzip < %(infile)s | pslSwap stdin stdout | python %(scriptsdir)s/psl2chain.py --log=%(outfile)s.log | chainSort stdin stdout | gzip > %(outfile)s.sorted.chain.gz; checkpoint; gunzip < %(outfile)s.sorted.chain.gz | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout | gzip > %(outfile)s''' P.run() os.unlink(tmpfilename1) os.unlink(tmpfilename2)
def remapWithBowtie( infiles, outfile ): '''re-map unaligned reads. Select those reads that have not been mapped from a bam file (flag-value = 4) and map again with Bowtie. ''' to_cluster = True tmpfilename = P.getTempFilename() prefix = outfile[:-len(".bam")] infile, subsequence = infiles start = START statement = ''' samtools view %(infile)s |\ awk '$2 == 4 {printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$10,$11);}' |\ bowtie --sam -n 3 %(subsequence)s - 2>%(outfile)s.log |\ awk -v OFS="\\t" '/^@/ {print;next;} {if ($4 > 0) { $4 += %(start)s } print; }' |\ samtools import %(genome)s - %(tmpfilename)s >& %(outfile)s.log; samtools sort %(tmpfilename)s %(prefix)s; samtools index %(outfile)s; rm -f %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) if os.path.exists( tmpfilename ): os.unlink( tmpfilename )
def computeOverlapCoding(infile, outfile): '''compute overlap between coding markers and windows. This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id are removed. ''' to_cluster = True tmpfilename = P.getTempFilename(dir=".") statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=ensembl.diff.genes_ovl \ < %(infile)s > %(tmpfilename)s ''' P.run(**dict(locals().items() + PARAMS.items())) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename)
def exportEnsembl( infile, outfile ): '''export gtf file with ensembl transcripts. ''' tmpfile = P.getTempFilename() statement = ''' perl %(scriptsdir)s/ensembl2gtf.pl -dbname %(mysql_database_ensembl)s -host %(mysql_host)s -user %(mysql_user)s -dbpass %(mysql_pass)s -dnadbname %(mysql_database_ensembl)s -dnahost %(mysql_host)s -dnauser %(mysql_user)s -dnapass %(mysql_pass)s -gtffile %(tmpfile)s -schema '%(ensembl_schema)s' -coordsystem %(ensembl_coordsystem)s -genetypes %(ensembl_genetypes)s > %(outfile)s.log''' P.run() statement = 'gzip < %(tmpfile)s > %(outfile)s' P.run() os.unlink( tmpfile )
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def mapReadsWithBowtie(infiles, outfile): '''map reads with bowtie''' inifile, infile = infiles to_cluster = USECLUSTER job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[ "bowtie_threads"] tmpfile = P.getTempFilename() statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s ''' P.run()
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join(['''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [re.search(regex, os.path.basename(x)).groups()[0] for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def mapReadsWithBowtie(infiles, outfile): """map reads with bowtie""" inifile, infile = infiles to_cluster = USECLUSTER job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = """ gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s """ P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def buildGeneModels(infile, outfile): '''build transcript models - run cufflinks on each region seperately''' to_cluster = USECLUSTER track = os.path.basename(outfile[:-len(".gtf")]) ins_size, std_dev = getInsertSizes("reads/%s" % track) tmpfilename = P.getTempFilename() nslots = 4 if os.path.exists(tmpfilename): os.unlink(tmpfilename) infile = os.path.abspath(infile) outfile = os.path.abspath(outfile) statement = '''mkdir %(tmpfilename)s; samtools view %(infile)s | sort -k3,3 -k4,4n 2> %(outfile)s.log1 > %(tmpfilename)s/temp.sam; cd %(tmpfilename)s; cufflinks --inner-dist-mean %(ins_size)i --inner-dist-stddev %(std_dev)i --label %(track)s --num-threads %(nslots)i --min-isoform-fraction %(cuff_min_isoform)f --pre-mrna-fraction %(cuff_pre_mrna)f %(tmpfilename)s/temp.sam >& %(outfile)s.log2; mv transcripts.gtf %(outfile)s >& %(outfile)s.log3; rm -rf %(tmpfilename)s >& %(outfile)s.log4 ''' P.run()
def computeOverlapCoding( infile, outfile ): '''compute overlap between coding markers and windows. This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id are removed. ''' to_cluster = True tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=ensembl.diff.genes_ovl \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def loadRepeatInformation( infiles, outfile ): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename( "." ) statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | python %(scriptsdir)s/gff2bed.py -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink( tmpfilename )
def loadRepeatInformation(infiles, outfile): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename(".") statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | python %(scriptsdir)s/gff2bed.py -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def mapReadsWithBowtie(infiles, outfile): '''map reads with bowtie''' inifile, infile = infiles job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s ''' P.run()
def buildGeneModels(infile, outfile): '''build transcript models - run cufflinks on each region seperately''' to_cluster = USECLUSTER track = os.path.basename( outfile[:-len(".gtf")] ) ins_size, std_dev = getInsertSizes( "reads/%s" % track ) tmpfilename = P.getTempFilename() nslots = 4 if os.path.exists( tmpfilename ): os.unlink( tmpfilename ) infile = os.path.abspath( infile ) outfile = os.path.abspath( outfile ) statement = '''mkdir %(tmpfilename)s; samtools view %(infile)s | sort -k3,3 -k4,4n 2> %(outfile)s.log1 > %(tmpfilename)s/temp.sam; cd %(tmpfilename)s; cufflinks --inner-dist-mean %(ins_size)i --inner-dist-stddev %(std_dev)i --label %(track)s --num-threads %(nslots)i --min-isoform-fraction %(cuff_min_isoform)f --pre-mrna-fraction %(cuff_pre_mrna)f %(tmpfilename)s/temp.sam >& %(outfile)s.log2; mv transcripts.gtf %(outfile)s >& %(outfile)s.log3; rm -rf %(tmpfilename)s >& %(outfile)s.log4 ''' P.run()
def prepareBAMs(infile, outfile): '''filter bam files for medip-seq analysis. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. ''' to_cluster = True track = P.snip(outfile, ".bam") tmpdir = P.getTempFilename() current_file = infile nfiles = 0 statement = ["mkdir %(tmpdir)s"] if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %%(filtering_quality)i -b %(current_file)s 2>> %%(outfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() dedup_method = PARAMS["filtering_dedup_method"] if dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(outfile)s.log ''' % locals()) nfiles += 1 current_file = next_file statement.append("mv %%(current_file)s %(outfile)s" % locals()) statement.append("rm -rf %(tmpdir)s") statement.append("samtools index %(outfile)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def computeOverlapGO( infile, outfile ): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) )) filter_genes = set() E.info( "number of goids: %i" % len(filter_goid)) for l in open( PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add( id ) tmpfile1 = P.getTempFile( dir = "." ) for line in open("ensembl.diff.genes_ovl" ): a,b = line[:-1].split( "\t" ) if b not in filter_genes: continue tmpfile1.write(line) E.info( "number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def prepareBAMs(infile, outfile): '''filter bam files for medip-seq analysis. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. ''' to_cluster = True track = P.snip(outfile, ".bam") tmpdir = P.getTempFilename() current_file = infile nfiles = 0 statement = ["mkdir %(tmpdir)s"] if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append( '''samtools view -q %%(filtering_quality)i -b %(current_file)s 2>> %%(outfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() dedup_method = PARAMS["filtering_dedup_method"] if dedup_method == 'samtools': statement.append( '''samtools rmdup - - ''' ) elif dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(outfile)s.log ''' % locals() ) nfiles += 1 current_file = next_file statement.append("mv %%(current_file)s %(outfile)s" % locals()) statement.append("rm -rf %(tmpdir)s") statement.append("samtools index %(outfile)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def computeOverlapGO(infile, outfile): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"]))) filter_genes = set() E.info("number of goids: %i" % len(filter_goid)) for l in open(PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add(id) tmpfile1 = P.getTempFile(dir=".") for line in open("ensembl.diff.genes_ovl"): a, b = line[:-1].split("\t") if b not in filter_genes: continue tmpfile1.write(line) E.info("number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename(dir=".") statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run(**dict(locals().items() + PARAMS.items())) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename)
def aggregateWindowsReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Tiles with no counts will not be output. ''' to_cluster = True # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def mapReads(infiles, outfile): '''map reads using all known junctions and all junctions found before. This method requires the explicit genome in bowtiedir together with the samtools index. Using a flattened genome file will not work due to the limit of a line length of 65536 in samtools. ''' if not os.path.exists("%(bowtiedir)s/%(genome)s.fa" % PARAMS): raise ValueError( "genome %(bowtiedir)s/%(genome)s.fa does not exist - create with bowtie-inspect first" % PARAMS) ins_size, std_dev = getInsertSizes(os.path.dirname(outfile[:-len(".bam")])) nslots = 4 fastq1, fastq2 = infiles[0] tmpfilename = P.getTempFilename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) job_options = "-pe dedicated 4-8 -l mem_free=3G -R y" to_cluster = USECLUSTER junctions_file = "reads/all.junctions" # WARNING: contents of tmpfile can get large (20Gb or more) statement = ''' gunzip < %(fastq1)s > %(tmpfilename)s.1.fq; gunzip < %(fastq2)s > %(tmpfilename)s.2.fq; tophat --output-dir %(tmpfilename)s --min-isoform-fraction 0.0 --mate-inner-dist %(ins_size)i --mate-std-dev %(std_dev)i --raw-juncs %(junctions_file)s -p %(nslots)i %(bowtiedir)s/%(genome)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log; mv %(tmpfilename)s/accepted_hits.bam %(outfile)s 2>> %(outfile)s.log; rm -rf %(tmpfilename)s 2>> %(outfile)s.log; rm -f %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq 2>> %(outfile)s.log ''' P.run()
def indexForSailfish(infile, outfile): '''create a sailfish index''' outdir = P.snip(outfile, "/transcriptome.sfi") kmer = int(PARAMS["sailfish_kmer_size"]) tmp = P.getTempFilename() statement = '''gunzip -c %(infile)s > %(tmp)s; module load bio/sailfish; sailfish index -t %(tmp)s -k %(kmer)i -o %(outdir)s; rm -f %(tmp)s''' P.run()
def mapReadsWithTophat(infiles, outfile): '''map reads with tophat ''' inifile, infile = infiles local_params = P.loadParameters(inifile) to_cluster = USECLUSTER job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[ "tophat_threads"] tmpfile = P.getTempFilename(".") #qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz" ''' gunzip < %(infile)s > %(tmpfile)s.csfasta; checkpoint; gunzip < %(qualfile)s > %(tmpfile)s.qual; checkpoint; ''' statement = ''' zcat %(infile)s | python %(scriptsdir)s/fastq2solid.py --change-format=integer --pattern="%(tmpfile)s.%%s" >& %(outfile)s.log; checkpoint; tophat --output-dir %(outfile)s.dir --num-threads %(tophat_threads)s --library-type %(tophat_library_type)s --color --quals --integer-quals %(tophat_options)s %(tophat_genome_dir)s/%(genome)s_cs %(tmpfile)s.csfasta %(tmpfile)s.qual >> %(outfile)s.log; checkpoint; mv %(outfile)s.dir/accepted_hits.bam %(outfile)s; checkpoint; samtools index %(outfile)s; checkpoint; rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual ''' # use local parameters to overwrite default ones. P.run(**local_params) os.unlink(tmpfile)
def makeCodingPotential( infile, outfile ): '''run CPC to predict coding potential.''' statement = ''' cpc.sh %(infile)s %(outfile)s.forward.table %(outfile)s.tmp.dir %(outfile)s.forward.evidence %(codingpotential_database)s > %(outfile)s.log''' P.run() tmpfilename = P.getTempFilename( "." ) statement = '''python %(toolsdir)s/fasta2fasta.py --method=reverse-complement -v 0 < %(infile)s > %(tmpfilename)s''' P.run() statement = ''' cpc.sh %(tmpfilename)s %(outfile)s.reverse.table %(outfile)s.tmp.dir %(outfile)s.reverse.evidence %(codingpotential_database)s >> %(outfile)s.log''' P.run() outf = open(outfile, "w") outf.write( "gene_id\tlength\tf_iscoding\tf_value\tf_orfstart\tf_orfend\tf_orfval1\tf_orfval2\tf_orf\tr_iscoding\tr_value\tr_orfstart\tr_orfend\tr_orfval1\tr_orfval2\tr_orf\n") outf.close() to_cluster = True statement = ''' python %(toolsdir)s/combine_tables.py -v 0 %(outfile)s.forward.table %(outfile)s.forward.evidence.orf %(outfile)s.reverse.table %(outfile)s.reverse.evidence.orf |\ cut -f 1,2,3,4,6,7,8,9,10,12,13,15- >> %(outfile)s ''' P.run() # save space by compressing the result of the homology searches E.info( "compressing CPC output" ) statement ='''rm -f %(outfile)s.*h**o.gz; gzip %(outfile)s.*h**o''' P.run() os.unlink( tmpfilename )
def aggregateTiledReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 This method uses the maximum number of reads found in any interval as the tag count. Tiles with no counts will not be output. ''' to_cluster = True src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles] ) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def mapReads(infiles, outfile): '''map reads using all known junctions and all junctions found before. This method requires the explicit genome in bowtiedir together with the samtools index. Using a flattened genome file will not work due to the limit of a line length of 65536 in samtools. ''' if not os.path.exists( "%(bowtiedir)s/%(genome)s.fa" % PARAMS ): raise ValueError( "genome %(bowtiedir)s/%(genome)s.fa does not exist - create with bowtie-inspect first" % PARAMS) ins_size, std_dev = getInsertSizes( os.path.dirname( outfile[:-len(".bam") ] ) ) nslots = 4 fastq1, fastq2 = infiles[0] tmpfilename = P.getTempFilename() if os.path.exists( tmpfilename ): os.unlink( tmpfilename ) job_options= "-pe dedicated 4-8 -l mem_free=3G -R y" to_cluster = USECLUSTER junctions_file = "reads/all.junctions" # WARNING: contents of tmpfile can get large (20Gb or more) statement = ''' gunzip < %(fastq1)s > %(tmpfilename)s.1.fq; gunzip < %(fastq2)s > %(tmpfilename)s.2.fq; tophat --output-dir %(tmpfilename)s --min-isoform-fraction 0.0 --mate-inner-dist %(ins_size)i --mate-std-dev %(std_dev)i --raw-juncs %(junctions_file)s -p %(nslots)i %(bowtiedir)s/%(genome)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log; mv %(tmpfilename)s/accepted_hits.bam %(outfile)s 2>> %(outfile)s.log; rm -rf %(tmpfilename)s 2>> %(outfile)s.log; rm -f %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq 2>> %(outfile)s.log ''' P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile): '''map reads from short read archive sequence using bowtie against transcriptome data. ''' # Mapping will permit up to one mismatches. This is sufficient # as the downstream filter in bams2bam requires the # number of mismatches less than the genomic number of mismatches. # Change this, if the number of permitted mismatches for the genome # increases. # Output all valid matches in the best stratum. This will # inflate the file sizes due to matches to alternative transcripts # but otherwise matches to paralogs will be missed (and such # reads would be filtered out). to_cluster = USECLUSTER job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[ "bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}" | samtools import %(contigs)s - - | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s ''' P.run()
def runCufflinks(infiles, outfile): '''estimate expression levels in each set. ''' gtffile, bamfile = infiles job_threads = PARAMS["cufflinks_threads"] track = os.path.basename(P.snip(gtffile, ".gtf.gz")) tmpfilename = P.getTempFilename(".") if os.path.exists(tmpfilename): os.unlink(tmpfilename) gtffile = os.path.abspath(gtffile) bamfile = os.path.abspath(bamfile) outfile = os.path.abspath(outfile) # note: cufflinks adds \0 bytes to gtf file - replace with '.' # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb # intron. # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute statement = '''mkdir %(tmpfilename)s; cd %(tmpfilename)s; cufflinks --label %(track)s --GTF <(gunzip < %(gtffile)s) --num-threads %(cufflinks_threads)i --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa --library-type %(cufflinks_library_type)s %(cufflinks_options)s %(bamfile)s | grep -v 'BAM record error' >& %(outfile)s; perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz; gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz; gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz; ''' P.run() shutil.rmtree(tmpfilename)
def indexForSailfish(infile, outfile): '''create a sailfish index''' outdir = P.snip(outfile, "/transcriptome.sfi") kmer = int(PARAMS["sailfish_kmer_size"]) tmp = P.getTempFilename() if zipped: statement = '''gunzip -c %(infile)s > %(tmp)s; checkpoint; sailfish index -t %(tmp)s''' else: statement = '''sailfish index -t %(infile)s''' statement += '''-k %(kmer)i -o %(outdir)s; checkpoint; rm -f %(tmp)s''' P.run()
def runCufflinks(infiles, outfile): '''estimate expression levels in each set. ''' gtffile, bamfile = infiles job_options = "-pe dedicated %i -R y" % PARAMS["cufflinks_threads"] track = os.path.basename(P.snip(gtffile, ".gtf.gz")) tmpfilename = P.getTempFilename(".") if os.path.exists(tmpfilename): os.unlink(tmpfilename) gtffile = os.path.abspath(gtffile) bamfile = os.path.abspath(bamfile) outfile = os.path.abspath(outfile) # note: cufflinks adds \0 bytes to gtf file - replace with '.' # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb # intron. # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute statement = '''mkdir %(tmpfilename)s; cd %(tmpfilename)s; cufflinks --label %(track)s --GTF <(gunzip < %(gtffile)s) --num-threads %(cufflinks_threads)i --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa --library-type %(cufflinks_library_type)s %(cufflinks_options)s %(bamfile)s | grep -v 'BAM record error' >& %(outfile)s; perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz; gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz; gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz; ''' P.run() shutil.rmtree(tmpfilename)
def findJunctions(infiles, outfile): '''map reads using all known junctions in order to identify new possible junctions - cat the junctions together and delete the tophat output directories ''' ins_size, std_dev = getInsertSizes( os.path.dirname(outfile[:-len(".junctions")])) nslots = 4 fastq1, fastq2 = infiles[0] tmpfilename = P.getTempFilename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) job_options = "-pe dedicated 4-8 -l mem_free=3G -R y" to_cluster = USECLUSTER # tophat does a seek operation on the fq files, hence they # need to unpacked into real files statement = ''' gunzip < %(fastq1)s > %(tmpfilename)s.1.fq; gunzip < %(fastq2)s > %(tmpfilename)s.2.fq; tophat --output-dir %(tmpfilename)s --butterfly-search --min-anchor-length 5 --closure-search --microexon-search --min-isoform-fraction 0.0 --mate-inner-dist %(ins_size)i --mate-std-dev %(std_dev)i --max-intron-length %(max_intron)i --raw-juncs %(junctions_file)s -p %(nslots)i %(bowtiedir)s/%(genome)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log; mv %(tmpfilename)s/junctions.bed %(outfile)s >& %(outfile)s.log2; mv %(tmpfilename)s/logs %(outfile)s.logs >& %(outfile)s.log3; rm -rf %(tmpfilename)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log4 ''' P.run()
def findJunctions(infiles, outfile): '''map reads using all known junctions in order to identify new possible junctions - cat the junctions together and delete the tophat output directories ''' ins_size, std_dev = getInsertSizes( os.path.dirname( outfile[:-len(".junctions") ] ) ) nslots = 4 fastq1, fastq2 = infiles[0] tmpfilename = P.getTempFilename() if os.path.exists( tmpfilename ): os.unlink( tmpfilename ) job_options= "-pe dedicated 4-8 -l mem_free=3G -R y" to_cluster = USECLUSTER # tophat does a seek operation on the fq files, hence they # need to unpacked into real files statement = ''' gunzip < %(fastq1)s > %(tmpfilename)s.1.fq; gunzip < %(fastq2)s > %(tmpfilename)s.2.fq; tophat --output-dir %(tmpfilename)s --butterfly-search --min-anchor-length 5 --closure-search --microexon-search --min-isoform-fraction 0.0 --mate-inner-dist %(ins_size)i --mate-std-dev %(std_dev)i --max-intron-length %(max_intron)i --raw-juncs %(junctions_file)s -p %(nslots)i %(bowtiedir)s/%(genome)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log; mv %(tmpfilename)s/junctions.bed %(outfile)s >& %(outfile)s.log2; mv %(tmpfilename)s/logs %(outfile)s.logs >& %(outfile)s.log3; rm -rf %(tmpfilename)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log4 ''' P.run()
def buildRefcodingGeneSetStats(infile, outfile): ''' counts: no. of transcripts no. genes average number of exons per transcript average number of exons per gene no. multi-exon transcripts no. single exon transcripts no. multi-exon genes no. single exon genes in the coding and lncRNA genesets ''' # calculate exon status for refcoding genes. tmpf = P.getTempFilename(".") + ".gz" PipelineLncRNA.flagExonStatus(infile, tmpf) outf = open(outfile, "w") outf.write("\t".join(["no_transcripts", "no_genes", "no_exons_per_transcript", "no_exons_per_gene", "no_single_exon_transcripts", "no_multi_exon_transcripts", "no_single_exon_genes", "no_multi_exon_genes"]) + "\n") outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), PipelineLncRNA.CounterGenes(tmpf).count(), PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), PipelineLncRNA.CounterExonsPerGene(tmpf).count(), PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), PipelineLncRNA.CounterMultiExonGenes(tmpf).count()]))) os.unlink(tmpf) os.unlink(tmpf + ".log") os.unlink(P.snip(tmpf, ".gz"))
def buildCoverageOverContigs(infiles, outfile): ''' build histograms of the coverage over each of the contigs ''' bam = infiles[0] # genomecoveragebed does not like some of the # output from bwa. bwa outputs some reads # that map off the end of contigs # as having a leftmost position of 0. This is # not ideal. Need to use temporary bam # files with only mapped reads - this is # nasty and needs changing tempdir = P.getTempDir(".") tempname = P.getTempFilename(tempdir) + ".bam" P.submit("CGATPipelines.PipelineMetagenomeAssembly", "filterBamOnPos", infiles = bam, outfiles = tempname) # tablename where alignment stats live tablename = os.path.dirname( bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats" # hack to convert to table - add .load tablename = P.toTable(tablename + ".load") # connect to database dbh = connect() cc = dbh.cursor() # get number of reads aligned from bam2stats if PARAMS.get("coverage_scale"): scale_factor = cc.execute("""SELECT counts FROM %s WHERE category == 'reads_mapped'""" % tablename).fetchone()[0] scale_factor = 1 / (float(scale_factor) / 1000000) scale_options = "-scale %(scale_factor)f" else: scale_options = "" statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s; rm -rf %(tempdir)s''' P.run()
def buildBigBed( infile, outfile ): '''bed file with intervals that are covered by reads in any of the experiments. ''' to_cluster = True to_cluster = False tmpfile = P.getTempFilename() contig_sizes = os.path.join( PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"] ) statement = ''' zcat %(infile)s > %(tmpfile)s; bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s; rm -f %(tmpfile)s ''' P.run() try: os.unlink( tmpfile ) except OSError: pass
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.getTempFilename() statement = ''' python %(scriptsdir)s/annotator.py \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-id="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run() tmpfile = P.getTempFile() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run() os.unlink(tmpfilename) os.unlink(tmpfilename2)
def buildRefcodingGeneSetStats(infile, outfile): ''' counts: no. of transcripts no. genes average number of exons per transcript average number of exons per gene no. multi-exon transcripts no. single exon transcripts no. multi-exon genes no. single exon genes in the coding and lncRNA genesets ''' # calculate exon status for refcoding genes. tmpf = P.getTempFilename(".") + ".gz" PipelineLncRNA.flagExonStatus(infile, tmpf) outf = open(outfile, "w") outf.write("\t".join([ "no_transcripts", "no_genes", "no_exons_per_transcript", "no_exons_per_gene", "no_single_exon_transcripts", "no_multi_exon_transcripts", "no_single_exon_genes", "no_multi_exon_genes" ]) + "\n") outf.write("\t".join( map(str, [ PipelineLncRNA.CounterTranscripts(tmpf).count(), PipelineLncRNA.CounterGenes(tmpf).count(), PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), PipelineLncRNA.CounterExonsPerGene(tmpf).count(), PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), PipelineLncRNA.CounterMultiExonGenes(tmpf).count() ]))) os.unlink(tmpf) os.unlink(tmpf + ".log") os.unlink(P.snip(tmpf, ".gz"))
def mapReadsWithBowtieAgainstJunctions(infiles, outfile): '''map reads from short read archive sequence using bowtie against splice junctions. The reads are converted to genomic coordinates. ''' job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[ "bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") to_cluster = USECLUSTER statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --set-nh --log=%(outfile)s.log | python %(scriptsdir)s/rnaseq_junction_bam2bam.py --contig-sizes=%(contigs)s --log=%(outfile)s.log | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s ''' P.run() os.unlink(tmpfile)
def extractControllLncRNAFastaAlignments(infiles, outfile): bed_file, maf_file = infiles maf_tmp = P.getTempFilename("/ifs/scratch") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.getTempFilename() statement = ''' python %(scriptsdir)s/annotator2tsv.py \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-id="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run(**dict(locals().items() + PARAMS.items())) tmpfile = P.getTempFile() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def runCufflinks(infiles, outfile): '''estimate expression levels in each set. ''' gtffile, bamfile = infiles to_cluster = True job_options = "-pe dedicated %i -R y" % PARAMS["cufflinks_threads"] track = os.path.basename(P.snip(gtffile, ".gtf.gz")) tmpfilename = P.getTempFilename(".") if os.path.exists(tmpfilename): os.unlink(tmpfilename) gtffile = os.path.abspath(gtffile) bamfile = os.path.abspath(bamfile) outfile = os.path.abspath(outfile) # note: cufflinks adds \0 bytes to gtf file - replace with '.' # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb intron. statement = '''mkdir %(tmpfilename)s; cd %(tmpfilename)s; cufflinks --label %(track)s --GTF <(gunzip < %(gtffile)s) --num-threads %(cufflinks_threads)i --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa --library-type %(cufflinks_library_type)s %(cufflinks_options)s %(bamfile)s >& %(outfile)s; perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz; gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz; gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz; ''' P.run() shutil.rmtree(tmpfilename)
def buildBigBed(infile, outfile): '''bed file with intervals that are covered by reads in any of the experiments. ''' to_cluster = True to_cluster = False tmpfile = P.getTempFilename() contig_sizes = os.path.join( PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) statement = ''' zcat %(infile)s > %(tmpfile)s; bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s; rm -f %(tmpfile)s ''' P.run() try: os.unlink(tmpfile) except OSError: pass
def difference_to2(infile, outfile): '''compare several bed-files.''' track = re.match("version\d+_(.*).bed", infile).groups()[0] tmpfile = P.getTempFilename() for version in VERSIONS: t = tmpfile + "%s" % version if version == "version2": statement = '''cut -f 5 < %(version)s_%(track)s.bed |\ python %(toolsdir)s/data2histogram.py --headers=%(version)s --bin-size=1 --min-value=1 > %(t)s ''' else: statement = ''' intersectBed -v -a version2_%(track)s.bed -b %(version)s_%(track)s.bed | cut -f 5 |\ python %(toolsdir)s/data2histogram.py --headers=%(version)s --bin-size=1 --min-value=1 > %(t)s ''' P.run(**dict(locals().items() + PARAMS.items())) statement = ''' python %(toolsdir)s/combine_tables.py --sort-keys=numeric %(tmpfile)s* > %(outfile)s ''' P.run(**dict(locals().items() + PARAMS.items()))