def convertPslToChain(infile, outfile): '''convert a psl to a chain file. see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver ''' to_cluster = True target, query = extractGenomes(infile) tmpfilename1 = P.getTempFilename(".") tmpfilename2 = P.getTempFilename(".") writeContigSizes(target, tmpfilename1) writeContigSizes(query, tmpfilename2) statement = '''gunzip < %(infile)s | pslSwap stdin stdout | cgat psl2chain --log=%(outfile)s.log | chainSort stdin stdout | gzip > %(outfile)s.sorted.chain.gz; checkpoint; gunzip < %(outfile)s.sorted.chain.gz | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout | gzip > %(outfile)s''' P.run() os.unlink(tmpfilename1) os.unlink(tmpfilename2)
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS["annotations_interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def mapReadsWithBowtie(infiles, outfile): '''map reads with bowtie''' inifile, infile = infiles job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | cgat bam2bam --output-sam --method=set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s ''' P.run()
def loadRepeatInformation(infiles, outfile): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename(".") statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | cgat gff2bed -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |cgat csv2db %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database, annotations_database): tmpfile = P.getTempFilename("/ifs/scratch/") table = os.path.basename(gene_info) if gene_biotypes: where_cmd = "WHERE " + " OR ".join( ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")]) else: where_cmd = "" select = """SELECT DISTINCT gene_id, gene_name FROM annotations.%(table)s %(where_cmd)s""" % locals() df1 = pd.read_table(infile, sep="\t") df1.set_index("test_id", drop=False, inplace=True) df2 = pd.read_sql(select, connect(database, annotations_database)) df2.set_index("gene_id", drop=False, inplace=True) df = df1.join(df2) df.to_csv(tmpfile, sep="\t", index=True) options = "--add-index=gene_id" P.load(tmpfile, outfile, options=options) os.unlink(tmpfile)
def mergeSingleExpressionTables(infile, outfile): ''' Merge refcoding and lncRNA count tables from a single condition if there are separate input reference gtfs. ''' file1 = infile[0] file2 = infile[1] tmpfile = P.getTempFilename(shared=True) df1 = pd.read_table(file1, sep="\t", index_col=0, header=0, compression="gzip") df2 = pd.read_table(file2, sep="\t", index_col=0, header=0, compression="gzip") out_frame = df1.append(df2) out_frame.to_csv(tmpfile, sep="\t") statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s''' P.run()
def prepareBAMs(infile, outfile): '''filter bam files for medip-seq analysis. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. ''' to_cluster = True track = P.snip(outfile, ".bam") tmpdir = P.getTempFilename() current_file = infile nfiles = 0 statement = ["mkdir %(tmpdir)s"] if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %%(filtering_quality)i -b %(current_file)s 2>> %%(outfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() dedup_method = PARAMS["filtering_dedup_method"] if dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(outfile)s.log ''' % locals()) nfiles += 1 current_file = next_file statement.append("mv %%(current_file)s %(outfile)s" % locals()) statement.append("rm -rf %(tmpdir)s") statement.append("samtools index %(outfile)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def buildBAMforPeakCalling(infiles, outfile, dedup, mask): ''' Make a BAM file suitable for peak calling. Infiles are merged and unmapped reads removed. If specificied duplicate reads are removed. This method use Picard. If a mask is specified, reads falling within the mask are filtered out. This uses bedtools. The mask is a quicksect object containing the regions from which reads are to be excluded. ''' # open the infiles, if more than one merge and sort first using samtools. samfiles = [] num_reads = 0 nfiles = 0 statement = [] tmpfile = P.getTempFilename(".") if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: statement.append('''MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT > %(outfile)s.picardlog ''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run()
def aggregateTiledReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 This method uses the maximum number of reads found in any interval as the tag count. Tiles with no counts will not be output. ''' to_cluster = True src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def mapReadsWithTophat(infiles, outfile): '''map reads with tophat ''' inifile, infile = infiles local_params = P.loadParameters(inifile) job_options = "-l mem_free=16G" job_threads = PARAMS["tophat_threads"] tmpfile = P.getTempFilename(".") #qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz" ''' gunzip < %(infile)s > %(tmpfile)s.csfasta; checkpoint; gunzip < %(qualfile)s > %(tmpfile)s.qual; checkpoint; ''' statement = ''' zcat %(infile)s | cgat fastq2solid --method=change-format --target-format=integer --pattern-identifier="%(tmpfile)s.%%s" >& %(outfile)s.log; checkpoint; tophat --output-dir %(outfile)s.dir --num-threads %(tophat_threads)s --library-type %(tophat_library_type)s --color --quals --integer-quals %(tophat_options)s %(tophat_genome_dir)s/%(genome)s_cs %(tmpfile)s.csfasta %(tmpfile)s.qual >> %(outfile)s.log; checkpoint; mv %(outfile)s.dir/accepted_hits.bam %(outfile)s; checkpoint; samtools index %(outfile)s; checkpoint; rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual ''' # use local parameters to overwrite default ones. P.run(**local_params) os.unlink(tmpfile)
def bed2BigWig(infiles, outfile): infile, sizes = infiles infile = infile.replace(".bismark.cov", ".bedGraph") # need to sort first, can do this with tmp file tmp_infile = P.getTempFilename() statement = ''' sort -k1,1 -k2,2n %(infile)s | awk '{OFS="\t"; $3 = $3 + 1; print $1,$2,$3,$4}' > %(tmp_infile)s; checkpoint; bedGraphToBigWig %(tmp_infile)s %(sizes)s %(outfile)s; checkpoint; rm -rf %(tmp_infile)s''' P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile): '''map reads from short read archive sequence using bowtie against transcriptome data. ''' # Mapping will permit up to one mismatches. This is sufficient # as the downstream filter in bams2bam requires the # number of mismatches less than the genomic number of mismatches. # Change this, if the number of permitted mismatches for the genome # increases. # Output all valid matches in the best stratum. This will # inflate the file sizes due to matches to alternative transcripts # but otherwise matches to paralogs will be missed (and such # reads would be filtered out). job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | cgat bam2bam --output-sam --method=set-nh --log=%(outfile)s.log | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}" | samtools import %(contigs)s - - | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s ''' P.run()
def buildFeatureCounts(infiles, outfile): '''counts reads falling into "features", which by default are genes. A read overlaps if at least one bp overlaps. Pairs and strandedness can be used to resolve reads falling into more than one feature. Reads that cannot be resolved to a single feature are ignored. ''' infile, annotations = infiles # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") annotations_tmp = P.getTempFilename() # -p -B specifies count fragments rather than reads, and both # reads must map to the feature if PARAMS['featurecounts_paired'] == "1": paired = "-p -B" else: paired = "" job_options = "-pe dedicated %i" % PARAMS['featurecounts_threads'] statement = ''' zcat %(annotations)s > %(annotations_tmp)s; checkpoint; featureCounts %(featurecounts_options)s -T %(featurecounts_threads)s -s %(featurecounts_strand)s -b -a %(annotations_tmp)s -o %(outfile)s %(infile)s > %(outfile)s.log; checkpoint; gzip %(outfile)s; checkpoint; rm %(annotations_tmp)s ''' P.run()
def mapReadsWithBowtieAgainstJunctions(infiles, outfile): '''map reads from short read archive sequence using bowtie against splice junctions. The reads are converted to genomic coordinates. ''' job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") statement = ''' gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | cgat bam2bam --method=set-nh --log=%(outfile)s.log | cgat rnaseq_junction_bam2bam --contigs-tsv-file=%(contigs)s --log=%(outfile)s.log | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s ''' P.run() os.unlink(tmpfile)
def buildRefcodingGeneSetStats(infile, outfile): ''' counts: no. of transcripts no. genes average number of exons per transcript average number of exons per gene no. multi-exon transcripts no. single exon transcripts no. multi-exon genes no. single exon genes in the coding and lncRNA genesets ''' # calculate exon status for refcoding genes. tmpf = P.getTempFilename(".") + ".gz" PipelineLncRNA.flagExonStatus(infile, tmpf) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([ "no_transcripts", "no_genes", "no_exons_per_transcript", "no_exons_per_gene", "no_single_exon_transcripts", "no_multi_exon_transcripts", "no_single_exon_genes", "no_multi_exon_genes" ]) + "\n") outf.write("\t".join( map(str, [ PipelineLncRNA.CounterTranscripts(tmpf).count(), PipelineLncRNA.CounterGenes(tmpf).count(), PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), PipelineLncRNA.CounterExonsPerGene(tmpf).count(), PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), PipelineLncRNA.CounterMultiExonGenes(tmpf).count() ]))) os.unlink(tmpf) os.unlink(tmpf + ".log") os.unlink(P.snip(tmpf, ".gz"))
def extractControllLncRNAFastaAlignments(infiles, outfile): bed_file, maf_file = infiles maf_tmp = P.getTempFilename("/ifs/scratch") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def buildBigBed(infile, outfile): '''bed file with intervals that are covered by reads in any of the experiments. ''' to_cluster = True to_cluster = False tmpfile = P.getTempFilename() contig_sizes = os.path.join(PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"]) statement = ''' zcat %(infile)s > %(tmpfile)s; bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s; rm -f %(tmpfile)s ''' P.run() try: os.unlink(tmpfile) except OSError: pass
def plotHeatmap(results, norm_matrix, threshold_stat, p_threshold, fc_threshold, outfile): ''' plot heatmap of differentially abundant genes ''' if threshold_stat == "p": p = "P.Value" elif threshold_stat == "padj": p = "adj.P.Val" else: p = "adj.P.Val" temp = P.getTempFilename(".") R('''library(gplots)''') R('''library(gtools)''') E.info("reading data") R('''mat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % norm_matrix) R('''rownames(mat) <- mat$taxa mat <- as.matrix(mat[,1:ncol(mat)-1])''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % results) E.info("data loaded") R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' % ( p, p_threshold, fc_threshold)) R('''diff.genes <- unique(t)''') ############################## # this is a hack # to avoid errors when # a single differential # abundant feature is found ############################## R('''write.table(diff.genes, file = "%s", row.names = F, sep = "\t")''' % temp) tmp = open(temp) tmp.readline() if len(tmp.readlines()) == 1: P.touch(outfile) else: R('''mat <- mat[as.character(diff.genes), ] samples <- colnames(mat) mat <- as.data.frame(t(apply(mat, 1, scale))) colnames(mat) <- samples mat <- mat[, mixedsort(colnames(mat))] colours = colorRampPalette(c("blue", "white", "red"))(75) pdf("%s", height = 12, width = 12) heatmap.2(as.matrix(mat), trace = "none", scale = "none", col = colours, Colv = F, dendrogram = "row", margins = c(18, 18)) dev.off()''' % outfile) os.unlink(temp)
def loadLncRNAPhyloCSF(infile, outfile): tmpf = P.getTempFilename("/ifs/scratch") PipelineLncRNA.parsePhyloCSF(infile, tmpf) P.load(tmpf, outfile, options="--add-index=gene_id")
def buildNormalizedBAM(infiles, outfile, normalize=True): '''build a normalized BAM file. Infiles are merged and duplicated reads are removed. If *normalize* is set, reads are removed such that all files will have approximately the same number of reads. Note that the duplication here is wrong as there is no sense of strandedness preserved. ''' min_reads = getMinimumMappedReads(glob.glob("*.readstats")) samfiles = [] num_reads = 0 for infile, statsfile in infiles: samfiles.append(pysam.Samfile(infile, "rb")) num_reads += getMappedReads(statsfile) threshold = float(min_reads) / num_reads E.info("%s: min reads: %i, total reads=%i, threshold=%f" % (infiles, min_reads, num_reads, threshold)) pysam_out = pysam.Samfile(outfile, "wb", template=samfiles[0]) ninput, noutput, nduplicates = 0, 0, 0 # iterate over mapped reads last_contig, last_pos = None, None for pysam_in in samfiles: for read in pysam_in.fetch(): ninput += 1 if read.rname == last_contig and read.pos == last_pos: nduplicates += 1 continue if normalize and random.random() <= threshold: pysam_out.write(read) noutput += 1 last_contig, last_pos = read.rname, read.pos pysam_in.close() pysam_out.close() logs = IOTools.openFile(outfile + ".log", "w") logs.write("# min_reads=%i, threshold= %5.2f\n" % (min_reads, threshold)) logs.write("set\tcounts\tpercent\n") logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0)) nwithout_dups = ninput - nduplicates logs.write("duplicates\t%i\t%5.2f%%\n" % (nduplicates, 100.0 * nduplicates / ninput)) logs.write("without duplicates\t%i\t%5.2f%%\n" % (nwithout_dups, 100.0 * nwithout_dups / ninput)) logs.write("target\t%i\t%5.2f%%\n" % (min_reads, 100.0 * min_reads / nwithout_dups)) logs.write("noutput\t%i\t%5.2f%%\n" % (noutput, 100.0 * noutput / nwithout_dups)) logs.close() # if more than one samfile: sort if len(samfiles) > 1: tmpfilename = P.getTempFilename(".") pysam.sort(outfile, tmpfilename) shutil.move(tmpfilename + ".bam", outfile) os.unlink(tmpfilename) pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, min_reads))
def buildIntervalCounts(infile, outfile, track, fg_replicates, bg_replicates): '''count read density in bed files comparing stimulated versus unstimulated binding. ''' samfiles_fg, samfiles_bg = [], [] # collect foreground and background bam files for replicate in fg_replicates: samfiles_fg.append("%s.call.bam" % replicate.asFile()) for replicate in bg_replicates: samfiles_bg.append("%s.call.bam" % replicate.asFile()) samfiles_fg = [x for x in samfiles_fg if os.path.exists(x)] samfiles_bg = [x for x in samfiles_bg if os.path.exists(x)] samfiles_fg = ",".join(samfiles_fg) samfiles_bg = ",".join(samfiles_bg) tmpfile1 = P.getTempFilename(os.getcwd()) + ".fg" tmpfile2 = P.getTempFilename(os.getcwd()) + ".bg" # start counting to_cluster = True statement = """ zcat < %(infile)s | cgat bed2gff --as-gtf | cgat gtf2table --counter=read-coverage --log=%(outfile)s.log --bam-file=%(samfiles_fg)s > %(tmpfile1)s""" P.run() if samfiles_bg: statement = """ zcat < %(infile)s | cgat bed2gff --as-gtf | cgat gtf2table --counter=read-coverage --log=%(outfile)s.log --bam-file=%(samfiles_bg)s > %(tmpfile2)s""" P.run() statement = ''' python %(toolsdir)s/combine_tables.py --add-file-prefix --regex-filename="[.](\S+)$" %(tmpfile1)s %(tmpfile2)s > %(outfile)s ''' P.run() os.unlink(tmpfile2) else: statement = ''' python %(toolsdir)s/combine_tables.py --add-file-prefix --regex-filename="[.](\S+)$" %(tmpfile1)s > %(outfile)s ''' P.run() os.unlink(tmpfile1)