def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS["annotations_interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def convertPslToChain(infile, outfile): '''convert a psl to a chain file. see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver ''' to_cluster = True target, query = extractGenomes(infile) tmpfilename1 = P.getTempFilename(".") tmpfilename2 = P.getTempFilename(".") writeContigSizes(target, tmpfilename1) writeContigSizes(query, tmpfilename2) statement = '''gunzip < %(infile)s | pslSwap stdin stdout | cgat psl2chain --log=%(outfile)s.log | chainSort stdin stdout | gzip > %(outfile)s.sorted.chain.gz; checkpoint; gunzip < %(outfile)s.sorted.chain.gz | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout | gzip > %(outfile)s''' P.run() os.unlink(tmpfilename1) os.unlink(tmpfilename2)
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.getTempFilename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=PARAMS["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) P.touch(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run() os.unlink(tmpfasta)
def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes, database, annotations_database): tmpfile = P.getTempFilename("/ifs/scratch/") table = os.path.basename(transcript_info) if gene_biotypes: where_cmd = "WHERE " + " OR ".join( ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")]) else: where_cmd = "" select = """SELECT DISTINCT transcript_id, transcript_biotype, gene_id, gene_name FROM annotations.%(table)s %(where_cmd)s""" % locals() df1 = pd.read_table(infile, sep="\t") df1.set_index("transcript_id", drop=True, inplace=True) df2 = pd.read_sql(select, connect(database, annotations_database)) df2.set_index("transcript_id", drop=False, inplace=True) df = df1.join(df2) df.to_csv(tmpfile, sep="\t", index=True) options = "--add-index=transcript_id" P.load(tmpfile, outfile, options=options) os.unlink(tmpfile)
def mapReadsWithBowtie(infiles, outfile): """map reads with bowtie""" inifile, infile = infiles job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = """ gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s """ P.run()
def testMotifDisruptingSnpsNotEnriched(infiles, outfile): ''' test SNPs for motif disrupting effects using motifbreakR ''' infile = infiles[0] annot_file = infiles[1] job_memory = "6G" tmp = P.getTempFilename(shared=True) statement = ''' comm -23 %(infile)s <(zcat %(annot_file)s | cut -f 4 | sort) > %(tmp)s; checkpoint; python /ifs/devel/projects/proj045/enrichment_pipeline/snps2motif.py --log=%(outfile)s.log --snp-column=0 --R-scripts-directory=%(r_scripts)s --R-script=%(motifs_script)s --additional-motif=%(motifs_pwms)s --image-directory=plots.dir %(tmp)s > %(outfile)s; checkpoint; rm -f %(tmp)s''' P.run()
def buildGff(infile, outfile): '''Creates a gff for DEXSeq This takes the gtf and flattens it to an exon based input required by DEXSeq. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile : string Input filename in :term:`gtf` format outfile : string A :term:`gff` file for use in DEXSeq annotations_interface_geneset_all_gtf : string :term:`PARAMS`. Filename of :term:`gtf` file containing all ensembl annotations ''' tmpgff = P.getTempFilename(".") statement = "gunzip -c %(infile)s > %(tmpgff)s" P.run() ps = PYTHONSCRIPTSDIR statement = '''python %(ps)s/dexseq_prepare_annotation.py %(tmpgff)s %(outfile)s''' P.run() os.unlink(tmpgff)
def loadRepeatInformation(infiles, outfile): '''load genome information.''' to_cluster = True table = outfile[:-len(".load")] repeatsfile, indexfile = infiles tmpfilename = P.getTempFilename(".") statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s''' P.run() statement = ''' gunzip < %(repeatsfile)s | cgat gff2bed -v 0 | coverageBed -a stdin -b %(tmpfilename)s | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}' |cgat csv2db %(csv2db_options)s --table=%(table)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def testMotifDisruptingSnpsEnrichedAnnotations(infile, outfile): ''' test SNPs for motif disrupting effects using motifbreakR ''' job_memory = "6G" tmp = P.getTempFilename(shared=True) statement = ''' zcat %(infile)s | grep -P %(annotations_regex)s | grep -P %(annotations_cell_regex)s > %(tmp)s; checkpoint; python /ifs/devel/projects/proj045/enrichment_pipeline/snps2motif.py --log=%(outfile)s.log --snp-column=3 --R-scripts-directory=%(r_scripts)s --R-script=%(motifs_script)s --additional-motif=%(motifs_pwms)s --image-directory=plots.dir %(tmp)s > %(outfile)s; checkpoint; rm -f %(tmp)s''' P.run()
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join(['''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [re.search(regex, os.path.basename(x)).groups()[0] for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def mergeSingleExpressionTables(infile, outfile): ''' Merge refcoding and lncRNA count tables from a single condition if there are separate input reference gtfs. ''' file1 = infile[0] file2 = infile[1] tmpfile = P.getTempFilename(shared=True) df1 = pd.read_table(file1, sep="\t", index_col=0, header=0, compression="gzip") df2 = pd.read_table(file2, sep="\t", index_col=0, header=0, compression="gzip") out_frame = df1.append(df2) out_frame.to_csv(tmpfile, sep="\t") statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s''' P.run()
def buildReferenceGeneSet(infile, outfile): """ filter full gene set and add attributes to create the reference gene set Performs merge and filter operations: * Merge exons separated by small introns (< 5bp). * Remove transcripts with very long introns (`max_intron_size`) * Remove transcripts located on contigs to be ignored (`remove_contigs`) (usually: chrM, _random, ...) * (Optional) Remove transcripts overlapping repetitive sequences (`rna_file`) This preserves all features in a gtf file (exon, CDS, ...) Runs cuffcompare with `infile` against itself to add attributes such as p_id and tss_id. Parameters ---------- infile : str Input filename in :term:`gtf` format outfile : str Input filename in :term:`gtf` format annotations_interface_rna_gff : str :term:`PARAMS`. Filename of :term:`gtf` file containing repetitive rna annotations genome_dir : str :term:`PARAMS`. Directory of :term:fasta formatted files genome : str :term:`PARAMS`. Genome name (e.g hg38) """ tmp_mergedfiltered = P.getTempFilename(".") if "geneset_remove_repetetive_rna" in PARAMS: rna_file = PARAMS["annotations_interface_rna_gff"] else: rna_file = None gene_ids = PipelineMapping.mergeAndFilterGTF( infile, tmp_mergedfiltered, "%s.removed.gz" % outfile, genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]), max_intron_size=PARAMS["max_intron_size"], remove_contigs=PARAMS["geneset_remove_contigs"], rna_file=rna_file, ) # Add tss_id and p_id PipelineMapping.resetGTFAttributes( infile=tmp_mergedfiltered, genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]), gene_ids=gene_ids, outfile=outfile, ) os.unlink(tmp_mergedfiltered)
def prepareBAMs(infile, outfile): '''filter bam files for medip-seq analysis. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. ''' to_cluster = True track = P.snip(outfile, ".bam") tmpdir = P.getTempFilename() current_file = infile nfiles = 0 statement = ["mkdir %(tmpdir)s"] if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append( '''samtools view -q %%(filtering_quality)i -b %(current_file)s 2>> %%(outfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() dedup_method = PARAMS["filtering_dedup_method"] if dedup_method == 'samtools': statement.append( '''samtools rmdup - - ''' ) elif dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(outfile)s.log ''' % locals() ) nfiles += 1 current_file = next_file statement.append("mv %%(current_file)s %(outfile)s" % locals()) statement.append("rm -rf %(tmpdir)s") statement.append("samtools index %(outfile)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def download(self, genes=None, fields=None, scope=None, species=None): ''' download an up to date ontology file, parse the xml data into a Python "ElementTree" and delete the ontology file. ''' ontologyfile = P.getTempFilename(".") os.system("wget -O %s %s" % (ontologyfile, self.datasource)) tree = ET.parse(ontologyfile) os.remove(ontologyfile) self.dataset = tree
def buildBAMforPeakCalling(infiles, outfile, dedup, mask): ''' Make a BAM file suitable for peak calling. Infiles are merged and unmapped reads removed. If specificied duplicate reads are removed. This method use Picard. If a mask is specified, reads falling within the mask are filtered out. This uses bedtools. The mask is a quicksect object containing the regions from which reads are to be excluded. ''' # open the infiles, if more than one merge and sort first using samtools. samfiles = [] num_reads = 0 nfiles = 0 statement = [] tmpfile = P.getTempFilename(".") if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: statement.append('''MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT > %(outfile)s.picardlog ''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run()
def clustersToBigBed(infile, genome_file, outfile): '''Convert beds to bigbed ''' checkParams() tmp = P.getTempFilename() statement = ''' zcat %(infile)s | sort -k1,1 -k2,2n | awk 'BEGIN{OFS="\\t"} $5=1' > %(tmp)s; checkpoint; bedToBigBed %(tmp)s %(genome_file)s %(outfile)s; checkpoint; rm %(tmp)s''' P.run()
def loadManualAnnotations(infile, outfile): tmp = P.getTempFilename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.openFile(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.openFile(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def aggregateAdaptors(infiles, outfile): ''' Collate fasta files into a single contaminants file for adapter removal. ''' tempfile = P.getTempFilename() infiles = " ".join(infiles) statement = """ cat %(infiles)s | fastx_reverse_complement > %(tempfile)s; cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s; rm -f %(tempfile)s """ P.run()
def mapReadsWithTophat(infiles, outfile): """map reads with tophat """ inifile, infile = infiles local_params = P.loadParameters(inifile) job_options = "-l mem_free=16G" job_threads = PARAMS["tophat_threads"] tmpfile = P.getTempFilename(".") # qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz" """ gunzip < %(infile)s > %(tmpfile)s.csfasta; checkpoint; gunzip < %(qualfile)s > %(tmpfile)s.qual; checkpoint; """ statement = """ zcat %(infile)s | python %(scriptsdir)s/fastq2solid.py --method=change-format --target-format=integer --pattern-identifier="%(tmpfile)s.%%s" >& %(outfile)s.log; checkpoint; tophat --output-dir %(outfile)s.dir --num-threads %(tophat_threads)s --library-type %(tophat_library_type)s --color --quals --integer-quals %(tophat_options)s %(tophat_genome_dir)s/%(genome)s_cs %(tmpfile)s.csfasta %(tmpfile)s.qual >> %(outfile)s.log; checkpoint; mv %(outfile)s.dir/accepted_hits.bam %(outfile)s; checkpoint; samtools index %(outfile)s; checkpoint; rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual """ # use local parameters to overwrite default ones. P.run(**local_params) os.unlink(tmpfile)
def buildRefFlat(infile, outfile): '''build flat geneset for Picard RnaSeqMetrics. ''' tmpflat = P.getTempFilename(".") job_memory = PARAMS["job_memory"] statement = ''' gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s; paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s) > %(outfile)s ''' P.run() os.unlink(tmpflat)
def makeCytoscapeInputs(infiles, outfile): infile = infiles[1] T = P.getTempFilename(".") statement = """ awk -F "\\t" '{printf("%%%%s\\t%%%%s\\t%%%%s\\t%%%%s\\t+1\\n",\ $1, $12, $8, $9)}' %(infile)s > %(T)s""" % locals() P.run() typ = infile.split("_")[-3] keep = [line.strip() for line in IOTools.openFile(PARAMS['cytoscape_%s' % typ]).readlines()] tab = pd.read_csv(T, sep="\t") tab = tab[tab['term_id'].isin(keep)] tab.columns = ['ID', 'Description', 'pvalue', 'padj', 'Phenotype'] tab.to_csv(outfile, sep="\t", index=None) os.remove(T)
def loadClusterCounts(infiles, outfile): '''Find the number of signficant clusters found in each sample''' tmp = P.getTempFilename(shared=True) results = [] for infile in infiles: count = IOTools.getNumLines(infile) method, track = re.match( "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups() results.append((method, track, count)) IOTools.writeLines(tmp, results, header=["method", "track", "count"]) P.load(tmp, outfile) os.unlink(tmp)
def validateCramFiles(infile, outfiles): '''Validate CRAM files by exit status of cramtools qstat. Save the quality scores of cram files. ''' outfile, outfile_quality = outfiles temp_quality = P.getTempFilename() statement = '''cramtools qstat -I %(infile)s > %(temp_quality)s; echo $? > %(outfile)s; cat %(temp_quality)s | awk '{OFS="\\t"} {print $1,$2}' > %(outfile_quality)s; ''' P.run()
def clustersToBigBed(infile, outfile): '''Convert beds to bigbed ''' checkParams() tmp = P.getTempFilename() genome_file = os.path.join(PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs_tsv"]) statement = ''' zcat %(infile)s | sort -k1,1 -k2,2n | awk 'BEGIN{OFS="\\t"} $5=1' > %(tmp)s; checkpoint; bedToBigBed %(tmp)s %(genome_file)s %(outfile)s; checkpoint; rm %(tmp)s''' P.run()
def aggregateTiledReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 This method uses the maximum number of reads found in any interval as the tag count. Tiles with no counts will not be output. ''' to_cluster = True src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles] ) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def bed2BigWig(infiles, outfile): infile, sizes = infiles infile = infile.replace(".bismark.cov", ".bedGraph") # need to sort first, can do this with tmp file tmp_infile = P.getTempFilename() statement = ''' sort -k1,1 -k2,2n %(infile)s | awk '{OFS="\t"; $3 = $3 + 1; print $1,$2,$3,$4}' > %(tmp_infile)s; checkpoint; bedGraphToBigWig %(tmp_infile)s %(sizes)s %(outfile)s; checkpoint; rm -rf %(tmp_infile)s''' P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile): """map reads from short read archive sequence using bowtie against transcriptome data. """ # Mapping will permit up to one mismatches. This is sufficient # as the downstream filter in bams2bam requires the # number of mismatches less than the genomic number of mismatches. # Change this, if the number of permitted mismatches for the genome # increases. # Output all valid matches in the best stratum. This will # inflate the file sizes due to matches to alternative transcripts # but otherwise matches to paralogs will be missed (and such # reads would be filtered out). job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() infile, reffile, contigs = infiles track = P.snip(outfile, ".bam") prefix = P.snip(reffile, ".fa") statement = """ gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --un /dev/null --threads %(bowtie_threads)s %(transcriptome_options)s --best --strata -a %(prefix)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}" | samtools import %(contigs)s - - | samtools sort - %(track)s; checkpoint; samtools index %(outfile)s checkpoint; rm -f %(tmpfile)s """ P.run()
def runCufflinks(infiles, outfile): '''estimate expression levels in each set. ''' gtffile, bamfile = infiles job_threads = PARAMS["cufflinks_threads"] track = os.path.basename(P.snip(gtffile, ".gtf.gz")) tmpfilename = P.getTempFilename(".") if os.path.exists(tmpfilename): os.unlink(tmpfilename) gtffile = os.path.abspath(gtffile) bamfile = os.path.abspath(bamfile) outfile = os.path.abspath(outfile) # note: cufflinks adds \0 bytes to gtf file - replace with '.' # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb # intron. # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute statement = '''mkdir %(tmpfilename)s; cd %(tmpfilename)s; cufflinks --label %(track)s --GTF <(gunzip < %(gtffile)s) --num-threads %(cufflinks_threads)i --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa --library-type %(cufflinks_library_type)s %(cufflinks_options)s %(bamfile)s | grep -v 'BAM record error' >& %(outfile)s; perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz; gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz; gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz; ''' P.run() shutil.rmtree(tmpfilename)
def buildNUMTs(infile, outfile): '''output set of potential nuclear mitochondrial genes (NUMTs). This function works by aligning the mitochondrial chromosome against genome using exonerate_. This can take a while. Arguments --------- infile : string Ignored. outfile : filename Output in :term:`gtf` format with potential NUMTs. ''' if not PARAMS["numts_mitochrom"]: E.info("skipping numts creation") P.touch(outfile) return fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) if PARAMS["numts_mitochrom"] not in fasta: E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"]) P.touch(outfile) return tmpfile_mito = P.getTempFilename(".") statement = ''' cgat index_fasta --extract=%(numts_mitochrom)s --log=%(outfile)s.log %(genome_dir)s/%(genome)s > %(tmpfile_mito)s ''' P.run() if IOTools.isEmpty(tmpfile_mito): E.warn("mitochondrial genome empty.") os.unlink(tmpfile_mito) P.touch(outfile) return format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi", "C") format = "\\\\t".join(["%%%s" % x for x in format]) # collect all results min_score = 100 statement = ''' cat %(genome_dir)s/%(genome)s.fasta | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(tmpfile_mito)s --model affine:local --score %(min_score)i --showalignment no --showsugar no --showcigar no --showvulgar no --ryo \\"%(format)s\\n\\" " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run() # convert to gtf inf = IOTools.openFile("%s.links.gz" % outfile) outf = IOTools.openFile(outfile, "w") min_score = PARAMS["numts_score"] c = E.Counter() for line in inf: (query_contig, query_strand, query_start, query_end, target_contig, target_strand, target_start, target_end, score, pid, alignment) = line[:-1].split("\t") c.input += 1 score = int(score) if score < min_score: c.skipped += 1 continue if target_strand == "-": target_start, target_end = target_end, target_start gff = GTF.Entry() gff.contig = target_contig gff.start, gff.end = int(target_start), int(target_end) assert gff.start < gff.end gff.strand = target_strand gff.score = int(score) gff.feature = "numts" gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end) gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end) outf.write("%s\n" % str(gff)) c.output += 1 inf.close() outf.close() E.info("filtering numts: %s" % str(c)) os.unlink(tmpfile_mito)
def buildGenomicContext(infiles, outfile, distance=10): '''build a :term:`bed` formatted file with genomic context. The output is a bed formatted file, annotating genomic segments according to whether they are any of the ENSEMBL annotations. The function also adds the RNA and repeats annotations from the UCSC. The annotations can be partially or fully overlapping. The annotations can be partially or fully overlapping. Adjacent features (less than 10 bp apart) of the same type are merged. Arguments --------- infiles : list A list of input files to generate annotations from. The contents are 1. ``repeats``, a :term:`gff` formatted file with repeat annotations 2. ``rna``, a :term:`gff` formatted file with small, repetetive RNA annotations 3. ``annotations``, a :term:`gtf` formatted file with genomic annotations, see :func:`annotateGenome`. 4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see :func:`buildFlatGeneSet`. outfile : string Output filename in :term:`bed` format. distance : int Merge adajcent features of the same type within this distance. ''' repeats_gff, rna_gff, annotations_gtf, geneset_flat_gff, \ cpgisland_bed, go_tsv = infiles tmpfile = P.getTempFilename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(6)] # add ENSEMBL annotations statement = """ zcat %(annotations_gtf)s | cgat gtf2gtf --method=sort --sort-order=gene | cgat gtf2gtf --method=merge-exons --log=%(outfile)s.log | cgat gff2bed --set-name=gene_biotype --is-gtf --log=%(outfile)s.log | sort -k 1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_0 """ P.run() # rna statement = ''' zcat %(repeats_gff)s %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_1''' P.run() # add aggregate intervals for repeats statement = ''' zcat %(repeats_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | awk -v OFS="\\t" '{$4 = "repeats"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_2''' P.run() # add aggregate intervals for rna statement = ''' zcat %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | awk -v OFS="\\t" '{$4 = "repetetive_rna"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_3 ''' P.run() # add ribosomal protein coding genes goids = ("GO:0003735", ) patterns = "-e %s" % ("-e ".join(goids)) statement = ''' zcat %(geneset_flat_gff)s | cgat gtf2gtf --map-tsv-file=<(zcat %(go_tsv)s | grep %(patterns)s | cut -f 2 | sort | uniq) --method=filter --filter-method=gene --log=%(outfile)s.log | cgat gff2bed --log=%(outfile)s.log | awk -v OFS="\\t" '{$4 = "ribosomal_coding"; print}' | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_4 ''' P.run() # CpG islands statement = ''' zcat %(cpgisland_bed)s | awk '{printf("%%s\\t%%i\\t%%i\\tcpgisland\\n", $1,$2,$3 )}' > %(tmpfile)s_5 ''' P.run() # sort and merge # remove strand information as bedtools # complains if there are annotations with # different number of field files = " ".join(tmpfiles) statement = ''' sort --merge -k1,1 -k2,2n %(files)s | cut -f 1-4 | gzip > %(outfile)s ''' P.run() for x in tmpfiles: os.unlink(x)
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run() return tmpfile1 = P.getTempFilename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | cgat gff2fasta --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run() if IOTools.isEmpty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) P.touch(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run() os.unlink(tmpfile1) inf = IOTools.openFile("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([x[0] for x in cc.execute( """SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """)]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ( (len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.openFile(outfile, "w") inf = GTF.iterator(IOTools.openFile(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def plotHeatmap(results, norm_matrix, threshold_stat, p_threshold, fc_threshold, outfile): ''' plot heatmap of differentially abundant genes ''' if threshold_stat == "p": p = "P.Value" elif threshold_stat == "padj": p = "adj.P.Val" else: p = "adj.P.Val" temp = P.getTempFilename(".") R('''library(gplots)''') R('''library(gtools)''') E.info("reading data") R('''mat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % norm_matrix) R('''rownames(mat) <- mat$taxa mat <- as.matrix(mat[,1:ncol(mat)-1])''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % results) E.info("data loaded") R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' % (p, p_threshold, fc_threshold)) R('''diff.genes <- unique(t)''') ############################## # this is a hack # to avoid errors when # a single differential # abundant feature is found ############################## R('''write.table(diff.genes, file = "%s", row.names = F, sep = "\t")''' % temp) tmp = open(temp) tmp.readline() if len(tmp.readlines()) == 1: P.touch(outfile) else: R('''mat <- mat[as.character(diff.genes), ] samples <- colnames(mat) mat <- as.data.frame(t(apply(mat, 1, scale))) colnames(mat) <- samples mat <- mat[, mixedsort(colnames(mat))] colours = colorRampPalette(c("blue", "white", "red"))(75) pdf("%s", height = 12, width = 12) heatmap.2(as.matrix(mat), trace = "none", scale = "none", col = colours, Colv = F, dendrogram = "row", margins = c(18, 18)) dev.off()''' % outfile) os.unlink(temp)
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0): '''load results from cuffdiff analysis to database This functions parses and loads the results of a cuffdiff differential expression analysis. Parsing is performed by the parseCuffdiff function. Multiple tables will be created as cuffdiff outputs information on gene, isoform, tss, etc. levels. The method converts from ln(fold change) to log2 fold change. Pairwise comparisons in which one gene is not expressed (fpkm < `min_fpkm`) are set to status 'NOCALL'. These transcripts might nevertheless be significant. Arguments --------- dbhandle : object Database handle. infile : string Input filename, output from cuffdiff outfile : string Output filename in :term:`tsv` format. min_fpkm : float Minimum fpkm. Genes with an fpkm lower than this will be set to status `NOCALL`. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued... tmpname = P.getTempFilename(shared=True) # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) P.load(tmpname, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=treatment_name " "--add-index=control_name " "--add-index=test_id") for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" infile = os.path.join(indir, fn) P.load(infile, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=tracking_id " "--add-index=control_name " "--add-index=test_id") # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} is_first = True for line in inf: if is_first: is_first = False continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # please make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") s = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[s]] + "\t") s += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() P.load(tmpf, outfile, tablename=tablename, options="--allow-empty-file " " --add-index=gene_id") os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def resetGTFAttributes(infile, genome, gene_ids, outfile): """set GTF attributes in :term:`gtf` formatted file so that they are compatible with cufflinks. This method runs cuffcompare with `infile` against itself to add attributes such as p_id and tss_id. Arguments --------- infile : string Filename of :term:`gtf`-formatted input file genome : string Filename (without extension) of indexed genome file in :term:`fasta` format. gene_ids : dict Dictionary mapping transcript ids to gene ids. outfile : string Output filename in :term:`gtf` format """ tmpfile1 = P.getTempFilename(".") tmpfile2 = P.getTempFilename(".") ################################################# E.info("adding tss_id and p_id") # The p_id attribute is set if the fasta sequence is given. # However, there might be some errors in cuffdiff downstream: # # cuffdiff: bundles.cpp:479: static void HitBundle::combine(const std:: # vector<HitBundle*, std::allocator<HitBundle*> >&, HitBundle&): Assertion # `in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id()' failed. # # I was not able to resolve this, it was a complex # bug dependent on both the read libraries and the input reference gtf # files job_memory = "5G" statement = ''' cuffcompare -r <( gunzip < %(infile)s ) -T -s %(genome)s.fa -o %(tmpfile1)s <( gunzip < %(infile)s ) <( gunzip < %(infile)s ) > %(outfile)s.log ''' P.run() ################################################# E.info("resetting gene_id and transcript_id") # reset gene_id and transcript_id to ENSEMBL ids # cufflinks patch: # make tss_id and p_id unique for each gene id outf = IOTools.openFile(tmpfile2, "w") map_tss2gene, map_pid2gene = {}, {} inf = IOTools.openFile(tmpfile1 + ".combined.gtf") def _map(gtf, key, val, m): if val in m: while gene_id != m[val]: val += "a" if val not in m: break m[val] = gene_id gtf.setAttribute(key, val) for gtf in GTF.iterator(inf): transcript_id = gtf.oId gene_id = gene_ids[transcript_id] gtf.setAttribute("transcript_id", transcript_id) gtf.setAttribute("gene_id", gene_id) # set tss_id try: tss_id = gtf.tss_id except AttributeError: tss_id = None try: p_id = gtf.p_id except AttributeError: p_id = None if tss_id: _map(gtf, "tss_id", tss_id, map_tss2gene) if p_id: _map(gtf, "p_id", p_id, map_pid2gene) outf.write(str(gtf) + "\n") outf.close() # sort gtf file PipelineGeneset.sortGTF(tmpfile2, outfile) # make sure tmpfile1 is NEVER empty assert tmpfile1 for x in glob.glob(tmpfile1 + "*"): os.unlink(x) os.unlink(tmpfile2)
def summarizeTagsWithinContext(tagfile, contextfile, outfile, min_overlap=0.5, job_memory="15G"): '''count occurances of tags in genomic context. Examines the genomic context to where tags align. A tag is assigned to the genomic context that it overlaps by at least 50%. Thus some reads mapping several contexts might be dropped. Arguments --------- tagfile : string Filename with tags. The file can be :term:`bam` or :term:`bed` format. contextfile : string Filename of :term:`bed` formatted files with named intervals (BED4). outfile : string Output in :term:`tsv` format. min_overlap : float Minimum overlap (fraction) to count features as overlapping. job_memory : string Memory to reserve. ''' tmpfile = P.getTempFilename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(2)] statement = ''' cgat bam_vs_bed --min-overlap=%(min_overlap)f --log=%(outfile)s.log %(tagfile)s %(contextfile)s > %(tmpfile)s_0 ''' P.run() statement = ''' printf "intergenic\\t" >> %(tmpfile)s_1''' P.run() statement = ''' bedtools intersect -a %(tagfile)s -b %(contextfile)s -bed -v | wc -l | xargs printf >> %(tmpfile)s_1 ''' P.run() files = " ".join(tmpfiles) statement = ''' sort --merge %(files)s | gzip > %(outfile)s ''' P.run() for x in tmpfiles: os.unlink(x)
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ """<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() os.unlink(tmpdir) statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.quality.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | cgat bam2bam --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.nonunique.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.markdup.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | cgat bam2bed --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.bam2bed.log - | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | cgat bam2bed --log=%(bedfile)s.bam2bed.log - | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; checkpoint; ".join(statement) P.run()
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run() os.unlink(tmpfile)
def buildGenomicContext(infiles, outfile, distance=10): '''build a :term:`bed` formatted file with genomic context. The output is a bed formatted file, annotating genomic segments according to whether they are any of the ENSEMBL annotations. The function also adds the RNA and repeats annotations from the UCSC. The annotations can be partially or fully overlapping. The annotations can be partially or fully overlapping. Adjacent features (less than 10 bp apart) of the same type are merged. Arguments --------- infiles : list A list of input files to generate annotations from. The contents are 1. ``repeats``, a :term:`gff` formatted file with repeat annotations 2. ``rna``, a :term:`gff` formatted file with small, repetetive RNA annotations 3. ``annotations``, a :term:`gtf` formatted file with genomic annotations, see :func:`annotateGenome`. 4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see :func:`buildFlatGeneSet`. outfile : string Output filename in :term:`bed` format. distance : int Merge adajcent features of the same type within this distance. ''' repeats_gff, rna_gff, annotations_gtf, utr_gtf, intron_gtf = infiles tmpfile = P.getTempFilename(shared=True) tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(4)] # add ENSEMBL annotations statement = """ zcat %(annotations_gtf)s | cgat gtf2gtf --method=sort --sort-order=gene | cgat gtf2gtf --method=merge-exons --log=%(outfile)s.log | cgat gff2bed --set-name=gene_biotype --is-gtf --log=%(outfile)s.log | sort -k 1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_0 """ P.run() # rna statement = ''' zcat %(repeats_gff)s %(rna_gff)s | cgat gff2bed --set-name=family --is-gtf -v 0 | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_1''' P.run() # utr statement = '''zcat %(utr_gtf)s | cgat gff2bed --is-gtf --set-name=feature | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_2''' P.run() # intron statement = '''zcat %(intron_gtf)s | cgat gff2bed --is-gtf --set-name=feature | sort -k1,1 -k2,2n | cgat bed2bed --method=merge --merge-by-name --merge-distance=%(distance)i --log=%(outfile)s.log > %(tmpfile)s_3''' P.run() # sort and merge # remove strand information as bedtools # complains if there are annotations with # different number of field files = " ".join(tmpfiles) statement = ''' sort --merge -k1,1 -k2,2n %(files)s | cut -f 1-4 | gzip > %(outfile)s ''' P.run() for x in tmpfiles: os.unlink(x)
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) P.touch(outfile) return ######################################## # output and load spiked results tmpfile_name = P.getTempFilename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run() E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run() E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = IOTools.openFile(tmpfile_name, "w") tmpfile.write("\t".join( ("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = IOTools.openFile(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join(map( str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join(map( str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.toTable( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def loadLncRNAPhyloCSF(infile, outfile): tmpf = P.getTempFilename("/ifs/scratch") PipelineLncRNA.parsePhyloCSF(infile, tmpf) P.load(tmpf, outfile, options="--add-index=gene_id")