def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def convertPslToChain(infile, outfile): '''convert a psl to a chain file. see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver ''' to_cluster = True target, query = extractGenomes(infile) tmpfilename1 = P.getTempFilename(".") tmpfilename2 = P.getTempFilename(".") writeContigSizes(target, tmpfilename1) writeContigSizes(query, tmpfilename2) statement = '''gunzip < %(infile)s | pslSwap stdin stdout | cgat psl2chain --log=%(outfile)s.log | chainSort stdin stdout | gzip > %(outfile)s.sorted.chain.gz; checkpoint; gunzip < %(outfile)s.sorted.chain.gz | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout | gzip > %(outfile)s''' P.run() os.unlink(tmpfilename1) os.unlink(tmpfilename2)
def extractPairwiseAlignmentSingleFile(infiles, outfile, track): '''build pairwise genomic aligment from maf files.''' try: os.remove(outfile) except OSError: pass genomefile = PARAMS["%s_genome" % track] to_cluster = True for infile in infiles: E.info("adding %s" % infile) statement = '''gunzip < %(infile)s | cgat maf2psl --query=%(track)s --target=%(maf_master)s --log=%(outfile)s.log | cgat psl2psl --method=filter-fasta --method=sanitize --queries-tsv-file=%(genomefile)s --target-psl-file=%(genome)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def buildIndirectMaps(infile, outfile, track): '''build a map between query and target, linking via intermediate targets.''' to_cluster = True path = P.asList(PARAMS["%s_path" % track]) E.info("path=%s" % str(path)) statement = [] for stage, part in enumerate(path): filename = part + ".over.psl.gz" if not os.path.exists(filename): raise ValueError( "required file %s for %s (stage %i) not exist." % (filename, outfile, stage)) if stage == 0: statement.append( '''gunzip < %(filename)s''' % locals() ) else: statement.append( ''' pslMap stdin <(gunzip < %(filename)s) stdout ''' % locals() ) statement.append("gzip") statement = " | ".join(statement) + " > %(outfile)s " % locals() P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def spikeInCounts(infiles, outfile): ''' Perform spike-in across a specific range of fold changes or absolute count differences. Counts table generated from original input counts data. ''' counts_file = infiles[0] design_file = infiles[1] statement = ''' zcat %(counts_file)s | python %(scriptsdir)s/counts2counts.py --design-tsv-file=%(design_file)s --method="spike" --spike-type="row" --spike-change-bin-max=3.0 --spike-change-bin-width=0.1 --spike-change-bin-min=0.1 --spike-initial-bin-width=1 --spike-initial-bin-min=1 --spike-initial-bin-max=200000 --spike-minimum=1 --spike-maximum=1000000 --random-seed=%(random_seed)i --spike-iterations=%(spike_iterations)i -v 5 --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[ 1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)): true_set.add(taxa[0]) for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp)) / total_estimate tp_rate = float(len(tp)) / total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def buildBAMStats(infile, outfile): '''Count number of reads mapped, duplicates, etc. ''' to_cluster = USECLUSTER scriptsdir = PARAMS["general_scriptsdir"] statement = '''cgat bam2stats --force-output --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s''' P.run()
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def getCpGIslandsFromUCSC(dbhandle, outfile): '''get CpG islands from UCSC database and save as a :term:`bed` formatted file. The name column in the bed file will be set to the UCSC name. Arguments --------- dbhandle : object Database handle to UCSC mysql database outfile : string Filename of output file in :term:`bed` format. ''' cc = dbhandle.cursor() table = "cpgIslandExt" sql = """SELECT chrom, chromStart, chromEnd, name FROM %(table)s ORDER by chrom, chromStart""" sql = sql % locals() E.debug("executing sql statement: %s" % sql) try: cc.execute(sql) outfile = IOTools.openFile(outfile, "w") for data in cc.fetchall(): outfile.write("\t".join(map(str, data)) + "\n") outfile.close() except Exception: E.warn("Failed to connect to table %s. %s is empty" % (table, outfile)) P.touch(outfile)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def buildRawGenomeAlignment(infiles, outfile): '''build pairwise genomic aligment from maf files. ''' try: os.remove(outfile) except OSError: pass for infile in infiles: # skip maf files without Hsap on top. if "other" in infile or "supercontig" in infile: continue E.info("adding %s" % infile) genome_query, genome_target = getGenomes() statement = '''gunzip < %(infile)s | python %(scriptsdir)s/maf2psl.py --query=%(maf_name_query)s --target=%(maf_name_target)s --log=%(outfile)s.log | python %(scriptsdir)s/psl2psl.py --method=filter-fasta --method=sanitize --queries-tsv-file=%(genome_query)s --target-psl-file=%(genome_target)s --log=%(outfile)s.log | gzip >> %(outfile)s ''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''gather BAM file alignment statistics using Picard ''' job_options = getPicardOptions() job_threads = 3 if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitely. statement = '''cat %(infile)s | python %(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence --output-sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def extractLncRNAFastaAlignments(infiles, outfile): """ Recieves a MAF file containing pairwise alignments and a gtf12 file containing intervals. Outputs a single fasta file containing aligned sequence for each interval. """ bed_file, maf_file = infiles maf_tmp = P.getTempFilename("./phyloCSF") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def loadAnnotations(infile, outfile): '''load variant annotations into database''' P.load(infile, outfile, options="--map=gene_id:str " "--add-index=gene_id " "--map=base_qualities:text ")
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def buildFilteredLncRNAGeneSet(infile, outfile): ''' Depending on on filtering_remove_single_exon will: i) remove all single exon transcripts from all lncrna models (transcripts) ii) remove lncrna loci that only contain single exon transcripts (loci) iii) leave all single-exon and multi-exon loci in outfile (None) ''' if not PARAMS["filtering_remove_single_exon"]: E.info("Both multi-exon and single-exon lncRNA are retained!") statement = ("cp %(infile)s %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "loci": E.info("Warning: removing all single-exon" " transcripts from lncRNA set") statement = ("zcat %(infile)s |" " grep 'exon_status_locus \"s\"'" " gzip > %(outfile)s") elif PARAMS["filtering_remove_single_exon"] == "transcripts": E.info("Warning: removing loci with only single-exon transcripts") statement = ("zcat %(infile)s |" " grep 'exon_status \"s\"'" " gzip > %(outfile)s") else: raise ValueError("Unregocnised parameter %s" % PARAMS["filtering_remove_single_exon"]) P.run()
def mapReadsWithBowtie(infiles, outfile): """map reads with bowtie""" inifile, infile = infiles job_options = "-l mem_free=16G" job_threads = PARAMS["bowtie_threads"] tmpfile = P.getTempFilename() statement = """ gunzip < %(infile)s > %(tmpfile)s; checkpoint; bowtie -q --sam -C --threads %(bowtie_threads)s %(bowtie_options)s %(bowtie_genome_dir)s/%(genome)s_cs %(tmpfile)s | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log | gzip > %(outfile)s; checkpoint; rm -f %(tmpfile)s """ P.run()
def loadGO(infile, outfile, tablename): """import GO results into individual tables. This method concatenates all the results from a GO analysis and uploads into a single table. """ indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return load_statement = P.build_load_statement( tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=goid ") statement = ''' python %(toolsdir)s/cat_tables.py %(indir)s/*.overall | %(load_statement)s > %(outfile)s ''' P.run()
def buildAllStats(infiles, outfile): ''' paste stats together ''' statement = '''paste %s > %s''' % ( " ".join([infile for infile in infiles]), outfile) P.run()
def loadSummariseReadsContributingToTranscripts(infile, outfile): ''' loads the summary of reads contributing to transcripts ''' tablename = P.toTable(outfile.replace("/", "_")) statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def loadCountSingleAndMultiExonLincRNA(infile, outfile): ''' load the counts for the multi and single exon lincRNA ''' tablename = P.toTable(outfile.replace("/", "_")) + ".count" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --map-tsv-file=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def loadNumberExonsLengthSummaryStats(infile, outfile): ''' load the table of exon counts and transcript lengths ''' tablename = P.toTable(outfile.replace("/", "_")) + "_stats" statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s''' P.run()
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. dbhandle = PipelineUCSC.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=ucsc_database) cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(shared=True) total_repeats = 0 for table in tables: E.info("%s: loading repeats from %s" % (ucsc_database, table)) cc = dbhandle.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) n = 0 for data in cc.fetchall(): n += 1 tmpfile.write("\t".join(map(str, data)) + "\n") E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n)) total_repeats += n if total_repeats == 0: raise ValueErrror("did not find any repeats for %s" % ucsc_database) tmpfile.close() tmpfilename = tmpfile.name statement = '''cat %(tmpfilename)s | %(pipeline_scriptsdir)s/gff_sort pos | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def publish_report(): '''publish report in the CGAT downloads directory.''' E.info("publishing report") P.publish_report()
def loadConsensusMetrics(infile, outfile): P.load(infile, outfile)
def loadConsensusClustering(infile, outfile): P.load(infile, outfile)
def loadClusterEigengenes(infile, outfile): ''' Load module eignengene expression profiles in to DB ''' P.load(infile, outfile)
def loadMatchClusterExpression(infile, outfile): P.load(infile, outfile)
import itertools import re import sqlite3 import glob import pandas as pd import rpy2.robjects as ro import CGAT.Experiment as E import CGAT.Timeseries as Timeseries import CGATPipelines.PipelineTracks as PipelineTracks ################################################### # Pipeline configuration # load options from the config file import CGATPipelines.Pipeline as P P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS ################################################################### # Helper functions mapping tracks to conditions, etc GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3) TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam") REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) def connect():
def matchBgSequenceComposition(gc_load_files, background, foreground, fasta_file, outfile, database="csvdb", header_line=True, bg_stat="pCpG", stat="fisher"): ''' take the background set and subset it for intervals with a sequence composition distribution that is the same as the foreground set. Subsetting is done without replacement. This requires that the background set is sufficiently large, if the returned matched_background set is <90% size of foreground set, the pipeline will crash. ''' background_file = open(background) foreground_file = open(foreground) if header_line: background_file.readline() foreground_file.readline() background_set = set() foreground_set = set() for interval_id in background_file.readlines(): background_set.add(interval_id[:-1]) for interval_id in foreground_file.readlines(): foreground_set.add(interval_id[:-1]) dbh = sqlite3.connect(database) cc = dbh.cursor() tablenames = [ filenameToTablename(P.snip(os.path.basename(x), ".load")) for x in gc_load_files ] # jj: cpg scores rounded to three dp. # background dict - key <cpg score>: val <set of gene_ids with that score> # foreground dict - key <gene_id>: val <cpg score> gc = {"background": collections.defaultdict(set), "foreground": {}} for tablename in tablenames: # MM: need to make sure `-` in filenames don't break the sql statement tablename = tablename.replace("-", "_") for data in cc.execute("""SELECT * FROM %s;""" % tablename): interval_id = data[3].split(" ")[0] cpg = data[2] # jj: store background in 1 percent bins cpg_str = "%.3f" % cpg if re.search("background", tablename): if interval_id in background_set: gc["background"][cpg_str].add(interval_id) elif re.search("foreground", tablename): if interval_id in foreground_set: gc["foreground"][interval_id] = cpg_str else: raise ValueError("Unrecognized table name %s. Should contain" "'foreground' or 'background'" % tablename) # debug: pickle and dump the gc dict pickle_file = P.snip(foreground, ".foreground.tsv") + ".p" pickle.dump(gc, open(pickle_file, "wb")) # match the background set to the foreground set by taking a random # background interval with the the same sequence composition as each # foreground interval. outf = open(outfile, "w") if header_line: outf.write("gene_id\n") # jj: sample background gene_ids without replacement matched_background = set() X = 0 for interval, cpg in gc["foreground"].iteritems(): # print("Finding background for foreground gene: %s (%s)" % # (interval, cpg)) if cpg in gc["background"].keys(): # get set of bg gene_ids with relevant cpg score bg_gene_ids = gc["background"][cpg] # print "There are %i background genes in total" % len(bg_gene_ids) # remove foreground genes from background set bg_gene_ids = bg_gene_ids - foreground_set # print("There are %i background genes after removing foreground" % # len(bg_gene_ids)) if bg_gene_ids: # select one gene_id to add to matched_background bg_id = random.sample(gc["background"][cpg], 1)[0] matched_background.add(bg_id) # remove selected background gene_id from set gc["background"][cpg].remove(bg_id) else: X += 1 E.warn("Missing background gene for %s %s, no gene with" " matching %s" % (foreground_file.name, interval, bg_stat)) else: X += 1 E.warn("Missing background gene for %s %s, no gene with" " matching %s" % (foreground_file.name, interval, bg_stat)) # Hack # jj: check that background gene_list is <10% shorter than foregroung # hack # MM: only need to check sufficient background size for Fisher's exact test if stat == "fisher": assert len(matched_background) > 0.9 * len(foreground_set), ( "There are insufficient genes with matched background to perform" " test for sample %s" % foreground_file) else: pass print "Number of genes with no available background: %i" % X print "Foreground set: %i" % len(foreground_set) print "Backfround set: %i" % len(matched_background) outf.write("\n".join(matched_background) + "\n") outf.close()
Code ==== """ from ruffus import * import sys import os import sqlite3 import CGAT.Experiment as E import CGATPipelines.Pipeline as P # load options from the config file PARAMS = P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) # add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update( P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__", prefix="annotations_", update_interface=True)) # if necessary, update the PARAMS dictionary in any modules file.
def update_report(): '''update report.''' E.info("updating report") P.run_report(clean=False)
def loadWordCounts(infile, outfile): '''load results of word counting into database.''' P.load(infile, outfile, "--add-index=word")
def loadCpGIslands(infile, outfile): '''load CpG Islands information.''' P.load(infile, outfile, "--add-index=transcript_id")
def publish_report(): '''publish report.''' E.info("publishing report") P.publish_report()
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | python %(scriptsdir)s/bed2fasta.py --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence" ) def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match("Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|") ] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append( MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join( map(str, (transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join( map(str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def build_report(): '''build report from scratch.''' E.info("starting report build process from scratch") P.run_report(clean=True)
'pipeline_docs', 'themes') logopath = os.path.join(themedir, "cgat_logo.png") ################################################################ # Import pipeline configuration from pipeline.ini in the current # directory and the common one. # PATH were code for pipelines is stored pipelinesdir = os.path.dirname(CGATPipelines.__file__) # The default configuration file - 'inifile' is read by # sphinx-report. inifile = os.path.join(os.path.dirname(CGATPipelines.__file__), 'configuration', 'pipeline.ini') PARAMS = P.getParameters([inifile, "pipeline.ini"]) # Definition now part of CGATReport # def setup(app): # app.add_config_value('PARAMS', {}, True) ################################################################ ################################################################ ################################################################ # The pipeline assumes that sphinxreport is called within the # working directory. If the report is in a separate build directory, # change the paths below. # # directory with export directory from pipeline # This should be a directory in the build directory - you can # link from here to a directory outside the build tree, though.
def loadTATABox(infile, outfile): '''load TATA box information.''' P.load(infile + ".table.gz", outfile, "--add-index=transcript_id")
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall() ] genelists = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall() ] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) load_statement = P.build_load_statement(tablename=tablename) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | %(load_statement)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty-file')
import CGAT.Experiment as E import CGAT.IOTools as IOTools import CGAT.FastaIterator as FastaIterator import CGAT.Bed as Bed import CGATPipelines.PipelineGeneset as PipelineGeneset ################################################### ################################################### ################################################### # Pipeline configuration ################################################### # load options from the config file import CGATPipelines.Pipeline as P P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") PipelineGeneset.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### def connect(): '''connect to database.
def loadGeneListMatrix(infile, outfile): '''load fgene list matrix into table.''' track = P.snip(infile, ".tsv.gz") P.load(infile, outfile, tablename="%s_foreground" % track) P.load(infile + ".bg.tsv.gz", outfile, tablename="%s_background" % track)
def loadHypergeometricResultsSummary(infiles, outfile): '''load GO summary results.''' infiles = glob.glob("hypergeometric.dir/*/*.parameters") P.mergeAndLoad(infiles, outfile)
def loadGeneLists(infile, outfile): '''load gene list data into database.''' P.load(infile, outfile, tablename="genelist_%s" % P.toTable(outfile))
def loadPathways(infile, outfile): '''load pathway information into database.''' P.load(infile, outfile, "--add-index=gene_id --add-index=go_id")
def convertBedGraph(infile, outfile): contig_file = os.path.join(PARAMS["annotations_dir"], "contigs.tsv") statement = ("bedGraphToBigWig %(infile)s %(contig_file)s %(outfile)s") P.run()
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv(IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" % track)] genesets.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)
def loadNPeaksForPooledPseudoreplicates(infile, outfile): P.load(infile, outfile)
def publish(): '''publish report and data.''' E.info("publishing report") P.publish_report()
def loadIDROnPooledPseudoreplicates(infile, outfile): P.load(infile, outfile)
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def loadIDROnIndividualReplicates(infile, outfile): P.load(infile, outfile)
def loadNPeaksForIndividualReplicates(infile, outfile): P.load(infile, outfile)