def buildExpectedGenomeCoverage(infiles, outfile): ''' build the expected coverage over the genomes in the sample based on read depth and length ''' P.submit("PipelineMetagenomeBenchmark", "buildExpectedCoverageOverGenomes", infiles=infiles, outfiles=outfile)
def buildCoverageOverGenomes(infiles, outfile): ''' create file with the coverage over each of the simulated genomes ''' P.submit("PipelineMetagenomeBenchmark", "buildCoverageOverGenomes", infiles=infiles, outfiles=outfile)
def filterContigsByCoverage(infiles, outfile): ''' filter contigs by their average base coverage ''' P.submit("PipelineMetagenomeBenchmark", "filterByCoverage", infiles=infiles, outfiles=outfile)
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] module = P.snip(IDR.__file__, ".py") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] module = P.snip(IDR.__file__, ".py") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_self_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def splitPooledBamfiles(infile, sentinel): infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".py") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def splitPooledBamfiles(infile, sentinal): infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def findNPeaksForPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_self_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[ str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def splitBamfiles(infile, sentinel): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinel") + ".bam" outfile = P.snip(sentinel, ".sentinel") params = '2' try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinel)
def splitBamfiles(infile, sentinal): """ For all tracks, split the filtered bamfile in two using pysam """ infile = P.snip(infile, ".sentinal") + ".bam" outfile = P.snip(sentinal, ".sentinal") params = '2' module = P.snip(IDR.__file__, ".pyc") P.submit(module, "splitBam", params, infile, outfile) P.touch(sentinal)
def buildChimerasBasedOnReads(infile, outfile): ''' this function is an alternative to counting a contig as a chimera if it aligns to more than one genome. A contig is likely to align to multiple genomes with high idenitity if there contains very similar genomes in the sample. This is true of our simulation that contains subspecies of the same species e.g. B.fragilis subspecies A more appropriate method for assessing chimericity is to score each contig with a chimericity score. The chimericity score is the ratio of "good" alignments / "bad" alignments. An alignment is considered "good" if it is from the species from which the majority of alignments from that contig are derived''' P.submit("CGATPipelines.PipelineMetagenomeBenchmark", "buildChimerasBasedOnReads" , infiles = infile, outfiles = outfile)
def buildChimerasBasedOnReads(infile, outfile): ''' this function is an alternative to counting a contig as a chimera if it aligns to more than one genome. A contig is likely to align to multiple genomes with high idenitity if there contains very similar genomes in the sample. This is true of our simulation that contains subspecies of the same species e.g. B.fragilis subspecies A more appropriate method for assessing chimericity is to score each contig with a chimericity score. The chimericity score is the ratio of "good" alignments / "bad" alignments. An alignment is considered "good" if it is from the species from which the majority of alignments from that contig are derived''' P.submit("CGATPipelines.PipelineMetagenomeBenchmark", "buildChimerasBasedOnReads", infiles=infile, outfiles=outfile)
def buildCoverageOverContigs(infiles, outfile): ''' build histograms of the coverage over each of the contigs ''' bam = infiles[0] # genomecoveragebed does not like some of the # output from bwa. bwa outputs some reads # that map off the end of contigs # as having a leftmost position of 0. This is # not ideal. Need to use temporary bam # files with only mapped reads - this is # nasty and needs changing tempdir = P.getTempDir(".") tempname = P.getTempFilename(tempdir) + ".bam" P.submit("CGATPipelines.PipelineMetagenomeAssembly", "filterBamOnPos", infiles = bam, outfiles = tempname) # tablename where alignment stats live tablename = os.path.dirname( bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats" # hack to convert to table - add .load tablename = P.toTable(tablename + ".load") # connect to database dbh = connect() cc = dbh.cursor() # get number of reads aligned from bam2stats if PARAMS.get("coverage_scale"): scale_factor = cc.execute("""SELECT counts FROM %s WHERE category == 'reads_mapped'""" % tablename).fetchone()[0] scale_factor = 1 / (float(scale_factor) / 1000000) scale_options = "-scale %(scale_factor)f" else: scale_options = "" statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s; rm -rf %(tempdir)s''' P.run()
def filterContigsByCoverage(infiles, outfile): ''' filter contigs by their average base coverage ''' P.submit("PipelineMetagenomeBenchmark", "filterByCoverage", infiles = infiles, outfiles = outfile)