def buildExpectedGenomeCoverage(infiles, outfile):
    '''
    build the expected coverage over the genomes
    in the sample based on read depth and length
    '''
    P.submit("PipelineMetagenomeBenchmark",
             "buildExpectedCoverageOverGenomes", infiles=infiles, outfiles=outfile)
def buildCoverageOverGenomes(infiles, outfile):
    '''
    create file with the coverage over each of the 
    simulated genomes
    '''
    P.submit("PipelineMetagenomeBenchmark", "buildCoverageOverGenomes",
             infiles=infiles, outfiles=outfile)
def filterContigsByCoverage(infiles, outfile):
    '''
    filter contigs by their average base coverage
    '''
    P.submit("PipelineMetagenomeBenchmark",
             "filterByCoverage",
             infiles=infiles,
             outfiles=outfile)
def buildCoverageOverGenomes(infiles, outfile):
    '''
    create file with the coverage over each of the 
    simulated genomes
    '''
    P.submit("PipelineMetagenomeBenchmark",
             "buildCoverageOverGenomes",
             infiles=infiles,
             outfiles=outfile)
Beispiel #5
0
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    module = P.snip(IDR.__file__, ".py")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
Beispiel #6
0
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    module = P.snip(IDR.__file__, ".py")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
Beispiel #7
0
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
Beispiel #8
0
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
def buildExpectedGenomeCoverage(infiles, outfile):
    '''
    build the expected coverage over the genomes
    in the sample based on read depth and length
    '''
    P.submit("PipelineMetagenomeBenchmark",
             "buildExpectedCoverageOverGenomes",
             infiles=infiles,
             outfiles=outfile)
Beispiel #10
0
def findNPeaksForPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_self_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
Beispiel #11
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
Beispiel #12
0
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".py")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
Beispiel #13
0
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
Beispiel #14
0
def findNPeaksForPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_self_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[
                 str(idr_thresh),
             ],
             infiles=infiles,
             outfiles=outfile)
Beispiel #15
0
def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
Beispiel #16
0
def splitBamfiles(infile, sentinal):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
def buildChimerasBasedOnReads(infile, outfile):
    '''
    this function is an alternative to counting a contig as a chimera
    if it aligns to more than one genome. A contig is likely to align
    to multiple genomes with high idenitity if there contains very similar
    genomes in the sample. This is true of our simulation that contains
    subspecies of the same species e.g. B.fragilis subspecies

    A more appropriate method for assessing chimericity is to score
    each contig with a chimericity score. The chimericity score is the 
    ratio of "good" alignments / "bad" alignments. An alignment is considered
    "good" if it is from the species from which the majority of alignments
    from that contig are derived'''

    P.submit("CGATPipelines.PipelineMetagenomeBenchmark", "buildChimerasBasedOnReads"
             , infiles = infile, outfiles = outfile)
Beispiel #18
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinel)
def buildChimerasBasedOnReads(infile, outfile):
    '''
    this function is an alternative to counting a contig as a chimera
    if it aligns to more than one genome. A contig is likely to align
    to multiple genomes with high idenitity if there contains very similar
    genomes in the sample. This is true of our simulation that contains
    subspecies of the same species e.g. B.fragilis subspecies

    A more appropriate method for assessing chimericity is to score
    each contig with a chimericity score. The chimericity score is the 
    ratio of "good" alignments / "bad" alignments. An alignment is considered
    "good" if it is from the species from which the majority of alignments
    from that contig are derived'''

    P.submit("CGATPipelines.PipelineMetagenomeBenchmark",
             "buildChimerasBasedOnReads",
             infiles=infile,
             outfiles=outfile)
Beispiel #20
0
def buildCoverageOverContigs(infiles, outfile):
    '''
    build histograms of the coverage over each of the contigs
    '''
    bam = infiles[0]
    # genomecoveragebed does not like some of the 
    # output from bwa. bwa outputs some reads
    # that map off the end of contigs
    # as having a leftmost position of 0. This is
    # not ideal. Need to use temporary bam
    # files with only mapped reads - this is 
    # nasty and needs changing
    tempdir = P.getTempDir(".")
    tempname = P.getTempFilename(tempdir) + ".bam"
    P.submit("CGATPipelines.PipelineMetagenomeAssembly", 
             "filterBamOnPos", 
             infiles = bam, 
             outfiles = tempname)

    # tablename where alignment stats live
    tablename = os.path.dirname(
        bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats"

    # hack to convert to table - add .load
    tablename = P.toTable(tablename + ".load")
    
    # connect to database
    dbh = connect()
    cc = dbh.cursor()

    # get number of reads aligned from bam2stats
    if PARAMS.get("coverage_scale"):
        scale_factor = cc.execute("""SELECT counts FROM %s
                                     WHERE category == 'reads_mapped'""" % tablename).fetchone()[0]
        scale_factor = 1 / (float(scale_factor) / 1000000)
        scale_options = "-scale %(scale_factor)f"
    else:
        scale_options = ""

    statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s;
                   rm -rf %(tempdir)s'''
    P.run()
def filterContigsByCoverage(infiles, outfile):
    '''
    filter contigs by their average base coverage
    '''
    P.submit("PipelineMetagenomeBenchmark", "filterByCoverage", infiles = infiles, outfiles = outfile)