Esempio n. 1
0
def model_error(model_counts, bin=1):
    """Given the error profile of the X and Y chromosome, derive an error
    model for stutter introduced into the library during MDA."""

    # Step 1. since many major alleles do not have enough observations in
    # the data to make a good model, combine histograms from nearby major
    # alleles to increase the amount of data available while also retaining
    # some of the unique properties of the major allele.
    # Combine histograms for:
    #    | major_allele - bin | ... | major_allele + bin |
    # IMPORTANT NOTE: the bin parameter is in repeat units.  Therefore, if
    # we observe a major allele at 39 repeat units but NO alleles at 38 or
    # 40, for bin=1 the 39 histogram will get no extra data.  That is, there
    # is a distinction between the n closest alleles and alleles within n
    # repeat units.
    binned_model = defaultdict(dict)
    for unit, counts_by_major_allele in model_counts.iteritems():
        for major_allele, data in counts_by_major_allele.iteritems():
            major_allele = int(major_allele)
            binned_model[unit][major_allele] = \
                bin_data(major_allele, bin, counts_by_major_allele)
    for unit, binned_data_by_major_allele in binned_model.iteritems():
        vert_rows = ceil(len(binned_data_by_major_allele) / 8.0)
        #rpy.r.png('%s_piecewise_pmfs.png' % unit, width=8*250,
                  #height=300*vert_rows)
        #rpy.r.library('plotrix')
        #rpy.r.layout(rpy.r.matrix(range(1, vert_rows*8 + 1), ncol=8, byrow=True))
        #rpy.r.par(mar=[3,2,2,1])
        print("opening layout for %s, %d fits" % (unit, len(binned_data_by_major_allele)))
        for major_allele, binned_data in binned_data_by_major_allele.iteritems():
            #rpy.r.barp(binned_data.values(), x=[ major_allele + a for a in binned_data ])
            rpy.r.assign("%s.%d.freq" % (unit, major_allele), binned_data.values())
            rpy.r.assign("%s.%d.allele" % (unit, major_allele),[major_allele+a for a in binned_data ])
        #rpy.r.dev_off()
    rpy.r.save_image(file="hists.RData")
    exit(1)

    # Step 2. with our binned profiles, we can determine the empirical mean
    # and variance for errors for each major allele size.  This needs to be
    # done separately for gains and losses since the final piecewise PMF will
    # be constructed with two different negbin models--one for gain and one
    # for loss.
    for unit, binned_data in binned_model.iteritems():
        model_mean_and_variance(binned_data)
        sys.stdin.read(1)
Esempio n. 2
0
def model_error(model_counts, bin=2, save_rdata=None, interactive=False):
    """Given the error profile of the X and Y chromosome, derive an error
    model for stutter introduced into the library during MDA.  If save_rdata
    is a file name, convert data into R readable format and save it in the
    RData format."""

    # Step 1. since many major alleles do not have enough observations in
    # the data to make a good model, combine histograms from nearby major
    # alleles to increase the amount of data available while also retaining
    # some of the unique properties of the major allele.
    # Combine histograms for:
    #    | major_allele - bin | ... | major_allele + bin |
    # IMPORTANT NOTE: the bin parameter is in repeat units.  Therefore, if
    # we observe a major allele at 39 repeat units but NO alleles at 38 or
    # 40, for bin=1 the 39 histogram will get no extra data.  That is, there
    # is a distinction between the n closest alleles and alleles within n
    # repeat units.
    binned_data = defaultdict(dict)
    unbinned_data = defaultdict(dict)
    for unit, counts_by_major_allele in model_counts.iteritems():
        for major_allele, data in counts_by_major_allele.iteritems():
            major_allele = int(major_allele)
            # bin_data with bin extension=0 is the unbinned data for that allele
            unbinned_data[unit][major_allele] = \
                bin_data(major_allele, 0, counts_by_major_allele)
            # NB. 'data' is not the right thing to pass
            binned_data[unit][major_allele] = \
                bin_data(major_allele, bin, counts_by_major_allele)
        #plot_binned_data("%s_binned_hists.png" % unit, binned_data[unit],
                         #interactive=interactive)
        rpy.r.assign('%s.raw.data' % unit, unbinned_data[unit])
        rpy.r.assign('%s.binned.data' % unit, binned_data[unit])

    # Step 2. with our binned profiles, we can determine the empirical mean
    # and variance for errors for each major allele size.  This needs to be
    # done separately for gains and losses since the final piecewise PMF will
    # be constructed with two different negbin models--one for gain and one
    # for loss.
    model = defaultdict(dict)
    for unit, bdata in binned_data.iteritems():
        tot_ests, gain_ests, loss_ests = \
            estimate_mean_and_variance(bdata)
        rpy.r.assign("%s.tot.ests" % unit, tot_ests)
        rpy.r.assign("%s.gain.ests" % unit, gain_ests)
        rpy.r.assign("%s.loss.ests" % unit, loss_ests)

        print("unit=" + unit)
        model[unit]['gain'] = model_mean_and_variance(gain_ests)
        model[unit]['loss'] = model_mean_and_variance(loss_ests)
        rpy.r.assign("%s.gain.model" % unit, model[unit]['gain'])
        rpy.r.assign("%s.loss.model" % unit, model[unit]['loss'])

        with PlotContext(interactive, rpy.r.png,
            filename="%s_mean_and_var.png" % unit, width=1000, height=1000):
            rpy.r.layout(rpy.r.matrix([1,2,3,4], ncol=2))
            rpy.r.par(mar=[3, 2, 2, 1])
            plot_mean_and_variance(unit, "gain", gain_ests, model[unit]['gain'])
            plot_mean_and_variance(unit, "loss", loss_ests, model[unit]['loss'])
 
    if save_rdata:
        rpy.r.save_image(file=save_rdata)