Example #1
0
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False,
                   start=-util.INF, end=util.INF):

    from rasmus import gnuplot
    import scipy
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)
    
    x = [bin[0] for bin in bins]
    expected = [len(data) * cdf(x[1], params)]
    expected.extend([len(data) *
                     (cdf(x[i+1], params) - cdf(x[i], params))
                     for i in range(1, len(x)-1)])
    expected.append(len(data) * (1.0 - cdf(x[-1], params)))
    expected = scipy.array(util.mget(expected, ind))
    
    chi2, pval = scipy.stats.chisquare(obs, expected)

    if plot:        
        p = gnuplot.plot(util.mget(x, ind), obs)
        p.plot(util.mget(x, ind), expected)
    
    return chi2, pval
Example #2
0
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5,
                start=-util.INF, end=util.INF):

    import scipy
    import scipy.optimize
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)
    
    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([len(data) *
                         (cdf(x[i+1], params) - cdf(x[i], params))
                         for i in range(1, len(x)-1)])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))
        
        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2

    params = scipy.optimize.fmin(optfunc, params_init, disp=False)
    chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples)

    return list(params), pval
Example #3
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0,
                       nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(float, util.submatrix(dat, range(1, len(dat)),
                                           range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x<5*m, treelens)
        files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in
                                             files, gene_sizes, lens, treelens]



    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    
    return species, lens, gene_sizes, files
Example #4
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(
        float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x < 5 * m, treelens)
        files, gene_sizes, lens, treelens = [
            util.mget(x, ind) for x in files, gene_sizes, lens, treelens
        ]

    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    return species, lens, gene_sizes, files
Example #5
0
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5,
                start=-util.INF, end=util.INF):

    import scipy
    import scipy.optimize
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in range(0, len(data), binsize)]
    obs = scipy.array(list(map(len, bins)))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([len(data) *
                         (cdf(x[i+1], params) - cdf(x[i], params))
                         for i in range(1, len(x)-1)])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))

        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2

    params = scipy.optimize.fmin(optfunc, params_init, disp=False)
    chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples)

    return list(params), pval
Example #6
0
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False,
                   start=-util.INF, end=util.INF):

    from rasmus import gnuplot
    import scipy
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in range(0, len(data), binsize)]
    obs = scipy.array(list(map(len, bins)))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    x = [bin[0] for bin in bins]
    expected = [len(data) * cdf(x[1], params)]
    expected.extend([len(data) *
                     (cdf(x[i+1], params) - cdf(x[i], params))
                     for i in range(1, len(x)-1)])
    expected.append(len(data) * (1.0 - cdf(x[-1], params)))
    expected = scipy.array(util.mget(expected, ind))

    chi2, pval = scipy.stats.chisquare(obs, expected)

    if plot:
        p = gnuplot.plot(util.mget(x, ind), obs)
        p.plot(util.mget(x, ind), expected)

    return chi2, pval
Example #7
0
def remove_gapped_columns(aln):
    """Removes any column form an alignment 'aln' that contains a gap
    
       A new alignment is returned
    """
    cols = zip(*aln.values())
    ind = util.find(lambda col: "-" not in col, cols)
    return subalign(aln, ind)
def remove_gapped_columns(aln):
    """Removes any column form an alignment 'aln' that contains a gap

       A new alignment is returned
    """
    cols = zip(* aln.values())
    ind = util.find(lambda col: "-" not in col, cols)
    return subalign(aln, ind)
Example #9
0
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5):
    sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)]
    sizes.append(sizes[-1])  # NOTE: assumes bins are of equal size

    # only focus on bins that are large enough
    counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)]

    expected = []
    for i in xrange(len(xbins) - 1):
        expected.append(
            (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples)

    # ensure we have enough expected samples in each bin
    ind = util.find(util.gefunc(minsamples), expected)
    counts = util.mget(counts, ind)
    expected = util.mget(expected, ind)

    if len(counts) == 0:
        return [0, 1], counts, expected
    else:
        return chiSquare([counts], [expected], nparams), counts, expected
Example #10
0
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5):
    sizes = [xbins[i+1] - xbins[i] for i in xrange(len(xbins)-1)]
    sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size
    
    # only focus on bins that are large enough
    counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins)-1)]
    
    expected = []
    for i in xrange(len(xbins)-1):
        expected.append((func(xbins[i]) + func(xbins[i+1]))/2.0 * 
                         sizes[i] * nsamples)
        
    # ensure we have enough expected samples in each bin
    ind = util.find(util.gefunc(minsamples), expected)
    counts = util.mget(counts, ind)
    expected = util.mget(expected, ind)
    
    if len(counts) == 0:
        return [0, 1], counts, expected
    else:
        return chiSquare([counts], [expected], nparams), counts, expected