Ejemplo n.º 1
0
def regression(data):
    """Calls R's lm to make a linear regression on each of its inputs."""

    reg = r.lm(r('x ~ y'),
            data = r.data_frame(x=data[:,0], y=data[:,1])
            )['coefficients']

    return reg
Ejemplo n.º 2
0
	def calibrate(self):
	
		"""
		Performs a calibration based on the available datapoints.				
		"""
		
		from rpy import r
	
		if len(self.pts) < 2:
			return False

		in_x = []
		in_y = []
		in_z = []
		out_x = []
		out_y = []
		out_z = []

		# Index all points so they can be fed into the R multiple linear regression
		for in_pt, out_pt in self.pts:
			in_x.append(in_pt[0])
			in_y.append(in_pt[1])
			in_z.append(in_pt[2])
			out_x.append(out_pt[0])
			out_y.append(out_pt[1])
			out_z.append(out_pt[2])
		
		# Perform the regression analysis
		fx = r.lm(r("x ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, x=out_x))["coefficients"]
		fy = r.lm(r("y ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, y=out_y))["coefficients"]
		fz = r.lm(r("z ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, z=out_z))["coefficients"]		
	
		self.fx = fx["(Intercept)"], fx["a"], fx["b"], fx["c"]
		self.fy = fy["(Intercept)"], fy["a"], fy["b"], fy["c"]
		self.fz = fz["(Intercept)"], fz["a"], fz["b"], fz["c"]
								
		self.calibrated = True
		
		return True		
Ejemplo n.º 3
0
def LinearRegression_lm(ls1,ls2,return_rsqrd):
    intercept = 0 ### when forced through the origin
    from rpy import r
    d = r.data_frame(x=ls1, y=ls2)
    model = r("y ~ x - 1") ### when not forced through the origin it is r("y ~ x")
    fitted_model = r.lm(model, data = d)
    slope = fitted_model['coefficients']['x']
    #intercept = fitted_model['coefficients']['(Intercept)']
    if return_rsqrd == 'yes':
        from scipy import stats
        rsqrd = math.pow(stats.linregress(ls1,ls2)[2],2)
        return slope,rsqrd
    else: return slope
Ejemplo n.º 4
0
def funcion(dato,variable,caso,opciones):  # Cambiar cosa por caso
    from rpy import r #pylint: disable=import-error
    variable1 = variable[0]
    variable2 = variable[1]
    lista1=dato.query(variable1,caso)
    lista2=dato.query(variable2,caso)
    #lista2=[float(x) for x in dato.getCol(variable2,caso=caso)]
    resultadoprueba=r.lm(r("y ~ x"),data=r.data_frame(x=lista1, y=lista2))
    sumario=r.summary_lm(resultadoprueba,True)
    anova=r.anova_lm(resultadoprueba)
    #resultadoprueba=r.lsfit(lista1,lista2)
    midiccionario={"resultado":resultadoprueba,"sumario":sumario,"anova":anova}
    return midiccionario
Ejemplo n.º 5
0
 def __init__(self, y, design, model_type=r.lm):
     """ Set up and estimate R model with data and design """
     self.y = y
     self.design = design
     self.model_type = model_type
     self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])]
     # Note the '-1' for no intercept - this is included in the design
     self.formula = r("y ~ %s-1" % "+".join(self._design_cols))
     self.frame = r.data_frame(y=y, x=self.design)
     self.results = self.model_type(self.formula, data=self.frame)
     # Provide compatible interface with scipy models
     coeffs = self.results["coefficients"]
     self.beta = np.array([coeffs[c] for c in self._design_cols])
     self.resid = self.results["residuals"]
     self.predict = self.results["fitted.values"]
     self.df_resid = self.results["df.residual"]
Ejemplo n.º 6
0
def LinearRegression(ls1,ls2,return_rsqrd):
    intercept = 0 ### when forced through the origin
    from rpy import r
    r.library('MASS')
    k = r.options(warn=-1) ### suppress all warning messages from R
    #print ls1; print ls2
    d = r.data_frame(x=ls1, y=ls2)
    model = r("y ~ x - 1") ### when not forced through the origin it is r("y ~ x")
    fitted_model = r.rlm(model, data = d) ###errors: rlm failed to converge in 20 steps - maxit=21
    slope = fitted_model['coefficients']['x']
    #intercept = fitted_model['coefficients']['(Intercept)']
    if return_rsqrd == 'yes':
        from scipy import stats
        rsqrd = math.pow(stats.linregress(ls1,ls2)[2],2)
        return slope,rsqrd
    else:
        return slope
Ejemplo n.º 7
0
def check_R(model,g):
  import rpy
  from   rpy   import r
  from   numpy import array,allclose

  vars = [ v.replace(':','.').replace('+','p').replace('-','m').replace('_','.') for v in model.vars[1:] ]
  frame = dict( (v,model.X[:,i+1].reshape(-1)) for i,v in enumerate(vars) )
  frame['y'] = model.y.reshape(-1)
  formula = 'y ~ ' + ' + '.join(v.replace(':','.') for v in vars)

  rpy.set_default_mode(rpy.NO_CONVERSION)
  mod = r.glm(r(formula),data=r.data_frame(**frame),family=r.binomial('logit'))
  rpy.set_default_mode(rpy.BASIC_CONVERSION)
  pmod = mod.as_py()

  coef  = r.coefficients(mod)
  coef  = array([coef['(Intercept)']] + [ coef[v] for v in vars ],dtype=float)
  coef2 = g.beta.reshape(-1)
def fitPoly(xarray, yarray, order):

    r.lm.local_mode(rpy.NO_CONVERSION)

    xl=list(xarray)
    yl=list(yarray)
    
    modelDef = "y ~ poly(x,%d)" % order
    model=r.lm(r(modelDef), data=r.data_frame(x=xl,y=yl))
    
    pred=r.predict(model)

# pred is now a dict with keys from '1' to 'N', where N is the size of xl

    predvals = []

    for i in range(len(xl)):
        predvals.append(pred[str(i+1)])
        
    return(xl, predvals)
Ejemplo n.º 9
0
def krige_to_grid(grid_fname, obs_x, obs_y, obs_data, vgm_par):
    """Interpolate point data onto a grid using Kriging.

    Interpolate point data onto a regular rectangular grid of square cells using
    Kriging with a predefined semi-variogram.  The observed data locations must
    be specified in the same projection and coordinate system as the grid, which
    is defined in an ArcGIS raster file.

    Parameters
    ----------
    grid_fname : string
        Filename of an ArcGIS float grid raster defining the required grid to
        Krige onto.  All cells are included regardless of their value.
    obs_x : array_like
        The x coordinates of the observation locations.
    obs_y : array_like
        The y coordinates of the observation locations.
    obs_data : array_like
        The data values at the observation locations.
    vgm : dict
        A dictionary describing the semi-variogram model.  Required keys are:
        'model' can be one of {'Lin', 'Exp', 'Sph', 'Gau'}
        'nugget' must be a scalar
        'range' must be a scalar
        'sill' must be a scalar

    Returns
    -------
    kriged_est : 2darray
        A 2D array containing the Kriged estimates at each point on the
        specified rectangular grid.

    Notes
    -----
    This function requires that R, RPy and the R gstat library are correctly
    installed.

    """
    grid, headers = arcfltgrid.read(grid_fname)
    cols = headers[0]
    rows = headers[1]
    x0 = headers[2]
    y0 = headers[3]
    cell_size = headers[4]
    # TO DO: adjust x0, y0 by 0.5*cell_size if llcorner..

    # define the grid (pixel centre's)
    xt, yt = np.meshgrid(
        np.linspace(x0, x0 + (cols - 1) * cell_size, num=cols),
        np.linspace(y0 + (rows - 1) * cell_size, y0, num=rows))

    xt = xt.flatten()
    yt = yt.flatten()

    # Krige using gstat via RPy
    r.library('gstat')
    rpy.set_default_mode(rpy.NO_CONVERSION)

    obs_frame = r.data_frame(x=obs_x, y=obs_y, data=obs_data)
    target_grid = r.data_frame(x=xt, y=yt)

    v = r.vgm(vgm_par['sill'], vgm_par['model'], vgm_par['range'],
              vgm_par['nugget'])

    result = r.krige(r('data ~ 1'),
                     r('~ x + y'),
                     obs_frame,
                     target_grid,
                     model=v)

    rpy.set_default_mode(rpy.BASIC_CONVERSION)

    result = result.as_py()

    kriged_est = np.array(result['var1.pred'])
    kriged_est = kriged_est.reshape(rows, cols)

    return kriged_est
Ejemplo n.º 10
0
There are also R scripts included with most of the datasets to run
some basic models for comparisons of results to statsmodels.
'''

from rpy import r
import numpy as np
import scikits.statsmodels.api as sm

examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load()
    y, x = data.endog, sm.add_constant(data.exog)
    des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print results.keys()
    print results['coefficients']

if 2 in examples:
    data2 = sm.datasets.star98.load()
    y2, x2 = data2.endog, sm.add_constant(data2.exog)
    import rpy
    y2 = y2[:, 0] / y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
    results2 = r.glm(formula2, data=frame2, family='binomial')
    params_est = [
        results2['coefficients'][k] for k in sorted(results2['coefficients'])
Ejemplo n.º 11
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        """ Set up and estimate R model with data and design """
        r.library("MASS")  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r("y ~ %s-1" % "+".join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results["residuals"])
        if isinstance(self.results["residuals"], dict):
            self.resid = np.zeros((len(list(self.results["residuals"].keys()))))
            for i in list(self.results["residuals"].keys()):
                self.resid[int(i) - 1] = self.results["residuals"][i]
        else:
            self.resid = self.results["residuals"]
        self.fittedvalues = self.results["fitted.values"]
        self.df_resid = self.results["df.residual"]
        self.params = rsum["coefficients"][:, 0]
        self.bse = rsum["coefficients"][:, 1]
        self.bt = rsum["coefficients"][:, 2]
        try:
            self.pvalues = rsum["coefficients"][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault("r.squared", None)
        self.rsquared_adj = rsum.setdefault("adj.r.squared", None)
        self.aic_R = rsum.setdefault("aic", None)
        self.fvalue = rsum.setdefault("fstatistic", None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault("value", None)  # for wls
        df = rsum.setdefault("df", None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault("cov.unscaled", None)
        self.bcov = rsum.setdefault("cov.scaled", None)
        if "sigma" in rsum:
            self.scale = rsum["sigma"]
        elif "dispersion" in rsum:
            self.scale = rsum["dispersion"]
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
Ejemplo n.º 12
0
    for id in py_id:
        x1 = poly_x_vals[i,0]
        x2 = poly_x_vals[i,1]
        y1 = poly_y_vals[i,0]
        y2 = poly_y_vals[i,1]
        xy = poly_xy_vals[i]
        if poly_values: poly_values = poly_values + ","
        poly_values += "(%s, %f, %f, %f, %f, %f)" % (id, x1, x2, y1, y2, xy)
        i = i+1

    query = query + poly_values
#    print query
    c.execute(query)

    
    model = r.lm(r("delta ~ poly(x, 2) + poly(y, 2) + poly(x*y, 1)"), data=r.data_frame(x=py_x, y=py_y, delta=py_delta), weights=py_wt)
    model_summary = r.summary(model)
    model_coeff = array(model_summary['coefficients'])
    if not model_coeff.shape == (6,4):
        print "Bad model for %s" % exp
        continue
    
    c0 = model_coeff[0][0]
    c0_sigma = model_coeff[0][1]
    cx1 = model_coeff[1][0]
    cx1_sigma = model_coeff[1][1]
    cx2 = model_coeff[2][0]
    cx2_sigma = model_coeff[2][1]
    cy1 = model_coeff[3][0]
    cy1_sigma = model_coeff[3][1]
    cy2 = model_coeff[4][0]
Ejemplo n.º 13
0
some basic models for comparisons of results to statsmodels.
'''

from rpy import r
import numpy as np
import scikits.statsmodels.api as sm


examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load()
    y,x = data.endog, sm.add_constant(data.exog)
    des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print results.keys()
    print results['coefficients']

if 2 in examples:
    data2 = sm.datasets.star98.load()
    y2,x2 = data2.endog, sm.add_constant(data2.exog)
    import rpy
    y2 = y2[:,0]/y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
    results2 = r.glm(formula2, data=frame2, family='binomial')
    params_est = [results2['coefficients'][k] for k
                    in sorted(results2['coefficients'])]
Ejemplo n.º 14
0
                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R
                        
                        ## Various ways to calculate r. It is not possible to use
                        ## cor.test or lsfit directly, as you have to perform a
                        ## regression through the origin.
                        
                        ## uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        ## for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"), data = R.data_frame(x=columns[x], y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        ## extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""
Ejemplo n.º 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--tree-nh-file", dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header", dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson",
                               "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                # n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not
                        # possible to use cor.test or lsfit directly,
                        # as you have to perform a regression through
                        # the origin.

                        # uncomment to check pearson r against
                        # phylip's value r =
                        # calculateCorrelationCoefficient(columns[x],
                        # columns[y])

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(
                            R("y ~ x - 1"), data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y],
                             options.value_format % phy_r,
                             options.pvalue_format % p,
                             code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(
                            map(lambda x: options.value_format % x, d)) + "\n")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(node_id, bl, c1, c2, ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(
                                node_id, node.data.branchlength, c1, c2)
                        else:
                            assert(node_id == tree.root)
                            assert(len(node.succ) == 3)
                            update_data(
                                node_id, node.data.branchlength,
                                node.succ[0], node.succ[1])
                            update_data(
                                max_index - 1, node.data.branchlength,
                                node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write(
                    "node_id\tvariance\t%s\n" % "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[
                            node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
Ejemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not possible to use
                        # cor.test or lsfit directly, as you have to perform a
                        # regression through the origin.

                        # uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"),
                                            data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y], options.value_format %
                             phy_r, options.pvalue_format % p, code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(map(lambda x: options.value_format % x, d)) +
                        "\n ")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(
                    node_id,
                    bl,
                    c1,
                    c2,
                ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(node_id, node.data.branchlength, c1,
                                        c2)
                        else:
                            assert (node_id == tree.root)
                            assert (len(node.succ) == 3)
                            update_data(node_id, node.data.branchlength,
                                        node.succ[0], node.succ[1])
                            update_data(max_index - 1, node.data.branchlength,
                                        node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write("node_id\tvariance\t%s\n" %
                                     "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
Ejemplo n.º 17
0
def krige_to_grid(grid_fname, obs_x, obs_y, obs_data, vgm_par):
    """Interpolate point data onto a grid using Kriging.

    Interpolate point data onto a regular rectangular grid of square cells using
    Kriging with a predefined semi-variogram.  The observed data locations must
    be specified in the same projection and coordinate system as the grid, which
    is defined in an ArcGIS raster file.

    Parameters
    ----------
    grid_fname : string
        Filename of an ArcGIS float grid raster defining the required grid to
        Krige onto.  All cells are included regardless of their value.
    obs_x : array_like
        The x coordinates of the observation locations.
    obs_y : array_like
        The y coordinates of the observation locations.
    obs_data : array_like
        The data values at the observation locations.
    vgm : dict
        A dictionary describing the semi-variogram model.  Required keys are:
        'model' can be one of {'Lin', 'Exp', 'Sph', 'Gau'}
        'nugget' must be a scalar
        'range' must be a scalar
        'sill' must be a scalar

    Returns
    -------
    kriged_est : 2darray
        A 2D array containing the Kriged estimates at each point on the
        specified rectangular grid.

    Notes
    -----
    This function requires that R, RPy and the R gstat library are correctly
    installed.

    """
    grid, headers = arcfltgrid.read(grid_fname)
    cols = headers[0]
    rows = headers[1]
    x0 = headers[2]
    y0 = headers[3]
    cell_size = headers[4]
    # TO DO: adjust x0, y0 by 0.5*cell_size if llcorner..

    # define the grid (pixel centre's)
    xt, yt = np.meshgrid(np.linspace(x0, x0 + (cols-1)*cell_size, num=cols),
                         np.linspace(y0 + (rows-1)*cell_size, y0, num=rows))

    xt = xt.flatten()
    yt = yt.flatten()

    # Krige using gstat via RPy
    r.library('gstat')
    rpy.set_default_mode(rpy.NO_CONVERSION)

    obs_frame = r.data_frame(x=obs_x, y=obs_y, data=obs_data)
    target_grid = r.data_frame(x=xt, y=yt)

    v = r.vgm(vgm_par['sill'], vgm_par['model'],
              vgm_par['range'], vgm_par['nugget'])

    result = r.krige(r('data ~ 1'), r('~ x + y'),
                     obs_frame, target_grid, model=v)

    rpy.set_default_mode(rpy.BASIC_CONVERSION)

    result = result.as_py()

    kriged_est = np.array(result['var1.pred'])
    kriged_est = kriged_est.reshape(rows, cols)

    return kriged_est
Ejemplo n.º 18
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        ''' Set up and estimate R model with data and design '''
        r.library('MASS')  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = [
            'x.%d' % (i + 1) for i in range(self.design.shape[1])
        ]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r('y ~ %s-1' % '+'.join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results['residuals'])
        if isinstance(self.results['residuals'], dict):
            self.resid = np.zeros((len(self.results['residuals'].keys())))
            for i in self.results['residuals'].keys():
                self.resid[int(i) - 1] = self.results['residuals'][i]
        else:
            self.resid = self.results['residuals']
        self.fittedvalues = self.results['fitted.values']
        self.df_resid = self.results['df.residual']
        self.params = rsum['coefficients'][:, 0]
        self.bse = rsum['coefficients'][:, 1]
        self.bt = rsum['coefficients'][:, 2]
        try:
            self.pvalues = rsum['coefficients'][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault('r.squared', None)
        self.rsquared_adj = rsum.setdefault('adj.r.squared', None)
        self.aic_R = rsum.setdefault('aic', None)
        self.fvalue = rsum.setdefault('fstatistic', None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault('value', None)  # for wls
        df = rsum.setdefault('df', None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault('cov.unscaled', None)
        self.bcov = rsum.setdefault('cov.scaled', None)
        if 'sigma' in rsum:
            self.scale = rsum['sigma']
        elif 'dispersion' in rsum:
            self.scale = rsum['dispersion']
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()