Ejemplo n.º 1
0
def readRates( infile ):
    """read rates and G+C from a tab-separated file."""
    from rpy import r as R
    import rpy

    handle, name = tempfile.mkstemp()
    os.close(handle)
    outfile = open(name, "w")

    first = True
    headers = []
    for line in infile:
        if line[0] == "#": continue
        data = line[:-1].split("\t")
        if first:
            headers = data
            first = False
            continue

        outfile.write( line )
    outfile.close()

    assert len(headers) == 3, "malformatted file of rates, please supply id, g+c, rate"

    rpy.set_default_mode(rpy.NO_CONVERSION)
    matrix = R.read_table( name, na_string = ("NA", "na"), col_names=headers ) 
    rpy.set_default_mode(rpy.BASIC_CONVERSION)
    os.remove( name )
    return matrix, headers
Ejemplo n.º 2
0
    def randomForest_predict(self, fit_model, data):
        """
		03-17-06
		2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        if self.debug:
            sys.stderr.write("Predicting by randomForest...\n")
        data = array(data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": data[:, 0],
                "recurrence": data[:, 1],
                "connectivity": data[:, 2],
                "cluster_size": data[:, 3],
                "gradient": data[:, 4],
                "avg_degree": data[:, 5],
                "unknown_ratio": data[:, 6],
                "is_correct": r.factor(data[:, -1]),
            }
        )
        set_default_mode(BASIC_CONVERSION)
        pred = r.predict(fit_model, data_frame)
        del data_frame
        if self.debug:
            sys.stderr.write("Done randomForest prediction.\n")
        return pred
Ejemplo n.º 3
0
    def rpart_predict(self, fit_model, data):
        """
		11-23-05
			split from rpart_fit_and_predict()
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_predict...\n")
        data = array(data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": data[:, 0],
                "recurrence": data[:, 1],
                "connectivity": data[:, 2],
                "cluster_size": data[:, 3],
                "gradient": data[:, 4],
                "is_correct": data[:, -1],
            }
        )
        set_default_mode(BASIC_CONVERSION)
        pred = r.predict(fit_model, data_frame, type=["class"])  # 11-17-05 type=c("class")
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_predict.\n")
        return pred
Ejemplo n.º 4
0
def readRates(infile):
    """read rates and G+C from a tab-separated file."""
    from rpy import r as R
    import rpy

    handle, name = tempfile.mkstemp()
    os.close(handle)
    outfile = open(name, "w")

    first = True
    headers = []
    for line in infile:
        if line[0] == "#":
            continue
        data = line[:-1].split("\t")
        if first:
            headers = data
            first = False
            continue

        outfile.write(line)
    outfile.close()

    assert len(
        headers) == 3, "malformatted file of rates, please supply id, g+c, rate"

    rpy.set_default_mode(rpy.NO_CONVERSION)
    matrix = R.read_table(name, na_string=("NA", "na"), col_names=headers)
    rpy.set_default_mode(rpy.BASIC_CONVERSION)
    os.remove(name)
    return matrix, headers
Ejemplo n.º 5
0
    def generateCorStructForGLSFromVarianceMatrix(cls, variance_matrix):
        """
		2009-12-23
			generate the corStruct for gls()
		"""
        sys.stderr.write("Generating corStruct for gls() from variance_matrix ...")

        rpy.set_default_mode(rpy.NO_CONVERSION)  # 04-07-05
        rpy.r.library("nlme")

        # bring the lower-triangle of variance_matrix into a list, row by row
        no_of_rows, no_of_cols = variance_matrix.shape
        lower_triangle_cor_vector = []
        for i in range(1, no_of_rows):
            for j in range(i):
                lower_triangle_cor_vector.append(
                    variance_matrix[i][j] / math.sqrt(variance_matrix[i][i] * variance_matrix[j][j])
                )

        csSymm = rpy.r.corSymm(value=lower_triangle_cor_vector)

        data_frame = rpy.r.as_data_frame({"fakedata": [1] * no_of_rows})
        csSymm = rpy.r.Initialize(csSymm, data=data_frame)
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        sys.stderr.write("Done.\n")
        return csSymm
Ejemplo n.º 6
0
def regress(dv, iv):
    # Performs regression using R's linear model function (lm)
    if type(dv.values()) is list and all(type(x) is list for x in iv.values()):
        # First check that all of the data is in list form, otherwise RPy will throw an error
        rpy.set_default_mode(
            rpy.NO_CONVERSION)  # Keeps values in R format until we need them
        R_string, frame = make_R_strings(
            dv, iv)  # Create strings used by RPy to run regression

        # R runs the linear regression
        OLS_model = eval('rpy.r.lm(R_string, data=rpy.r.data_frame(' + frame +
                         '))')
        rpy.set_default_mode(
            rpy.BASIC_CONVERSION)  # Now convert back to usable format

        model_summary = rpy.r.summary(OLS_model)  # Store resultss

        # Extract all of the data of interest
        coeff = model_summary['coefficients'][:, 0]  # Regression coeffecients
        std_err = model_summary['coefficients'][:, 1]  # Standard Errors
        t_stat = model_summary['coefficients'][:, 2]  # t-statistics
        p_val = model_summary['coefficients'][:, 3]  # p-values
        r_sqr = model_summary['r.squared']  # R-squred
        asj_r_sqr = model_summary['adj.r.squared']  # Adjusted R-squared

        return coeff, std_err, t_stat, p_val, r_sqr, asj_r_sqr
    else:
        raise TypeError("All variables must all be of type 'list'")
Ejemplo n.º 7
0
 def lm(self, l, h):
     for i in range(l, h + 1):
         data_frame, data_model = self.mount_reg_params(i)
         print data_model
         rpy.set_default_mode(rpy.NO_CONVERSION)
         linear_model = r.lm(r(data_model), data=data_frame)
         rpy.set_default_mode(rpy.BASIC_CONVERSION)
         print r.summary(linear_model)['r.squared']
def VegetationClassify(Elev_arr, River_arr): 

  rpy.r.library("rpart")
  # Read the dictionary from the pickle file
  pkl_file = open('decision_tree.pkl','rb')
  rpy.set_default_mode(rpy.NO_CONVERSION)
  traing_data = pickle.load(pkl_file)
  pkl_file.close()

  # Create Decision tree for predicting landcover class
  # create the decision tree using rpart 
  fit = rpy.r.rpart(formula='Class ~ Elevation + RiverDistance + Slope \
      + Aspect_x + Aspect_y',data = traing_data, method = "class")

  # calculate River distance using River_arr
  River_dist_arr = dist.CityBlock(River_arr)  
  # claculate slope and aspect
  (Slope_arr, Aspect_arr) = Slope_aspect.Slope_aspect(Elev_arr)

  (x_len, y_len) = Elev_arr.shape
  # Alloctae vegetation array for holding predicted landcover values
  Veg_arr = numpy.zeros((x_len, y_len), dtype = "uint8")

  # Normalize the elevation data
  minimum_elev = numpy.min(Elev_arr)
  factor = numpy.max(Elev_arr) - minimum_elev
  Elev_arr = (Elev_arr[:,:] - minimum_elev)*100/factor

  # Create various list to hold test data
  Elevation = []
  Slope = []
  RiverDistance = []
  Aspect_x = []
  Aspect_y = []

  # Append the data into respective list
  for i in range(0,x_len):
    for j in range(0,y_len):
      Elevation.append(int(Elev_arr[i][j]))
      Slope.append(int(Slope_arr[i][j]))
      RiverDistance.append(int(River_dist_arr[i][j]))
      Aspect_x.append(int(Aspect_arr[i][j][0]))
      Aspect_y.append(int(Aspect_arr[i][j][1]))

  # Create dictionary so as to apply R's predict command on it 
  Test_data ={'Elevation':Elevation ,'Slope':Slope ,'RiverDistance':RiverDistance,\
             'Aspect_x':Aspect_x,'Aspect_y':Aspect_y}

  rpy.set_default_mode(rpy.BASIC_CONVERSION)
  # values contain probability values of the predicted landcover classes
  values = rpy.r.predict(fit,newdata=Test_data,method="class")
  for i in range(0,x_len):
    for j in range(0,y_len):
      # Get the class having max probability for each test data point
      a = ndimage.maximum_position(values[i*x_len + j])
      Veg_arr[i,j] = (a[0]*25) # Assign them some value to facilitate visualization
  return Veg_arr
Ejemplo n.º 9
0
    def randomForest_fit(self, known_data, parameter_list, bit_string="1111111"):
        """
		03-17-06
		2006-10-302006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        if self.debug:
            sys.stderr.write("Fitting randomForest...\n")
        mty = parameter_list[0]

        from rpy import r

        r._libPaths(
            os.path.join(lib_path, "R")
        )  # better than r.library("randomForest", lib_loc=os.path.join(lib_path, "R")) (see plone doc)
        r.library("randomForest")

        coeff_name_list = [
            "p_value",
            "recurrence",
            "connectivity",
            "cluster_size",
            "gradient",
            "avg_degree",
            "unknown_ratio",
        ]  # 2006-10-30
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
        formula = r("is_correct~%s" % "+".join(formula_list))

        known_data = array(known_data)
        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "avg_degree": known_data[:, 5],
                "unknown_ratio": known_data[:, 6],
                "is_correct": r.factor(known_data[:, -1]),
            }
        )  # 03-17-06, watch r.factor	#2006-10-30

        if mty > 0:
            fit = r.randomForest(formula, data=data_frame, mty=mty)
        else:
            fit = r.randomForest(formula, data=data_frame)

        del data_frame
        if self.debug:
            sys.stderr.write("Done fitting randomForest.\n")
        return fit
Ejemplo n.º 10
0
def interpolazionelineare(x, y):
    rpy.set_default_mode(rpy.NO_CONVERSION)  #serve per l'ultima parte in R
    linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame(x=x, y=y))
    rpy.set_default_mode(rpy.BASIC_CONVERSION)
    summary = rpy.r.summary(linear_model)
    #pendenza,errpendenza,intercetta,errintercetta
    risultati = (summary['coefficients'][0][0], \
                    summary['coefficients'][0][1], \
                    summary['coefficients'][1][0], \
                    summary['coefficients'][1][1])
    return risultati
Ejemplo n.º 11
0
 def _setup(self):
     try:
         import rpy
         self.__rpy = rpy
         self.__rpy_version = 1
     except:
         import rpy2.rpy_classic as rpy
         rpy.set_default_mode(rpy.BASIC_CONVERSION)
         self.__rpy = rpy
         self.__rpy_version = 2
         self._process_events()
     globals()[self.__name] = rpy.r
 def _setup(self):
     try:
         import rpy
         self.__rpy = rpy
         self.__rpy_version = 1
     except:
         import rpy2.rpy_classic as rpy
         rpy.set_default_mode(rpy.BASIC_CONVERSION)
         self.__rpy = rpy
         self.__rpy_version = 2
         self._process_events()
     globals()[self.__name] = rpy.r
Ejemplo n.º 13
0
    def estimate_pi0(self, lambda_list, pi0_list):
        """
		01-19-06
			Storey2003, (natural) cubic spline, df=3
		"""
        sys.stderr.write("Estimating pi0...\n")
        rpy.set_default_mode(rpy.NO_CONVERSION)
        s = r.smooth_spline(lambda_list, pi0_list, df=3)
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        estimated_pi0 = r.predict(s, 1)['y']
        print "\t estimated_pi0:", estimated_pi0
        sys.stderr.write("Done.\n")
        return estimated_pi0
Ejemplo n.º 14
0
 def interpolazionelineare(self, other):
         """x.interpolazionelineare(y) esegue l'i.l. con x in ascissa e y in ordinata.
         x e y devono essere due oggetti della classe DatiSperimentali."""
         rpy.set_default_mode(rpy.NO_CONVERSION)
         linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=self.valori, y=other.valori))
         rpy.set_default_mode(rpy.BASIC_CONVERSION)
         summary = rpy.r.summary(linear_model)
         #pendenza,errpendenza,intercetta,errintercetta
         risultati = (summary['coefficients'][0][0], \
                     summary['coefficients'][0][1], \
                     summary['coefficients'][1][0], \
                     summary['coefficients'][1][1])
         return risultati
Ejemplo n.º 15
0
	def estimate_pi0(self, lambda_list, pi0_list):
		"""
		01-19-06
			Storey2003, (natural) cubic spline, df=3
		"""
		sys.stderr.write("Estimating pi0...\n")
		rpy.set_default_mode(rpy.NO_CONVERSION)
		s = r.smooth_spline(lambda_list, pi0_list, df=3)
		rpy.set_default_mode(rpy.BASIC_CONVERSION)
		estimated_pi0 = r.predict(s,1)['y']
		print "\t estimated_pi0:", estimated_pi0
		sys.stderr.write("Done.\n")
		return estimated_pi0
Ejemplo n.º 16
0
 def interpolazionelineare(self, other):
     rpy.set_default_mode(rpy.NO_CONVERSION)
     linear_model = rpy.r.lm(rpy.r("y ~ x"),
                             data=rpy.r.data_frame(x=self.valori,
                                                   y=other.valori))
     rpy.set_default_mode(rpy.BASIC_CONVERSION)
     summary = rpy.r.summary(linear_model)
     #pendenza,errpendenza,intercetta,errintercetta
     risultati = (summary['coefficients'][0][0], \
                 summary['coefficients'][0][1], \
                 summary['coefficients'][1][0], \
                 summary['coefficients'][1][1])
     return risultati
Ejemplo n.º 17
0
    def _train(self, dataset):
        """Train the classifier using `data` (`Dataset`).
        """
        # process the labels based on the model family
        if self.params.family == 'gaussian':
            # do nothing, just save the labels as a list
            labels = dataset.labels.tolist()
            pass
        elif self.params.family == 'multinomial':
            # turn lables into list of range values starting at 1
            labels = _label2indlist(dataset.labels,
                                    dataset.uniquelabels)
        self.__ulabels = dataset.uniquelabels.copy()

        # process the pmax
        if self.params.pmax is None:
            # set it to the num features
            pmax = dataset.nfeatures
        else:
            # use the value
            pmax = self.params.pmax

        # train with specifying max_steps
        # must not convert trained model to dict or we'll get segfault
        rpy.set_default_mode(rpy.NO_CONVERSION)
        self.__trained_model = rpy.r.glmnet(dataset.samples,
                                            labels,
                                            family=self.params.family,
                                            alpha=self.params.alpha,
                                            nlambda=self.params.nlambda,
                                            standardize=self.params.standardize,
                                            thresh=self.params.thresh,
                                            pmax=pmax,
                                            maxit=self.params.maxit,
                                            type=self.params.model_type)
        rpy.set_default_mode(rpy.NO_DEFAULT)

        # get a dict version of the model
        self.__trained_model_dict = rpy.r.as_list(self.__trained_model)

        # save the lambda of last step
        self.__last_lambda = self.__trained_model_dict['lambda'][-1]

        # set the weights to the last step
        weights = rpy.r.coef(self.__trained_model, s=self.__last_lambda)
        if self.params.family == 'multinomial':
            self.__weights = N.hstack([rpy.r.as_matrix(weights[str(i)])[1:]
                                       for i in range(1,len(self.__ulabels)+1)])
        elif self.params.family == 'gaussian':
            self.__weights = rpy.r.as_matrix(weights)[1:]
Ejemplo n.º 18
0
def make_L(data,direction='S',z=None,):
    """ Define the along track distance from one reference

        direction define the cardinal direction priority (N,S,W or E).
         S means that the reference will be the southern most point

        z define the bathymetry, if defined, the closest point to that
         bathymetry will be the reference. In case of cross this bathymetry
         more than once, the direction criteria is used to distinguish.
    """
    from fluid.common.distance import distance
    all_cycles_data = join_cycles(data)

    if z==None:
        import rpy
        #for t in topex.invert_keys(data):
        for t in all_cycles_data:
            rpy.set_default_mode(rpy.NO_CONVERSION)
            linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude']))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            coef=rpy.r.coef(linear_model)
            if direction=='S':
                lat0=all_cycles_data[t]['Latitude'].min()-1
                lon0 = (lat0-coef['(Intercept)'])/coef['x']
                L_correction = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],lat0,lon0).min()
            for c in invert_keys(data)[t]:
                data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0)- L_correction
    # This bathymetric method was only copied from an old code. This should be atleast
    #  changed, if not removed.
    elif method=='bathymetric':
        import rpy
        for t in all_cycles_data:
            # First define the near coast values.
            idSouth=numpy.argmin(all_cycles_data[t]['Latitude'])
            L_tmp = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],all_cycles_data[t]['Latitude'][idSouth],all_cycles_data[t]['Longitude'][idSouth])
            idNearCoast = L_tmp.data<400e3
            if min(all_cycles_data[t]['Bathy'][idNearCoast]) > -z:
                idNearCoast = L_tmp.data<600e3
            # Then calculate the distance to a reference
            rpy.set_default_mode(rpy.NO_CONVERSION)
            linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude']))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            coef=rpy.r.coef(linear_model)
            lat0 = all_cycles_data[t]['Latitude'].min()-1
            lon0 = (lat0-coef['(Intercept)'])/coef['x']
            #L = distance(,lon,lat0,lon0)
            #
            #id0 = numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast]))
            idref=numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast]+z))
            #L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref])
            L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],lat0,lon0)
            for c in topex.invert_keys(data)[t]:
                #data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],all_cycles_data[t]['Latitude'][idNearCoast][id0],all_cycles_data[t]['Longitude'][idNearCoast][id0]) - L_correction
                data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0) - L_correction
    #
    return
Ejemplo n.º 19
0
    def pure_linear_model_via_R(cls, non_NA_genotype_ls, non_NA_phenotype_ls, non_NA_phenotype2count=None):
        """
		2010-2-25
			use createDesignMatrix() to generate a design matrix
		2009-8-28
			split out of pure_linear_model(). same functionality as pure_linear_model(), but invoke R to run regression.
		"""

        genotype_matrix = cls.createDesignMatrix(non_NA_genotype_ls)
        # 2008-11-10 do linear regression by R
        genotype_var = numpy.var(genotype_matrix[:, 0])  # 2008-11-10 var=\sum(x_i-\bar{x})^2/(n-1)
        rpy.set_default_mode(rpy.NO_CONVERSION)  # 04-07-05
        # data_frame = rpy.r.as_data_frame({"phenotype":non_NA_phenotype_ls, "genotype":rpy.r.as_factor(genotype_matrix[:,1])})
        formula_list = []
        data_frame_dict = {"phenotype": non_NA_phenotype_ls}
        for i in range(genotype_matrix.shape[1]):
            var_name = "genotype%s" % i
            formula_list.append(var_name)
            data_frame_dict.update({var_name: genotype_matrix[:, i]})
        data_frame = rpy.r.as_data_frame(data_frame_dict)
        formula = "phenotype~%s" % "+".join(formula_list)

        if non_NA_phenotype2count and len(non_NA_phenotype2count) == 2:  # binary phenotype, use logistic regression
            lm_result = rpy.r.glm(rpy.r(formula), data=data_frame, family=rpy.r("binomial"))
        else:
            lm_result = rpy.r.glm(rpy.r(formula), data=data_frame)
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        # 04-07-05 r.summary() requires lm_result in NO_CONVERSION state
        summary_stat = rpy.r.summary(lm_result)

        # 06-30-05	index 0 in summary_stat['coefficients'] is intercept
        coeff_list = []
        coeff_p_value_list = []
        for i in range(len(summary_stat["coefficients"])):
            coeff_list.append(summary_stat["coefficients"][i][0])  # 0 is the coefficient
            coeff_p_value_list.append(summary_stat["coefficients"][i][-1])  # -1 is the corresponding p-value
            # 06-30-05	fill in other efficients based on bit_string, NOTE i+1
        pvalue = coeff_p_value_list[1]
        residuals = summary_stat["deviance"]
        geno_effect_var = genotype_var * coeff_list[1] * coeff_list[1] * (no_of_rows - 1)
        var_perc = geno_effect_var / (residuals + geno_effect_var)

        pdata = PassingData(
            pvalue=pvalue, var_perc=var_perc, coeff_list=coeff_list, coeff_p_value_list=coeff_p_value_list
        )
        return pdata
Ejemplo n.º 20
0
    def plot(self,filename=None,format='pdf',**kwargs):
        """Plot the heatmap and save to an image file.

          plot()             # display using windowing system
          plot('hm')         # --> hm.pdf
          plot('hm.png')     # --> hm.png
          plot('hm','png')   # --> hm.png

        By default a clustered heat map is constructed using R's heatmap.2
        function. If R cannot be found, an unclustered heat map is
        plotted. **kwargs can be used to customize the output.

        :Arguments:
        filename       name of the image file; may contain extension
                       If empty use the windowing system.
        format         eps,pdf,png... whatever matplotlib understands

        **kwargs for R:
        scale          Determines the coloring. Choose between 'none' (the
                       actual values in the heat map (possibly already normalized)),
                       'row' or 'column' (z-score across the dimension)
        N_colors       Number of color levels; default is 32.

        **kwargs for matplotlib:
           The kwargs are applied to the matplotlib.text() method and
           are typically used to set font properties. See the
           pylab/matplotlib documentation.
        """
        if filename:
            format = hop.utilities.fileextension(filename,default=format)
        labels = self.labels()
        try:
            try:
                import rpy
            except ImportError:
                from rpy2 import rpy_classic as rpy
                # http://www.mail-archive.com/[email protected]/msg01893.html
                rpy.set_default_mode(rpy.BASIC_CONVERSION)
            self._heatmap_R(labels,filename=filename,format=format,**kwargs)
        except ImportError:
            msg(0,"rpy package missing: cannot plot clustered heat map, defaulting to "
                "an unclustered heat map")
            self._heatmap_matplotlib(labels,filename=filename,format=format,**kwargs)
        if filename:
            msg(1,"Wrote image to file %s.\n" % self.filename(filename,ext=format))
Ejemplo n.º 21
0
def check_R(model,g):
  import rpy
  from   rpy   import r
  from   numpy import array,allclose

  vars = [ v.replace(':','.').replace('+','p').replace('-','m').replace('_','.') for v in model.vars[1:] ]
  frame = dict( (v,model.X[:,i+1].reshape(-1)) for i,v in enumerate(vars) )
  frame['y'] = model.y.reshape(-1)
  formula = 'y ~ ' + ' + '.join(v.replace(':','.') for v in vars)

  rpy.set_default_mode(rpy.NO_CONVERSION)
  mod = r.glm(r(formula),data=r.data_frame(**frame),family=r.binomial('logit'))
  rpy.set_default_mode(rpy.BASIC_CONVERSION)
  pmod = mod.as_py()

  coef  = r.coefficients(mod)
  coef  = array([coef['(Intercept)']] + [ coef[v] for v in vars ],dtype=float)
  coef2 = g.beta.reshape(-1)
Ejemplo n.º 22
0
    def _predict(self, data):
        """
        Predict the output for the provided data.
        """
        # predict with standard method
        values = rpy.r.predict(self.__trained_model,
                               newx=data,
                               type='link',
                               s=self.__last_lambda)

        # predict with the final state (i.e., the last step)
        classes = None
        if self.params.family == 'multinomial':
            # remove last dimension of values
            values = values[:,:,0]

            # get the classes too (they are 1-indexed)
            rpy.set_default_mode(rpy.NO_CONVERSION)
            class_ind = rpy.r.predict(self.__trained_model,
                                      newx=data,
                                      type='class',
                                      s=self.__last_lambda)
            rpy.set_default_mode(rpy.NO_DEFAULT)
            class_ind = rpy.r.as_vector(class_ind)

            # convert the strings to ints and subtract 1
            class_ind = N.array([int(float(c))-1 for c in class_ind])

            # convert to actual labels
            classes = self.__ulabels[class_ind]
        else:
            # is gaussian, so just remove last dim of values
            values = values[:,0]

        # values need to be set anyways if values state is enabled
        self.values = values
        if classes is not None:
            # set the values and return none
            return classes
        else:
            # return the values as predictions
            return values
 def _mcmc_betas_same_sources(self, tag_list):
     """
     The given tag_list contains tags that all have the same features
     available. Train on the tags in tag_list using only the songs
     in self.only_these_songs, or all available songs if
     self.only_these_songs is None.
     """
     if not self.production_run:
         self.mcmc_reps = 75 # save time
     rc.library("bayesm")
     data = []
     for tag in tag_list:
         data.append(rc.list(X=self.X[tag],y=self.y[tag]))
     rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
     data = rc.list(*data)
     if self.regtype in ["Hierarchical Linear", "Hierarchical Mixture"]:
         Data = rc.list(regdata=data)
     elif self.regtype=="Hierarchical Logistic":
         Data = rc.list(lgtdata=data)
     if self.regtype=="Hierarchical Mixture":
         Prior = rc.list(ncomp=self.ncomp)
     Mcmc=rc.list(R=self.mcmc_reps)
     rpy.set_default_mode(rpy.BASIC_CONVERSION)
     try:
         if self.regtype=="Hierarchical Linear":
             output = rc.rhierLinearModel(Data=Data,Mcmc=Mcmc)
         elif self.regtype=="Hierarchical Logistic":
             output = rc.rhierBinLogit(Data=Data,Mcmc=Mcmc)
         elif self.regtype=="Hierarchical Mixture":
             output = rc.rhierLinearMixture(Data=Data,Prior=Prior,Mcmc=Mcmc)
     except:
         #pdb.set_trace()
         self._info_about_r_error(tag_list)
         return
     beta_matrix = output['betadraw'].mean(axis=2) # nregressions x ncoeffs, averaged along third dim
     matrix_index = 0
     for tag in tag_list:
         cur_tag_beta_vec = beta_matrix[matrix_index,:]
         beta_dict_list = [dict([('beta', coeff)]) for coeff in cur_tag_beta_vec]
         self.beta[tag] = dict(zip(self.sorted_sources[tag],beta_dict_list))
         self.stats[tag] = dict() # I'm not currently storing any stats for hierarchical regressions.
         matrix_index += 1
 def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3):
     times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output.
     SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"]
     for tag in tag_list:
         self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false
         rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
         data = rc.list(y=self.y[tag],X=self.X[tag])
         model = "y~X-1" # Use -1 because X has an intercept already
         if self.regtype=="Independent Linear":
             try:
                 result = rc.lm(model,data=data)
             except:
                 pdb.set_trace()
         elif self.regtype=="Independent Logistic":
             result = rc.glm(model,family=rc.binomial("logit"),data=data)
         rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode.
         summary = rc.summary(result,correlation=rc.TRUE)
         self._record_regression_stats(tag, summary)
         beta_dict = dict()
         sorted_sources = self.sorted_sources[tag]
         coeff_matrix = summary["coefficients"]
         for i in range(len(sorted_sources)):
             try:
                 cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:]))
             except IndexError:
                 util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag)
                 if remove_tags_when_bad_regression:
                     self._remove_tag(tag)
                     break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features....
                 continue
             try:
                 cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10)
             except OverflowError:
                 pass
             beta_dict[sorted_sources[i]] = cur_source_dict
         if tag not in self.features: # We've removed this tag a few lines above, so skip it.
             continue
         self.beta[tag] = beta_dict
         if times_showed_summary < n_times_show_summary:
             self._print_regression_summary(tag, summary)
             times_showed_summary += 1
Ejemplo n.º 25
0
def regress(dv,iv):
# Performs regression using R's linear model function (lm)
    if type(dv.values()) is list and all(type(x) is list for x in iv.values()):
    # First check that all of the data is in list form, otherwise RPy will throw an error
        rpy.set_default_mode(rpy.NO_CONVERSION) # Keeps values in R format until we need them
        R_string,frame=make_R_strings(dv,iv)    # Create strings used by RPy to run regression
        
        # R runs the linear regression
        OLS_model=eval('rpy.r.lm(R_string, data=rpy.r.data_frame('+frame+'))')
        rpy.set_default_mode(rpy.BASIC_CONVERSION)  # Now convert back to usable format
        
        model_summary=rpy.r.summary(OLS_model)      # Store resultss
        
        # Extract all of the data of interest
        coeff=model_summary['coefficients'][:,0]    # Regression coeffecients
        std_err=model_summary['coefficients'][:,1]  # Standard Errors
        t_stat=model_summary['coefficients'][:,2]   # t-statistics
        p_val=model_summary['coefficients'][:,3]    # p-values
        r_sqr=model_summary['r.squared']            # R-squred
        asj_r_sqr=model_summary['adj.r.squared']    # Adjusted R-squared
        
        return coeff,std_err,t_stat,p_val,r_sqr,asj_r_sqr
    else:
        raise TypeError("All variables must all be of type 'list'")
Ejemplo n.º 26
0
	def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'):
		"""
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		"""
		sys.stderr.write("rpart fitting and predicting...\n")
		r.library("rpart")
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		#11-17-05 transform into array
		all_data = array(all_data)
		known_data = array(known_data)
		
		set_default_mode(NO_CONVERSION)
		data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
			"cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]})
		if prior_prob:
			prior_prob = [prior_prob, 1-prior_prob]	#get the full list
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) )
		else:
			fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\
				parms=r.list(loss=r.matrix(loss_matrix) ) )
		
		set_default_mode(BASIC_CONVERSION)
		pred_training = r.predict(fit, data_frame, type=["class"])
		del data_frame
		
		set_default_mode(NO_CONVERSION)
		all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \
			"cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]})
		set_default_mode(BASIC_CONVERSION)
		pred = r.predict(fit, all_data_frame, type=["class"])	#11-17-05 type=c("class")
		del all_data_frame
		sys.stderr.write("Done rpart fitting and predicting.\n")
		return pred, pred_training
Ejemplo n.º 27
0
	# if you have rpy installed, use it to test the results
	have_rpy =  False
	try:
	    print("\n")
	    print("="*30)
	    print("Validating OLS results in R")
	    print("="*30)

	    import rpy
	    have_rpy = True
	except ImportError:
	    print("\n")
	    print("="*30)
	    print("Validating OLS-class results in R")
	    print("="*30)
	    print("rpy is not installed")
	    print("="*30)

	if have_rpy:
	    y = data[:,0]
	    x1 = data[:,1]
	    x2 = data[:,2]
	    x3 = data[:,3]
	    x4 = data[:,4]
	    rpy.set_default_mode(rpy.NO_CONVERSION)
	    linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
	    rpy.set_default_mode(rpy.BASIC_CONVERSION)
	    print(linear_model.as_py()['coefficients'])
	    summary = rpy.r.summary(linear_model)
	    print(summary)
Ejemplo n.º 28
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        """ Set up and estimate R model with data and design """
        r.library("MASS")  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r("y ~ %s-1" % "+".join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results["residuals"])
        if isinstance(self.results["residuals"], dict):
            self.resid = np.zeros((len(list(self.results["residuals"].keys()))))
            for i in list(self.results["residuals"].keys()):
                self.resid[int(i) - 1] = self.results["residuals"][i]
        else:
            self.resid = self.results["residuals"]
        self.fittedvalues = self.results["fitted.values"]
        self.df_resid = self.results["df.residual"]
        self.params = rsum["coefficients"][:, 0]
        self.bse = rsum["coefficients"][:, 1]
        self.bt = rsum["coefficients"][:, 2]
        try:
            self.pvalues = rsum["coefficients"][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault("r.squared", None)
        self.rsquared_adj = rsum.setdefault("adj.r.squared", None)
        self.aic_R = rsum.setdefault("aic", None)
        self.fvalue = rsum.setdefault("fstatistic", None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault("value", None)  # for wls
        df = rsum.setdefault("df", None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault("cov.unscaled", None)
        self.bcov = rsum.setdefault("cov.scaled", None)
        if "sigma" in rsum:
            self.scale = rsum["sigma"]
        elif "dispersion" in rsum:
            self.scale = rsum["dispersion"]
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
Ejemplo n.º 29
0
from rpy import r, set_default_mode, NO_CONVERSION, PROC_CONVERSION

import numpy as N
from scipy.interpolate import interp1d
from scipy.ndimage import gaussian_filter1d

from ppgplot_spb import *
import gaussian

set_default_mode(PROC_CONVERSION)

def bootdensity(data, min, max, nboot, ci):
    """ Calculate density and confidence intervals on density
    for a 1D array of points.  Bandwidth is selected automatically.
    """
    r("""
      limdensity <- function(data, weights=NULL, bw="nrd0")
      {
        density(data, from=%f, to=%f, weights=weights, bw=bw)
      }
      """%(min, max))
    density = r.limdensity(data)
    xdens = N.array(density['x'])
    ydens = N.array(density['y'])
    bw = density['bw']
    #print 'bandwidth:', bw
    ydensboot = N.zeros((nboot, len(xdens)), N.float)
    ndata = len(data)
    ran = N.random.uniform(0, ndata, (nboot,ndata)).astype(N.int)
    for i in range(nboot):
        den = r.limdensity(data[ran[i]])
Ejemplo n.º 30
0
def DecisionTree(output_dir, elev_filename, landcover_filename, river_filename):
    """
  This module generate decision tree used to allocate landcover classes.
  It imports rpart library from rpy package.
  Reads the training data, creates a sample data and use rpart libray to build decision tree. 
  """
    rpy.r.library("rpart")  # rpart library used for creating Decision tree
    # Read Elevation Data from ascii file
    file_name = "training_data/%s" % (elev_filename)
    Elev_arr = numpy.loadtxt(file_name, unpack=True)
    # Read Landcover Data from ascii file
    file_name = "training_data/%s" % (landcover_filename)
    Landcover = numpy.loadtxt(file_name, unpack=True)
    # Read River Data from ascii file
    file_name = "training_data/%s" % (river_filename)
    River = numpy.loadtxt(file_name, unpack=True)
    # Compute City block distance from River data
    River_dist_arr = city_block_dist.CityBlock(River)
    # Compute Slope and Aspect from Elevation data
    (Slope_arr, Aspect_arr) = Slope_aspect.Slope_aspect(Elev_arr)

    (x_len, y_len) = Elev_arr.shape
    no_of_veg_class = 10  # no of vegetation class in Landcover matrix
    # Generating Lists for differnt Landcover classes
    # Create list of lists to hold pixels of each landcover class - no of list in
    # list L is equal to no_of_veg_class
    L = []
    for i in range(0, no_of_veg_class):
        L.append([])

    # Now append the pixel co-ordinates into respective list of lists
    for i in range(1, x_len - 1):  # Ignoring boundary cells
        for j in range(1, y_len - 1):  # because we don't have slope and aspect for them
            # nodata values already gets handled since we are ignoring it
            if Landcover[i][j] == 0:
                L[0].append((i, j))
            elif Landcover[i][j] == 1:
                L[1].append((i, j))
            elif Landcover[i][j] == 2:
                L[2].append((i, j))
            elif Landcover[i][j] == 3:
                L[3].append((i, j))
            elif Landcover[i][j] == 4:
                L[4].append((i, j))
            elif Landcover[i][j] == 5:
                L[5].append((i, j))
            elif Landcover[i][j] == 6:
                L[6].append((i, j))
            elif Landcover[i][j] == 7:
                L[7].append((i, j))
            elif Landcover[i][j] == 8:
                L[8].append((i, j))
            elif Landcover[i][j] == 9:
                L[9].append((i, j))

    # Sample Data for decision tree
    # normalizing elevation data
    minimum_elev = numpy.min(Elev_arr)
    factor = numpy.max(Elev_arr) - minimum_elev
    Elev_arr = (Elev_arr[:, :] - minimum_elev) * 100 / factor

    # Create various list to hold sample training data
    Elevation = []
    Slope = []
    RiverDistance = []
    Aspect_x = []
    Aspect_y = []
    Class = []

    # Now sampling the data
    for i in range(0, no_of_veg_class):
        if len(L[i]) < 500:
            limit = len(L[i])
        else:
            limit = 500
        for j in range(0, limit):
            Elevation.append(int(Elev_arr[L[i][j][0]][L[i][j][1]]))
            Slope.append(int(Slope_arr[L[i][j][0]][L[i][j][1]]))
            RiverDistance.append(int(River_dist_arr[L[i][j][0]][L[i][j][1]]))
            Aspect_x.append(int(Aspect_arr[L[i][j][0]][L[i][j][1]][0]))
            Aspect_y.append(int(Aspect_arr[L[i][j][0]][L[i][j][1]][1]))
            Class.append(i)

    # create dictionary of sample data which will be needed to generate decision tree
    traing_data = {
        "Elevation": Elevation,
        "Slope": Slope,
        "RiverDistance": RiverDistance,
        "Aspect_x": Aspect_x,
        "Aspect_y": Aspect_y,
        "Class": Class,
    }

    # write dictionary into pickle file for further use(reusability)
    output = open("decision_tree.pkl", "wb")
    pickle.dump(traing_data, output)
    output.close()

    rpy.set_default_mode(rpy.NO_CONVERSION)
    print "Creating Decision tree"
    # Using rpart create the decision tree
    fit = rpy.r.rpart(
        formula="Class ~ Elevation + RiverDistance + Slope + Aspect_x + Aspect_y", data=traing_data, method="class"
    )

    # output a png image of the decision tree
    file_name = "%s/DecisionTree.png" % (output_dir)
    rpy.r.png(file_name)
    rpy.r.plot(fit)
    rpy.r.text(fit)
    rpy.r.dev_off()
Ejemplo n.º 31
0
def calc_stratified_rates(summset,
                          popset,
                          conflev=0.95,
                          basepop=100000,
                          timeinterval='years',
                          ci_method='dobson',
                          popset_popcol='_freq_',
                          debug=False):
    """
    Calculate stratified population rates

    summset     is a straified summary dataset of counts of events for
                the population-of-interest
    popset      is the stratified population counts for the
                population-of-interest
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method not in ('dobson', 'ff'):
        raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) '
                    'methods for confidence intervals currently '
                    'implemented')
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        # We turn the summset into an Ncondcols-dimensional matrix
        summtab = CrossTab.from_summset(summset)

        # The population dataset must have at least as many dimensions as
        # summary dataset. Any additional axes are eliminated by summing.
        # any missing axes are created by replication.
        poptab = CrossTab.from_summset(popset, shaped_like=summtab)
        poptab.collapse_axes_not_in(summtab)
        poptab.replicate_axes(summtab)
        popfreq = poptab[popset_popcol].data.astype(Numeric.Float64)

        # Manufacture a CrossTab for the result
        result = summtab.empty_copy()

        basepop = float(basepop)

        for table, name, n_add, l_add in just_freq_tables(summtab):
            # avoid integer overflows...
            summfreq = table.data.astype(Numeric.Float64)

            strata_rate = summfreq / popfreq

            result.add_table('summfreq' + n_add,
                             data=summfreq,
                             label='Events' + l_add)
            result.add_table('popfreq' + n_add,
                             data=popfreq,
                             label='Person-' + timeinterval + ' at risk' +
                             l_add)
            result.add_table('sr' + n_add,
                             data=strata_rate * basepop,
                             label='Strata-specific Rate per ' +
                             '%d' % basepop + ' person-' + timeinterval +
                             l_add)

            if alpha is not None:
                # CIs for stratified rates
                summfreq_shape = summfreq.shape
                summfreq_flat = MA.ravel(summfreq)
                assert popfreq.shape == summfreq.shape
                popfreq_flat = MA.ravel(popfreq)

                sr_ll = Numeric.empty(len(summfreq_flat),
                                      typecode=Numeric.Float64)
                sr_ul = Numeric.empty(len(summfreq_flat),
                                      typecode=Numeric.Float64)
                sr_ll_mask = Numeric.zeros(len(summfreq_flat),
                                           typecode=Numeric.Int8)
                sr_ul_mask = Numeric.zeros(len(summfreq_flat),
                                           typecode=Numeric.Int8)

                for i, v in enumerate(summfreq_flat):
                    try:
                        if v == 0:
                            sr_ll[i] = 0.0
                        else:
                            sr_ll[i] = (
                                (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) /
                                popfreq_flat[i]) * basepop
                        sr_ul[i] = (
                            (r.qchisq(1. - alpha / 2., df=2.0 *
                                      (v + 1)) / 2.0) /
                            popfreq_flat[i]) * basepop
                    except:
                        sr_ll[i] = 0.0
                        sr_ul[i] = 0.0
                        sr_ll_mask[i] = 1
                        sr_ul_mask[i] = 1

                sr_ll = MA.array(sr_ll, mask=sr_ll_mask, typecode=MA.Float64)
                sr_ul = MA.array(sr_ul, mask=sr_ul_mask, typecode=MA.Float64)
                sr_ll.shape = summfreq_shape
                sr_ul.shape = summfreq_shape

                sr_base = 'Stratified rate %s%%' % (100.0 * conflev)
                result.add_table('sr_ll' + n_add,
                                 data=sr_ll,
                                 label=sr_base + ' lower confidence limit ' +
                                 l_add)
                result.add_table('sr_ul' + n_add,
                                 data=sr_ul,
                                 label=sr_base + ' upper confidence limit ' +
                                 l_add)

    finally:
        set_default_mode(r_mode)
    soom.info('calc_stratified_rates took %.03f' % (time.time() - st))
    name = 'stratified_rates_' + summset.name
    label = 'Stratified Rates for ' + (summset.label or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)
    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)
Ejemplo n.º 32
0
    def gls_via_R(cls, non_NA_genotype_ls, non_NA_phenotype_ls, non_NA_phenotype2count=None, variance_matrix=None):
        """
		2009-12-23
			general least square model via calling equivalent function in R.
			
		"""
        genotype_matrix = cls.createDesignMatrix(non_NA_genotype_ls)  # no need to add a constant vector.

        if hasattr(cls, "corStruct"):
            corStruct = cls.corStruct
        else:
            if variance_matrix is not None:
                corStruct = cls.generateCorStructForGLSFromVarianceMatrix(variance_matrix)
                setattr(cls, "corStruct", corStruct)
            else:
                corStruct = None
                # 2008-11-10 do linear regression by R
        genotype_var = numpy.var(genotype_matrix[:, 0])  # 2008-11-10 var=\sum(x_i-\bar{x})^2/(n-1)
        rpy.set_default_mode(rpy.NO_CONVERSION)  # 04-07-05
        rpy.r.library("nlme")

        # data_frame = rpy.r.as_data_frame({"phenotype":non_NA_phenotype_ls, "genotype":rpy.r.as_factor(genotype_matrix[:,1])})
        formula_list = []
        data_frame_dict = {"phenotype": non_NA_phenotype_ls}
        for i in range(genotype_matrix.shape[1]):
            var_name = "genotype%s" % i
            formula_list.append(var_name)
            data_frame_dict.update({var_name: genotype_matrix[:, i]})
        data_frame = rpy.r.as_data_frame(data_frame_dict)
        formula = "phenotype~%s" % "+".join(formula_list)

        lm_result = rpy.r.gls(rpy.r(formula), data=data_frame, correlation=corStruct)
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        # 04-07-05 r.summary() requires lm_result in NO_CONVERSION state
        summary_stat = rpy.r.summary(lm_result)

        rpy.set_default_mode(rpy.NO_CONVERSION)
        summary_stat1 = rpy.r.summary(lm_result)

        rpy.set_default_mode(rpy.VECTOR_CONVERSION)
        summary_stat2 = rpy.r.summary(lm_result)

        rpy.set_default_mode(rpy.TOP_CONVERSION)
        summary_stat3 = rpy.r.summary(lm_result)

        # 06-30-05	index 0 in summary_stat['coefficients'] is intercept
        coeff_list = []
        coeff_p_value_list = []
        for i in range(len(summary_stat["coefficients"])):
            coeff_list.append(summary_stat["coefficients"][i][0])  # 0 is the coefficient
            coeff_p_value_list.append(summary_stat["coefficients"][i][-1])  # -1 is the corresponding p-value
            # 06-30-05	fill in other efficients based on bit_string, NOTE i+1
        pvalue = coeff_p_value_list[1]
        residuals = summary_stat["deviance"]
        geno_effect_var = genotype_var * coeff_list[1] * coeff_list[1] * (no_of_rows - 1)
        var_perc = geno_effect_var / (residuals + geno_effect_var)

        pdata = PassingData(
            pvalue=pvalue, var_perc=var_perc, coeff_list=coeff_list, coeff_p_value_list=coeff_p_value_list
        )
        return pdata
Ejemplo n.º 33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not possible to use
                        # cor.test or lsfit directly, as you have to perform a
                        # regression through the origin.

                        # uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"),
                                            data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y], options.value_format %
                             phy_r, options.pvalue_format % p, code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(map(lambda x: options.value_format % x, d)) +
                        "\n ")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(
                    node_id,
                    bl,
                    c1,
                    c2,
                ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(node_id, node.data.branchlength, c1,
                                        c2)
                        else:
                            assert (node_id == tree.root)
                            assert (len(node.succ) == 3)
                            update_data(node_id, node.data.branchlength,
                                        node.succ[0], node.succ[1])
                            update_data(max_index - 1, node.data.branchlength,
                                        node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write("node_id\tvariance\t%s\n" %
                                     "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
Ejemplo n.º 34
0
def calc_indirectly_std_ratios(summset,
                               popset,
                               stdsummset,
                               stdpopset,
                               conflev=0.95,
                               baseratio=100,
                               timeinterval='years',
                               popset_popcol='_freq_',
                               stdpopset_popcol='_stdpop_',
                               ci_method='daly',
                               debug=False):
    """
    Calculate Indirectly Standardised Population Event Ratios

    - summset is a summary dataset of counts of events for the
      population-of-interest being compared to the standard population.
    - popset is the stratified population counts for the
      population-of-interest
    - stdsummset is a summary dataset of counts of events for the
      standard population
    - stdpopset is the stratified population counts for the standard
      population
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method != 'daly':
        raise Error("Only Daly method for confidence intervals "
                    "currently implemented")
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))
    if not stdpopset.has_column(stdpopset_popcol):
        raise Error('Standard population dataset %r does not have a '
                    '%r column' %
                    (stdpopset.label or stdpopset.name, stdpopset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        shape = shape_union(stdsummset, summset)

        summtab = CrossTab.from_summset(summset, shaped_like=shape)

        stdsummtab = CrossTab.from_summset(stdsummset, shaped_like=shape)

        stdpoptab = CrossTab.from_summset(stdpopset, shaped_like=shape)
        stdpoptab.collapse_axes_not_in(stdsummtab)

        stdsummtab.replicate_axes(shape)
        stdpoptab.replicate_axes(shape)

        poptab = CrossTab.from_summset(popset, shaped_like=shape)
        poptab.collapse_axes_not_in(shape)
        if poptab.get_shape() != stdsummtab.get_shape():
            raise Error(
                'Observed population does not have all the required columns')
        popfreq = poptab[popset_popcol].data.astype(MA.Float64)

        result = stdsummtab.empty_copy()
        result.add_table('popfreq',
                         data=popfreq,
                         label='Total person-' + timeinterval + ' at risk')

        expected_cols = []
        for table, name, n_add, l_add in just_freq_tables(stdsummtab):
            stdsummfreq = stdsummtab[name].data.astype(MA.Float64)
            stdpopfreq = stdpoptab[stdpopset_popcol].data.astype(MA.Float64)
            std_strata_rates = stdsummfreq / stdpopfreq
            strata_expected_freq = std_strata_rates * popfreq
            #            print stdsummfreq[0,0,0], stdpopfreq[0,0,0], popfreq[0,0,0]
            result.add_table('expected' + n_add,
                             data=strata_expected_freq,
                             label='Expected events' + l_add)
            expected_cols.append('expected' + n_add)

        result.collapse_axes_not_in(summtab)

        axis = 0
        baseratio = float(baseratio)

        for table, name, n_add, l_add in just_freq_tables(summtab):
            observed = table.data.astype(Numeric.Float64)
            result.add_table('observed' + n_add,
                             data=observed,
                             label='Observed events' + l_add)

            expected = result['expected' + n_add].data

            isr = observed / expected
            result.add_table('isr' + n_add,
                             data=isr * baseratio,
                             label='Indirectly Standardised Event Ratio')

            # Confidence Intervals
            if alpha is None or name != '_freq_':
                # Can only calculate confidence intervals on freq cols
                continue

            conflev_l = (1 - conflev) / 2.0
            conflev_u = (1 + conflev) / 2.0

            # get shape of observed
            observed_shape = observed.shape
            # flattened version
            observed_flat = MA.ravel(observed)

            # sanity check on shapes - should be the same!
            assert expected.shape == observed.shape

            # flattened version of expecetd
            expected_flat = MA.ravel(expected)

            # lists to hold results
            isr_ll = Numeric.empty(len(observed_flat),
                                   typecode=Numeric.Float64)
            isr_ul = Numeric.empty(len(observed_flat),
                                   typecode=Numeric.Float64)
            isr_ll_mask = Numeric.zeros(len(observed_flat),
                                        typecode=Numeric.Int8)
            isr_ul_mask = Numeric.zeros(len(observed_flat),
                                        typecode=Numeric.Int8)

            obs_mask = MA.getmaskarray(observed_flat)
            exp_mask = MA.getmaskarray(expected_flat)

            for i, v in enumerate(observed_flat):
                if obs_mask[i] or exp_mask[i]:
                    isr_ll[i] = 0.0
                    isr_ul[i] = 0.0
                    isr_ll_mask[i] = 1
                    isr_ul_mask[i] = 1
                else:
                    if v == 0.:
                        obs_ll = 0.0
                        obs_ul = -math.log(1 - conflev)
                    else:
                        obs_ll = r.qgamma(conflev_l, v, scale=1.)
                        obs_ul = r.qgamma(conflev_u, v + 1., scale=1.)
                    isr_ll[i] = obs_ll / expected_flat[i]
                    isr_ul[i] = obs_ul / expected_flat[i]

            isr_ll = MA.array(isr_ll, typecode=MA.Float64, mask=isr_ll_mask)
            isr_ul = MA.array(isr_ul, typecode=MA.Float64, mask=isr_ul_mask)
            isr_ll.shape = observed_shape
            isr_ul.shape = observed_shape

            isr_base = 'ISR %d%%' % (100.0 * conflev)
            result.add_table('isr_ll' + n_add,
                             data=isr_ll * baseratio,
                             label=isr_base + ' lower confidence limit' +
                             l_add)
            result.add_table('isr_ul' + n_add,
                             data=isr_ul * baseratio,
                             label=isr_base + ' upper confidence limit' +
                             l_add)
    finally:
        set_default_mode(r_mode)
    soom.info('calc_indirectly_std_ratios took %.03f' % (time.time() - st))
    name = 'indir_std_ratios_' + summset.name
    label = 'Indirectly Standardised Ratios for ' + (summset.label
                                                     or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)

    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)
Ejemplo n.º 35
0
def main(parameter_file):
    """
    It performs the following actions:
    1. Gets the parameters, required for simulation, from parameter.yaml file. 
    2. calls DEM_creator() --> for generating DEM grid
    3. Erosion modelling 
    4. Flow modelling
    5. Landcover class allocation using decision tree
    6. Geometric feature development
    7. road mapping
    """
    time1 = time.time()
    #*****************parameter handling *************************************
    # Get the parameters from parameter.yaml file
    yaml_file  = open(parameter_file, 'r')
    stream     = yaml.load(yaml_file)
    resolution = stream['resolution']
    H          = stream['H']
    H_wt       =  stream['H_wt']
    seed       = stream['seed']
    sigma      = stream['sigma']
    elev_range = stream['elev_range']
    max_level  = stream['max_level']
    DEMcreator_option = stream['DEMcreator_option']
    output_dir = stream['output_dir']
    river_drop = stream['river_drop']
    Erosion_permission = stream['Erosion_permission']
    decision_tree = stream['decision_tree']
    counter    = stream['counter']
    elev_filename      = stream['training_elev_filename']
    landcover_filename = stream['training_landcover_filename']
    river_filename     = stream['training_river_filename']
    no_of_veg_class    = stream['no_of_veg_class']
    min_area     = stream['min_area']
    max_area     = stream['max_area']
    aspect_ratio = stream['aspect_ratio']
    agri_area_limit = stream['agri_area_limit']
    yaml_file.close() 

    #**************************print statistics***********************************
    print ("Running simulation with follwing parameters")
    print ("H: %s" % H)
    print ("H_wt: %s" % H_wt)
    print ("seed: %s" % seed)
    print ("sigma: %f" % sigma) 
    print ("elev_range: %s" % elev_range)
    print ("max_level: %s" % max_level)
    print ("DEMcreator_option: %s" % DEMcreator_option)
    print ("output_dir: %s" % output_dir)
    print ("River drop: %d" % river_drop)
    print ("counter: %d" % counter)
    print ("no of vegetation class %d" % no_of_veg_class)
    print ("min area: %f" % min_area)
    print ("max area: %f" % max_area)
    print ("aspect ratio: %f" % aspect_ratio)
    print ("agricultural area limit: %f" % agri_area_limit)
    gradient = 0   #fixed for now TODO incorporate gradient in next version
    #*****************************DEM genaration************************************
    # Generate DEM using FM2D/SS algorithm by calling DEM_creator(args...) function
    DEM_Result = DEM_generator.DEM_creator(H, H_wt, seed, elev_range,sigma,gradient,max_level, DEMcreator_option)
    pathname = os.path.dirname(sys.argv[0])
    fullpath = os.path.abspath(pathname)
    filename = fullpath + "/" + output_dir
    if not os.path.exists(filename):
        os.makedirs(filename)          # create output directory if it doesn't exist 
    DEM_arr = DEM_Result[0]
    DEM_Result = 0 #free space
    #****************************region adjustment***********************************
    # We create a temporary region that is only valid in this python session
    g.use_temp_region()
    rows = DEM_arr.shape[0]
    cols = DEM_arr.shape[1]
    n = 4928050 #some arbitrary value
    s = n - resolution*rows
    e = 609000  #some arbitrary value
    w = e - resolution*cols
    g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = resolution, rows = rows ,cols = cols)   
    #*************************Flow accumulation with Erosion modelling****************************
    filename = fullpath + "/ascii_files"
    if not os.path.exists(filename):
        os.makedirs(filename)
    if not Erosion_permission:
        counter = 0
        DEM_arr_to_ascii(DEM_arr,resolution)
        g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath +'/'+'ascii_files' +'/DEM.asc', output='test_DEM')
        #Flow computation for massive grids (float version) 
        g.run_command('r.terraflow', overwrite = True, elevation = 'test_DEM@user1', filled = 'flooded_DEM',\
          direction = 'DEM_flow_direction',swatershed = 'DEM_sink_watershed', accumulation = 'DEM_flow_accum', tci = 'DEM_tci')
        g.run_command('r.out.ascii',flags='h',input='DEM_flow_accum@user1',output=fullpath +'/ascii_files'+ '/DEM_flow_accum',null='0')
        f = open(fullpath +'/ascii_files'+ '/DEM_flow_accum', 'r')
        Flow_accum_arr = numpy.loadtxt(f)
        f.close()
    for iteration in range(0,counter):
        DEM_arr_to_ascii(DEM_arr,resolution)
        #Input the DEM ascii file into grass
        g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath +'/'+'ascii_files' +'/DEM.asc', output='test_DEM')
        #Flow computation for massive grids (float version) 
        g.run_command('r.terraflow', overwrite = True, elevation = 'test_DEM@user1', filled = 'flooded_DEM',\
          direction = 'DEM_flow_direction',swatershed = 'DEM_sink_watershed', accumulation = 'DEM_flow_accum', tci = 'DEM_tci')
        g.run_command('r.out.ascii',flags='h',input='DEM_flow_accum@user1',output=fullpath +'/ascii_files'+ '/DEM_flow_accum',null='0')
        f = open(fullpath +'/ascii_files'+ '/DEM_flow_accum', 'r')
        Flow_accum_arr = numpy.loadtxt(f)
        f.close()
        #call erosion modelling function
        DEM_arr = Erosion(Flow_accum_arr, DEM_arr, river_drop)
    output=fullpath +'/'+output_dir+ '/DEM.asc'
    arr_to_ascii(DEM_arr,output)
    output=fullpath +'/'+output_dir+ '/flow_accum.asc'
    arr_to_ascii(Flow_accum_arr,output)
    #****************************landcover allocation using decision tree********************************
    # Get slope and Aspect using grass functions
    g.run_command('r.slope.aspect',overwrite=True,elevation='test_DEM@user1',slope='DEM_Slope',aspect='DEM_Aspect')
    g.run_command('r.out.ascii',flags='h',input='DEM_Slope@user1',output=fullpath + '/ascii_files'+'/DEM_Slope',null='0')
    f = open('ascii_files/DEM_Slope', 'r')
    DEM_Slope_arr = numpy.loadtxt(f)
    f.close()
    g.run_command('r.out.ascii',flags='h',input='DEM_Aspect@user1',output=fullpath +'/ascii_files'+'/DEM_Aspect',null='0')
    f = open('ascii_files/DEM_Aspect', 'r')
    DEM_Aspect_arr = numpy.loadtxt(f)
    f.close()
    Distance_arr = dist.CityBlock(Flow_accum_arr,flag = 0)
    # Normalize the elevation values to use decision tree
    minimum_elev = numpy.min(DEM_arr)
    factor = numpy.max(DEM_arr) - minimum_elev
    Elev_arr = (DEM_arr[:,:] - minimum_elev)*100/factor
    # Create various list to hold test data
    Elevation = []
    Slope = []
    RiverDistance = []
    Aspect = []
    # Append the data into respective list
    x_len = DEM_arr.shape[0]
    y_len = DEM_arr.shape[1]
    for i in range(0,x_len):
        for j in range(0,y_len):
            Elevation.append(int(Elev_arr[i][j]))
            Slope.append(int(DEM_Slope_arr[i][j]))
            RiverDistance.append(int(Distance_arr[i][j]))
            Aspect.append(int(DEM_Aspect_arr[i][j]))
    Elev_arr = 0 #free space
    DEM_slope_arr = 0 #free space
    DEM_Aspect_arr = 0 #free space
    Distance_arr = 0 #free space
    # Create dictionary to apply R's predict command on it 
    Test_data = {'Elevation':Elevation ,'Slope':Slope ,'RiverDistance':RiverDistance,'Aspect':Aspect}
    #free spaces
    Elevation = []
    Slope = []
    RiverDistance = []
    Aspect = []
    # create decision tree from training data
    fit = DecisionTree(no_of_veg_class,elev_filename, landcover_filename, river_filename,decision_tree)
    g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = resolution, rows = rows ,cols = cols)
    # Alloctae vegetation array for holding predicted landcover values
    Veg_arr = numpy.zeros(DEM_arr.shape, dtype = "uint8")
    rpy.r.library("rpart")
    rpy.set_default_mode(rpy.BASIC_CONVERSION)
    # values contain probability values of the predicted landcover classes
    values = rpy.r.predict(fit,newdata=Test_data,method="class")
    Test_data = 0 #free space
    x_len = Veg_arr.shape[0]
    y_len = Veg_arr.shape[1]
    for i in range(0,x_len):
        for j in range(0,y_len):
        # Get the class having max probability for each test data point
            a = ndimage.maximum_position(values[i*y_len + j])
            Veg_arr[i,j] = (a[0]) # Assign them some value to facilitate visualization
    values = 0 #free space
    filename=fullpath +'/'+output_dir+ "/landcover.asc"
    arr_to_ascii(Veg_arr,filename)
    # Allocate and initialize Suitabilty map 
    Suitability = numpy.zeros( DEM_arr.shape, dtype = "uint8")
    for i in range(0,DEM_arr.shape[0]):
        for j in range(0,DEM_arr.shape[1]):
            #TODO can use mask here, needs to be generalised
            if Veg_arr[i][j] == 0: # Ignore
                Suitability[i][j] = 0 
            elif Veg_arr[i][j] == 25: # Deciduous woodland
                Suitability[i][j] = 60 
            elif Veg_arr[i][j] == 50: # Coniferous woodland
                Suitability[i][j] = 55 
            elif Veg_arr[i][j] == 75: # Agriculture including pasture
                Suitability[i][j] = 98 
            elif Veg_arr[i][j] == 100: # Semi-natural grassland
                Suitability[i][j] = 90 
            elif Veg_arr[i][j] == 125: # Bog and swamp
                Suitability[i][j] = 50
            elif Veg_arr[i][j] == 150: # Heath
                Suitability[i][j] = 75 
            elif Veg_arr[i][j] == 175: # Montane habitat
                Suitability[i][j] = 20 
            elif Veg_arr[i][j] == 200: # Rock and quarry
                Suitability[i][j] = 30 
            elif Veg_arr[i][j] == 225: # Urban
                Suitability[i][j] = 80
    Display_fields = Geometry.GeometricFeature(Suitability, min_area,max_area ,aspect_ratio ,agri_area_limit)
    f = open('fields_arr', 'w')
    numpy.save(f,Display_fields)
    f.close()
    pylab.imsave(output_dir+"/fields.png",Display_fields)
    time2 = time.time()
    print "time taken", time2-time1
    shutil.rmtree(fullpath+'/ascii_files')
Ejemplo n.º 36
0
def DecisionTree(no_of_veg_class, elev_filename, landcover_filename, river_filename,decision_tree):
    """
    Generates a decision tree given the training data
    Input:
        no_of_veg_class: No of landcover class in training data
        elev_filename  : Name of training file having elevation values
        landcover_filename: Name of training file having landcover values
        river_filename: Name of training file having river presence absence info
    """
    rpy.r.library("rpart")
    g.use_temp_region()
    #TODO generalize no of rows and columns for training data
    rows = 2001
    cols = 1201
    resolution = 50
    n = 4928050 #some arbitrary value
    s = n - resolution*rows
    e = 609000  #some arbitrary value
    w = e - resolution*cols
    g.run_command('g.region', flags = 'ap', n = n ,s = s, e = e, w = w,res = 50, rows = 2001 ,cols = 1201)
    pathname = os.path.dirname(sys.argv[0])        
    fullpath = os.path.abspath(pathname)
    if decision_tree:
    # Convert ascii DEM into grass raster map that will help in getting slope and aspect
        file_name = "/Training/%s" % elev_filename
        g.run_command('r.in.ascii', overwrite = True, flags='i', input = fullpath + file_name, output='training_DEM')
    # TODO read training DEM into array without writing another file 
        g.run_command('r.out.ascii',flags='h',input='training_DEM@user1',output=fullpath + '/ascii_files'+'/training_DEM',null='0')
        f = open('ascii_files/training_DEM', 'r')
        Elev_arr = numpy.loadtxt(f)
        f.close() 
        file_name = "Training/%s" % (landcover_filename)
        Landcover = numpy.loadtxt(file_name) # Read Landcover Data from ascii file
        file_name = "Training/%s" % (river_filename)
        River     = numpy.loadtxt(file_name) # Read River Data from ascii file
        River_dist_arr = dist.CityBlock(River,flag = 1)   #Compute distance from River data
        g.run_command('r.slope.aspect',overwrite=True,elevation='training_DEM@user1',slope='Slope',aspect='Aspect')
        g.run_command('r.out.ascii',flags='h',input='Slope@user1',output=fullpath + '/ascii_files'+'/Slope',null='0')
        f = open('ascii_files/Slope', 'r')
        Slope_arr = numpy.loadtxt(f)  #Get Slope into an array
        f.close()
        g.run_command('r.out.ascii',flags='h',input='Aspect@user1',output=fullpath +'/ascii_files'+ '/Aspect',null='0')
        f = open('ascii_files/Aspect', 'r')
        Aspect_arr = numpy.loadtxt(f) #Get Aspect into an array
        f.close()

        (x_len,y_len) = Elev_arr.shape
        L = [ [] for i in range(0,no_of_veg_class)]
        for i in range(1,x_len-1):   # Ignoring boundary cells 
            for j in range(1,y_len-1):
            # Append the pixel co-ordinates into respective list of lists
            # nodata values already gets handled since we are ignoring it
                for k in range(0, no_of_veg_class):
                    if Landcover[i][j] == k:
                        L[k].append( (i,j) )
                        break

        minimum_elev = numpy.min(Elev_arr)
        factor = numpy.max(Elev_arr) - minimum_elev      # normalize elevation data
        Elev_arr = (Elev_arr[:,:]-minimum_elev)*100/factor
    # Sample training Data for decision tree, we can't take entire data as it take longer processing time
    # various lists to hold sample training data
        Elevation = []
        Slope = []
        RiverDistance = []
        Aspect = []
        Class = []
    # Sample the data
        for i in range(0,no_of_veg_class):  
            if len(L[i]) < 1000:
                limit = len(L[i])
            else:
                limit = 1000
            for j in range(0,limit):
                Elevation.append( int(Elev_arr[ L[i][j][0] ][ L[i][j][1] ]))
                Slope.append(int(Slope_arr[ L[i][j][0] ][ L[i][j][1] ]))
                RiverDistance.append(int(River_dist_arr[ L[i][j][0] ][ L[i][j][1] ]))
                Aspect.append(int(Aspect_arr[ L[i][j][0] ][ L[i][j][1] ]))
                Class.append(i)
    #free space
        Elev_arr = 0
        Slope_arr = 0
        River_dist_arr = 0
        Aspect_arr = 0
    # create dictionary of sample data which will be needed to generate decision tree 
        training_data = {'Elevation':Elevation,'Slope':Slope,'RiverDistance':RiverDistance,'Aspect':Aspect,'Class':Class}
    #free space
        Elevation = []
        Slope = []
        RiverDistance = []
        Aspect = []
        Class = []
        f = open( 'save.p', 'w' )
        pickle.dump(training_data, f )
        f.close()
    else:
        f = open( 'save.p', 'r' )
        training_data = pickle.load( f )
        f.close()
    rpy.set_default_mode(rpy.NO_CONVERSION)
    #Using rpart create the decision tree
    fit = rpy.r.rpart(formula='Class ~ Elevation + RiverDistance + Slope + Aspect',data=training_data,method="class")
    training_data = 0
    #rpy.r.png("DecisionTree.png")  # Output a png image of the decision tree
    #rpy.r.plot(fit)
    #rpy.r.text(fit)
    #rpy.r.dev_off()
    return fit
Ejemplo n.º 37
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        ''' Set up and estimate R model with data and design '''
        r.library('MASS')  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = [
            'x.%d' % (i + 1) for i in range(self.design.shape[1])
        ]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r('y ~ %s-1' % '+'.join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results['residuals'])
        if isinstance(self.results['residuals'], dict):
            self.resid = np.zeros((len(self.results['residuals'].keys())))
            for i in self.results['residuals'].keys():
                self.resid[int(i) - 1] = self.results['residuals'][i]
        else:
            self.resid = self.results['residuals']
        self.fittedvalues = self.results['fitted.values']
        self.df_resid = self.results['df.residual']
        self.params = rsum['coefficients'][:, 0]
        self.bse = rsum['coefficients'][:, 1]
        self.bt = rsum['coefficients'][:, 2]
        try:
            self.pvalues = rsum['coefficients'][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault('r.squared', None)
        self.rsquared_adj = rsum.setdefault('adj.r.squared', None)
        self.aic_R = rsum.setdefault('aic', None)
        self.fvalue = rsum.setdefault('fstatistic', None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault('value', None)  # for wls
        df = rsum.setdefault('df', None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault('cov.unscaled', None)
        self.bcov = rsum.setdefault('cov.scaled', None)
        if 'sigma' in rsum:
            self.scale = rsum['sigma']
        elif 'dispersion' in rsum:
            self.scale = rsum['dispersion']
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
Ejemplo n.º 38
0
if __name__ == '__main__':
    modules = os.listdir('.')

    if '--random' in sys.argv:
        shuffle=True
        sys.argv.remove('--random')
    else:
        shuffle=False

    if '--loop' in sys.argv:
	niter = 1000
        sys.argv.remove('--loop')
    else:
	niter = 1


    modules = filter( lambda x: not x.endswith('.pyc'), modules)
    modules = filter( lambda x: x.startswith('test_'), modules)
    modules = filter( lambda x: x.endswith('.py'), modules)

    print "Modules to be tested:", modules
    
    for iter in range(niter):
        if shuffle: random.shuffle(modules)
        for module in modules:
            name = module[:-3]
            print 'Testing:', name
            rpy.set_default_mode(rpy.NO_DEFAULT)  # reset to base case
            run(name)
Ejemplo n.º 39
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--tree-nh-file", dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header", dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson",
                               "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                # n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not
                        # possible to use cor.test or lsfit directly,
                        # as you have to perform a regression through
                        # the origin.

                        # uncomment to check pearson r against
                        # phylip's value r =
                        # calculateCorrelationCoefficient(columns[x],
                        # columns[y])

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(
                            R("y ~ x - 1"), data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y],
                             options.value_format % phy_r,
                             options.pvalue_format % p,
                             code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(
                            map(lambda x: options.value_format % x, d)) + "\n")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(node_id, bl, c1, c2, ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(
                                node_id, node.data.branchlength, c1, c2)
                        else:
                            assert(node_id == tree.root)
                            assert(len(node.succ) == 3)
                            update_data(
                                node_id, node.data.branchlength,
                                node.succ[0], node.succ[1])
                            update_data(
                                max_index - 1, node.data.branchlength,
                                node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write(
                    "node_id\tvariance\t%s\n" % "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[
                            node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
Ejemplo n.º 40
0
    def rpart_fit(self, known_data, parameter_list, bit_string="11111"):
        """
		11-09-05
			1st use known_data to get the fit model
			2nd use the fit model to do prediction on all_data, result is prob for each class
		11-09-05 add rpart_cp
		11-17-05
			add loss_matrix, prior_prob
			return two pred
		11-23-05
			split fit and predict. rpart_fit_and_predict() is split into rpart_fit() and rpart_predict()
		11-27-05
			r cleanup
		03-17-06
			use parameter_list instead
		"""
        if self.debug:
            sys.stderr.write("Doing rpart_fit...\n")
            # 03-17-06
        rpart_cp, loss_matrix, prior_prob = parameter_list

        # 11-27-05 r cleanup
        from rpy import r

        r.library("rpart")

        coeff_name_list = ["p_value", "recurrence", "connectivity", "cluster_size", "gradient"]
        formula_list = []
        for i in range(len(bit_string)):
            if bit_string[i] == "1":
                formula_list.append(coeff_name_list[i])
                # 11-17-05 transform into array
        known_data = array(known_data)

        set_default_mode(NO_CONVERSION)
        data_frame = r.as_data_frame(
            {
                "p_value": known_data[:, 0],
                "recurrence": known_data[:, 1],
                "connectivity": known_data[:, 2],
                "cluster_size": known_data[:, 3],
                "gradient": known_data[:, 4],
                "is_correct": known_data[:, -1],
            }
        )
        if prior_prob:
            prior_prob = [prior_prob, 1 - prior_prob]  # get the full list
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix)),
            )
        else:
            fit = r.rpart(
                r("is_correct~%s" % "+".join(formula_list)),
                data=data_frame,
                method="class",
                control=r.rpart_control(cp=rpart_cp),
                parms=r.list(loss=r.matrix(loss_matrix)),
            )
        del data_frame
        if self.debug:
            sys.stderr.write("Done rpart_fit.\n")
        return fit
Ejemplo n.º 41
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
Ejemplo n.º 42
0
def calc_directly_std_rates(summset,
                            popset,
                            stdpopset=None,
                            conflev=0.95,
                            basepop=100000,
                            timeinterval='years',
                            ci_method='dobson',
                            popset_popcol='_freq_',
                            stdpopset_popcol='_stdpop_',
                            axis=0,
                            debug=False):
    """
    Calculate Directly Standardised Population Rates

    summset     is a summary dataset of counts of events for the
                population-of-interest being compared to the standard
                population.  
    popset      is the stratified population counts for the
                population-of-interest
    stdpopset   is the stratified population counts for the standard
                population
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method not in ('dobson', 'ff'):
        raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) methods '
                    'for confidence intervals currently implemented')
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))
    if stdpopset is not None and not stdpopset.has_column(stdpopset_popcol):
        raise Error('Standard population dataset %r does not have a '
                    '%r column' %
                    (stdpopset.label or stdpopset.name, stdpopset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        # We turn the summset into an Ncondcols-dimensional matrix
        summtab = CrossTab.from_summset(summset)

        if stdpopset is not None:
            # Then attempt to do the same to the stdpop data, summing any
            # axes not required and replicate any missing until we have an
            # array the same shape as the summtab array.
            stdtab = CrossTab.from_summset(stdpopset, shaped_like=summtab)
            stdtab.collapse_axes_not_in(summtab)
            stdtab.replicate_axes(summtab)
            stdpop = stdtab[stdpopset_popcol].data.astype(Numeric.Float64)

        # The population dataset must have at least as many dimensions as
        # summary dataset. Any additional axes are eliminated by summing.
        # any missing axes are created by replication.
        poptab = CrossTab.from_summset(popset, shaped_like=summtab)
        poptab.collapse_axes_not_in(summtab)
        poptab.replicate_axes(summtab)
        popfreq = poptab[popset_popcol].data.astype(Numeric.Float64)

        # Manufacture a CrossTab for the result, with one less axis (the first)
        result = summtab.empty_copy()
        del result.axes[axis]

        if stdpopset is not None:
            sum_stdpop = sumaxis(stdpop)
            stdwgts = stdpop / sum_stdpop
            stdpop_sq = stdpop**2
            sum_stdpop_sq = sum_stdpop**2
            ffwi = stdwgts / popfreq
            ffwm = MA.maximum(MA.ravel(ffwi))

        basepop = float(basepop)

        for table, name, n_add, l_add in just_freq_tables(summtab):

            # avoid integer overflows...
            summfreq = table.data.astype(Numeric.Float64)
            strata_rate = summfreq / popfreq

            result.add_table('summfreq' + n_add,
                             data=sumaxis(summfreq, axis),
                             label='Total events' + l_add)
            result.add_table('popfreq' + n_add,
                             data=sumaxis(popfreq, axis),
                             label='Total person-' + timeinterval +
                             ' at risk' + l_add)

            if stdpopset is not None:
                std_strata_summfreq = summfreq * Numeric.where(
                    MA.getmask(stdwgts), 0., 1.)
                wgtrate = strata_rate * stdwgts
                result.add_table('std_strata_summfreq' + n_add,
                                 data=sumaxis(std_strata_summfreq, axis),
                                 label="Total events in standard strata" +
                                 l_add)

            # Crude rate
            cr = sumaxis(summfreq, axis) / sumaxis(popfreq, axis) * basepop
            result.add_table('cr' + n_add,
                             data=cr,
                             label='Crude Rate per ' + '%d' % basepop +
                             ' person-' + timeinterval + l_add)

            if alpha is not None:
                # CIs for crude rate
                count = sumaxis(summfreq, axis)
                count_shape = count.shape
                count_flat = MA.ravel(count)
                totpop = sumaxis(popfreq, axis)
                assert totpop.shape == count.shape
                totpop_flat = MA.ravel(totpop)

                cr_ll = Numeric.empty(len(count_flat),
                                      typecode=Numeric.Float64)
                cr_ul = Numeric.empty(len(count_flat),
                                      typecode=Numeric.Float64)
                cr_ll_mask = Numeric.zeros(len(count_flat),
                                           typecode=Numeric.Int8)
                cr_ul_mask = Numeric.zeros(len(count_flat),
                                           typecode=Numeric.Int8)

                for i, v in enumerate(count_flat):
                    try:
                        if v == 0:
                            cr_ll[i] = 0.0
                        else:
                            cr_ll[i] = (
                                (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) /
                                totpop_flat[i]) * basepop
                        cr_ul[i] = (
                            (r.qchisq(1. - alpha / 2., df=2.0 *
                                      (v + 1)) / 2.0) /
                            totpop_flat[i]) * basepop
                    except:
                        cr_ll[i] = 0.0
                        cr_ul[i] = 0.0
                        cr_ll_mask[i] = 1
                        cr_ul_mask[i] = 1

                cr_ll = MA.array(cr_ll, mask=cr_ll_mask, typecode=MA.Float64)
                cr_ul = MA.array(cr_ul, mask=cr_ul_mask, typecode=MA.Float64)
                cr_ll.shape = count_shape
                cr_ul.shape = count_shape

                cr_base = 'Crude rate %d%%' % (100.0 * conflev)
                result.add_table('cr_ll' + n_add,
                                 data=cr_ll,
                                 label=cr_base + ' lower confidence limit ' +
                                 l_add)
                result.add_table('cr_ul' + n_add,
                                 data=cr_ul,
                                 label=cr_base + ' upper confidence limit ' +
                                 l_add)

            if stdpopset is not None:

                # Directly Standardised Rate
                dsr = sumaxis(wgtrate, axis)
                result.add_table('dsr' + n_add,
                                 data=dsr * basepop,
                                 label='Directly Standardised Rate per ' +
                                 '%d' % basepop + ' person-' + timeinterval +
                                 l_add)

                # Confidence Intervals
                if alpha is None or name != '_freq_':
                    # Can only calculate confidence intervals on freq cols
                    continue

                if ci_method == 'dobson':
                    # Dobson et al method
                    # see: Dobson A, Kuulasmaa K, Eberle E, Schere J. Confidence intervals for weighted sums
                    # of Poisson parameters, Statistics in Medicine, Vol. 10, 1991, pp. 457-62.
                    # se_wgtrate = summfreq*((stdwgts/(popfreq/basepop))**2)
                    se_wgtrate = summfreq * ((stdwgts / (popfreq))**2)
                    stderr = stdpop_sq * strata_rate * (1.0 - strata_rate)
                    se_rate = sumaxis(se_wgtrate, axis)
                    sumsei = sumaxis(stderr, axis)
                    total_freq = sumaxis(std_strata_summfreq, axis)
                    # get shape of total_freq
                    total_freq_shape = total_freq.shape

                    total_freq_flat = MA.ravel(total_freq)

                    # flat arrays to hold results and associated masks
                    l_lam = Numeric.empty(len(total_freq_flat),
                                          typecode=Numeric.Float64)
                    u_lam = Numeric.empty(len(total_freq_flat),
                                          typecode=Numeric.Float64)
                    l_lam_mask = Numeric.zeros(len(total_freq_flat),
                                               typecode=Numeric.Int8)
                    u_lam_mask = Numeric.zeros(len(total_freq_flat),
                                               typecode=Numeric.Int8)

                    conflev_l = (1 - conflev) / 2.0
                    conflev_u = (1 + conflev) / 2.0

                    for i, v in enumerate(total_freq_flat):
                        try:
                            if v == 0.:
                                u_lam[i] = -math.log(1 - conflev)
                                l_lam[i] = 0.0
                            else:
                                l_lam[i] = r.qgamma(conflev_l, v, scale=1.)
                                u_lam[i] = r.qgamma(conflev_u,
                                                    v + 1.,
                                                    scale=1.)
                        except:
                            l_lam[i] = 0.0
                            u_lam[i] = 0.0
                            l_lam_mask[i] = 1
                            u_lam_mask[i] = 1

                    l_lam = MA.array(l_lam,
                                     mask=l_lam_mask,
                                     typecode=MA.Float64)
                    u_lam = MA.array(u_lam,
                                     mask=u_lam_mask,
                                     typecode=MA.Float64)
                    l_lam.shape = total_freq_shape
                    u_lam.shape = total_freq_shape
                    dsr_ll = dsr + (((se_rate / total_freq)**0.5) *
                                    (l_lam - total_freq))
                    dsr_ul = dsr + (((se_rate / total_freq)**0.5) *
                                    (u_lam - total_freq))

                elif ci_method == 'ff':
                    # Fay and Feuer method
                    # see: Fay MP, Feuer EJ. Confidence intervals for directly standardized rates:
                    # a method based on the gamma distribution. Statistics in Medicine 1997 Apr 15;16(7):791-801.

                    ffvari = summfreq * ffwi**2.0
                    ffvar = sumaxis(ffvari, axis)

                    dsr_flat = Numeric.ravel(MA.filled(dsr, 0))
                    dsr_shape = dsr.shape

                    ffvar_flat = Numeric.ravel(MA.filled(ffvar, 0))

                    # flat arrays to hold results and associated masks
                    dsr_ll = Numeric.empty(len(dsr_flat),
                                           typecode=Numeric.Float64)
                    dsr_ul = Numeric.empty(len(dsr_flat),
                                           typecode=Numeric.Float64)
                    dsr_ll_mask = Numeric.zeros(len(dsr_flat),
                                                typecode=Numeric.Int8)
                    dsr_ul_mask = Numeric.zeros(len(dsr_flat),
                                                typecode=Numeric.Int8)

                    for i, y in enumerate(dsr_flat):
                        try:
                            dsr_ll[i] = (ffvar_flat[i] / (2.0 * y)) * r.qchisq(
                                alpha / 2., df=(2.0 * (y**2.) / ffvar_flat[i]))
                            dsr_ul[i] = ((ffvar_flat[i] + (ffwm**2.0)) /
                                         (2.0 * (y + ffwm))) * r.qchisq(
                                             1. - alpha / 2.,
                                             df=((2.0 * ((y + ffwm)**2.0)) /
                                                 (ffvar_flat[i] + ffwm**2.0)))
                        except:
                            dsr_ll[i] = 0.0
                            dsr_ul[i] = 0.0
                            dsr_ll_mask[i] = 1
                            dsr_ul_mask[i] = 1
                    dsr_ll = MA.array(dsr_ll,
                                      mask=dsr_ll_mask,
                                      typecode=MA.Float64)
                    dsr_ul = MA.array(dsr_ul,
                                      mask=dsr_ul_mask,
                                      typecode=MA.Float64)
                    dsr_ll.shape = dsr_shape
                    dsr_ul.shape = dsr_shape

                result.add_table('dsr_ll' + n_add,
                                 data=dsr_ll * basepop,
                                 label='DSR ' + '%d' % (100.0 * conflev) +
                                 '% lower confidence limit' + l_add)
                result.add_table('dsr_ul' + n_add,
                                 data=dsr_ul * basepop,
                                 label='DSR ' + '%d' % (100.0 * conflev) +
                                 '% upper confidence limit' + l_add)

    finally:
        set_default_mode(r_mode)
    soom.info('calc_directly_std_rates took %.03f' % (time.time() - st))
    if stdpopset is not None:
        name = 'dir_std_rates_' + summset.name
        label = 'Directly Standardised Rates for ' + (summset.label
                                                      or summset.name)
    else:
        name = 'crude_rates_' + summset.name
        label = 'Crude Rates for ' + (summset.label or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)
    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)
Ejemplo n.º 43
0
	def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None):
		"""
		02-28-05
			linear model fitting here
		
		03-08-05
			grouping and accumulating before do linear model fitting, see log of 2005, 
			section 'linear model overfitting' for detail.
		03-27-05
			Use glm of R to do logistic regression
		06-30-05
			add cluster_size
			add bit_string to control which parameter should be enabled.
		07-04-05
			add connectivity_2nd
		07-06-05
			add logistic
		11-09-05 extend coeff_list and coeff_p_value_list
			restructure the list, go_no2lm_results[go_no]
			
			--data_prepare
			--submit
		"""
		sys.stderr.write("Linear Model Fitting...\n")
		go_no2lm_results = {}
		
		#06-30-05	setup the formula_list based on bit_string
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		
		for (go_no,data) in go_no2prediction_space.iteritems():
			sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no))
			#11-09-05 extend coeff_list and coeff_p_value_list
			coeff_list = [0]*7	#intercept, p_value, recurrence, connectivity, cluster_size
			coeff_p_value_list = [1]*7
			index = 0	#06-30-05	the pointer for summary_stat
			
			if len(data)<=50:
				#two few data
				continue
			#convert it to a 2d array
			data = array(data)
			"""
			data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \
				repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1]))
			lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)")
			significance_dict = r("summary(lm_result)")
			print significance_dict['coefficients']
			"""
			set_default_mode(NO_CONVERSION) #04-07-05
			data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \
				"cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]})	#06-30-05	-1 denotes is_correct
			if self.logistic:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial"))
			else:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame)	#06-30-05 use formula_list
			set_default_mode(BASIC_CONVERSION) #04-07-05
			#04-07-05 r.summary() requires lm_result in NO_CONVERSION state
			summary_stat = r.summary(lm_result)
			if self.debug:
				print "everything about coefficients from function", go_no, "is"
				print summary_stat['coefficients']	#p-values of coefficients
			"""
			#04-07-05 convert to python dictionary form
			lm_result = lm_result.as_py()
			coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \
				lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \
				lm_result["coefficients"]["cluster_size"], \
				summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\
				summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\
				summary_stat['coefficients'][4][-1], 1]
				#the last entry is score_cut_off, replaced later in get_score_cut_off()
				#06-30-05	add corresponding p-values
			"""
			#06-30-05	0 in summary_stat['coefficients'] is intercept
			coeff_list[0] = summary_stat['coefficients'][0][0]	#0 is the coefficient
			coeff_p_value_list[0] = summary_stat['coefficients'][0][-1]	#-1 is the corresponding p-value
			#06-30-05	fill in other efficients based on bit_string, NOTE i+1
			for i in range(len(bit_string)):
				if bit_string[i] == '1':
					index+=1
					coeff_list[i+1] = summary_stat['coefficients'][index][0]	#0 is the coefficient
					coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1]	#-1 is the corresponding p-value
			#11-09-05 restructure the following list
			go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1]	#the last entry is score_cut_off, replaced later in get_score_cut_off()
		sys.stderr.write("done.\n")
		return go_no2lm_results
Ejemplo n.º 44
0
def make_L(
    data,
    direction='S',
    z=None,
):
    """ Define the along track distance from one reference

        direction define the cardinal direction priority (N,S,W or E).
         S means that the reference will be the southern most point

        z define the bathymetry, if defined, the closest point to that
         bathymetry will be the reference. In case of cross this bathymetry
         more than once, the direction criteria is used to distinguish.
    """
    from fluid.common.distance import distance
    all_cycles_data = join_cycles(data)

    if z == None:
        import rpy
        #for t in topex.invert_keys(data):
        for t in all_cycles_data:
            rpy.set_default_mode(rpy.NO_CONVERSION)
            linear_model = rpy.r.lm(rpy.r("y ~ x"),
                                    data=rpy.r.data_frame(
                                        x=all_cycles_data[t]['Longitude'],
                                        y=all_cycles_data[t]['Latitude']))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            coef = rpy.r.coef(linear_model)
            if direction == 'S':
                lat0 = all_cycles_data[t]['Latitude'].min() - 1
                lon0 = (lat0 - coef['(Intercept)']) / coef['x']
                L_correction = distance(all_cycles_data[t]['Latitude'],
                                        all_cycles_data[t]['Longitude'], lat0,
                                        lon0).min()
            for c in invert_keys(data)[t]:
                data[c][t]['L'] = distance(data[c][t]['Latitude'],
                                           data[c][t]['Longitude'], lat0,
                                           lon0) - L_correction
    # This bathymetric method was only copied from an old code. This should be atleast
    #  changed, if not removed.
    elif method == 'bathymetric':
        import rpy
        for t in all_cycles_data:
            # First define the near coast values.
            idSouth = numpy.argmin(all_cycles_data[t]['Latitude'])
            L_tmp = distance(all_cycles_data[t]['Latitude'],
                             all_cycles_data[t]['Longitude'],
                             all_cycles_data[t]['Latitude'][idSouth],
                             all_cycles_data[t]['Longitude'][idSouth])
            idNearCoast = L_tmp.data < 400e3
            if min(all_cycles_data[t]['Bathy'][idNearCoast]) > -z:
                idNearCoast = L_tmp.data < 600e3
            # Then calculate the distance to a reference
            rpy.set_default_mode(rpy.NO_CONVERSION)
            linear_model = rpy.r.lm(rpy.r("y ~ x"),
                                    data=rpy.r.data_frame(
                                        x=all_cycles_data[t]['Longitude'],
                                        y=all_cycles_data[t]['Latitude']))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            coef = rpy.r.coef(linear_model)
            lat0 = all_cycles_data[t]['Latitude'].min() - 1
            lon0 = (lat0 - coef['(Intercept)']) / coef['x']
            #L = distance(,lon,lat0,lon0)
            #
            #id0 = numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast]))
            idref = numpy.argmin(
                numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast] + z))
            #L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref])
            L_correction = distance(
                all_cycles_data[t]['Latitude'][idNearCoast][idref],
                all_cycles_data[t]['Longitude'][idNearCoast][idref], lat0,
                lon0)
            for c in topex.invert_keys(data)[t]:
                #data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],all_cycles_data[t]['Latitude'][idNearCoast][id0],all_cycles_data[t]['Longitude'][idNearCoast][id0]) - L_correction
                data[c][t]['L'] = distance(data[c][t]['Latitude'],
                                           data[c][t]['Longitude'], lat0,
                                           lon0) - L_correction
    #
    return
Ejemplo n.º 45
0
	is_correct_list = []
	for row in reader:
		p_value, recurrence, connectivity, cluster_size, gradient, gene_no, go_no, is_correct = row
		data.append([float(p_value), float(recurrence), float(connectivity), float(cluster_size), float(gradient), int(gene_no), int(go_no), int(is_correct)])
	del reader
	return data, is_correct_list

known_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.known'
unknown_fname = '/tmp/hs_fim_92m5x25bfsdfl10q0_7gf1.unknown'

known_data, known_is_correct_list = read_data(known_fname)
unknown_data, unknown_is_correct_list = read_data(unknown_fname)

from numarray import array
from rpy import r, set_default_mode,NO_CONVERSION,BASIC_CONVERSION
set_default_mode(NO_CONVERSION)
#pack data into data_frame
known_data = array(known_data)
known_data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \
	"cluster_size":known_data[:,3], "gradient":known_data[:,4]})
unknown_data = array(unknown_data)
unknown_data_frame = r.as_data_frame({"p_value":unknown_data[:,0], "recurrence":unknown_data[:,1], "connectivity":unknown_data[:,2], \
	"cluster_size":unknown_data[:,3], "gradient":unknown_data[:,4]})
#start to call randomF.r to run randomForest
r.library('randomForest')
r.source('randomF.r')
#rf_model still needs to be in pure R object
rf_model = r.randomF(known_data_frame, known_data[:,-1])

set_default_mode(BASIC_CONVERSION)
unknown_pred = r.predictRandomF(rf_model, unknown_data_frame)
Ejemplo n.º 46
0
	# if you have rpy installed, use it to test the results
	have_rpy =  False
	try:
	    print "\n"
	    print "="*30
	    print "Validating OLS results in R"
	    print "="*30

	    import rpy
	    have_rpy = True
	except ImportError:
	    print "\n"
	    print "="*30
	    print "Validating OLS-class results in R"
	    print "="*30
	    print "rpy is not installed"
	    print "="*30

	if have_rpy:
	    y = data[:,0]
	    x1 = data[:,1]
	    x2 = data[:,2]
	    x3 = data[:,3]
	    x4 = data[:,4]
	    rpy.set_default_mode(rpy.NO_CONVERSION)
	    linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
	    rpy.set_default_mode(rpy.BASIC_CONVERSION)
	    print linear_model.as_py()['coefficients']
	    summary = rpy.r.summary(linear_model)
	    print summary
Ejemplo n.º 47
0
def krige_to_grid(grid_fname, obs_x, obs_y, obs_data, vgm_par):
    """Interpolate point data onto a grid using Kriging.

    Interpolate point data onto a regular rectangular grid of square cells using
    Kriging with a predefined semi-variogram.  The observed data locations must
    be specified in the same projection and coordinate system as the grid, which
    is defined in an ArcGIS raster file.

    Parameters
    ----------
    grid_fname : string
        Filename of an ArcGIS float grid raster defining the required grid to
        Krige onto.  All cells are included regardless of their value.
    obs_x : array_like
        The x coordinates of the observation locations.
    obs_y : array_like
        The y coordinates of the observation locations.
    obs_data : array_like
        The data values at the observation locations.
    vgm : dict
        A dictionary describing the semi-variogram model.  Required keys are:
        'model' can be one of {'Lin', 'Exp', 'Sph', 'Gau'}
        'nugget' must be a scalar
        'range' must be a scalar
        'sill' must be a scalar

    Returns
    -------
    kriged_est : 2darray
        A 2D array containing the Kriged estimates at each point on the
        specified rectangular grid.

    Notes
    -----
    This function requires that R, RPy and the R gstat library are correctly
    installed.

    """
    grid, headers = arcfltgrid.read(grid_fname)
    cols = headers[0]
    rows = headers[1]
    x0 = headers[2]
    y0 = headers[3]
    cell_size = headers[4]
    # TO DO: adjust x0, y0 by 0.5*cell_size if llcorner..

    # define the grid (pixel centre's)
    xt, yt = np.meshgrid(
        np.linspace(x0, x0 + (cols - 1) * cell_size, num=cols),
        np.linspace(y0 + (rows - 1) * cell_size, y0, num=rows))

    xt = xt.flatten()
    yt = yt.flatten()

    # Krige using gstat via RPy
    r.library('gstat')
    rpy.set_default_mode(rpy.NO_CONVERSION)

    obs_frame = r.data_frame(x=obs_x, y=obs_y, data=obs_data)
    target_grid = r.data_frame(x=xt, y=yt)

    v = r.vgm(vgm_par['sill'], vgm_par['model'], vgm_par['range'],
              vgm_par['nugget'])

    result = r.krige(r('data ~ 1'),
                     r('~ x + y'),
                     obs_frame,
                     target_grid,
                     model=v)

    rpy.set_default_mode(rpy.BASIC_CONVERSION)

    result = result.as_py()

    kriged_est = np.array(result['var1.pred'])
    kriged_est = kriged_est.reshape(rows, cols)

    return kriged_est