def __init__(self, formula_str, df, factors=None, resid_formula_str=None, **lmer_opts): """ """ # get the pred_var pred_var = formula_str.split('~')[0].strip() # convert df to a recarray if it's a dataframe if isinstance(df, pd.DataFrame): df = df.to_records() # add column if necessary if pred_var not in df.dtype.names: # must add it df = append_fields(df, pred_var, [0.0] * len(df), usemask=False) # make factor list if necessary if factors is None: factors = {} # add in missingarg for any potential factor not provided for k in df.dtype.names: if isinstance(df[k][0], str) and k not in factors: factors[k] = MissingArg for f in factors: if factors[f] is None: factors[f] = MissingArg # checking for both types of R Vectors for rpy2 variations elif (not isinstance(factors[f], Vector) and not factors[f] == MissingArg): factors[f] = Vector(factors[f]) # convert the recarray to a DataFrame (releveling if desired) self._rdf = DataFrame({ k: (FactorVector(df[k], levels=factors[k]) if (k in factors) or isinstance(df[k][0], str) else df[k]) for k in df.dtype.names }) # get the column index self._col_ind = list(self._rdf.colnames).index(pred_var) # make a formula obj self._rformula = Formula(formula_str) # make one for resid if necessary if resid_formula_str: self._rformula_resid = Formula(resid_formula_str) else: self._rformula_resid = None # save the args self._lmer_opts = lmer_opts # model is null to start self._ms = None
def qcrop2(xlist, ylist, labels=None, nq=4.): if labels is None: labels = map(str, range(len(xlist))) x = [] y = [] xcrop = [] ycrop = [] facet = [] for i, (onex, oney) in enumerate(zip(xlist, ylist)): xmin, xmax = qlim1(onex, nq) ymin, ymax = qlim1(oney, nq) cropx, cropy = zip(*[( nan, nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx, vy) for vx, vy in zip(onex, oney)]) xcrop += cropx ycrop += cropy x += onex y += oney facet += [labels[i]] * len(onex) df = DataFrame({ 'x': FloatVector(x), 'y': FloatVector(y), 'xcrop': FloatVector(xcrop), 'ycrop': FloatVector(ycrop), 'facet': FactorVector(StrVector(facet), levels=StrVector(labels)) }) return df
def __init__(self, count_matrix, design_matrix, conditions, gene_column='id'): self.dds = None self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] self.count_matrix = pandas2ri.py2rpy( count_matrix.drop(gene_column, axis=1)) design_formula = "~ " for col in conditions: levels = design_matrix[col].unique() levels = robjects._convert_rpy2py_strvector(levels) as_factor = r["as.factor"] design_matrix[col] = FactorVector(design_matrix[col], levels=levels) design_matrix[col] = as_factor(design_matrix[col]) design_formula = design_formula + col + " +" design_formula = design_formula[:-2] self.design_matrix = pandas2ri.py2rpy(design_matrix) self.design_formula = Formula(design_formula)
def tupls2RDataframe(data, titles): cols = [[] for _ in titles] for datum in data: for i, e in enumerate(datum): cols[i].append(e) col_d = {} for i, t in enumerate(titles): col_d[t] = StrVector(tuple(cols[i])) col_d[t] = FactorVector(col_d[t]) dataf = DataFrame(col_d) return dataf
def FitChiralCont_1Slope_Cross(self, a_r0_sqr, Categorical_Var): import rpy2.robjects as robjects from rpy2.robjects import FloatVector from rpy2.robjects import FactorVector from rpy2.robjects.packages import importr self.NumSlope = 1 stats = importr('stats') base = importr('base') Temp_Jack_Mean_List = self.EnsembleJAvgList self.CoeffList = [] robjects.globalenv["a_r0_sqr"] = FloatVector(a_r0_sqr) robjects.globalenv["Spacing"] = FactorVector(Categorical_Var) robjects.globalenv["M_pi_sqr"] = FloatVector(self.Indep_Var_List) r_weight = [pow(x, 2) for x in self.EnsembleWeight] robjects.globalenv["PointValueList"] = FloatVector(self.PointValueList) for j in range(0, len(self.EnsembleNameList), 1): for x in (self.EnsembleJackList[j]): Temp_Jack_Mean_List[j] = x robjects.globalenv["TempJackMean"] = FloatVector( Temp_Jack_Mean_List) #Coeff=(stats.lm("TempJackMean ~ Spacing:M_pi_sqr+a_r0_sqr","weights=EnsembleWeight"))[0] Coeff = np.asarray( (stats.lm("TempJackMean ~ M_pi_sqr*a_r0_sqr", weights=FloatVector(r_weight)))[0]) #Coeff=np.polyfit(self.Indep_Var_List,Temp_Jack_Mean_List,1) # vector=numpy.asarray(vector_R) self.CoeffList.append(Coeff) Temp_Jack_Mean_List = self.EnsembleJAvgList self.StdErr = self.ComputeStdError(self.CoeffList) JackAvg = sum(self.CoeffList) / len(self.CoeffList) self.FitAvg = np.asarray( (stats.lm("PointValueList ~ M_pi_sqr*a_r0_sqr", weights=FloatVector(r_weight)))[0]) #print(self.StdErr) #CoeffList self.InterceptError = self.StdErr[1] self.SlopeError = self.StdErr[0] self.FitCoeff = self.FitAvg ####Fix #self.FitCoeff=self.FitAvg - (len(self.CoeffList)-1)*(JackAvg-self.FitAvg) ##Fix this to be unbiased self.Intercept = self.FitCoeff[1] self.Slope = self.FitCoeff[0] self.PrintResults()
def __init__( self, fe_formula, re_formula, re_group, dep_data, ind_data, factors=None, row_mask=None, use_ranks=False, use_norm=True, memmap=False, memmap_dir=None, resid_formula=None, null_formula=None, num_null_boot=0, svd_terms=None, use_ssvd=False, #nperms=500, nboot=100, n_jobs=1, verbose=10, lmer_opts=None): """ """ if verbose > 0: sys.stdout.write('Initializing...') sys.stdout.flush() start_time = time.time() # save the formula self._formula_str = fe_formula + ' + ' + re_formula # see if there's a resid formula if resid_formula: # the random effects are the same self._resid_formula_str = resid_formula + ' + ' + re_formula else: self._resid_formula_str = None # see if there's a null formula if null_formula: # the random effects are the same self._null_formula_str = null_formula + ' + ' + re_formula else: self._null_formula_str = None self._num_null_boot = num_null_boot # save whether using ranks self._use_ranks = use_ranks # see whether to use sparse svd self._use_ssvd = use_ssvd # see if memmapping self._memmap = memmap # save job info self._n_jobs = n_jobs self._verbose = verbose # eventually fill the feature shape self._feat_shape = None # fill A,M,O,D self._A = {} self._M = {} self._O = {} self._D = {} O = [] # loop over unique grouping var self._re_group = re_group if isinstance(ind_data, dict): # groups are the keys self._groups = np.array(list(ind_data.keys())) else: # groups need to be extracted from the recarray self._groups = np.unique(ind_data[re_group]) for g in self._groups: # get that subj inds if isinstance(ind_data, dict): # the index is just the group into that dict ind_ind = g else: # select the rows based on the group ind_ind = ind_data[re_group] == g # process the row mask if row_mask is None: # no mask, so all good row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool) elif isinstance(row_mask, dict): # pull the row_mask from the dict row_ind = row_mask[g] else: # index into it with ind_ind row_ind = row_mask[ind_ind] # extract that group's A,M,O # first save the observations (rows of A) self._O[g] = ind_data[ind_ind][row_ind] if use_ranks: # loop over non-factors and rank them for n in self._O[g].dtype.names: if (n in factors) or isinstance(self._O[g][n][0], str): continue self._O[g][n] = rankdata(self._O[g][n]) O.append(self._O[g]) # eventually allow for dict of data files for dep_data if isinstance(dep_data, dict): # the index is just the group into that dict dep_ind = g else: # select the rows based on the group dep_ind = ind_ind # save feature shape if necessary if self._feat_shape is None: self._feat_shape = dep_data[dep_ind].shape[1:] # Save D index into data self._D[g] = dep_data[dep_ind][row_ind] # reshape it self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1)) if use_ranks: if verbose > 0: sys.stdout.write('Ranking %s...' % (str(g))) sys.stdout.flush() for i in range(self._D[g].shape[1]): self._D[g][:, i] = rankdata(self._D[g][:, i]) # reshape M, so we don't have to do it repeatedly self._M[g] = self._D[g].copy( ) #dep_data[ind].reshape((dep_data[ind].shape[0],-1)) # normalize M if use_norm: self._M[g] -= self._M[g].mean(0) self._M[g] /= np.sqrt((self._M[g]**2).sum(0)) # determine A from the model.matrix rdf = DataFrame({ k: (FactorVector(self._O[g][k]) if k in factors else self._O[g][k]) for k in self._O[g].dtype.names }) # model spec as data frame ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf)) cols = list(r['names'](ms)) if svd_terms is None: self._svd_terms = [c for c in cols if not 'Intercept' in c] else: self._svd_terms = svd_terms self._A[g] = np.concatenate( [np.array(ms.rx(c)) for c in self._svd_terms]).T #for c in cols if not 'Intercept' in c]).T if use_ranks: for i in range(self._A[g].shape[1]): self._A[g][:, i] = rankdata(self._A[g][:, i]) # normalize A if True: #use_norm: self._A[g] -= self._A[g].mean(0) self._A[g] /= np.sqrt((self._A[g]**2).sum(0)) # memmap if desired if self._memmap: self._M[g] = _memmap_array(self._M[g], memmap_dir) self._D[g] = _memmap_array(self._D[g], memmap_dir) # concat the Os together and make an LMER instance #O = np.concatenate(O) #self._O = np.vstack(O) #self._O = np.array(O) self._O = O if lmer_opts is None: lmer_opts = {} self._lmer_opts = lmer_opts self._factors = factors #self._lmer = LMER(self._formula_str, O, factors=factors, **lmer_opts) # prepare for the perms and boots self._perms = [] self._boots = [] self._tp = [] self._tb = [] if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.write('Processing actual data...') sys.stdout.flush() start_time = time.time() global _global_meld _global_meld[id(self)] = self # run for actual data (returns both perm and boot vals) self._R = None self._ss = None self._mer = None self._mer_null = None tp, tb, R, feat_mask, ss, mer, mer_null = _eval_model( id(self), None, None) self._R = R self._tp.append(tp) self._tb.append(tb) self._feat_mask = feat_mask self._ss = ss self._mer = mer self._mer_null = mer_null if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.flush()
def lmer_feature(formula_str, dat, perms=None, val=None, factors=None, **kwargs): """ Run LMER on a number of permutations of the predicted data. """ # get the perm_var perm_var = formula_str.split('~')[0].strip() # set the val if necessary if not val is None: dat[perm_var] = val # make factor list if necessary if factors is None: factors = [] # convert the recarray to a DataFrame rdf = DataFrame({ k: (FactorVector(dat[k]) if (k in factors) or isinstance(dat[k][0], str) else dat[k]) for k in dat.dtype.names }) #rdf = com.convert_to_r_dataframe(pd.DataFrame(dat),strings_as_factors=True) # get the column index col_ind = list(rdf.colnames).index(perm_var) # make a formula obj rformula = Formula(formula_str) # just apply to actual data if no perms if perms is None: #perms = [np.arange(len(dat))] perms = [None] # run on each permutation tvals = None for i, perm in enumerate(perms): if not perm is None: # set the perm rdf[col_ind] = rdf[col_ind].rx(perm + 1) # inside try block to catch convergence errors try: ms = lme4.lmer(rformula, data=rdf, **kwargs) except: continue #tvals.append(np.array([np.nan])) # extract the result df = r['data.frame'](r_coef(r['summary'](ms))) if tvals is None: # init the data # get the row names rows = list(r['row.names'](df)) tvals = np.rec.fromarrays( [np.ones(len(perms)) * np.nan for ro in range(len(rows))], names=','.join(rows)) tvals[i] = tuple(df.rx2('t.value')) return tvals
def __init__(self, fe_formula, re_formula, re_group, dep_data, ind_data, factors=None, row_mask=None, dep_mask=None, use_ranks=False, use_norm=True, memmap=False, memmap_dir=None, resid_formula=None, svd_terms=None, feat_thresh=0.05, feat_nboot=1000, do_tfce=False, connectivity=None, shape=None, dt=.01, E=2 / 3., H=2.0, n_jobs=1, verbose=10, lmer_opts=None): """ dep_data can be an array or a dict of arrays (possibly memmapped), one for each group. ind_data can be a rec_array for each group or one large rec_array with a grouping variable. """ if verbose > 0: sys.stdout.write('Initializing...') sys.stdout.flush() start_time = time.time() # save the formula self._formula_str = fe_formula + ' + ' + re_formula # see if there's a resid formula if resid_formula: # the random effects are the same self._resid_formula_str = resid_formula + ' + ' + re_formula else: self._resid_formula_str = None # save whether using ranks self._use_ranks = use_ranks # see the thresh for keeping a feature self._feat_thresh = feat_thresh self._feat_nboot = feat_nboot self._do_tfce = do_tfce self._connectivity = connectivity self._dt = dt self._E = E self._H = H # see if memmapping self._memmap = memmap # save job info self._n_jobs = n_jobs self._verbose = verbose # eventually fill the feature shape self._feat_shape = None # handle the dep_mask self._dep_mask = dep_mask # fill A,M,O,D self._A = {} self._M = {} self._O = {} self._D = {} O = [] # loop over unique grouping var self._re_group = re_group if isinstance(ind_data, dict): # groups are the keys self._groups = np.array(ind_data.keys()) else: # groups need to be extracted from the recarray self._groups = np.unique(ind_data[re_group]) for g in self._groups: # get that subj inds if isinstance(ind_data, dict): # the index is just the group into that dict ind_ind = g else: # select the rows based on the group ind_ind = ind_data[re_group] == g # process the row mask if row_mask is None: # no mask, so all good row_ind = np.ones(len(ind_data[ind_ind]), dtype=np.bool) elif isinstance(row_mask, dict): # pull the row_mask from the dict row_ind = row_mask[g] else: # index into it with ind_ind row_ind = row_mask[ind_ind] # extract that group's A,M,O # first save the observations (rows of A) self._O[g] = ind_data[ind_ind][row_ind] if use_ranks: # loop over non-factors and rank them for n in self._O[g].dtype.names: if (n in factors) or isinstance(self._O[g][n][0], str): continue self._O[g][n] = rankdata(self._O[g][n]) O.append(self._O[g]) # eventually allow for dict of data files for dep_data if isinstance(dep_data, dict): # the index is just the group into that dict dep_ind = g else: # select the rows based on the group dep_ind = ind_ind # save feature shape if necessary if self._feat_shape is None: self._feat_shape = dep_data[dep_ind].shape[1:] # handle the mask if self._dep_mask is None: self._dep_mask = np.ones(self._feat_shape, dtype=np.bool) # create the connectivity (will mask later) if self._do_tfce and self._connectivity is None and \ (len(self._dep_mask.flatten()) > self._dep_mask.sum()): # create the connectivity self._connectivity = cluster.sparse_dim_connectivity( [cluster.simple_neighbors_1d(n) for n in self._feat_shape]) # Save D index into data (apply row and feature masks # This will also reshape it self._D[g] = dep_data[dep_ind][row_ind][:, self._dep_mask].copy() # reshape it #self._D[g] = self._D[g].reshape((self._D[g].shape[0], -1)) if use_ranks: if verbose > 0: sys.stdout.write('Ranking %s...' % (str(g))) sys.stdout.flush() for i in xrange(self._D[g].shape[1]): # rank it self._D[g][:, i] = rankdata(self._D[g][:, i]) # normalize it self._D[g][:, i] = ((self._D[g][:, i] - 1) / (len(self._D[g][:, i]) - 1)) # save M from D so we can have a normalized version self._M[g] = self._D[g].copy() # remove any NaN's in dep_data self._D[g][np.isnan(self._D[g])] = 0.0 # normalize M if use_norm: self._M[g] -= self._M[g].mean(0) self._M[g] /= np.sqrt((self._M[g]**2).sum(0)) # determine A from the model.matrix rdf = DataFrame({ k: (FactorVector(self._O[g][k]) if k in factors else self._O[g][k]) for k in self._O[g].dtype.names }) # model spec as data frame ms = r['data.frame'](r_model_matrix(Formula(fe_formula), data=rdf)) cols = list(r['names'](ms)) if svd_terms is None: self._svd_terms = [c for c in cols if 'Intercept' not in c] else: self._svd_terms = svd_terms # self._A[g] = np.vstack([ms[c] #np.array(ms.rx(c)) self._A[g] = np.concatenate( [np.array(ms.rx(c)) for c in self._svd_terms]).T if use_ranks: for i in xrange(self._A[g].shape[1]): # rank it self._A[g][:, i] = rankdata(self._A[g][:, i]) # normalize it self._A[g][:, i] = ((self._A[g][:, i] - 1) / (len(self._A[g][:, i]) - 1)) # normalize A if True: # use_norm: self._A[g] -= self._A[g].mean(0) self._A[g] /= np.sqrt((self._A[g]**2).sum(0)) # memmap if desired if self._memmap: self._M[g] = _memmap_array(self._M[g], memmap_dir, unique_id=str(g)) self._D[g] = _memmap_array(self._D[g], memmap_dir, unique_id=str(g)) # save the new O self._O = O if lmer_opts is None: lmer_opts = {} self._lmer_opts = lmer_opts self._factors = factors # mask the connectivity if self._do_tfce and (len(self._dep_mask.flatten()) > self._dep_mask.sum()): self._connectivity = self._connectivity.tolil()[ self._dep_mask.flatten()][:, self._dep_mask.flatten()].tocoo() # prepare for the perms and boots and jackknife self._perms = [] self._tp = [] self._tb = [] self._tj = [] self._pfmask = [] if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.write('Processing actual data...') sys.stdout.flush() start_time = time.time() global _global_meld _global_meld[id(self)] = self # run for actual data (returns both perm and boot vals) self._R = None self._ss = None self._mer = None tp, tb, R, feat_mask, ss, mer = _eval_model(id(self), None) self._R = R self._tp.append(tp) self._tb.append(tb) self._feat_mask = feat_mask self._fmask = ~feat_mask[0] self._pfmask.append(~feat_mask[0]) self._ss = ss self._mer = mer if verbose > 0: sys.stdout.write('Done (%.2g sec)\n' % (time.time() - start_time)) sys.stdout.flush()
def FitChiralCont_1Slope_Cross_wSubtraction_NoIntercept( self, a_r0_sqr, Categorical_Var, filepath): import rpy2.robjects as robjects from rpy2.robjects import FloatVector from rpy2.robjects import FactorVector from rpy2.robjects.packages import importr self.NumSlope = 1 stats = importr('stats') base = importr('base') Temp_Jack_Mean_List = self.EnsembleJAvgList SubtractionJackList = [] self.CoeffList = [] robjects.globalenv["a_r0_sqr"] = FloatVector(a_r0_sqr) robjects.globalenv["Spacing"] = FactorVector(Categorical_Var) robjects.globalenv["M_pi_sqr"] = FloatVector(self.Indep_Var_List) r_weight = [pow(x, 2) for x in self.EnsembleWeight] robjects.globalenv["PointValueList"] = FloatVector(self.PointValueList) for j in range(0, len(self.EnsembleNameList), 1): for x in (self.EnsembleJackList[j]): Temp_Jack_Mean_List[j] = x robjects.globalenv["TempJackMean"] = FloatVector( Temp_Jack_Mean_List) #Coeff=(stats.lm("TempJackMean ~ Spacing:M_pi_sqr+a_r0_sqr","weights=EnsembleWeight"))[0] Coeff = np.asarray( (stats.lm("TempJackMean ~0.0+ M_pi_sqr*a_r0_sqr", weights=FloatVector(r_weight)))[0]) #Coeff=np.polyfit(self.Indep_Var_List,Temp_Jack_Mean_List,1) # vector=numpy.asarray(vector_R) self.CoeffList.append(Coeff) SubtractionList = [] for k in range(0, len(a_r0_sqr), 1): ###### FIX SubtractionList.append(Temp_Jack_Mean_List[k] - a_r0_sqr[k] * Coeff[1] - a_r0_sqr[k] * self.Indep_Var_List[k] * Coeff[2]) # SubtractionList.append(Temp_Jack_Mean_List[k]) SubtractionJackList.append(SubtractionList) Temp_Jack_Mean_List = self.EnsembleJAvgList self.StdErr = self.ComputeStdError(self.CoeffList) JackAvg = sum(self.CoeffList) / len(self.CoeffList) self.FitAvg = np.asarray( (stats.lm("PointValueList ~ 0+ M_pi_sqr*a_r0_sqr", weights=FloatVector(r_weight)))[0]) Rev_SubtractionJackList = [] # print(len(SubtractionJackList[])) SubtractJackAvg = [] SubtractJackErr = [] for i in range(0, len(a_r0_sqr), 1): Temp = [] for j in range(0, len(SubtractionJackList), 1): Temp.append(SubtractionJackList[j][i]) Rev_SubtractionJackList.append(Temp) #SubtractJackAvg.append(sum(Rev_SubtractionJackList[i])/len(Rev_SubtractionJackList)) #SubtractJackErr.append(self.ComputeStdError(Rev_SubtractionJackList[i])) print(len(Rev_SubtractionJackList[1])) #print(self.StdErr) #CoeffList for x in Rev_SubtractionJackList: #SubtractJackAvg.append(sum(x)/len(x)) # print(len(x)) # SubtractJackAvg.append() SubtractJackErr.append(self.ComputeStdError(x)) #SubtractJackAvg=self.PointValueList for k in range(0, len(a_r0_sqr), 1): ###FIX SubtractJackAvg.append(self.PointValueList[k] - a_r0_sqr[k] * self.FitAvg[1] - a_r0_sqr[k] * self.Indep_Var_List[k] * self.FitAvg[2]) # print(self.FitAvg) self.InterceptError = self.StdErr[1] self.SlopeError = self.StdErr[0] self.FitCoeff = self.FitAvg ####Fix #self.FitCoeff=self.FitAvg - (len(self.CoeffList)-1)*(JackAvg-self.FitAvg) ##Fix this to be unbiased self.Intercept = self.FitCoeff[1] self.Slope = self.FitCoeff[0] self.PrintResults() f = open(filepath, "w") output = 'PointName,TopSus,TopSus_Error,r0_Mpi_sqr,PlotName,PointId\n' f.write(output) for i in range(0, len(self.EnsembleNameList), 1): output = self.EnsembleNameList[i] + ',' + str( SubtractJackAvg[i]) + ',' + str( SubtractJackErr[i]) + ',' + str( self.Indep_Var_List[i]) + ',' + str( self.Name) + ',' + str(self.PointIdList[i]) + '\n' f.write(output) f.close()