def pcor(var1, var2, covariate, method='spearman'): '''Run R ppcor's partial correlation Key arguments: var1, var2, covariate: float or int numpy array method: str, 'spearman' or 'pearson' ''' # import ppcor library in R base = importr('ppcor') # define variables in R x = FloatVector(var1) y = FloatVector(var2) c = FloatVector(covariate) # assign values r.assign('x', x) r.assign('y', y) r.assign('c', c) # run partial correlation in R and return outputs to python r(f'pcorOut <- pcor.test(x, y, c, method = "{method}")') pcor_out = r('pcorOut') pcor_out_df = pandas2ri.rpy2py(pcor_out) return pcor_out_df
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw): x = [] y = [] ya = [] for triad in stats: for r in stats[triad]: paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw) ns_EN = sum(r[0]['EN'][t] for t in pair) s_EN = sum(r[1]['EN'][t] for t in pair) para = paralinear_dists[pair] if para: x.append(ns_EN) y.append(para) ya.append(s_EN) print 'paralinear stats' print_stats(x, y) print 'GTR stats' print_stats(x, ya) df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)}) globalenv['df'] = df cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \ 'geom_abline(intercept=0, slope=1, color="white") + ' + \ 'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \ 'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \ 'coord_cartesian(xlim=c(0,1), ylim=c(0,1))' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...') return
def tcplFit(conc, resp, bmad=False): kwargs = { 'logc': FloatVector(conc), 'resp': FloatVector(resp), } if bmad: kwargs['bmad'] = 1.0 * bmad else: kwargs['force.fit'] = True Y0 = rtcpl.tcplFit(**kwargs) Y1 = pd.Series(as_dict(Y0)) F0 = [] for m in ['cnst', 'hill', 'gnls']: fit = Y1.pop(m) K = [i for i in Y1.index if i.startswith(m)] Y2 = Y1[K] Y2.index = [re.sub('%s_?' % m, '', i) for i in K] Y2['model'] = m if fit == fit: F0.append(Y2.to_dict()) Y1 = Y1.drop(K) Y1['bmad'] = bmad R0 = {} R0['fits'] = F0 #set_trace() R0['cr_info'] = Y1.to_dict() R0['cr_data'] = dict(conc=list(conc), resp=list(resp)) # FIgure out best fit F1 = pd.DataFrame(F0) R0['best_fit'] = F1.sort_values('aic').iloc[0].to_dict() return R0
def qcrop2(xlist, ylist, labels=None, nq=4.): if labels is None: labels = map(str, range(len(xlist))) x = [] y = [] xcrop = [] ycrop = [] facet = [] for i, (onex, oney) in enumerate(zip(xlist, ylist)): xmin, xmax = qlim1(onex, nq) ymin, ymax = qlim1(oney, nq) cropx, cropy = zip(*[( nan, nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx, vy) for vx, vy in zip(onex, oney)]) xcrop += cropx ycrop += cropy x += onex y += oney facet += [labels[i]] * len(onex) df = DataFrame({ 'x': FloatVector(x), 'y': FloatVector(y), 'xcrop': FloatVector(xcrop), 'ycrop': FloatVector(ycrop), 'facet': FactorVector(StrVector(facet), levels=StrVector(labels)) }) return df
def to_robjects(covariates, observed_outcomes=None, treatment_assignment=None): """ Transform the given data into `rpy2.robjects` objects that can be fed to `R`. Returns transformed data as a dictionary. Parameters ---------- covariates : np.ndarray Covariate data as a 2-d array of shape `(num_units, num_covariates)`. observed_outcomes : np.ndarray, optional Observed outcome data as a 1-d array of shape `(num_units,)` treatment_assignment : np.ndarray, optional (Binary) treatment assignment as a 1-d array of shape `(num_units,)` Returns ---------- robject_dictionary : dict Dictionary mapping names of given data (e.g. `covariates`) to corresponding `rpy2.robjects` objects. """ num_units, *_ = covariates.shape robjects = {"covariates": to_Rmatrix(covariates)} if observed_outcomes is not None: robjects.update({ "observed_outcomes": FloatVector(observed_outcomes.tolist()) }) if treatment_assignment is not None: robjects.update({ "treatment_assignment": FloatVector(treatment_assignment.tolist()) }) return robjects
def qlim(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) rq = quantreg.rq('y ~ x', df, tau=FloatVector((0.25, 0.5, 0.75))) print rq.rx2('coefficients') fv = array(rq.rx2('fitted.values')) return min(fv[:,1]) - 2*max(fv[:,1] - fv[:,0]), \ 2*max(fv[:,2] - fv[:,1]) + max(fv[:,1])
def main(): # filename = 'dados_jun2018.csv_win_48_incr_3_smooth_6_FI.csv' # filename = 'reservoir-shift_0.1-ts-0.csv_win_48_incr_4_smooth_6_FI.csv' filename = 'rs1_ts1_bp.csv_win_48_incr_4_smooth_6_FI.csv' time, FI = read_csv(filename) __, __, perc_FI = min_max_perc(FI, 0.25) time_arr = FloatVector(time) fish_arr = FloatVector(FI) xs = np.arange(0, len(time), 0.02) for i in range(5, 16): # for i in [25.85]: df = i spline, derivative = fit_data(time_arr, fish_arr, df) breaks = find_breaks(list(xs), list(derivative(xs)), list(spline(xs)), perc_FI) make_plot(time, FI, spline, derivative, breaks, perc_FI) plt.title('DF = %.2f' % df) # _file = 'dados_df_%s.png' % df _file = 'rs1_ts1_bp_df_%s.png' % df plt.savefig(_file) plt.gcf().clear() # clear old plot print(i)
def permTS(dataDict=None, dataLabel='data', mode='exact.ce'): """ permTS performs a two-sample permutation test using the 'perm' package in R Uses the Monte-Carlo simulation method - not necessarily the fastest, but is "exact" This routine is a wrapper for permTS in R. Parameters ---------- dataDict: Dictionary Data format {'group1': [dataset], 'group2': [dataset]}. dataLabel: string title to use for print out of data mode: string test mode (see manual. Usually 'exact.ce' for "complete enumeration", or 'exact.mc' for montecarlo) Returns ------- (p, n) : tuple p value for test (against null hypothesis), and n, number of mc replications or 0 for other tests """ # test calling values if mode not in ['exact.ce', 'exact.mc']: raise ValueError('RStats.permTS: Mode must be either' + ' "exact.ce" or "exact.mc"; got %s' % mode) if dataDict is None or not (isinstance(dataDict, dict) or len(dataDict.keys()) != 2): raise ValueError('RSTATS.permTX: dataDict must be' + ' a dictionary with exactly 2 keys') k = list(dataDict.keys()) g1 = dataDict[k[0]] g2 = dataDict[k[1]] u = perm.permTS(FloatVector(g1), FloatVector(g2), alternative='two.sided', method=mode) pvalue = float(u[3][0]) if mode == 'exact.mc': nmc = int(u[10][0]) else: nmc = 0 d = u[1].items( ) # stored as a generator (interesting...) # using next for py2/3 estdiff = next( d) #.next() # gets the tuple with what was measured, and the value if dataLabel is not None: print('\nPermutation Test (R permTS). Dataset = %s' % (dataLabel)) print(u' Test statistic: ({:s}): {:8.4f}'.format( estdiff[0], estdiff[1])) print(u' p={:8.6f}, Nperm={:8d} [mode={:s}]'.format( float(pvalue), int(nmc), mode)) return (pvalue, nmc) # return the p value and the number of mc replicatess
def surv_median_from_r(isdead, nbdays): """ """ isdead = FloatVector(isdead) nbdays = FloatVector(nbdays) surv = rob.r.summary(survival.Surv(nbdays, isdead)) return float(surv[2].split(':')[1])
def through_the_origin(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) s = r.summary(r.lm('y ~ 0 + x', df)) return { 'coefficient': s.rx2('coefficients')[0], 'stderr': s.rx2('coefficients')[1], 'r.squared': s.rx2('r.squared')[0] }
def get_constrained_weights(constraints): ## A is (N_particles, N_constraints) constraintsR = r['matrix'](FloatVector(np.array(list(constraints.flatten()))), nrow=constraints.shape[0], ncol=constraints.shape[1], byrow=True) means = r['numeric'](constraints.shape[1]) w = r['matrix'](FloatVector(np.ones(constraints.shape[0])), nrow=constraints.shape[0]) result = r['el.test.wt2'](x=constraintsR, wt=w, mu=means, itertrace=False) weights = np.array(result[0]) return weights
def callRWilcoxTest(beta, beta_se): '''Call R function to do wilcox.test''' k = metafor.rma(yi=FloatVector(beta), sei=FloatVector(beta_se), method=MODEL) #beta, se, zval, pval, 95ci.l, 95ci.u r = (k[1][0], k[2][0], k[3][0], k[4][0], k[5][0], k[6][0]) return r
def create_basis(pmin, pmax, nbas=20): prange = FloatVector([pmin, pmax]) breaks = FloatVector(prange[0] + (prange[1] - prange[0]) * np.tan(np.linspace(0., 1., nbas - 2)) / np.tan(1)) breaks[0] = prange[0] breaks[-1] = prange[-1] basis = fda.create_bspline_basis(rangeval=prange, nbasis=nbas, norder=4, breaks=breaks) return basis
def c_index_multiple_from_r(matrix, isdead, nbdays, matrix_test, isdead_test, nbdays_test, lambda_val=None): """ """ rob.r('set.seed(2016)') if matrix.shape[1] < 2: return np.nan nbdays[nbdays == 0] = 1 nbdays_test[nbdays_test == 0] = 1 isdead = FloatVector(isdead) isdead_test = FloatVector(isdead_test) nbdays = FloatVector(nbdays) nbdays_test = FloatVector(nbdays_test) matrix = convert_to_rmatrix(matrix) matrix_test = convert_to_rmatrix(matrix_test) surv = survival.Surv(nbdays, isdead) cv_glmnet = rob.r('cv.glmnet') glmnet = rob.r('glmnet') arg = {'lambda': lambda_val} if not lambda_val: cv_fit = cv_glmnet(matrix, surv, family='cox', alpha=0) arg = {'lambda': min(cv_fit[0])} fit = glmnet(matrix, surv, family='cox', alpha=0, **arg) predict = rob.r.predict(fit, matrix_test) concordance_index = rob.r('concordance.index') try: with warnings.catch_warnings(): warnings.simplefilter("ignore") c_index = concordance_index(predict, nbdays_test, isdead_test, method='noether') except Exception as e: print("exception found for c index multiple!: {0}".format(e)) return None return c_index[0][0]
def callMetaPSUMZ(p, singed_weights): '''Call R function sumz for the meta analysis''' # https://www.rdocumentation.org/packages/metap/versions/1.1/topics/sumz # print(p) # print(singed_weights) k = metap.sumz(FloatVector(p), weights=FloatVector(singed_weights)) #z, p r = k[1][0] # print(r) return r
def wilcoxon(vect1, vect2): # TODO: use closure as same pattern as above if len(vect1) == 0 or len(vect2) == 0: return float('nan') try: results = r_stats.wilcox_test(FloatVector(vect1), FloatVector(vect2), paired=True, exact=True) # p_val = stats.wilcoxon(vect1, vect2)[1] except ValueError as err: raise ValueError("vect1: {} ({} elements), vect2: {} ({} elements); {}" .format(vect1, len(vect1), vect2, len(vect2), err)) wilcox_stat = results[results.names.index('statistic')][0] p_val = results[results.names.index('p.value')][0] return p_val
def zipoisson(N, lambda_par, psi): """Zero inflated Poisson sampler.""" # load R package r('library(VGAM)') # get R functions zipoissonR = r['rzipois'] res = zipoissonR(N, FloatVector(lambda_par), pstr0=FloatVector(psi)) return np.array([int(item) for item in res])
def gen_negbin(N, mu1, theta1): """Negative binomial distribution.""" # load R package r('library(MASS)') # get R functions nbinomR = r['rnegbin'] res = nbinomR(n=N, mu=FloatVector(mu1), theta=FloatVector(theta1)) return res
def cube_method(A, point, N_KEEP): ### A is (N_constraints, N_particles) and point is (N_particles) signs = np.sign(point) A_signs = A.T.copy() A_signs[1:, :] = A_signs[1:, :]*signs point_signs = np.abs(point)*N_KEEP/np.sum(np.abs(point)) A_signs *= point_signs A_signs_R = r['matrix'](FloatVector(np.array(list(A_signs.flatten()))), nrow=A_signs.T.shape[0]) print("Flight phase:") res = r["flightphase"](FloatVector(point_signs), A_signs_R) print("Landing Phase:") land_point = r["landingphase"](FloatVector(point_signs), res, A_signs_R) return np.array([r for r in land_point]), signs
def compute_NetEMD(u1, u2): u1 = np.array(list(u1)) u1 = u1 * (u1 > 1e-12) h1 = np.histogram(u1, bins='auto', density=False) h1_loc = (h1[1][:-1] + h1[1][1:]) / 2 u2 = np.array(list(u2)) u2 = u2 * (u2 > 1e-12) h2 = np.histogram(u2, bins='auto', density=False) h2_loc = (h2[1][:-1] + h2[1][1:]) / 2 dhist1 = netdist.dhist(FloatVector(h1_loc), FloatVector(h1[0])) dhist2 = netdist.dhist(FloatVector(h2_loc), FloatVector(h2[0])) dist = netdist.net_emd(dhist1, dhist2)[0] return dist
def form_data(self): ''' Form time series data used in lm model. ''' assert self.t is not None, "t is not defined." assert self.y is not None, "y is not defined." globalenv['t'] = FloatVector(self.t) globalenv['y'] = FloatVector(self.y) if self.ysd is None: globalenv['ysd'] = FloatVector(ones_like(self.y)) else: globalenv['ysd'] = FloatVector(self.ysd) _r('the_data <- data.frame(t=t,y=y,ysd=ysd)')
def gen_zinegbinom(N, mu1, mu2, alpha): """Zero inflated negative binomial distribution.""" # load R package r('require(VGAM)') # get R functions zinbinomR = r['rzinegbin'] res = zinbinomR(n=N, munb=FloatVector(mu1), size=1.0 / alpha, pstr0=FloatVector(mu2)) return np.array([int(item) for item in res])
def kde(x_vector, y_vector, weights): """Do weighted 2d KDE in R KS package, returning python results. Returns two lists: P estimates and estimate locations as x,y tuples.""" # Another possible way of doing this is with # http://pysal.readthedocs.io/en/latest/users/tutorials/smoothing.html#non-parametric-smoothing ??? # or with # http://scikit-learn.org/stable/modules/density.html # check the inputs assert len(x_vector) == len(y_vector) assert len(weights) == len(x_vector) # normalize the weights to the sample size if sum(weights) != len(weights): adjust_factor = len(weights) / float(sum(weights)) weights = [w * adjust_factor for w in weights] # get the ks package with kde function from rpy2.robjects.packages import importr ks = importr('ks') # get basic R functions into Python from rpy2.robjects import r cbind = r['cbind'] diag = r['diag'] # R data type conversion from rpy2.robjects import FloatVector # do the KDE print('\tRunning KDE on', len(x_vector), 'points') point_matrix = cbind(FloatVector(x_vector), FloatVector(y_vector)) bandwidth = config.kernel_bandwidth surface = ks.kde( # points and evaluation points are the same x=point_matrix, eval_points=point_matrix, # weights w=FloatVector(weights), # bandwidth / covariance matrix H=diag(FloatVector([bandwidth**2, bandwidth**2]))) eval_points = surface.rx2('eval.points') estimates = surface.rx2('estimate') # turn these into more pythonish objects so that the rpy2 syntax doesn't # have to leave this function eva, est = [], [] for i in range(1, len(weights) + 1): # insert estimate values est.append(estimates.rx(i)[0]) # insert location tuples eva.append((eval_points.rx(i, True)[0], eval_points.rx(i, True)[1])) # these are now vectors (python lists) giving estimated probabilities # and locations as x,y tuples return est, eva
def wrapped(self, covariates, observed_outcomes, treatment_assignment, **kwargs): treatment = treatment_assignment.astype(bool) treated_covariates = to_Rmatrix(covariates[treatment == 1, ...]) treated_outcomes = FloatVector(observed_outcomes[treatment == 1, ...]) control_covariates = to_Rmatrix(covariates[treatment == 0, ...]) control_outcomes = FloatVector(observed_outcomes[treatment == 0, ...]) return method( self, treated_covariates=treated_covariates, treated_outcomes=treated_outcomes, control_covariates=control_covariates, control_outcomes=control_outcomes, **kwargs )
def run_hydrology(init_gwstorage, init_C, init_Nash, init_Qq, init_Qs, climate_type): if "hydrological" in CONFIG.paths: path = CONFIG.paths['hydrological'] else: path = os.path.dirname(__file__) #end if r_path = os.path.join(path, 'WrappableRunIhacresGw.R') with open(r_path) as r_file: """ import .R file and call function """ string = r_file.read() IhacresGW = SignatureTranslatedAnonymousPackage(string, "IhacresGW") workingdir = CONFIG.paths[ "hydrological"] if "hydrological" in CONFIG.paths else os.path.dirname( __file__) + "/" #workingdir = os.path.dirname(__file__) # workingdir = "~/Dropbox/integrated/Mike/hydrological" # datadir = workingdir + "/Maules_19690101_20100302" datadir = workingdir + "data" workingdir = workingdir[: -1] #Remove last slash as function below expects it to be empty # sim, tdat = IhacresGW.RunIhacresGw(workingdir, datadir) return IhacresGW.RunIhacresGw(workingdir, datadir, init_gwstorage, init_C, FloatVector(init_Nash), init_Qq, init_Qs, climate_type)
def getNonParametricPValue(labels, values, random_seed=0, printResults=True): '''Markers localization p-value calculation: Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006. Implemented in R in "gravity: Estimation Methods for Gravity Models" at: https://rdrr.io/cran/gravity/man/ppml.html ''' np.random.seed(random_seed) #np.random.shuffle(labels) dataf = DataFrame({ 'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(values)) }) fit = R( 'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)' )(dataf) # Deviance is -2.*log_likelihood altDeviance = list(fit[9].items())[0][1] nullDeviance = list(fit[11].items())[0][1] p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1) if printResults: print( 'Non-parametric method:', '\n\t', #' Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t', #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t', 'p-value:\t', '%.1e' % p_value, '\n') return p_value
def fit(self, x, y): ordinal = importr('ordinal') rx = matrix_to_r_dataframe(x) self.levels = range(int(round(min(y))), int(round(max(y))) + 1) ry = base.factor(FloatVector(y), levels=self.levels, ordered=True) robjects.globalenv["y"] = ry self.clmfit = ordinal.clm("y ~ .", data=rx)
def Run(self): self.transit_message("Starting Corrplot") start_time = time.time() # assume first non-comment line is header; samples are headers = None data, means = [], [] if self.filetype == "gene_means": for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#': headers = w[3:] continue # last comment line has names of samples data.append(w) cnts = [float(x) for x in w[3:]] means.append(cnts) elif self.filetype == "anova" or self.filetype == "zinb": n = -1 # number of conditions for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#' or ( 'pval' in line and 'padj' in line ): # check for 'pval' for backwards compatibility headers = w continue # keep last comment line as headers if n == -1: # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions) # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions) if self.filetype == "anova": n = int((len(w) - 6) / 2) elif self.filetype == "zinb": n = int((len(headers) - 6) / 4) headers = headers[3:3 + n] headers = [x.replace("Mean_", "") for x in headers] vals = [float(x) for x in w[3:3 + n]] # take just the columns of means qval = float(w[-2]) if qval < 0.05: data.append(w) means.append(vals) else: print("filetype not recognized: %s" % self.filetype) sys.exit(-1) print("correlations based on %s genes" % len(means)) genenames = ["%s/%s" % (w[0], w[1]) for w in data] hash = {} headers = [h.replace("Mean_", "") for h in headers] for i, col in enumerate(headers): hash[col] = FloatVector([x[i] for x in means]) df = DataFrame(hash) # can't figure out how to set rownames corrplotFunc = self.make_corrplotFunc() corrplotFunc( df, StrVector(headers), StrVector(genenames), self.outfile ) # pass headers to put cols in order, since df comes from dict self.finish() self.transit_message("Finished Corrplot")
def ess_mcse_repy2(numpy_matrix): nrow = numpy_matrix.shape[0] ctl = robjects.r.matrix(FloatVector(numpy_matrix.flatten()), nrow=nrow) out = mcmcse.ess(ctl) print(out) out = numpy.asarray(out) return(out)
def effective_size(theta_mcmc): from rpy2.robjects.packages import importr from rpy2.robjects import FloatVector coda = importr('coda') es = coda.effectiveSize(FloatVector(theta_mcmc)) return es[0]