def get_federov_data(self, factors): low_level_limits = IntVector([self.parameter_ranges[f][0] for f in factors]) high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors]) factor_centers = IntVector([0 for f in factors]) factor_levels = IntVector([self.parameter_ranges[f][1] for f in factors]) factor_round = IntVector([0 for f in factors]) is_factor = BoolVector([False for f in factors]) mix = BoolVector([False for f in factors]) opt_federov_data = { "var": StrVector(factors), "low": low_level_limits, "high": high_level_limits, "center": factor_centers, "nLevels": factor_levels, "round": factor_round, "factor": is_factor, "mix": mix } opt_federov_dataframe = DataFrame(opt_federov_data) opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var", "low", "high", "center", "nLevels", "round", "factor", "mix"])) return opt_federov_dataframe
def generate_valid_sample(self, sample_size): search_space_dataframe = {} for n in self.axis_names: search_space_dataframe[n] = [] search_space = {} evaluated = 0 info( "Generating valid search space of size {0} (does not spend evaluations)" .format(sample_size)) while len(search_space) < sample_size: candidate_point = self.getRandomCoord() candidate_point_key = str(candidate_point) evaluated += 1 if candidate_point_key not in search_space: perf_params = self.coordToPerfParams(candidate_point) is_valid = eval(self.constraint, copy.copy(perf_params), dict(self.input_params)) if is_valid: search_space[candidate_point_key] = candidate_point for n in perf_params: candidate_value = self.parameter_values[n].index( perf_params[n]) search_space_dataframe[n].append(candidate_value) if len(search_space) % int(sample_size / 10) == 0: info("Valid coordinates: " + str(len(search_space)) + "/" + str(sample_size)) info("Tested coordinates: " + str(evaluated)) if evaluated % 1000000 == 0: info("Tested coordinates: " + str(evaluated)) info("Valid/Tested configurations: " + str(len(search_space)) + "/" + str(evaluated)) for k in search_space_dataframe: search_space_dataframe[k] = IntVector(search_space_dataframe[k]) search_space_dataframe_r = DataFrame(search_space_dataframe) search_space_dataframe_r = search_space_dataframe_r.rx( StrVector(self.axis_names)) info("Generated Search Space:") info(str(self.base.summary_default(search_space_dataframe_r))) coded_search_space_dataframe_r = self.encode_data( search_space_dataframe_r) return coded_search_space_dataframe_r
def measure_design(self, design, encoded_design, step_number): info("Measuring design of size " + str(len(design[0]))) design_names = [str(n) for n in self.base.names(design)] initial_factors = self.params["axis_names"] measurements = [] info("Current Design Names: " + str(design_names)) for line in range(1, len(design[0]) + 1): if type(design.rx(line, True)[0]) is int: design_line = [v for v in design.rx(line, True)] else: design_line = [int(round(float(v[0]))) for v in design.rx(line, True)] candidate = [0] * len(initial_factors) for k, v in self.model["fixed_factors"].items(): candidate[initial_factors.index(k)] = int(round(float(v))) for i in range(len(design_names)): candidate[initial_factors.index(design_names[i])] = design_line[i] measurement = self.getPerfCosts([candidate]) if measurement != {}: measurements.append(float(numpy.mean(measurement[str(candidate)][0]))) else: measurements.append(robjects.NA_Real) design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)})) encoded_design = self.base.cbind(encoded_design, DataFrame({self.model["response"]: FloatVector(measurements)})) info("Complete design, with measurements:") info(str(design)) design = design.rx(self.stats.complete_cases(design), True) design = design.rx(self.base.is_finite(self.base.rowSums(design)), True) encoded_design = encoded_design.rx(self.stats.complete_cases(encoded_design), True) encoded_design = encoded_design.rx(self.base.is_finite(self.base.rowSums(encoded_design)), True) info("Clean design, with measurements:") info(str(design)) info("Clean encoded design, with measurements:") info(str(encoded_design)) self.utils.write_csv(encoded_design, "design_step_{0}.csv".format(step_number)) self.utils.write_csv(design, "decoded_design_step_{0}.csv".format(step_number)) if self.complete_design_data == None: self.complete_design_data = design else: self.complete_design_data = self.dplyr.bind_rows(self.complete_design_data, design) return design
def getNonParametricPValue(labels, values, random_seed=0, printResults=True): '''Markers localization p-value calculation: Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006. Implemented in R in "gravity: Estimation Methods for Gravity Models" at: https://rdrr.io/cran/gravity/man/ppml.html ''' np.random.seed(random_seed) #np.random.shuffle(labels) dataf = DataFrame({ 'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(values)) }) fit = R( 'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)' )(dataf) # Deviance is -2.*log_likelihood altDeviance = list(fit[9].items())[0][1] nullDeviance = list(fit[11].items())[0][1] p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1) if printResults: print( 'Non-parametric method:', '\n\t', #' Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t', #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t', 'p-value:\t', '%.1e' % p_value, '\n') return p_value
def get_hourly_ffmc_on_diurnal_curve(ffmc_solar_noon: float, target_hour: float, temperature: float, relative_humidity: float, wind_speed: float, precip: float): """ Computes hourly FFMC based on noon FFMC using diurnal curve for approximation. Delegates the calculation to cffdrs R package. ffmc_solar_noon is the forecasted or actual FFMC value for solar noon of the date in question. target_hour is the hour of the day (on 24 hour clock) for which hourly FFMC should be calculated the weather variables (temperature, rh, wind_speed, precip) is the forecasted or actual weather values for solar noon. # Args: weatherstream: Input weather stream data.frame which includes # temperature, relative humidity, wind speed, # precipitation, hourly value, and bui. More specific # info can be found in the hffmc.Rd help file. # ffmc_old: ffmc from previous timestep # time.step: The time (hours) between previous FFMC and current # time. # calc.step: Whether time step between 2 obs is calculated # (optional) # batch: Single step or iterative (default=TRUE) # hourlyFWI: Can calculated hourly ISI & FWI as well # (TRUE/FALSE, default=FALSE) # # Returns: A single or multiple hourly ffmc value(s) # # From hffmc.Rd: # {weatherstream}{ # A dataframe containing input variables of hourly weather observations. # It is important that variable names have to be the same as in the following list, but they # are case insensitive. The order in which the input variables are entered is not important. # # temp (required) Temperature (centigrade) # rh (required) Relative humidity (%) # ws (required) 10-m height wind speed (km/h) # prec (required) 1-hour rainfall (mm) # hr (optional) Hourly value to calculate sub-hourly ffmc # bui (optional) Daily BUI value for the computation of hourly FWI. It is # required when hourlyFWI=TRUE """ time_offset = target_hour - 13 # solar noon # build weather_data dictionary to be passed as weatherstream weather_data = { 'hr': 13.0, 'temp': temperature, 'rh': relative_humidity, 'ws': wind_speed, 'prec': precip / 24 # the precip received will be based on the previous 24 hours, but the # R function requires 1-hour rainfall. We don't have hourly data, so the best we can do is # take the mean amount of precip for the past 24 hours. This is a liberal approximation # with a lot of hand-waving. } weather_data = DataFrame(weather_data) # pylint: disable=protected-access, no-member result = CFFDRS.instance().cffdrs.hffmc(weatherstream=weather_data, ffmc_old=ffmc_solar_noon, time_step=time_offset) return result[0]
def qcrop2(xlist, ylist, labels=None, nq=4.): if labels is None: labels = map(str, range(len(xlist))) x = [] y = [] xcrop = [] ycrop = [] facet = [] for i, (onex, oney) in enumerate(zip(xlist, ylist)): xmin, xmax = qlim1(onex, nq) ymin, ymax = qlim1(oney, nq) cropx, cropy = zip(*[( nan, nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx, vy) for vx, vy in zip(onex, oney)]) xcrop += cropx ycrop += cropy x += onex y += oney facet += [labels[i]] * len(onex) df = DataFrame({ 'x': FloatVector(x), 'y': FloatVector(y), 'xcrop': FloatVector(xcrop), 'ycrop': FloatVector(ycrop), 'facet': FactorVector(StrVector(facet), levels=StrVector(labels)) }) return df
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw): x = [] y = [] ya = [] for triad in stats: for r in stats[triad]: paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw) ns_EN = sum(r[0]['EN'][t] for t in pair) s_EN = sum(r[1]['EN'][t] for t in pair) para = paralinear_dists[pair] if para: x.append(ns_EN) y.append(para) ya.append(s_EN) print 'paralinear stats' print_stats(x, y) print 'GTR stats' print_stats(x, ya) df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)}) globalenv['df'] = df cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \ 'geom_abline(intercept=0, slope=1, color="white") + ' + \ 'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \ 'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \ 'coord_cartesian(xlim=c(0,1), ylim=c(0,1))' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...') return
def call_r(df): ''' Arguments: df: A string replicating a CSV file. The observations for the dependent variable MUST be in the FIRST COLUMN Returns: an rpy2 Robject float vector which stores the coefficients of the linear regression ''' from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage from io import StringIO from rpy2.robjects import DataFrame from rpy2.robjects import FloatVector import rpy2.rinterface as ri ri.initr() file_like_obj = StringIO(df) constructor_dict = parser(file_like_obj) rpy2_dataframe = DataFrame(constructor_dict) with open('regression_app\linear_modeler_function.R') as f: str = f.read() mod = SignatureTranslatedAnonymousPackage(str, 'mod') a = mod.linear_modeler(rpy2_dataframe) del mod return a
def Run(self): self.transit_message("Starting Corrplot") start_time = time.time() # assume first non-comment line is header; samples are headers = None data, means = [], [] if self.filetype == "gene_means": for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#': headers = w[3:] continue # last comment line has names of samples data.append(w) cnts = [float(x) for x in w[3:]] means.append(cnts) elif self.filetype == "anova" or self.filetype == "zinb": n = -1 # number of conditions for line in open(self.gene_means): w = line.rstrip().split('\t') if line[0] == '#' or ( 'pval' in line and 'padj' in line ): # check for 'pval' for backwards compatibility headers = w continue # keep last comment line as headers if n == -1: # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions) # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions) if self.filetype == "anova": n = int((len(w) - 6) / 2) elif self.filetype == "zinb": n = int((len(headers) - 6) / 4) headers = headers[3:3 + n] headers = [x.replace("Mean_", "") for x in headers] vals = [float(x) for x in w[3:3 + n]] # take just the columns of means qval = float(w[-2]) if qval < 0.05: data.append(w) means.append(vals) else: print("filetype not recognized: %s" % self.filetype) sys.exit(-1) print("correlations based on %s genes" % len(means)) genenames = ["%s/%s" % (w[0], w[1]) for w in data] hash = {} headers = [h.replace("Mean_", "") for h in headers] for i, col in enumerate(headers): hash[col] = FloatVector([x[i] for x in means]) df = DataFrame(hash) # can't figure out how to set rownames corrplotFunc = self.make_corrplotFunc() corrplotFunc( df, StrVector(headers), StrVector(genenames), self.outfile ) # pass headers to put cols in order, since df comes from dict self.finish() self.transit_message("Finished Corrplot")
def _align_var(breaks_r, pop_col, n, verbose=False): prev_b = -1 i = 1 align = dict() _vector = list() align_t = [1] for e, b in enumerate(breaks_r): if prev_b + 1 == b and b < n + 1: try: assert (min(align_t) != max(align_t)) # align[pop_col + '.' + str(i)] = IntVector( align[pop_col] = IntVector((min(align_t), max(align_t))) # _vector.extend([min(align_t), max(align_t)]) except: if verbose: print("can't allign {} at {} for {}".format(align_t, e, b)) pass i += 1 align_t = [e + 2] else: align_t.append(e + 2) prev_b = b if len(align) == 0: align[pop_col] = IntVector((1, len(breaks_r))) # else: # align[pop_col] = IntVector(_vector) align_r = DataFrame(align) return align_r
def qlim(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) rq = quantreg.rq('y ~ x', df, tau=FloatVector((0.25, 0.5, 0.75))) print rq.rx2('coefficients') fv = array(rq.rx2('fitted.values')) return min(fv[:,1]) - 2*max(fv[:,1] - fv[:,0]), \ 2*max(fv[:,2] - fv[:,1]) + max(fv[:,1])
def through_the_origin(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) s = r.summary(r.lm('y ~ 0 + x', df)) return { 'coefficient': s.rx2('coefficients')[0], 'stderr': s.rx2('coefficients')[1], 'r.squared': s.rx2('r.squared')[0] }
def plot_bar(stats, output_file=None, **kw): names = [r['name'] for r in stats.values()[0][0]] with_rates = [r['with_rate'] for r in stats.values()[0][0]] names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)] by_dir = defaultdict(list) for triad in stats: for r in stats[triad]: by_dir[r[0]['from_directory']].append(r) for d in by_dir: by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]]) runs = [] g_stats = [] data = [] alpha = 0 for d, v in by_dir.items(): if 'exons' in d.split('/'): dataset = 'Nuclear' elif 'mtDNA' in d.split('/'): dataset = 'Mitochondrial' else: dataset = 'Microbial' print dataset for j, g in enumerate(v): g_stats += g data += [dataset] * len(g) runs += [j] * len(g) print names[j], sum(1 for _g in g if _g > 0.05) / len(g) alpha = max(alpha, get_alpha(g)) print 'Samples', len(g) labels = 'expression(' + ','.join(names) + ')' df = DataFrame({ 'run': IntVector(runs), 'g_stat': FloatVector(g_stats), 'data': StrVector(data) }) globalenv['df'] = df R('library(scales)') # 'geom_jitter(alpha=0.2, size=1) + ' + \ # 'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \ # 'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \ # 'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \ # 'geom_hline(yintercept=0.05, color="black") + ' + \ cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \ 'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \ 'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \ 'scale_x_discrete(labels=' + labels + ') + ' + \ 'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \ 'facet_grid(. ~ data)' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def python_to_r_object(cls, item): """ 把python对象转化为R对象,类方法 :param item: Python对象,可以转换的类型有:list,tuple,pd.Series, np.ndarray, pd.DataFrame :return: R对象 """ numpy2ri.activate() if isinstance(item,(list, tuple, pd.Series)): return np.array(item) elif isinstance(item, pd.DataFrame): data_dict = {col_names: np.array(item[col_names]) for col_names in item.columns} rdataframe = DataFrame(data_dict) rdataframe.rownames = np.array(item.index) return rdataframe elif isinstance(item, (np.ndarray,bool,int,float,str)): return item else: print('Unsuported type: ',type(item)) raise Exception
def tupls2RDataframe(data, titles): cols = [[] for _ in titles] for datum in data: for i, e in enumerate(datum): cols[i].append(e) col_d = {} for i, t in enumerate(titles): col_d[t] = StrVector(tuple(cols[i])) col_d[t] = FactorVector(col_d[t]) dataf = DataFrame(col_d) return dataf
def _convert_python_to_R(data: typing.Union[dict, pd.DataFrame]): """ Converts a python object to an R object brms can handle: * python dict -> R list * python dataframe -> R dataframe """ with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter) as cv: if isinstance(data, pd.DataFrame): return DataFrame(data) elif isinstance(data, dict): return ListVector(data) else: raise ValueError("Data should be either a pandas dataframe or a dictionary")
def create_variable_in_R(self, variable_name, value): if isinstance(value, (list, np.ndarray, pd.Series)): self._R.globalenv[variable_name] = RActor.python_type_to_R_type(value) elif isinstance(value, pd.DataFrame): indexes = value.index rownames = ''.join(["c('","','".join(list(indexes)),"')"]) value_dict = value.to_dict('list') for key in value_dict: value_dict[key] = RActor.python_type_to_R_type(value_dict[key]) self._R.globalenv[variable_name] = DataFrame(value_dict) self._R.r(''.join(['rownames(',variable_name,') <- ',rownames])) else: self._R.globalenv[variable_name] = value
def python_to_r_object(cls, item): """ 把python对象转化为R对象,类方法 :param item: Python对象,可以转换的类型有:list,tuple,pd.Series, np.ndarray, pd.DataFrame :return: R对象 """ numpy2ri.activate() if isinstance(item, (list, tuple, pd.Series)): return np.array(item) elif isinstance(item, pd.DataFrame): data_dict = { col_names: np.array(item[col_names]) for col_names in item.columns } rdataframe = DataFrame(data_dict) rdataframe.rownames = np.array(item.index) return rdataframe elif isinstance(item, (np.ndarray, bool, int, float, str)): return item else: print('Unsuported type: ', type(item)) raise Exception
def _comparisons_dataframe(self): # col = ('Label.1', 'Label.2', 'win1', 'win2') # data = zip(col, [*self.comparison_items, *self.comparison_wins]) # return DataFrame(OrdDict([data])) column_comp1 = ('Label.1', FactorVector(self.comparison_items[0], levels=StrVector(self.items))) column_comp2 = ('Label.2', FactorVector(self.comparison_items[1], levels=StrVector(self.items))) column_win1 = ('win1', FloatVector(self.comparison_wins[0])) column_win2 = ('win2', FloatVector(self.comparison_wins[1])) return DataFrame( OrdDict([column_comp1, column_comp2, column_win1, column_win2]))
def Run(self): if self.filetype != "anova" and self.filetype != "zinb": print("filetype not recognized: %s" % self.filetype) sys.exit(-1) headers = None data, hits = [], [] n = -1 # number of conditions for line in open(self.infile): w = line.rstrip().split('\t') if line[0] == '#' or ( 'pval' in line and 'padj' in line): # check for 'pval' for backwards compatibility headers = w continue # keep last comment line as headers # assume first non-comment line is header if n == -1: # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions) # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions) if self.filetype == "anova": n = int((len(w) - 6) / 2) elif self.filetype == "zinb": n = int((len(headers) - 6) / 4) headers = headers[3:3 + n] headers = [x.replace("Mean_", "") for x in headers] else: lfcs = [float(x) for x in w[3 + n:3 + n + n] ] # take just the columns of means qval = float(w[-2]) data.append((w, lfcs, qval)) data.sort(key=lambda x: x[-1]) hits, LFCs = [], [] for k, (w, lfcs, qval) in enumerate(data): if (self.topk == -1 and qval < self.qval) or (self.topk != -1 and k < self.topk): hits.append(w) LFCs.append(lfcs) print("heatmap based on %s genes" % len(hits)) genenames = ["%s/%s" % (w[0], w[1]) for w in hits] hash = {} headers = [h.replace("Mean_", "") for h in headers] for i, col in enumerate(headers): hash[col] = FloatVector([x[i] for x in LFCs]) df = DataFrame(hash) heatmapFunc = self.make_heatmapFunc() heatmapFunc(df, StrVector(genenames), self.outfile)
def loadfiles(self): """ Load files into R environment """ rcount = 0 asmatrix = robjects.r['as.matrix'] diag = robjects.r['diag'] names = robjects.r['names'] ## Set the default parameter for reading from csv param = {'header': True, 'as_is': True, 'row.names': ri.NULL} ## Check the correct parameter and set the default for p in param.keys(): if p in self.param: if self.param[p] is not None: param[p] = self.param[p] for f, s in zip(self.filelist, self.seplist): try: dataf = DataFrame.from_csvfile(f, sep=str(s), header=param['header'], as_is=param['as_is'], row_names=param['row.names']) dataf = asmatrix(dataf) # Should be the diagonal set to 0? # Do it for all the inputs, just to be sure zcount = 0 for i in xrange(dataf.ncol): if (dataf.rx(i+1,i+1)[0] - 0.0 >= 1e-8): zcount += 1 dataf.rx[i+1,i+1] = 0 if zcount: self.e += f self.mylist.append(dataf) rcount += 1 except IOError, e: self.error += e except RRuntimeError, e: self.error += e
def measure_design(self, design, response, fixed_factors): info("Measuring design of size " + str(len(design))) design_names = [str(n) for n in self.base.names(design)] initial_factors = self.params["axis_names"] measurements = [] info("Current Design Names: " + str(design_names)) info("Initial Factors: " + str(initial_factors)) for line in range(len(design[0])): design_line = [int(v[0]) for v in design.rx(line + 1, True)] candidate = [0] * len(initial_factors) for k, v in fixed_factors.items(): candidate[initial_factors.index(k)] = int(v) for i in range(len(design_names)): candidate[initial_factors.index( design_names[i])] = design_line[i] info("Initial Design Line: " + str(design_line)) info("Fixed Factors: " + str(fixed_factors)) info("Testing candidate " + str(line + 1) + ": " + str(candidate)) measurement = self.getPerfCosts([candidate]) if measurement != {}: measurements.append( float(numpy.mean(measurement[str(candidate)][0]))) else: measurements.append(float('inf')) info("Measurements: " + str(measurements)) design = self.base.cbind( design, DataFrame({response[0]: FloatVector(measurements)})) design = design.rx(self.base.is_finite(design.rx2(response[0])), True) info(str(design)) return design
def plot_lrt_histograms(stats, output_file, **kw): gtr = [] general = [] gtrplusgamma = [] for triad in stats: for r in stats[triad]: for o, l in zip((0, 2, 4), (general, gtr, gtrplusgamma)): p = lrt_p(r[1 + o]['ll'], r[o]['ll'], r[1 + o]['df'], r[o]['df']) l.append(p) intercepts = [] for n, v in zip(('General', 'GTR+Gamma', 'GTR'), (general, gtrplusgamma, gtr)): i = sum(1 for p in v if p <= 0.05) / len(v) intercepts.append(str(np.round(i, 2))) print n, min(v), max(v), i, len(v) intercepts = 'c(' + ','.join(intercepts) + ')' n = len(gtr) globalenv['df'] = DataFrame({ 'Model': StrVector(['general'] * n + ['gtrplusgamma'] * n + ['gtr'] * n), 'pvalue': FloatVector(general + gtrplusgamma + gtr) }) cmd = 'gg <- ggplot(df, aes(pvalue, group=Model, linetype=Model)) + ' + \ 'stat_ecdf(geom="line") + ' + \ 'xlab("LRT p-value") + ylab("Empirical CDF") + ' + \ 'theme(legend.position = c(0.85, 0.15)) + ' + \ 'scale_x_continuous(breaks=c(0.05,seq(0.25,1,by=0.25)), limits=c(0,1))+'+\ 'scale_y_continuous(breaks=c(' + intercepts + \ ', seq(0,1,by=0.25)), limits=c(0,1)) + ' + \ 'scale_linetype_discrete(labels=expression(General, GTR, GTR+Gamma))' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def is_strong_iv(df, idx): df = df.loc[df['index'].isin(idx)] Y = df["treated"] X0 = df["0"] X1 = df["1"] X2 = df["2"] X3 = df["3"] X4 = df["4"] X5 = df["5"] X6 = df["6"] X7 = df["7"] X8 = df["8"] X9 = df["9"] IV = df["iv"] formula = Formula( 'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv') formula2 = Formula( 'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9') dataf = DataFrame({'treated': robjects.IntVector(Y), \ 'x0': robjects.IntVector(X0),\ 'x1': robjects.IntVector(X1),\ 'x2': robjects.IntVector(X2),\ 'x3': robjects.IntVector(X3),\ 'x4': robjects.IntVector(X4),\ 'x5': robjects.IntVector(X5),\ 'x6': robjects.IntVector(X6),\ 'x7': robjects.IntVector(X7),\ 'x8': robjects.IntVector(X8),\ 'x9': robjects.IntVector(X9),\ 'iv': robjects.IntVector(IV) }) #print(dataf) fit = robjects.r.lm(formula=formula, data=dataf) fit2 = robjects.r.lm(formula=formula2, data=dataf) r_frame = robjects.r.anova(fit, fit2) py_frame = pandas2ri.ri2py_dataframe(r_frame) return py_frame.iloc[1, 4] >= 10
def measure_design(self, design): info("Measuring design of size " + str(len(design[0]))) design_names = [str(n) for n in self.base.names(design)] initial_factors = self.params["axis_names"] measurements = [] info("Current Design Names: " + str(design_names)) for line in range(1, len(design[0]) + 1): if type(design.rx(line, True)[0]) is int: design_line = [v for v in design.rx(line, True)] else: design_line = [int(v[0]) for v in design.rx(line, True)] candidate = [0] * len(initial_factors) for k, v in self.model["fixed_factors"].items(): candidate[initial_factors.index(k)] = int(v) for i in range(len(design_names)): candidate[initial_factors.index(design_names[i])] = design_line[i] measurement = self.getPerfCosts([candidate]) if measurement != {}: measurements.append(float(numpy.mean(measurement[str(candidate)][0]))) else: measurements.append(float('inf')) design = self.base.cbind(design, DataFrame({self.model["response"]: FloatVector(measurements)})) design = design.rx(self.base.is_finite(design.rx2(self.model["response"])), True) info("Complete design, with measurements:") info(str(design)) return design
def total_concentration(df): Y = df["treated"] X0 = df["0"] X1 = df["1"] X2 = df["2"] X3 = df["3"] X4 = df["4"] X5 = df["5"] X6 = df["6"] X7 = df["7"] X8 = df["8"] X9 = df["9"] IV = df["iv"] formula = Formula( 'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + iv') formula2 = Formula( 'treated ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9') dataf = DataFrame({'treated': robjects.IntVector(Y), \ 'x0': robjects.IntVector(X0),\ 'x1': robjects.IntVector(X1),\ 'x2': robjects.IntVector(X2),\ 'x3': robjects.IntVector(X3),\ 'x4': robjects.IntVector(X4),\ 'x5': robjects.IntVector(X5),\ 'x6': robjects.IntVector(X6),\ 'x7': robjects.IntVector(X7),\ 'x8': robjects.IntVector(X8),\ 'x9': robjects.IntVector(X9),\ 'iv': robjects.IntVector(IV) }) fit = robjects.r.lm(formula=formula, data=dataf) fit2 = robjects.r.lm(formula=formula2, data=dataf) r_frame = robjects.r.anova(fit, fit2) py_frame = pandas2ri.ri2py_dataframe(r_frame) print("total concentration parameter: " + str(py_frame.iloc[1, 4]))
def fit(self, training_data, target): """ :param training_data: a pandas dataframe. :param target: a string referring to the target variable to be predicted :return: An rpart model """ self.outcome = target #Converting to proper format for R functions train_data = DataFrame(training_data) formula = target + " ~ ." #TODO: HyperParameter Incorporateion #train the model self.model = rparty.rpart( formula=formula, data=train_data, method="class" #control = rparty.rpart_control(minsplit = ?, cp = ?) ) return self.model
search_space_database = dataset.connect( "sqlite:///search_space_{0}.db".format(self.seed_space_size)) for experiment in search_space_database['experiments']: search_space.append(eval(experiment["value"])) info("Starting DOPT-anova") r_search_space = {} for i in range(len(search_space[0])): r_row = [self.dim_uplimits[i] - 1, 0] for col in search_space: r_row.append(col[i]) r_search_space[initial_factors[i]] = IntVector(r_row) data = DataFrame(r_search_space) data = data.rx(StrVector(initial_factors)) self.dopt_anova(initial_factors, initial_inverse_factors, data) sys.exit() perf_cost, mean_perf_cost = self.MAXFLOAT, self.MAXFLOAT params = self.coordToPerfParams(coord) end_time = time.time() search_time = start_time - end_time speedup = float(eval_cost[0]) / float(best_perf_cost) search_time = time.time() - start_time info('----- end random search -----')
def run_zinb(self, data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions): """ Runs Zinb for each gene across conditions and returns p and q values ([[Wigdata]], [Gene], [Number], [Number], {Rv: [SiteIndex]}, [Condition], [Covar], [Interaction]) -> Tuple([Number], [Number], [Status]) Wigdata :: [Number] Gene :: {start, end, rv, gene, strand} SiteIndex: Integer Condition :: String Covar :: String Interaction :: String Status :: String """ count = 0 self.progress_range(len(genes)) pvals, Rvs, status = [], [], [] r_zinb_signif = self.def_r_zinb_signif() if (self.winz): self.transit_message("Winsorizing and running analysis...") self.transit_message("Condition: %s" % self.condition) comp1a = "1+cond" comp1b = "1+cond" # include cond in mod0 only if testing interactions comp0a = "1" if len(self.interactions) == 0 else "1+cond" comp0b = "1" if len(self.interactions) == 0 else "1+cond" for I in self.interactions: comp1a += "*" + I comp1b += "*" + I comp0a += "+" + I comp0b += "+" + I for C in self.covars: comp1a += "+" + C comp1b += "+" + C comp0a += "+" + C comp0b += "+" + C zinbMod1 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % ( comp1a, comp1b) zinbMod0 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % ( comp0a, comp0b) nbMod1 = "cnt~%s" % (comp1a) nbMod0 = "cnt~%s" % (comp0a) toRFloatOrStrVec = lambda xs: FloatVector( [float(x) for x in xs]) if self.is_number(xs[0]) else StrVector(xs) for gene in genes: count += 1 Rv = gene["rv"] ## Single gene case for debugging if (GENE): Rv = None if GENE in RvSiteindexesMap: Rv = GENE else: for g in genes: if (g['gene'] == GENE): Rv = g["rv"] break if not Rv: self.transit_error("Cannot find gene: {0}".format(GENE)) sys.exit(0) if (DEBUG): self.transit_message( "======================================================================" ) self.transit_message(gene["rv"] + " " + gene["gene"]) if (len(RvSiteindexesMap[Rv]) <= 1): status.append("TA sites <= 1, not analyzed") pvals.append(1) else: # For winsorization # norm_data = self.winsorize((map( # lambda wigData: wigData[RvSiteindexesMap[Rv]], data))) if self.winz else list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data)) norm_data = list( map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data)) ([ readCounts, condition, covarsData, interactionsData, NZmean, logitZPerc ]) = self.melt_data(norm_data, conditions, covariates, interactions, NZMeanByRep, LogZPercByRep) if (numpy.sum(readCounts) == 0): status.append( "pan-essential (no counts in all conditions) - not analyzed" ) pvals.append(1) else: df_args = { 'cnt': IntVector(readCounts), 'cond': toRFloatOrStrVec(condition), 'NZmean': FloatVector(NZmean), 'logitZperc': FloatVector(logitZPerc) } ## Add columns for covariates and interactions if they exist. df_args.update( list( map( lambda t_ic: (t_ic[ 1], toRFloatOrStrVec(covarsData[t_ic[0]])), enumerate(self.covars)))) df_args.update( list( map( lambda t_ic: (t_ic[1], toRFloatOrStrVec(interactionsData[t_ic[0]])), enumerate(self.interactions)))) melted = DataFrame(df_args) # r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True] debugFlag = True if DEBUG or GENE else False pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0, nbMod1, nbMod0, debugFlag) status.append(msg) pvals.append(float(pval)) if (DEBUG or GENE): self.transit_message( "Pval for Gene {0}: {1}, status: {2}".format( Rv, pvals[-1], status[-1])) if (GENE): self.transit_message("Ran for single gene. Exiting...") sys.exit(0) Rvs.append(Rv) # Update progress text = "Running ZINB Method... %5.1f%%" % (100.0 * count / len(genes)) self.progress_update(text, count) pvals = numpy.array(pvals) mask = numpy.isfinite(pvals) qvals = numpy.full(pvals.shape, numpy.nan) qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals)[ 1] # BH, alpha=0.05 p, q, statusMap = {}, {}, {} for i, rv in enumerate(Rvs): p[rv], q[rv], statusMap[rv] = pvals[i], qvals[i], status[i] return (p, q, statusMap)
def fit( latlon: NDArray[(2, Any), float], z: NDArray[(Any, ), float], nx: int, ny: int, extrap: bool, ) -> Tuple[NDArray[(Any, Any), float], NDArray[(Any, ), float], NDArray[( Any, ), float]]: """Encapsulates the functionality of R's spatialProcess into a Python Args: latlon: grid of pairwise coordinates of observations z: observations nx: number of grid cells on interpolated grid x nx: number of grid cells on interpolated grid y xy: dimensions of interpolated grid output distance: distance metric to use (note, only 'geo' supported currently) variogram_model: choice of variogram model (note, only 'exoponential' supported) Returns: z: kriged field x, y: locations of kriged data """ if not isinstance(latlon, NDArray[(2, Any), float]): raise TypeError( f"Incorrect grid shape, size, or dtype. Must be {NDArray[(2, Any), float]}" ) if not isinstance(z, NDArray[(Any, ), float]): raise TypeError( f"Incorrect grid shape, size, or dtype. Must be {NDArray[(Any, ), float]}" ) if not isinstance(nx, int) or not isinstance(ny, int): raise TypeError("Provide integer grid size") if latlon.shape[1] != z.size: raise ValueError( "Different number of grid coordinates than observations") latlon, z = latlon.tolist(), z.tolist() # convert regular numeric data # convert latlon list into two R FloatVectors # list of FloatVector -> OrderedDict -> R DataFrame # -> numeric R data matrix r_lists = list(map(FloatVector, latlon)) coords = OrderedDict(zip(map(str, range(len(r_lists))), r_lists)) r_dataFrame = DataFrame(coords) r_latlon = robjects.r["data.matrix"](r_dataFrame) # convert observations r_z = FloatVector(z) # use separate simple r-script in path below rstring = resource_string("climpyrical", "tests/data/spatial_process_r.R").decode("utf-8") rfunc = robjects.r(rstring) r_surface = rfunc(r_latlon, r_z, nx, ny, extrap) # extract data from R's interpolation surface_dict = dict(zip(r_surface.names, list(r_surface))) # z = np.array(list(r_surface[1])) z = np.array(surface_dict["z"]).reshape(nx, ny) x = np.array(surface_dict["x"]) y = np.array(surface_dict["y"]) # cov = dict(zip(surface_dict["cov"].names, list(surface_dict["cov"]))) # cov = surface_dict["cov"] return z, x, y
utils = importr('dplyr') utils = importr('gravity') except Exception as exception: print(exception) import rpy2.robjects.packages as rpackages utils = rpackages.importr('utils') utils = rpackages.importr('gravity') utils.chooseCRANmirror(ind=1) utils.install_packages('dplyr') utils.install_packages('gravity') finally: from rpy2.robjects.packages import importr utils = importr('dplyr') utils = importr('gravity') dataf = DataFrame({'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(data.T[0]))}) fit = R('function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)')(dataf) #print(fit) # Deviance is -2.*log_likelihood altDeviance = list(fit[9].items())[0][1] nullDeviance = list(fit[11].items())[0][1] p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1) print('Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.') print('Implemented in R in "gravity: Estimation Methods for Gravity Models" at:') print('https://rdrr.io/cran/gravity/man/ppml.html') print('Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', 'Null deviance:\t', np.round(nullDeviance, 1), '\n\t', 'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t', 'p-value:\t', '%.1e' % p_value, '\n')
# R fitdistr for Beta distribution: which starting parameters? from rpy2.robjects import DataFrame starter= DataFrame({'shape1':0.5,'shape2':0.5}) x = MASS.fitdistr(myValues, "beta", start=starter))