def get_federov_data(self, factors): low_level_limits = IntVector([self.parameter_ranges[f][0] for f in factors]) high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors]) factor_centers = IntVector([0 for f in factors]) factor_levels = IntVector([self.parameter_ranges[f][1] for f in factors]) factor_round = IntVector([0 for f in factors]) is_factor = BoolVector([False for f in factors]) mix = BoolVector([False for f in factors]) opt_federov_data = { "var": StrVector(factors), "low": low_level_limits, "high": high_level_limits, "center": factor_centers, "nLevels": factor_levels, "round": factor_round, "factor": is_factor, "mix": mix } opt_federov_dataframe = DataFrame(opt_federov_data) opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var", "low", "high", "center", "nLevels", "round", "factor", "mix"])) return opt_federov_dataframe
def draw_hist(length,pdfname='hist.pdf',b=25,m=700,wd=8,hd=6): #length = d5 # 分区间统计 # for z1, z2 in groupby(sorted(length), key=lambda x: x//5): # print('{}-{}: {}'.format(z1*5, (z1+1)*5-1, len(list(z2)))) # matplotlib 作图 # lenths = array(length) # pyplot.hist(x=lenths,bins=50) # pyplot.xlabel('Sequence Length') # pyplot.xlim(400,500) # pyplot.ylabel('Sequence Number') # pyplot.title('Sequence Length Distribution') # pyplot.show() robjects.globalenv["dd"] = IntVector(length) robjects.globalenv["nm"] = StrVector([pdfname]) robjects.globalenv["b"] = IntVector([b]) robjects.globalenv["m"] = IntVector([m]) robjects.globalenv["wd"] = IntVector([wd]) robjects.globalenv["hd"] = IntVector([hd]) r_script = ''' library(ape) pdf(nm,width=wd,height=hd) xcol=seq(0,m,b) hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length') dev.off() ''' robjects.r(r_script)
def _mark_timestamp(self, blSegsL): """ mark segs in final sample """ # 此处应用R来进行求解 # 首先,求解每相邻数据的基线之差的集合 # # 或直接列出所有基线 # 然后,根据相邻数据的基线之差,映射到数据的非基线之上,确定归宿于哪一个 # 基线之差 # # 或找出落入基线之中的最大索引 # 最后,所有的数据点中最先落入基线之差的为目标时间戳 # # 根据该索引作为时间戳 from rpy2.robjects.packages import importr from rpy2.robjects import IntVector, StrVector, globalenv import rpy2.robjects as robjects GR = importr('GenomicRanges') IR = importr('IRanges') GRL = GR.GRangesList() globalenv["GRL"] = GRL for blSegs, idx in zip(blSegsL, range(len(blSegsL))): chromNames = StrVector([seg.chromName for seg in blSegs]) starts = IntVector([seg.start for seg in blSegs]) ends = IntVector([seg.end for seg in blSegs]) tempGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) globalenv["tempGR"] = tempGR robjects.r("GRL[[{0}]]=tempGR".format(str(idx+1))) GRL = robjects.r["GRL"] # 此处由于list中保存的是指向目标Seg的指针,所以更新nonBLSegs即可 nonBlSegs = list(set(self._segPoolL[-1].segments) - set(blSegsL[-1])) chromNames = StrVector([seg.chromName for seg in nonBlSegs]) starts = IntVector([seg.start for seg in nonBlSegs]) ends = IntVector([seg.end for seg in nonBlSegs]) nonBlGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) # fo = IR.findOverlaps(nonBlGR, GRL) # For large SCNA fo = IR.findOverlaps(nonBlGR, GRL, minoverlap=5000) globalenv["fo"] = fo robjects.reval("fom <- as.matrix(fo)") overlapIdx = np.array(list(robjects.r.fom)).reshape(tuple(reversed(robjects.r.fom.dim))) - 1 # [[2, 2, 3, 3], # [1, 2, 1, 2]] # print overlapIdx for index in set(overlapIdx[0,]): yIdxes = np.where(overlapIdx[0,]==index)[0] ts = np.max(overlapIdx[1,yIdxes]+1) nonBlSegs[index].tag = str(ts)
def cox(hidden, survival, epoch, method='MG'): def clustering(hidden, method): if method == 'KNN': clf = cluster.KMeans(n_clusters=3) clf.fit(hidden) return clf.predict(hidden) if method == 'MG': clf = mixture.BayesianGaussianMixture(n_components=10, n_init=10, max_iter=500, covariance_type='full') clf.fit(hidden) return clf.predict(hidden) predicts = clustering(hidden, method) T1, E1, G1 = [], [], [] #print(predicts) unique, counts = np.unique(predicts, return_counts=True) #print(np.asarray((unique, counts/predicts.shape[0])).T) # easier to grep clusters = np.asarray((unique, counts / predicts.shape[0])).T for i in clusters: print('Epoch {} # CLUSTER: {} '.format(epoch, i)) sscore = silhouette_score(hidden, predicts) print('Epoch {} # SILHOUTE: {}'.format(epoch, sscore)) for i in range(len(predicts)): T1.append(survival[i, -2]) E1.append(survival[i, -1]) #temp = np.array(T1) #print(temp) #print(temp.astype(int)) # print(T1) # print('\n') # print(E1) # print('\n') # print(predicts) info = pd.DataFrame({'status': T1, 'survive': E1, 'clusters': predicts}) ratio = 0 formula = Formula('x~y') env = formula.environment env['y'] = IntVector(predicts) env['x'] = surv.Surv(IntVector(np.array(T1).astype(int)), IntVector(np.array(E1).astype(int))) result = surv.survdiff(formula) p_value = 1 - np.array(stats.pchisq(result[4], len(set(predicts)) - 1)) # R pvalue #result = multivariate_logrank_test(np.array(T1), np.array(predicts).astype(int), np.array(E1) ) #p_value = result.p_value return p_value, ratio, info
def bargraph_language(results): r = robjects.r for language in languages: varis = [] probs = [] locs = [] for (lang, prob, var) in results.keys(): if lang == language: loc = results[(lang, prob, var)] varis.append(pretty_varis[var]) probs.append(prob) locs.append(loc) r.pdf('bargraph-loc-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Lines': IntVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code")') pp.plot() r['dev.off']()
def getNonParametricPValue(labels, values, random_seed=0, printResults=True): '''Markers localization p-value calculation: Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006. Implemented in R in "gravity: Estimation Methods for Gravity Models" at: https://rdrr.io/cran/gravity/man/ppml.html ''' np.random.seed(random_seed) #np.random.shuffle(labels) dataf = DataFrame({ 'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(values)) }) fit = R( 'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)' )(dataf) # Deviance is -2.*log_likelihood altDeviance = list(fit[9].items())[0][1] nullDeviance = list(fit[11].items())[0][1] p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1) if printResults: print( 'Non-parametric method:', '\n\t', #' Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t', #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t', 'p-value:\t', '%.1e' % p_value, '\n') return p_value
def plot_bar(stats, output_file=None, **kw): names = [r['name'] for r in stats.values()[0][0]] with_rates = [r['with_rate'] for r in stats.values()[0][0]] names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)] by_dir = defaultdict(list) for triad in stats: for r in stats[triad]: by_dir[r[0]['from_directory']].append(r) for d in by_dir: by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]]) runs = [] g_stats = [] data = [] alpha = 0 for d, v in by_dir.items(): if 'exons' in d.split('/'): dataset = 'Nuclear' elif 'mtDNA' in d.split('/'): dataset = 'Mitochondrial' else: dataset = 'Microbial' print dataset for j, g in enumerate(v): g_stats += g data += [dataset] * len(g) runs += [j] * len(g) print names[j], sum(1 for _g in g if _g > 0.05) / len(g) alpha = max(alpha, get_alpha(g)) print 'Samples', len(g) labels = 'expression(' + ','.join(names) + ')' df = DataFrame({ 'run': IntVector(runs), 'g_stat': FloatVector(g_stats), 'data': StrVector(data) }) globalenv['df'] = df R('library(scales)') # 'geom_jitter(alpha=0.2, size=1) + ' + \ # 'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \ # 'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \ # 'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \ # 'geom_hline(yintercept=0.05, color="black") + ' + \ cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \ 'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \ 'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \ 'scale_x_discrete(labels=' + labels + ') + ' + \ 'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \ 'facet_grid(. ~ data)' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def generate_valid_sample(self, sample_size): search_space_dataframe = {} for n in self.axis_names: search_space_dataframe[n] = [] search_space = {} evaluated = 0 info( "Generating valid search space of size {0} (does not spend evaluations)" .format(sample_size)) while len(search_space) < sample_size: candidate_point = self.getRandomCoord() candidate_point_key = str(candidate_point) evaluated += 1 if candidate_point_key not in search_space: perf_params = self.coordToPerfParams(candidate_point) is_valid = eval(self.constraint, copy.copy(perf_params), dict(self.input_params)) if is_valid: search_space[candidate_point_key] = candidate_point for n in perf_params: candidate_value = self.parameter_values[n].index( perf_params[n]) search_space_dataframe[n].append(candidate_value) if len(search_space) % int(sample_size / 10) == 0: info("Valid coordinates: " + str(len(search_space)) + "/" + str(sample_size)) info("Tested coordinates: " + str(evaluated)) if evaluated % 1000000 == 0: info("Tested coordinates: " + str(evaluated)) info("Valid/Tested configurations: " + str(len(search_space)) + "/" + str(evaluated)) for k in search_space_dataframe: search_space_dataframe[k] = IntVector(search_space_dataframe[k]) search_space_dataframe_r = DataFrame(search_space_dataframe) search_space_dataframe_r = search_space_dataframe_r.rx( StrVector(self.axis_names)) info("Generated Search Space:") info(str(self.base.summary_default(search_space_dataframe_r))) coded_search_space_dataframe_r = self.encode_data( search_space_dataframe_r) return coded_search_space_dataframe_r
def draw_hist(length,pdfname='hist.pdf',b=5,m=700,wd=8,hd=6): #length = d5 robjects.globalenv["dd"] = IntVector(length) robjects.globalenv["nm"] = StrVector([pdfname]) robjects.globalenv["b"] = IntVector([b]) robjects.globalenv["m"] = IntVector([m]) robjects.globalenv["wd"] = IntVector([wd]) robjects.globalenv["hd"] = IntVector([hd]) r_script = ''' library(ape) pdf(nm,width=wd,height=hd) xcol=seq(0,m,b) hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length') dev.off() ''' robjects.r(r_script)
def rank_abundance_data(counter): n = len(counter) ranks = IntVector(range(1, n + 1)) counts = [c for (i, c) in counter.most_common()] counts_sum = sum(counts) fracs_arr = [(c / counts_sum) for c in counts] fracs = FloatVector(fracs_arr) return ranks, fracs
def loess_fromR(x, y, f, d=2): x_vector = IntVector(x) y_vector = FloatVector(y) globalenv["x_vector"] = x_vector globalenv["y_vector"] = y_vector globalenv["f"] = f a = round(f, 2) if round(f, 2) > 0.0 else f model = stats.loess('y_vector~x_vector', span=a, degree=d) return model.rx2('fitted')
def ro(self): """Expose a view as RObject, to be manipulated in R environment""" # Convert to R vector of correct data type if isinstance(self.iloc, dict): out = ListVector([(None, PyR(v).ro) for v in self.iloc]) if types.is_float_dtype(self.iloc): out = FloatVector(self.iloc.reshape(-1, order='F')) elif types.is_integer_dtype(self.iloc): out = IntVector(self.iloc.reshape(-1, order='F')) else: out = StrVector(self.iloc.reshape(-1, order='F')) if len(self.dim) > 1: # reshape to R Array if has non-trivial dim out = ro.r.array(out, dim=IntVector(self.dim)) # Collect R object name attributes if hasattr(self, 'rownames'): out.rownames = StrVector(self.rownames) if hasattr(self, 'colnames'): out.colnames = StrVector(self.colnames) if hasattr(self, 'names'): out.names = ListVector(self.names) if isinstance( self.names, ListVector) else StrVector(self.names) return out
def estimate_cpu_writes(ops_observed): stats = importr('stats') ops = IntVector([ 5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000 ]) cpu_used = IntVector([ 1.5733, 1.5944, 2.09, 2.346, 2.488, 2.596, 4.925, 6.956, 9.02, 10.75, 16.06, 20.74, 25.2100192678, 30.2011560694, 35.0838150289, 38.0040540541, 42.8837545126, 47.6525096525 ]) TYPE = StrVector(["monoH.FC"]) ro.globalenv["ops"] = ops ro.globalenv["cpu_used"] = cpu_used ro.globalenv["TYPE"] = TYPE splinefun = ro.r['splinefun'] sp_w = splinefun(ops, cpu_used, TYPE) res = float(sp_w(ops_observed).r_repr()) print '\ncpu_writes: ' + str(res) return res
def python_type_to_R_type(cls, pobject=None): if isinstance(pobject,(list, np.ndarray, pd.Series)): if isinstance(pobject,(list, pd.Series)): pobject = np.array(pobject) if re.match('^int',pobject.dtype.name) is not None: return IntVector(pobject) elif re.match('^float',pobject.dtype.name) is not None: return FloatVector(pobject) elif re.match('^str',pobject.dtype.name) is not None: return StrVector(pobject) elif re.match('^bool',pobject.dtype.name) is not None: return StrVector(pobject) else: return pobject else: return pobject
def bargraph_language(): r = robjects.r for language in languages: varis = [] probs = [] times = [] for prob in problems: for var in variations: try: time = result[language][prob][var] except KeyError: time = 0 # for the expert times, add expert and non-expert times together if var.startswith('expert'): try: time = time + result[language][prob][var.replace( 'expert', '')] except KeyError: pass varis.append(pretty_varis[var]) probs.append(prob) times.append(time) r.pdf('bargraph-codingtime-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Time': IntVector(times), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (in minutes)")') pp.plot() r['dev.off']()
def create_variables(self, variables=None): """ 在R中创建变量 :param dict,pd.Dataframe variables: 变量数据 :return: 无返回值 """ # 解析变量名,存入variable_names if isinstance(variables, dict): variable_names = variables.keys() elif isinstance(variables, pd.DataFrame): variable_names = list(variables.columns) else: print('vars is not dict or dataframe!') variable_names = [] for var_name in variable_names: if isinstance(variables[var_name], (list,pd.Series)): if isinstance(variables[var_name][0],(int,np.int8)): self.R.globalenv[var_name] = IntVector(variables[var_name]) if isinstance(variables[var_name][0],(float,np.float32)): self.R.globalenv[var_name] = FloatVector(variables[var_name]) else: self.R.globalenv[var_name] = variables[var_name]
import rpy2 exact = importr("exact2x2") INF = float('inf') fstring = '%.' + str(pdecimal) + 'f' estring = '%.' + str(pdecimal) + 'e' # print(fstring) for line in sys.stdin: line = line.strip() if line: ss = line.split() try: c1 = [int(ss[x]) for x in t1Index] c2 = [int(ss[x]) for x in t2Index] # print(IntVector(c1+c2)) m = rpy2.robjects.r.matrix(IntVector(c1 + c2), nrow=2, byrow="T") # print(ci_alpha) r = exact.exact2x2(m, tsmethod="minlike", alternative=alternative, conf_level=ci_alpha) out = [] out.append(r[r.names.index('p.value')][0]) out.append(r[r.names.index('estimate')][0]) out.append(r[r.names.index('conf.int')][0]) out.append(r[r.names.index('conf.int')][1]) if out[1] == 0 or out[1] == INF:
import rpy2.robjects as R from rpy2.robjects import IntVector # Dez cobaias foram submetidas ao tratamento de engorda com certa ração. # Os pesos em gramas, antes e após o teste são dados a seguir # (supõe-se que provenham de distribuições normais). # A 1% de significância, podemos concluir que o uso da ração contribuiu para o aumento do peso médio dos animais? # Antes # 635, 704, 662, 560, 603, 745, 698, 575, 633, 669 # Depois # 640, 712, 681, 558, 610, 740, 707, 585, 635, 682 # t test antes = IntVector([635, 704, 662, 560, 603, 745, 698, 575, 633, 669]) depois = IntVector([640, 712, 681, 558, 610, 740, 707, 585, 635, 682]) result = R.r['t.test'](antes, depois, alternative='two.sided', alpha=0.01, paired=True) import ipdb ipdb.set_trace() print result.rx('p.value')[0][0] print result.rx('statistic')[0][0] print result.rx('parameter')[0][0] print result.rx('estimate')[0][0] final_result = 1 if result.rx('p.value')[0][0] < 0.01 else 0
base = importr('base') print(dir(base)) print(base.pi) # 获得R中的变量,调用__getitem__()方法 pi = robjects.r['pi'] print(pi, type(pi), len(pi), pi[0], pi.r_repr(), type(pi.r_repr()), float(pi.r_repr())) # R的向量表达 res = robjects.StrVector(['abc', 'def']) # R的函数,调用的参数如果是vector,dataframe或matrix,必须进行转换 #rsort = robjects.r['sort'] rsort = importr('base').sort # Wrong:rsort([3,2,4]) print(rsort(IntVector([3,2,4])), type(rsort(IntVector([3,2,4])))) # 调用函数时,记得R中用.的分隔符,在Python中用_替代 print(base.rank(0, na_last = True)) # 一个OLS的例子 # 构建R环境变量 stats = importr('stats') # 可以查看R函数的参数列表 print(tuple(stats.rnorm.formals().names)) # 创建R形式的变量 ctl = FloatVector([4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14]) trt = FloatVector([4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69]) # 调用R下的gl函数 group = base.gl(2, 10, 20, labels = ["Ctl","Trt"])
def line_plot (cfg, var, control, change_name, changing, selector, base_selector, basis): speedups = [] thrds = [] changes = [] lowers = [] uppers = [] for n in cfg.threads: probs.append ('ideal') langs.append ('ideal') speedups.append (n) thrds.append (n) changes.append ('ideal') lowers.append (n) uppers.append (n) for c in changing: sel = selector (c) # sequential base base = FloatVector (base_selector(c)) # base with p = 1 base_p1 = FloatVector (sel(1)) # use fastest sequential program if basis == 'fastest' and mean (base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 for n in cfg.threads: ntimes = FloatVector (sel(n)) # ratio confidence interval labels = ['Base'] * r.length(base)[0] + ['N']*r.length (ntimes)[0] df = DataFrame ({'Times': base + ntimes, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{'var.equal': False, 'conf.level': 0.999})[0][0] lowers.append (ratio_test[1][0]) uppers.append (ratio_test[2][0]) mn = mean (ntimes) speedups.append (mean(base) / mn) # plot slowdowns #speedups.append (-mn/base)#(base / mn) thrds.append (n) if change_name == 'Language': changes.append (pretty_langs [c]) else: changes.append (c) df = DataFrame ({'Speedup': FloatVector (speedups), 'Threads': IntVector (thrds), change_name: StrVector (changes), 'Lower': FloatVector (lowers), 'Upper': FloatVector (uppers) }) ideal_changing = ['ideal'] if change_name == 'Language': ideal_changing.extend ([pretty_langs [c] for c in changing]) else: ideal_changing.extend (changing) legendVec = IntVector (range (len (ideal_changing))) legendVec.names = StrVector (ideal_changing) gg = ggplot2.ggplot (df) limits = ggplot2.aes (ymax = 'Upper', ymin = 'Lower') dodge = ggplot2.position_dodge (width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=3) +\ ggplot2.aes_string(x='Threads', y='Speedup', group=change_name, color=change_name, shape=change_name) + \ ggplot2.scale_shape_manual(values=legendVec) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Cores")') # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\ pp.plot() r['dev.off']()
def as_dataframe(cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list(cfg.languages) langs_ideal.append('ideal') probs_ideal = list(cfg.problems) probs_ideal.append('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append(var) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) speedups.append(thread) speedup_lowers.append(thread) speedup_uppers.append(thread) times.append(0) ses.append(0) mems.append(0) continue varis.append(var) # pretty_varis [var]) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector(results[thread][prob][var][lang][0]) time = mean(vals) times.append(time) # # time confidence interval # t_result = r['t.test'](FloatVector(vals), **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) # # memory usage # mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append(1) speedup_lowers.append(1) speedup_uppers.append(1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace( 'par', 'seq')][lang][0] # sequential base base = FloatVector(seq_vals) # base with p = 1 base_p1 = FloatVector(results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean(base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base' ] * r.length(base)[0] + ['N'] * r.length(vals)[0] df = DataFrame({ 'Times': base + vals, 'Type': StrVector(labels) }) ratio_test = r['pairwiseCI'](r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{ 'var.equal': False })[0][0] speedups.append(mean(base) / time) speedup_lowers.append(ratio_test[1][0]) speedup_uppers.append(ratio_test[2][0]) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Threads': IntVector(threads), 'Time': FloatVector(times), 'SE': FloatVector(ses), 'Speedup': FloatVector(speedups), 'SpeedupLower': FloatVector(speedup_lowers), 'SpeedupUpper': FloatVector(speedup_uppers), 'Mem': FloatVector(mems) }) r.assign('df', df) r('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector(range(len(langs_ideal))) legendVec.names = StrVector(langs_ideal) gg = ggplot2.ggplot(df) limits = ggplot2.aes(ymax='SpeedupUpper.expertpar', ymin='SpeedupLower.expertpar') dodge = ggplot2.position_dodge(width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
utils = importr('dplyr') utils = importr('gravity') except Exception as exception: print(exception) import rpy2.robjects.packages as rpackages utils = rpackages.importr('utils') utils = rpackages.importr('gravity') utils.chooseCRANmirror(ind=1) utils.install_packages('dplyr') utils.install_packages('gravity') finally: from rpy2.robjects.packages import importr utils = importr('dplyr') utils = importr('gravity') dataf = DataFrame({'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(data.T[0]))}) fit = R('function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)')(dataf) #print(fit) # Deviance is -2.*log_likelihood altDeviance = list(fit[9].items())[0][1] nullDeviance = list(fit[11].items())[0][1] p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1) print('Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.') print('Implemented in R in "gravity: Estimation Methods for Gravity Models" at:') print('https://rdrr.io/cran/gravity/man/ppml.html') print('Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', 'Null deviance:\t', np.round(nullDeviance, 1), '\n\t', 'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t', 'p-value:\t', '%.1e' % p_value, '\n')
def as_dataframe (cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list (cfg.languages) langs_ideal.append ('ideal') probs_ideal = list (cfg.problems) probs_ideal.append ('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append (var) langs.append (pretty_langs[lang]) probs.append (prob) threads.append (thread) speedups.append (thread) speedup_lowers.append (thread) speedup_uppers.append (thread) times.append (0) ses.append(0) mems.append (0) continue varis.append (var) # pretty_varis [var]) langs.append (pretty_langs [lang]) probs.append (prob) threads.append (thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector (results[thread][prob][var][lang][0]) time = mean (vals) times.append (time) # # time confidence interval # t_result = r['t.test'] (FloatVector(vals), **{" conf.level": 0.999}).rx ('conf.int')[0] ses.append ((t_result[1] - t_result[0])/2) # # memory usage # mem_filename = get_mem_output (lang, prob, var) with open (mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append (float (mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append (1) speedup_lowers.append (1) speedup_uppers.append (1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace ('par', 'seq')][lang][0] # sequential base base = FloatVector (seq_vals) # base with p = 1 base_p1 = FloatVector (results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean (base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base'] * r.length(base)[0] + ['N']*r.length (vals)[0] df = DataFrame ({'Times': base + vals, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{'var.equal': False})[0][0] speedups.append (mean(base) / time) speedup_lowers.append (ratio_test[1][0]) speedup_uppers.append (ratio_test[2][0]) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Variation' : StrVector (varis), 'Threads': IntVector (threads), 'Time': FloatVector (times), 'SE': FloatVector (ses), 'Speedup': FloatVector (speedups), 'SpeedupLower': FloatVector (speedup_lowers), 'SpeedupUpper': FloatVector (speedup_uppers), 'Mem' : FloatVector (mems) }) r.assign ('df', df) r ('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf ('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector (range (len (langs_ideal))) legendVec.names = StrVector (langs_ideal) gg = ggplot2.ggplot (df) limits = ggplot2.aes (ymax = 'SpeedupUpper.expertpar', ymin = 'SpeedupLower.expertpar') dodge = ggplot2.position_dodge (width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
#Mod soundType.append(1) #Saline muscimol.append(0) import rpy2 from rpy2.robjects.packages import importr lme4 = importr('lme4') from rpy2.robjects import IntVector, Formula allnCorr = IntVector(allnCorr) allnVal = IntVector(allnVal) allfracCorr = IntVector(allfracCorr) sessionInds = IntVector(sessionInds) animalInds = IntVector(animalInds) soundType = IntVector(soundType) muscimol = IntVector(muscimol) model = Formula('allnCorr/allnVal ~ sessionInds + (1 | animalInds), weights=allnVal, family=binomial') env = model.environment env['allnCorr'] = allnCorr env['allnVal'] = allnVal env['sessionInds'] = sessionInds env['animalInds'] = animalInds
def line_plot(cfg, var, control, change_name, changing, selector, base_selector, basis): speedups = [] thrds = [] changes = [] lowers = [] uppers = [] for n in cfg.threads: probs.append('ideal') langs.append('ideal') speedups.append(n) thrds.append(n) changes.append('ideal') lowers.append(n) uppers.append(n) for c in changing: sel = selector(c) # sequential base base = FloatVector(base_selector(c)) # base with p = 1 base_p1 = FloatVector(sel(1)) # use fastest sequential program if basis == 'fastest' and mean(base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 for n in cfg.threads: ntimes = FloatVector(sel(n)) # ratio confidence interval labels = ['Base'] * r.length(base)[0] + ['N'] * r.length(ntimes)[0] df = DataFrame({'Times': base + ntimes, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'](r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{ 'var.equal': False, 'conf.level': 0.999 })[0][0] lowers.append(ratio_test[1][0]) uppers.append(ratio_test[2][0]) mn = mean(ntimes) speedups.append(mean(base) / mn) # plot slowdowns #speedups.append (-mn/base)#(base / mn) thrds.append(n) if change_name == 'Language': changes.append(pretty_langs[c]) else: changes.append(c) df = DataFrame({ 'Speedup': FloatVector(speedups), 'Threads': IntVector(thrds), change_name: StrVector(changes), 'Lower': FloatVector(lowers), 'Upper': FloatVector(uppers) }) ideal_changing = ['ideal'] if change_name == 'Language': ideal_changing.extend([pretty_langs[c] for c in changing]) else: ideal_changing.extend(changing) legendVec = IntVector(range(len(ideal_changing))) legendVec.names = StrVector(ideal_changing) gg = ggplot2.ggplot(df) limits = ggplot2.aes(ymax='Upper', ymin='Lower') dodge = ggplot2.position_dodge(width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=3) +\ ggplot2.aes_string(x='Threads', y='Speedup', group=change_name, color=change_name, shape=change_name) + \ ggplot2.scale_shape_manual(values=legendVec) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Cores")') # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\ pp.plot() r['dev.off']()
def run_all(site_type: str, gene_sets: Mapping[str, Path] = GENE_SETS, gene_set_filter: Tuple[int] = (5, 1000), correct=False, **kwargs): """Runs all active_pathways combinations for given site_type. Uses pan_cancer/clinvar Active Driver analyses results and all provided GMT gene sets. Args: site_type: site filter which will be passed to ActiveDriver analysis gene_sets: gene sets to be considered gene_set_filter: a two-tuple: (min, max) number of genes required to be in a gene set. If not set, the default of (5, 1000) is used Results are saved in `output_dir`. Returns: Mapping of directories with newly computed ActivePathways results """ data_table = importr('data.table') paths = {} kwargs['geneset.filter'] = IntVector(gene_set_filter) for analysis in [ active_driver.pan_cancer_analysis, active_driver.clinvar_analysis ]: for gene_set in gene_sets: path = output_dir / analysis.name / gene_set / site_type # remove the old results (if any) rmtree(path, ignore_errors=True) # recreate dir path.mkdir(parents=True) path = path.absolute() ad_result = analysis(site_type) print( f'Preparing active pathways: {analysis.name} for {len(ad_result["all_gene_based_fdr"])} genes' ) print(f'Gene sets/background: {gene_set}') gene_sets_path = gene_sets[gene_set] if callable(gene_sets_path): gene_sets_path = gene_sets_path() result = run_active_pathways(ad_result, str(gene_sets_path), cytoscape_dir=path, correct=correct, **kwargs) data_table.fwrite(result, str(path / 'pathways.tsv'), sep='\t', sep2=r.c('', ',', '')) paths[(analysis, gene_set)] = path return paths
def sequence_logo(pwm_or_seq, path: Path = None, width=369, height=149, dpi=72, legend=False, renumerate=True, title: str = None, **kwargs): """Generate a sequence logo from Position Weight Matrix (pwm) or a list of aligned sequences. and save it into a file if a path was provided. The logo will be generated with ggseqlogo (R). Args: pwm_or_seq: list of sequences or PWM matrix or dict where keys are names of facets and values are lists or PWMs path: where the file should be saved renumerate: change the labels of x axis to reflect relative position to the modified (central) residue (15-aa sequence is assumed) width: width in pixels height: height in pixels dpi: the DPI of the plotting device legend: whether and where the legend should be shown title: the title of the plot """ gglogo = importr("ggseqlogo") ggplot2 = importr("ggplot2") if isinstance(pwm_or_seq, list): pwm_or_seq = StrVector(pwm_or_seq) elif isinstance(pwm_or_seq, dict): pwm_or_seq = TaggedList(pwm_or_seq.values(), pwm_or_seq.keys()) theme_options = { 'legend.position': legend or 'none', 'legend.title': ggplot2.element_blank(), 'legend.text': ggplot2.element_text(size=14), 'legend.key.size': r.unit(0.2, 'in'), 'plot.title': ggplot2.element_text(hjust=0.5, size=16), 'axis.title.y': ggplot2.element_text(size=16), 'text': ggplot2.element_text(size=20), 'plot.margin': r.unit([0.03, 0.045, -0.2, 0.06], 'in'), } plot = GG(gglogo.ggseqlogo(pwm_or_seq, **kwargs)) + ggplot2.theme( **theme_options) + ggplot2.labs(y='bits') if renumerate: plot += ggplot2.scale_x_continuous(breaks=IntVector(range(1, 14 + 2)), labels=IntVector(range(-7, 7 + 1))) if title: plot += ggplot2.ggtitle(title) if path: ggplot2.ggsave(str(path), width=width / dpi, height=height / dpi, dpi=dpi, units='in', bg='transparent') return plot
def __init__(self, params): self.base = importr("base") self.utils = importr("utils") self.stats = importr("stats") self.algdesign = importr("AlgDesign") self.car = importr("car") self.rsm = importr("rsm") self.dplyr = importr("dplyr") self.quantreg = importr("quantreg") self.dicekrig = importr("DiceKriging") self.diced = importr("DiceDesign") #numpy.random.seed(11221) #self.base.set_seed(11221) self.complete_design_data = None self.complete_search_space = None self.total_runs = 20 orio.main.tuner.search.search.Search.__init__(self, params) self.name = "GPR" self.parameter_ranges = {} for i in range(len(self.params["axis_val_ranges"])): self.parameter_ranges[self.params["axis_names"][i]] = [ 0, len(self.params["axis_val_ranges"][i]) ] info("Parameters: " + str(self.parameter_ranges)) self.parameter_values = {} for i in range(len(self.params["axis_val_ranges"])): self.parameter_values[self.params["axis_names"] [i]] = self.params["axis_val_ranges"][i] info("Parameter Real Ranges: " + str(self.axis_val_ranges)) info("Parameter Range Values: " + str(self.parameter_values)) self.range_matrix = {} for i in range(len(self.axis_names)): self.range_matrix[self.axis_names[i]] = IntVector( self.axis_val_ranges[i]) self.range_matrix = ListVector(self.range_matrix) info("DataFrame Ranges: " + str(self.base.summary_default(self.range_matrix))) self.starting_sample = int(round(len(self.params["axis_names"]) + 2)) self.steps = 22 self.extra_experiments = int(round(len(self.params["axis_names"]) * 1)) self.testing_set_size = 300000 self.failure_multiplier = 100 self.__readAlgoArgs() self.experiment_data = None self.best_points_complete = None if self.time_limit <= 0 and self.total_runs <= 0: err(('%s search requires search time limit or ' + 'total number of search runs to be defined') % self.__class__.__name__) self.run_summary_database = dataset.connect("sqlite:///" + 'run_summary.db') self.summary = self.run_summary_database["dlmt_run_summary"] info("Starting sample: " + str(self.starting_sample)) info("GPR steps: " + str(self.steps)) info("Experiments added per step: " + str(self.extra_experiments)) info("Initial Testing Set Size: " + str(self.testing_set_size)) info("Constraints: " + str(self.constraint))
def factory(self, date, data, raw = False): if not raw: tdate = self.dateconvert date = IntVector([tdate(dt) for dt in date]) #data = FloatVector(data) return self.r['zoo'](data,date)
def measure_design(self, encoded_design, step_number): design = self.rsm.decode_data(encoded_design) info("Measuring design of size " + str(len(design[0]))) design_names = [ str(n) for n in self.base.names(design) if n not in [ "cost_mean", "predicted_mean", "predicted_sd", "predicted_mean_2s" ] ] initial_factors = self.params["axis_names"] measurements = [] info("Current Design Names: " + str(design_names)) info("Complete decoded design:") info(str(design)) info("Complete original design:") info(str(encoded_design)) for line in range(1, len(design[0]) + 1): if type(design.rx(line, True)[0]) is int: design_line = [v for v in design.rx(line, True)] else: design_line = [ int(round(float(v[0]))) for v in design.rx(line, True) ] candidate = [0] * len(initial_factors) for i in range(len(design_names)): # if should_redecode: # candidate[initial_factors.index(design_names[i])] = self.parameter_values[design_names[i]].index(design_line[i]) # else: candidate[initial_factors.index( design_names[i])] = design_line[i] info("Evaluating candidate:") info(str(candidate)) measurement = self.getPerfCosts([candidate]) if measurement != {}: measurements.append( float(numpy.mean(measurement[str(candidate)][0]))) else: measurements.append(robjects.NA_Real) encoded_design = encoded_design.rx( True, IntVector(tuple(range(1, len(initial_factors) + 1)))) info("Encoded design") info(str(encoded_design)) info("Dims design") info(str(self.base.dim(encoded_design))) info("Measurements") info(str(measurements)) info("FloatVector Measurements") info(str(FloatVector(measurements))) info("Attempting DF") info( str( self.base.dim( DataFrame({"cost_mean": FloatVector(measurements)})))) info("Dims DF") info( str( self.base.dim( DataFrame({"cost_mean": FloatVector(measurements)})))) encoded_design = self.dplyr.bind_cols( encoded_design, DataFrame({"cost_mean": FloatVector(measurements)})) info("Complete design, with measurements:") info(str(self.base.summary_default(encoded_design))) encoded_design = encoded_design.rx( self.stats.complete_cases(encoded_design), True) encoded_design = encoded_design.rx( self.base.is_finite(self.base.rowSums(encoded_design)), True) info("Clean encoded design, with measurements:") info(str(self.base.summary_default(encoded_design))) self.utils.write_csv(encoded_design, "design_step_{0}.csv".format(step_number)) if self.complete_design_data == None: self.complete_design_data = encoded_design else: info(str(self.complete_design_data)) info(str(encoded_design)) self.complete_design_data = self.base.rbind( self.complete_design_data, encoded_design) return encoded_design
else: info("Using pre-generated space for this size") search_space_database = dataset.connect( "sqlite:///search_space_{0}.db".format(self.seed_space_size)) for experiment in search_space_database['experiments']: search_space.append(eval(experiment["value"])) info("Starting DOPT-anova") r_search_space = {} for i in range(len(search_space[0])): r_row = [self.dim_uplimits[i] - 1, 0] for col in search_space: r_row.append(col[i]) r_search_space[initial_factors[i]] = IntVector(r_row) data = DataFrame(r_search_space) data = data.rx(StrVector(initial_factors)) self.dopt_anova(initial_factors, initial_inverse_factors, data) sys.exit() perf_cost, mean_perf_cost = self.MAXFLOAT, self.MAXFLOAT params = self.coordToPerfParams(coord) end_time = time.time() search_time = start_time - end_time speedup = float(eval_cost[0]) / float(best_perf_cost) search_time = time.time() - start_time
def run_zinb(self, data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions): """ Runs Zinb for each gene across conditions and returns p and q values ([[Wigdata]], [Gene], [Number], [Number], {Rv: [SiteIndex]}, [Condition], [Covar], [Interaction]) -> Tuple([Number], [Number], [Status]) Wigdata :: [Number] Gene :: {start, end, rv, gene, strand} SiteIndex: Integer Condition :: String Covar :: String Interaction :: String Status :: String """ count = 0 self.progress_range(len(genes)) pvals, Rvs, status = [], [], [] r_zinb_signif = self.def_r_zinb_signif() if (self.winz): self.transit_message("Winsorizing and running analysis...") self.transit_message("Condition: %s" % self.condition) comp1a = "1+cond" comp1b = "1+cond" # include cond in mod0 only if testing interactions comp0a = "1" if len(self.interactions) == 0 else "1+cond" comp0b = "1" if len(self.interactions) == 0 else "1+cond" for I in self.interactions: comp1a += "*" + I comp1b += "*" + I comp0a += "+" + I comp0b += "+" + I for C in self.covars: comp1a += "+" + C comp1b += "+" + C comp0a += "+" + C comp0b += "+" + C zinbMod1 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % ( comp1a, comp1b) zinbMod0 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % ( comp0a, comp0b) nbMod1 = "cnt~%s" % (comp1a) nbMod0 = "cnt~%s" % (comp0a) toRFloatOrStrVec = lambda xs: FloatVector( [float(x) for x in xs]) if self.is_number(xs[0]) else StrVector(xs) for gene in genes: count += 1 Rv = gene["rv"] ## Single gene case for debugging if (GENE): Rv = None if GENE in RvSiteindexesMap: Rv = GENE else: for g in genes: if (g['gene'] == GENE): Rv = g["rv"] break if not Rv: self.transit_error("Cannot find gene: {0}".format(GENE)) sys.exit(0) if (DEBUG): self.transit_message( "======================================================================" ) self.transit_message(gene["rv"] + " " + gene["gene"]) if (len(RvSiteindexesMap[Rv]) <= 1): status.append("TA sites <= 1, not analyzed") pvals.append(1) else: # For winsorization # norm_data = self.winsorize((map( # lambda wigData: wigData[RvSiteindexesMap[Rv]], data))) if self.winz else list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data)) norm_data = list( map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data)) ([ readCounts, condition, covarsData, interactionsData, NZmean, logitZPerc ]) = self.melt_data(norm_data, conditions, covariates, interactions, NZMeanByRep, LogZPercByRep) if (numpy.sum(readCounts) == 0): status.append( "pan-essential (no counts in all conditions) - not analyzed" ) pvals.append(1) else: df_args = { 'cnt': IntVector(readCounts), 'cond': toRFloatOrStrVec(condition), 'NZmean': FloatVector(NZmean), 'logitZperc': FloatVector(logitZPerc) } ## Add columns for covariates and interactions if they exist. df_args.update( list( map( lambda t_ic: (t_ic[ 1], toRFloatOrStrVec(covarsData[t_ic[0]])), enumerate(self.covars)))) df_args.update( list( map( lambda t_ic: (t_ic[1], toRFloatOrStrVec(interactionsData[t_ic[0]])), enumerate(self.interactions)))) melted = DataFrame(df_args) # r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True] debugFlag = True if DEBUG or GENE else False pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0, nbMod1, nbMod0, debugFlag) status.append(msg) pvals.append(float(pval)) if (DEBUG or GENE): self.transit_message( "Pval for Gene {0}: {1}, status: {2}".format( Rv, pvals[-1], status[-1])) if (GENE): self.transit_message("Ran for single gene. Exiting...") sys.exit(0) Rvs.append(Rv) # Update progress text = "Running ZINB Method... %5.1f%%" % (100.0 * count / len(genes)) self.progress_update(text, count) pvals = numpy.array(pvals) mask = numpy.isfinite(pvals) qvals = numpy.full(pvals.shape, numpy.nan) qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals)[ 1] # BH, alpha=0.05 p, q, statusMap = {}, {}, {} for i, rv in enumerate(Rvs): p[rv], q[rv], statusMap[rv] = pvals[i], qvals[i], status[i] return (p, q, statusMap)