def computeMNaseNucOneMusPhis(bamFile, nucleosomeFile, fragRange): empiricalCounts = [] samfile = pysam.AlignmentFile(bamFile) with open(nucleosomeFile) as infile: for line in infile: if line[0] == "#": continue l = line.strip().split() if l[0] == "chr": continue start = int(l[1]) - 73 stop = int(l[1]) + 74 counts = [0 for i in range(start, stop + 1)] # ignore chrXII if l[0] == "chrXII": continue regions = samfile.fetch(l[0], max(0, start - 200), stop + 200) for r in regions: if r.template_length <= 0: continue rStart = r.reference_start + 1 rEnd = r.reference_start + r.template_length m = (rStart + rEnd)/2 width = abs(r.template_length) if width < fragRange[0] or width > fragRange[1]: continue if m < start or m > stop: continue counts[int(m - start)] += 1 empiricalCounts.extend(counts) empiricalCounts = np.array(empiricalCounts) mus = [] phis = [] ec = empiricalCounts fitdist = importr("fitdistrplus") params = fitdist.fitdist(vectors.IntVector(ec), 'nbinom', method = 'mle') params = params.rx2("estimate") mus = params.rx2("mu")[0] phis = params.rx2("size")[0] return mus, phis
def web_logo_creator(sequence_list, sequence_name, output): """ :param sequence_list: (tuple of strings) - list of sequences :param sequence_name: (string) name of the sequence :param output: (string) the folder where the results will be created """ warnings.filterwarnings("ignore", category=RRuntimeWarning) weblogo_maker = robj.r(""" library("ggplot2") library("ggseqlogo") function(mys_seq, name_file, mytitle, size){ s1 = 15 cs1 = make_col_scheme(chars=c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M'), groups=c('g1','g2','g3','g4','g5', 'g6', 'g7', 'g8', 'g9', 'g10'),cols=c('limegreen','brown1','gold','dodgerblue3','darkorange', "brown1", "limegreen", "dodgerblue3", "darkorchid3", "dodgerblue3"), name='custom1') p1 = ggseqlogo(mys_seq, method = "bit", col_scheme=cs1, namespace = c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M')) + theme_logo() + scale_x_discrete(limits = as.character(seq(1,size, by=1)), labels = as.character(seq(1,size, by=2)), breaks = as.character(seq(1, size, by=2))) + theme(axis.title.y=element_text(size=s1+25), legend.position="none") p1 = p1 + ggtitle(mytitle) + theme(plot.title = element_text(hjust = 0.5)) p1 = p1 + theme(axis.text=element_text(size=s1 + 25), plot.title = element_text(size=s1 + 30)) p1 = p1 + scale_y_discrete(limits = c(0, 0.5, 1), labels = as.character(seq(0,1, length=3)), breaks = as.character(seq(0,1, length=3)), expand = c(0,0.05)) #p1 = p1 + ylim(0,1) png(file=paste(name_file,"_weblogo.png", sep=""),height=149 * 2,width=52 * size * 2 ) print(p1) dev.off() } """) weblogo_maker(v.StrVector(sequence_list), v.StrVector([output + sequence_name]), v.StrVector([sequence_name]), v.IntVector([len(sequence_list[0])]))
def test_spread(self): labels = ('a', 'b', 'c', 'd', 'e') dataf = tidyr.DataFrame({ 'x': vectors.IntVector((1, 2, 3, 4, 5)), 'labels': vectors.StrVector(labels) }) dataf_spread = dataf.spread('labels', 'x') assert sorted(list(labels)) == sorted(list(dataf_spread.colnames))
def test_dataframe(self): dataf = tidyr.DataFrame({ 'x': vectors.IntVector((1, 2, 3, 4, 5)), 'labels': vectors.StrVector(('a', 'b', 'b', 'b', 'a')) }) assert isinstance(dataf, tidyr.DataFrame) assert sorted(['x', 'labels']) == sorted(list(dataf.colnames))
def _check_auto_predictability(x, embedding_dim, tau, predsteplist=None): """ Check data validity for MCCM via auto-predictability This function evaluates whether or not the data used is valid to be tested by MCCM. The primary concerns are non-linearity and local periodicity. Issues warnings if these features are detected, but also returns the validity test results for analysis offline. Args: x (rvec.FloatVector): Vector to be tested embedding_dim (int): Embedding dimension tau (float): Time-delay for attractor reconstruction predsteplist (list): List of temporal distances for evaluating prediction Returns: (dict) Results of validity testing """ # Set default prediction step list if not provided if predsteplist is None: predsteplist = list(range(1, 11)) # Auto-predictability test from multispatialCCM.R signal_out_r = mccm.SSR_check_signal(A=x, E=embedding_dim, tau=tau, predsteplist=rvec.IntVector(predsteplist)) # Prediction strength (rho) over increasing temporal distance (predstep) rho_pred = np.array(signal_out_r.rx2("predatout")) # Slope and p-value (linear regression) of rho over temporal distance rho_slope = np.array(signal_out_r.rx2("rho_pre_slope")) if rho_slope[0] >= 0: # If prediction strength (rho) remains the same or increases with temporal distance, # a basic tenant of non-linearity has been violated warnings.warn("Prediction increases with historical distance. Data may not be non-linear...") if np.max(rho_pred) < 0.2: # If the highest prediction strength is fairly low, there may be too much noise # in the data for MCCM to provide a valid result warnings.warn( "Corrlation coefficient for short time steps (predictive validity) is below 0.2. " "Excessive stochasitic noise may be present...") if np.min(rho_pred) < (rho_pred[1, 0] - 0.2) and np.min(rho_pred) < (rho_pred[1, -1] - 0.2): # If prediction strength (rho) dips and then rises again, the data is likely locally periodic. # This is highly problematic for accurate MCCM warnings.warn("Possible periodicity detected...") # Return results of validity testing signal_out = {"rho predicted": rho_pred, "rho test": rho_slope} return signal_out
def computeMNaseTFPhisMus(bamFile, csvFile, fragRange, filename, offset = 0): """ Negative binomial distribution for short fragments at TF binding sites. """ TFs = ["ABF1", "REB1"] tfCounts = [] nucCounts = [] samfile = pysam.AlignmentFile(bamFile, "rb") with open(csvFile) as infile: for line in infile: l = line.strip().split() if l[3] not in TFs: continue minStart = int(l[1]) maxEnd = int(l[2]) chrm = l[0] countMid = [0 for i in range(maxEnd - minStart + 1)] countNuc = [0 for i in range(maxEnd - minStart + 1)] region = samfile.fetch(chrm, minStart - fragRange[1] - 1, maxEnd + fragRange[1] - 1) for i in region: if i.template_length - 2*offset >= 0: start = i.reference_start + 1 + offset end = i.reference_start + offset + i.template_length - 2*offset else: continue width = abs(i.template_length - 2*offset) if width >= fragRange[0] and width <= fragRange[1] and (start + end)/2 >= minStart and (start + end)/2 <= maxEnd: countMid[int((start + end)/2 - minStart)] += 1 tfCounts = tfCounts + countMid nucCounts = nucCounts + countNuc np.save("/usr/xtmp/sneha/tmpDir/mnaseCountstf", tfCounts) try: fitdist = importr('fitdistrplus') p = fitdist.fitdist(vectors.IntVector(tfCounts), 'nbinom', method = "mle") p = p.rx2("estimate") size = p.rx2("size")[0] mu = p.rx2("mu")[0] params = {'mu': mu, 'phi': size} except Exception as e: # hard code values if e.args[0][:14] == "Error in (func": mu = 0.002 phi = 100 params = {'mu': mu, 'phi': phi} return params
def pval_getter(val, cell, reg): glm = rpy2.robjects.r(""" function(val, cell, reg){ data <- as.data.frame(cbind(val, cell, reg)) data$val <- as.factor(data$val) data$reg <- as.factor(data$reg) data$cell <- as.factor(data$cell) md0 <-glm(val ~ cell, family=binomial("logit"),data=data) md1 <-glm(val ~ reg+cell, family=binomial("logit"),data=data) print(summary(md0)) print(summary(md1)) a <- anova(md1, md0, test="Chisq") print(a) return(as.numeric(a$"Pr(>Chi)"[2])) } """) return(glm(v.IntVector(val), v.StrVector(cell), v.StrVector(reg)))
def computeMNaseBackground(bamFile, segments, fragRange, offset = 0): """ Compute MNase-seq midpoint of background distribution. """ counts = [] samfile = pysam.AlignmentFile(bamFile, "rb") for s in segments: minStart = s['start'] maxEnd = s['stop'] + fragRange[1] region = samfile.fetch(s['chrm'], minStart - 200, maxEnd + 200) count = [0 for i in range(s['stop'] - s['start'])] for i in region: if i.template_length - 2*offset <= 0: continue if i.template_length - 2*offset >= 0: start = i.reference_start + offset end = i.reference_start + offset + i.template_length - 1 - 2*offset else: continue width = abs(i.template_length - 2*offset) if width >= fragRange[0] and width <= fragRange[1] and (start + end)/2 >= s['start'] and (start + end)/2 < s['stop']: count[int((start + end)/2 - s['start'])] += 1 counts = counts + count counts = np.array(counts) counts = counts.astype(int) try: fitdist = importr('fitdistrplus') params = fitdist.fitdist(vectors.IntVector(counts), 'nbinom', method = "mle") params = params.rx2("estimate") size = params.rx2("size")[0] mu = params.rx2("mu")[0] params = {'mu': mu, 'phi': size} except RRuntimeError: mu = 0.002 phi = 100 params = {'mu': mu, 'phi': phi} return params
import pytest from rpy2.robjects import vectors from rpy2.robjects.packages import importr from rpy2.ipython import html base = importr('base') @pytest.mark.parametrize( 'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal), (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal), (vectors.StrVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.FactorVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.ListVector({ 'a': 1, 'b': 2 }), html.html_rlist), (vectors.DataFrame({ 'a': 1, 'b': 'z' }), html.html_rdataframe), ('x <- c(1, 2, 3)', html.html_sourcecode), (base.c, html.html_ridentifiedobject)]) def test_html_func(o, func): res = func(o) assert isinstance(res, str)
robj.r('x = 3') print robj.r('x') # <codecell> # Interact with R using attributes print robj.r.x print "why are the two subsequent calls to ls producing different results?" print robj.r.ls print robj.r.ls() # <codecell> # create a python vector and compute R stats on it x = rv.IntVector(range(0,11)) print "why are the two subsequent calls to mean producing different results?" print robj.r('mean(x)') print robj.r.mean(x) print "Here are some other stats" print "Sum" print robj.r.sum(x) print "Variance" print robj.r.var(x) # <headingcell level=3> # Part 3: Create and interact with multi-dimensional R objects # <codecell>