Esempio n. 1
0
def computeMNaseNucOneMusPhis(bamFile, nucleosomeFile, fragRange):
    empiricalCounts = []
    samfile = pysam.AlignmentFile(bamFile)
    with open(nucleosomeFile) as infile:
        for line in infile:
            if line[0] == "#": continue
            l = line.strip().split()
            if l[0] == "chr": continue
            start = int(l[1]) - 73
            stop = int(l[1]) + 74
            counts = [0 for i in range(start, stop + 1)]
            # ignore chrXII
            if l[0] == "chrXII": continue
            regions = samfile.fetch(l[0], max(0, start - 200), stop + 200)
            for r in regions:
                if r.template_length <= 0: continue
                rStart = r.reference_start + 1
                rEnd = r.reference_start + r.template_length
                m = (rStart + rEnd)/2
                width = abs(r.template_length)
                if width < fragRange[0] or width > fragRange[1]: continue
                if m < start or m > stop: continue
                counts[int(m - start)] += 1
            empiricalCounts.extend(counts)
    empiricalCounts = np.array(empiricalCounts)
    mus = []
    phis = []
    ec = empiricalCounts
    fitdist = importr("fitdistrplus")
    params = fitdist.fitdist(vectors.IntVector(ec), 'nbinom', method = 'mle')
    params = params.rx2("estimate")
    mus = params.rx2("mu")[0]
    phis = params.rx2("size")[0]
    return mus, phis
def web_logo_creator(sequence_list, sequence_name, output):
    """
    :param sequence_list: (tuple of strings) - list of sequences
    :param sequence_name: (string) name of the sequence
    :param output: (string) the folder where the results will be created
    """
    warnings.filterwarnings("ignore", category=RRuntimeWarning)
    weblogo_maker = robj.r("""
    library("ggplot2")
    library("ggseqlogo")

    function(mys_seq, name_file, mytitle, size){
        s1 = 15
        cs1 = make_col_scheme(chars=c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M'), groups=c('g1','g2','g3','g4','g5', 'g6', 'g7', 'g8', 'g9', 'g10'),cols=c('limegreen','brown1','gold','dodgerblue3','darkorange', "brown1", "limegreen", "dodgerblue3", "darkorchid3", "dodgerblue3"), name='custom1')

        p1 = ggseqlogo(mys_seq,  method = "bit", col_scheme=cs1, namespace = c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M')) + theme_logo() + scale_x_discrete(limits = as.character(seq(1,size, by=1)), labels = as.character(seq(1,size, by=2)), breaks = as.character(seq(1, size, by=2))) + theme(axis.title.y=element_text(size=s1+25), legend.position="none")
        p1 = p1 + ggtitle(mytitle) +  theme(plot.title = element_text(hjust = 0.5))


        p1 = p1 + theme(axis.text=element_text(size=s1 + 25), plot.title = element_text(size=s1 + 30))
        p1 = p1 + scale_y_discrete(limits = c(0, 0.5, 1), labels = as.character(seq(0,1, length=3)), breaks = as.character(seq(0,1, length=3)), expand = c(0,0.05))
        #p1 = p1 + ylim(0,1)
        png(file=paste(name_file,"_weblogo.png", sep=""),height=149 * 2,width=52 * size * 2 )
        print(p1)
        dev.off()
    }
    """)
    weblogo_maker(v.StrVector(sequence_list),
                  v.StrVector([output + sequence_name]),
                  v.StrVector([sequence_name]),
                  v.IntVector([len(sequence_list[0])]))
Esempio n. 3
0
 def test_spread(self):
     labels = ('a', 'b', 'c', 'd', 'e')
     dataf = tidyr.DataFrame({
         'x': vectors.IntVector((1, 2, 3, 4, 5)),
         'labels': vectors.StrVector(labels)
     })
     dataf_spread = dataf.spread('labels', 'x')
     assert sorted(list(labels)) == sorted(list(dataf_spread.colnames))
Esempio n. 4
0
 def test_dataframe(self):
     dataf = tidyr.DataFrame({
         'x':
         vectors.IntVector((1, 2, 3, 4, 5)),
         'labels':
         vectors.StrVector(('a', 'b', 'b', 'b', 'a'))
     })
     assert isinstance(dataf, tidyr.DataFrame)
     assert sorted(['x', 'labels']) == sorted(list(dataf.colnames))
Esempio n. 5
0
def _check_auto_predictability(x, embedding_dim, tau, predsteplist=None):
    """
    Check data validity for MCCM via auto-predictability

    This function evaluates whether or not the data used is valid to be tested
    by MCCM. The primary concerns are non-linearity and local periodicity.
    Issues warnings if these features are detected, but also returns the
    validity test results for analysis offline.

    Args:
        x (rvec.FloatVector): Vector to be tested
        embedding_dim (int): Embedding dimension
        tau (float): Time-delay for attractor reconstruction
        predsteplist (list): List of temporal distances for evaluating prediction

    Returns:
        (dict) Results of validity testing
    """
    # Set default prediction step list if not provided
    if predsteplist is None:
        predsteplist = list(range(1, 11))

    # Auto-predictability test from multispatialCCM.R
    signal_out_r = mccm.SSR_check_signal(A=x, E=embedding_dim, tau=tau, predsteplist=rvec.IntVector(predsteplist))

    # Prediction strength (rho) over increasing temporal distance (predstep)
    rho_pred = np.array(signal_out_r.rx2("predatout"))

    # Slope and p-value (linear regression) of rho over temporal distance
    rho_slope = np.array(signal_out_r.rx2("rho_pre_slope"))

    if rho_slope[0] >= 0:
        # If prediction strength (rho) remains the same or increases with temporal distance,
        # a basic tenant of non-linearity has been violated
        warnings.warn("Prediction increases with historical distance. Data may not be non-linear...")

    if np.max(rho_pred) < 0.2:
        # If the highest prediction strength is fairly low, there may be too much noise
        # in the data for MCCM to provide a valid result
        warnings.warn(
            "Corrlation coefficient for short time steps (predictive validity) is below 0.2. "
            "Excessive stochasitic noise may be present...")

    if np.min(rho_pred) < (rho_pred[1, 0] - 0.2) and np.min(rho_pred) < (rho_pred[1, -1] - 0.2):
        # If prediction strength (rho) dips and then rises again, the data is likely locally periodic.
        # This is highly problematic for accurate MCCM
        warnings.warn("Possible periodicity detected...")

    # Return results of validity testing
    signal_out = {"rho predicted": rho_pred,
                  "rho test": rho_slope}

    return signal_out
Esempio n. 6
0
def computeMNaseTFPhisMus(bamFile, csvFile, fragRange, filename, offset = 0):
    """
    Negative binomial distribution for short fragments at TF
    binding sites.
    """
    TFs = ["ABF1", "REB1"]
    tfCounts = []
    nucCounts = []
    samfile = pysam.AlignmentFile(bamFile, "rb")
    with open(csvFile) as infile:
        for line in infile:
            l = line.strip().split()
            if l[3] not in TFs: continue
            minStart = int(l[1])
            maxEnd = int(l[2])
            chrm = l[0]
            countMid = [0 for i in range(maxEnd - minStart + 1)]
            countNuc = [0 for i in range(maxEnd - minStart + 1)]
            region = samfile.fetch(chrm, minStart - fragRange[1] - 1, maxEnd + fragRange[1] - 1)
            for i in region:
                if i.template_length - 2*offset >= 0:
                    start = i.reference_start + 1 + offset
                    end = i.reference_start + offset + i.template_length - 2*offset
                else:
                    continue
                width = abs(i.template_length - 2*offset)
                if width >= fragRange[0] and width <= fragRange[1] and (start + end)/2 >= minStart and (start + end)/2 <= maxEnd: 
                    countMid[int((start + end)/2 - minStart)] += 1
            tfCounts = tfCounts + countMid
            nucCounts = nucCounts + countNuc
    np.save("/usr/xtmp/sneha/tmpDir/mnaseCountstf", tfCounts)
    try:
        fitdist = importr('fitdistrplus')
        p = fitdist.fitdist(vectors.IntVector(tfCounts), 'nbinom', method = "mle")
        p = p.rx2("estimate")
        size = p.rx2("size")[0]
        mu = p.rx2("mu")[0]
        params = {'mu': mu, 'phi': size}
    except Exception as e:
        # hard code values
        if e.args[0][:14] == "Error in (func":
            mu = 0.002
            phi = 100
            params = {'mu': mu, 'phi': phi}
    return params
def pval_getter(val, cell, reg):

    glm = rpy2.robjects.r("""


    function(val, cell, reg){

    data <- as.data.frame(cbind(val, cell, reg))
    data$val <- as.factor(data$val)
    data$reg <- as.factor(data$reg)
    data$cell <- as.factor(data$cell)
    md0 <-glm(val ~ cell, family=binomial("logit"),data=data)
    md1 <-glm(val ~ reg+cell, family=binomial("logit"),data=data)
    print(summary(md0))
    print(summary(md1))
    a <- anova(md1, md0, test="Chisq")
    print(a)
    return(as.numeric(a$"Pr(>Chi)"[2]))
    }
    """)
    return(glm(v.IntVector(val), v.StrVector(cell), v.StrVector(reg)))
Esempio n. 8
0
def computeMNaseBackground(bamFile, segments, fragRange, offset = 0):
    """
    Compute MNase-seq midpoint of background distribution.
    """
    counts = []
    samfile = pysam.AlignmentFile(bamFile, "rb")
    for s in segments:
        minStart = s['start']
        maxEnd = s['stop'] + fragRange[1]
        region = samfile.fetch(s['chrm'], minStart - 200, maxEnd + 200)
        count = [0 for i in range(s['stop'] - s['start'])]
        for i in region:
            if i.template_length - 2*offset <= 0: continue
            if i.template_length - 2*offset >= 0:
                start = i.reference_start + offset
                end = i.reference_start + offset + i.template_length - 1 - 2*offset
            else:
                continue
            width = abs(i.template_length - 2*offset)
            if width >= fragRange[0] and width <= fragRange[1] and (start + end)/2 >= s['start'] and (start + end)/2 < s['stop']:
                count[int((start + end)/2 - s['start'])] += 1
        counts = counts + count
    counts = np.array(counts)
    counts = counts.astype(int)
    try:
        fitdist = importr('fitdistrplus')
        params = fitdist.fitdist(vectors.IntVector(counts), 'nbinom', method = "mle")
        params = params.rx2("estimate")
        size = params.rx2("size")[0]
        mu = params.rx2("mu")[0]
        params = {'mu': mu, 'phi': size}
    except RRuntimeError:
        mu = 0.002
        phi = 100
        params = {'mu': mu, 'phi': phi}
    return params
Esempio n. 9
0
import pytest
from rpy2.robjects import vectors
from rpy2.robjects.packages import importr
from rpy2.ipython import html

base = importr('base')


@pytest.mark.parametrize(
    'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.StrVector(['a', 'b'
                                   'c']), html.html_vector_horizontal),
               (vectors.FactorVector(['a', 'b'
                                      'c']), html.html_vector_horizontal),
               (vectors.ListVector({
                   'a': 1,
                   'b': 2
               }), html.html_rlist),
               (vectors.DataFrame({
                   'a': 1,
                   'b': 'z'
               }), html.html_rdataframe),
               ('x <- c(1, 2, 3)', html.html_sourcecode),
               (base.c, html.html_ridentifiedobject)])
def test_html_func(o, func):
    res = func(o)
    assert isinstance(res, str)
robj.r('x = 3')
print robj.r('x')

# <codecell>

# Interact with R using attributes
print robj.r.x
print "why are the two subsequent calls to ls producing different results?"
print robj.r.ls
print robj.r.ls()

# <codecell>

# create a python vector and compute R stats on it
x = rv.IntVector(range(0,11))
print "why are the two subsequent calls to mean producing different results?"
print robj.r('mean(x)')
print robj.r.mean(x)

print "Here are some other stats"
print "Sum"
print robj.r.sum(x)
print "Variance"
print robj.r.var(x)

# <headingcell level=3>

# Part 3: Create and interact with multi-dimensional R objects

# <codecell>