Beispiel #1
0
def getBayesModel(G, p, mixPrior=None):
    """
    Constructs a PWM CSI BayesMixtureModel.

    @param G: number of components
    @param p: number of positions of the binding site
    @return: BayesMixtureModel object
    """

    if not mixPrior:
        piPrior = mixture.DirichletPrior(G, [1.0] * G)
        compPrior = []
        for i in range(p):
            compPrior.append(
                mixture.DirichletPrior(4, [1.02, 1.02, 1.02, 1.02]))

        # arbitrary values of struct and comp parameters. Values should be
        # reset by user using the structPriorHeuristic method.
        mixPrior = mixture.MixtureModelPrior(0.05, 0.05, piPrior, compPrior)

    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.BayesMixtureModel(G, pi, comps, mixPrior, struct=1)
    return m
Beispiel #2
0
def readUCSCPrior(filename):
    """
    Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/)
    and converts them into PyMix DirichletMixturePrior objects.

    Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior
    the gap symbol is introduced with a parameter value of 0.01 in all components.

    @param filename: file in UCSC DMP format

    @return: DirichletMixturePrior object

    """
    f = open(filename, 'r')

    ex1 = re.compile('Mixture=\s(\d+.\d+)')
    ex2 = re.compile('Order\s*=\s+([A-Z\s]+)')
    ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)')

    pi = []
    sigma = None
    dComp = []
    alpha_mat = []

    for l in f:
        l = mixture.chomp(l)
        m1 = ex1.match(l)
        if m1:
            pi.append(float(m1.groups(1)[0]))
        m2 = ex2.match(l)
        if m2:
            s = m2.groups(1)[0]
            sigma = s.split(' ')

        m3 = ex3.match(l)
        if m3:
            s = m3.groups(1)[0]
            alpha = s.split(' ')
            alpha = map(float, alpha)
            alpha.pop(0)  # first entry is the sum of the others -> remove
            alpha_mat.append(alpha)

    # intergrate gap character '-' into the alphabet
    sigma.append('-')
    alphabet = mixture.Alphabet(sigma)

    for i in range(len(alpha_mat)):
        alpha_mat[i].append(0.01)  # add hyper paramerter for '-'
        dComp.append(mixture.DirichletPrior(21, alpha_mat[i]))

    prior = mixture.DirichletMixturePrior(len(dComp), 21, pi, dComp)
    return alphabet, prior
Beispiel #3
0
def getModel(G, p):
    """
    Constructs a PWM MixtureModel.

    @param G: number of components
    @param p: number of positions of the binding site
    @return: MixtureModel object
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    comps = []
    for i in range(G):
        dlist = []
        for j in range(p):
            phi = mixture.random_vector(4)
            dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
        comps.append(mixture.ProductDistribution(dlist))
    pi = mixture.random_vector(G)
    m = mixture.MixtureModel(G, pi, comps)
    return m
Beispiel #4
0
def scanSequence(mix, bg, seq, scoring='mix'):
    """
    Scores all positions of a sequence with the given model and background.

    @param mix: MixtureModel object
    @param bg: background MixtureModel object
    @param seq: sequence as list of nucleotides
    @param scoring: flag to determine the scoring scheme used for the mixtures.
      'compmax' means maximum density over the components, 'mix' means true mixture density

    @return: list of position-wise log-odd scores
    """
    # convert sequence to internal representation, alphabet of seq must be DNA
    alph = mixture.Alphabet(['A', 'C', 'G', 'T'])
    f = lambda x: alph.internal(x)
    seq = map(f, seq)

    dnr = mix.components[0].dist_nr

    # init with dummy value at first position
    s = nump.array([[-1] + seq[0:dnr - 1]])

    score = []
    for i in range(dnr - 1, len(seq), 1):
        # shift query sequence by one position
        s[0] = np.concatenate([s[0][1:], np.array([seq[i]])], 0)

        if scoring == 'compmax':
            # score as maximum over components
            c_m_l = np.zeros(mix.G, dtype='Float64')
            for i in range(mix.G):
                c_m_l[i] = mix.components[i].pdf(s)[0]
            m_l = c_m_l.max()

        elif scoring == 'mix':
            m_l = mix.pdf(s)[0]

        bg_l = bg.pdf(s)[0]

        score.append(m_l - bg_l)

    return score
Beispiel #5
0
def getBackgroundModel(p, dist=None):
    """
    Construct background model

    @param p: number of positions of the binding site
    @param dist: background nucleotide frequencies, uniform is default

    @return: MixtureModel representing the background
    """
    DNA = mixture.Alphabet(['A', 'C', 'G', 'T'])
    dlist = []

    if dist == None:
        phi = [0.25] * 4
    else:
        phi = dist

    for j in range(p):
        dlist.append(mixture.DiscreteDistribution(4, phi, DNA))
    comps = [mixture.ProductDistribution(dlist)]

    m = mixture.MixtureModel(1, [1.0], comps)
    return m
Beispiel #6
0
from pymix import mixture
import random
from numpy import numarray

VNTR = mixture.Alphabet([
    '.', '2/4', '2/7', '3/4', '3/7', '4/4', '4/6', '4/7', '4/8', '4/9', '7/7'
])
DIAG = mixture.Alphabet(['.', '0', '8', '1'])

data = mixture.DataSet()

# iq.txt = iq and achievement test fields from pheno.txt
# drd4_len.txt = drd4 vntr types, only number of repeats
data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"])

COMOR = 11
G = 8
components = []
for i in range(G):

    # intelligence and achivement tests as univariate normal distributions. (TEST)
    bd_mu = float(random.randint(3, 16))
    bd_sigma = random.uniform(1.0, 8.0)
    missing_bd = mixture.NormalDistribution(-9999.9, 0.00001)
    dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma)
    mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd],
                                  compFix=[0, 2])

    voc_mu = float(random.randint(3, 16))
    voc_sigma = random.uniform(1.0, 8.0)
    missing_voc = mixture.NormalDistribution(-9999.9, 0.00001)