def getBayesModel(G, p, mixPrior=None): """ Constructs a PWM CSI BayesMixtureModel. @param G: number of components @param p: number of positions of the binding site @return: BayesMixtureModel object """ if not mixPrior: piPrior = mixture.DirichletPrior(G, [1.0] * G) compPrior = [] for i in range(p): compPrior.append( mixture.DirichletPrior(4, [1.02, 1.02, 1.02, 1.02])) # arbitrary values of struct and comp parameters. Values should be # reset by user using the structPriorHeuristic method. mixPrior = mixture.MixtureModelPrior(0.05, 0.05, piPrior, compPrior) DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) comps = [] for i in range(G): dlist = [] for j in range(p): phi = mixture.random_vector(4) dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.BayesMixtureModel(G, pi, comps, mixPrior, struct=1) return m
def readUCSCPrior(filename): """ Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/) and converts them into PyMix DirichletMixturePrior objects. Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior the gap symbol is introduced with a parameter value of 0.01 in all components. @param filename: file in UCSC DMP format @return: DirichletMixturePrior object """ f = open(filename, 'r') ex1 = re.compile('Mixture=\s(\d+.\d+)') ex2 = re.compile('Order\s*=\s+([A-Z\s]+)') ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)') pi = [] sigma = None dComp = [] alpha_mat = [] for l in f: l = mixture.chomp(l) m1 = ex1.match(l) if m1: pi.append(float(m1.groups(1)[0])) m2 = ex2.match(l) if m2: s = m2.groups(1)[0] sigma = s.split(' ') m3 = ex3.match(l) if m3: s = m3.groups(1)[0] alpha = s.split(' ') alpha = map(float, alpha) alpha.pop(0) # first entry is the sum of the others -> remove alpha_mat.append(alpha) # intergrate gap character '-' into the alphabet sigma.append('-') alphabet = mixture.Alphabet(sigma) for i in range(len(alpha_mat)): alpha_mat[i].append(0.01) # add hyper paramerter for '-' dComp.append(mixture.DirichletPrior(21, alpha_mat[i])) prior = mixture.DirichletMixturePrior(len(dComp), 21, pi, dComp) return alphabet, prior
def getModel(G, p): """ Constructs a PWM MixtureModel. @param G: number of components @param p: number of positions of the binding site @return: MixtureModel object """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) comps = [] for i in range(G): dlist = [] for j in range(p): phi = mixture.random_vector(4) dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.MixtureModel(G, pi, comps) return m
def scanSequence(mix, bg, seq, scoring='mix'): """ Scores all positions of a sequence with the given model and background. @param mix: MixtureModel object @param bg: background MixtureModel object @param seq: sequence as list of nucleotides @param scoring: flag to determine the scoring scheme used for the mixtures. 'compmax' means maximum density over the components, 'mix' means true mixture density @return: list of position-wise log-odd scores """ # convert sequence to internal representation, alphabet of seq must be DNA alph = mixture.Alphabet(['A', 'C', 'G', 'T']) f = lambda x: alph.internal(x) seq = map(f, seq) dnr = mix.components[0].dist_nr # init with dummy value at first position s = nump.array([[-1] + seq[0:dnr - 1]]) score = [] for i in range(dnr - 1, len(seq), 1): # shift query sequence by one position s[0] = np.concatenate([s[0][1:], np.array([seq[i]])], 0) if scoring == 'compmax': # score as maximum over components c_m_l = np.zeros(mix.G, dtype='Float64') for i in range(mix.G): c_m_l[i] = mix.components[i].pdf(s)[0] m_l = c_m_l.max() elif scoring == 'mix': m_l = mix.pdf(s)[0] bg_l = bg.pdf(s)[0] score.append(m_l - bg_l) return score
def getBackgroundModel(p, dist=None): """ Construct background model @param p: number of positions of the binding site @param dist: background nucleotide frequencies, uniform is default @return: MixtureModel representing the background """ DNA = mixture.Alphabet(['A', 'C', 'G', 'T']) dlist = [] if dist == None: phi = [0.25] * 4 else: phi = dist for j in range(p): dlist.append(mixture.DiscreteDistribution(4, phi, DNA)) comps = [mixture.ProductDistribution(dlist)] m = mixture.MixtureModel(1, [1.0], comps) return m
from pymix import mixture import random from numpy import numarray VNTR = mixture.Alphabet([ '.', '2/4', '2/7', '3/4', '3/7', '4/4', '4/6', '4/7', '4/8', '4/9', '7/7' ]) DIAG = mixture.Alphabet(['.', '0', '8', '1']) data = mixture.DataSet() # iq.txt = iq and achievement test fields from pheno.txt # drd4_len.txt = drd4 vntr types, only number of repeats data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"]) COMOR = 11 G = 8 components = [] for i in range(G): # intelligence and achivement tests as univariate normal distributions. (TEST) bd_mu = float(random.randint(3, 16)) bd_sigma = random.uniform(1.0, 8.0) missing_bd = mixture.NormalDistribution(-9999.9, 0.00001) dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma) mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd], compFix=[0, 2]) voc_mu = float(random.randint(3, 16)) voc_sigma = random.uniform(1.0, 8.0) missing_voc = mixture.NormalDistribution(-9999.9, 0.00001)