Beispiel #1
0
def testLymphData():

	k = 5
	d = 11

	aux = [0]*d

	models = []

	for i in range(k):
	    aux1 = [0]*d
	    aux2 = [0]*d
	    aux3 = [0]*d
  	    models.append(mixture.ProductDistribution([mixture.DependenceTreeDistribution(d,aux1,aux2,aux3)]))

        pi = [1.0]*k
	pi = np.array(pi)/k


        train = mixture.MixtureModel(k,pi,models)

        data = mixture.DataSet()
	data.fromFiles(['data/ltree2_2fold.txt'],)

	train.modelInitialization(data)

        train.EM(data,100,0.01,silent=1)
Beispiel #2
0
def plotUnivariateNormalMixtureDensity(m, axis, title= None, format= '-b'):
    """

    """

    # -5, 10.0, -5.0, 10.0

    x = pylab.arange(axis[0],axis[1],0.02)
    dat_x = mixture.DataSet()
    dat_x.fromList(x)
    dat_x.internalInit(m)

    #print len(x)
    #print len(y)


    y = np.exp( m.pdf(dat_x) )


    #pylab.figure()
    pylab.plot(x,y, format) #
    pylab.axis(axis)


    if title:
        pylab.title(title)
    else:
        pylab.title('Normal Mixture Density Plot')
Beispiel #3
0
def readSites(fileName):
    """
    Flat file parser for the JASPAR .sites  format. The files are essentially fasta but
    there is a count matrix at the end of the file.

    @param fileName: File name of .sites file
    @return: DataSet object
    """
    f = open(fileName, "r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            #print s.groups(1)[0]
            tl = s.groups(1)[0].split('\t')

            ids.append(str(tl[1]) + '_' + str(tl[2]))

            #ids.append('seq'+str(count))
            #count +=1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i, s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs=ids)

    return data
Beispiel #4
0
def readFastaSequences(fileName, out_type='DataSet'):
    """
    Reads a file in fasta format and returns the sequence in a DataSet object

    @param fileName: Name of the input file
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: list of sequence lists
    """
    f = open(fileName, "r")
    index = -1
    seqM = []
    nameList = []
    partSeq = ""
    nameReg = re.compile("^\>(.*)")

    try:
        while 1 == 1:
            line = f.next()
            s = nameReg.search(line)
            if s:
                if index != -1:
                    if partSeq[len(partSeq) - 2:len(partSeq)] == "//":
                        partSeq = partSeq[0:len(partSeq) - 2]

                    partSeq = partSeq.upper(
                    )  # upper case letters by convention
                    seqM.append(list(partSeq))
                partSeq = ""
                index += 1
                nameList.append(mixture.chomp(s.group(1)))
            else:
                partSeq += mixture.chomp(line)

    except StopIteration:
        if partSeq[len(partSeq) - 2:len(partSeq)] == "//":
            partSeq = partSeq[0:len(partSeq) - 2]
        partSeq = partSeq.upper()
        seqM.append(list(partSeq))

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    data.fromList(seqM, IDs=nameList)

    return data
Beispiel #5
0
def readAlnData(fn, reg_str=None, out_type='DataSet'):
    """
    Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object.

    @param reg_str: regular expression for sequence parsing
    @param: type of output object: DataSet or ConstrainedDataSet
    @return: DataSet object
    """

    f = open(fn, 'r')
    if reg_str:
        parse = re.compile(reg_str)
    else:
        parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)")

    d = {}
    f.readline()  # remove first line

    for l in f:
        l = mixture.chomp(l)
        pat = parse.search(l)
        if pat:
            k = pat.group(1)
            seq = pat.group(2)
            if k in d.keys():
                d[k] += seq
            else:
                d[k] = seq

        else:
            continue

    if out_type == 'DataSet':
        data = mixture.DataSet()
    elif out_type == 'ConstrainedDataSet':
        data = mixture.ConstrainedDataSet()
    else:
        raise TypeError, 'Invalid output type ' + str(out_type)

    sIDs = d.keys()
    dMatrix = []
    for z in d.keys():
        dMatrix.append(list(d[z]))

    data.fromList(dMatrix, IDs=sIDs)

    return data
Beispiel #6
0
def readJASPAR(fileName):
    """
    Reads a flat file of JASPAR binding sites matrices. JASPAR files are
    essentially fasta, but only upper case letters are part of the binding site proper.
    Lower case letters are discarded.

    """
    f = open(fileName, "r")

    seq_head = re.compile("^\>(.*)")
    end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[")
    seq = []

    ids = []
    count = 1
    for line in f:
        line = mixture.chomp(line)
        #line = line.upper()
        s = seq_head.search(line)
        if s:
            ids.append('seq' + str(count))
            count += 1
            #print s.group(1)

        elif end.search(line):
            break
        else:
            if len(line) > 0:
                line = list(line)

                # remove lower case letters, only upper case letters are part of the
                # binding site
                site = []
                for i, s in enumerate(line):
                    if s.isupper():
                        site.append(s)

                seq.append(site)
                #print len(site)

    data = mixture.DataSet()
    data.fromList(seq, IDs=ids)

    return data
def mixture_model(allele_freq,
                  max_components,
                  p_mean=np.nan,
                  p_std=np.nan,
                  quiet=False):
    data = mixture.DataSet()
    data.fromList(allele_freq)

    distributions = []
    for i in xrange(max_components):
        if np.isnan(p_mean):
            mean = random()
        else:
            mean = p_mean
        if np.isnan(p_std):
            std = random()
        else:
            std = p_std
        distributions.append(mixture.NormalDistribution(mean, std))

    total_models = []
    for i in xrange(max_components):
        weights = list(np.repeat(1.0 / (i + 1), i + 1))
        components = distributions[0:i + 1]
        model = mixture.MixtureModel(i + 1, weights, components)

        model.EM(data, 1000, 0.001, silent=quiet)
        if not quiet:
            print
            print model
            print '------------------------------'
        total_models.append(model)

    model_selections = mixture.modelSelection(data, total_models, silent=quiet)
    best_model = total_models[model_selections[1].index(
        min(model_selections[1]))]
    best_model_bic = min(model_selections[1])
    labels = best_model.classify(data, silent=1)

    return best_model, labels, best_model_bic
Beispiel #8
0
from pymix import mixture
import random
from numpy import numarray

VNTR = mixture.Alphabet([
    '.', '2/4', '2/7', '3/4', '3/7', '4/4', '4/6', '4/7', '4/8', '4/9', '7/7'
])
DIAG = mixture.Alphabet(['.', '0', '8', '1'])

data = mixture.DataSet()

# iq.txt = iq and achievement test fields from pheno.txt
# drd4_len.txt = drd4 vntr types, only number of repeats
data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"])

COMOR = 11
G = 8
components = []
for i in range(G):

    # intelligence and achivement tests as univariate normal distributions. (TEST)
    bd_mu = float(random.randint(3, 16))
    bd_sigma = random.uniform(1.0, 8.0)
    missing_bd = mixture.NormalDistribution(-9999.9, 0.00001)
    dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma)
    mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd],
                                  compFix=[0, 2])

    voc_mu = float(random.randint(3, 16))
    voc_sigma = random.uniform(1.0, 8.0)
    missing_voc = mixture.NormalDistribution(-9999.9, 0.00001)
Beispiel #9
0
def plotMixtureEntropy(mix, axis):
    """
    @param axis: matlab-like axis coordinates: [x_start, x_end, y_start, y_end]

    """

    # -5, 10.0, -5.0, 10.0

    x = pylab.arange(axis[0],axis[1]+0.1,0.1)
    y = pylab.arange(axis[2],axis[3]+0.1,0.1)

    #print x
    #print y

    #print len(x)
    #print len(y)


    #X,Y = pylab.meshgrid(x,y)
    #z = pylab.exp(-(X*X + Y*Y)) + 0.6*pylab.exp(-((X+1.8)**2 + Y**2))
    #pylab.contour(x,y,z)

    z = np.zeros( (len(y),len(x)),dtype='Float64' )
    for i in range(len(y)):
        dat = np.zeros((len(x),2),dtype='Float64' )
        dat[:,1] = y[i]
        dat[:,0] = x
        #print np.exp(mix.pdf(dat))


        #print "---------------------------\n",dat
        data = mixture.DataSet()
        data.fromList(dat)
        data.internalInit(mix)

        l = mixture.get_posterior(mix,data,logreturn=False)

        #print l


        #print np.exp(mix.pdf(dat)).tolist()
        for j in range(len(x)):

            z[i,j] = mixture.entropy(l[:,j])
            #print dat[j,:] ,":",l[:,j], "=",z[i,j]
        #print "---------------------------\n"

    #print "z", len(z),'x', len(z[0]) ,'=', len(z) * len(z[0])

    print "max",z.max()
    #max_val = z.max()

    max_val = np.log(mix.G) # maximum entropy for a vector of length mix.G
    print "theor. max", max_val

    step = max_val / 10.0
    print "step",step

    #pylab.figure(1)
    #pylab.contour(x,y,z)

    #pylab.figure(2)
    #pylab.contour(x,y,z,pylab.arange(0,max_val,step))
    #pylab.legend()

#    pylab.colorbar()
    pylab.contourf(x,y,z,) # pylab.arange(0,max_val,step)



    pylab.title('Posterior Entropy Plot')
Beispiel #10
0
def plotNormalMixtureDensity(mix, axis, title= None, newfigure=False, fill=True, alpha=1.0):
    """
    @param axis: matlab-like axis coordinates: [x_start, x_end, y_start, y_end]

    """

    if newfigure == True:
        pylab.figure()

    # -5, 10.0, -5.0, 10.0

    x = pylab.arange(axis[0],axis[1]+0.1,0.1)
    y = pylab.arange(axis[2],axis[3]+0.1,0.1)

    #print len(x)
    #print len(y)


    #X,Y = pylab.meshgrid(x,y)
    #z = pylab.exp(-(X*X + Y*Y)) + 0.6*pylab.exp(-((X+1.8)**2 + Y**2))
    #pylab.contour(x,y,z)

    z = np.zeros( (len(y),len(x)),dtype='Float64' )
    for i in range(len(y)):
        ndat = np.zeros((len(x),2),dtype='Float64' )
        ndat[:,1] = y[i]
        ndat[:,0] = x
        #print np.exp(mix.pdf(dat))

        dat = mixture.DataSet()
        dat.fromList(ndat)
        dat.internalInit(mix)

        # XXX pdf is log valued, we want the true value XXX

        #print np.exp(mix.pdf(dat)).tolist()

        z[i,:] = np.exp(mix.pdf(dat))


    #print "z", len(z),'x', len(z[0]) ,'=', len(z) * len(z[0])

    #print "max",z.max()
    max_val = z.max()

    step = max_val / 40.0

    #step = max_val / 200.0

    #print "step",step

    #pylab.figure(1)
    #pylab.contour(x,y,z)

    if fill == True:
        pylab.contourf(x,y,z,pylab.arange(0,max_val,step),alpha=alpha)
    else:
        pylab.contour(x,y,z,pylab.arange(0,max_val,step),alpha=alpha)

    if title:
        pylab.title(title)
    else:
        pylab.title('Normal Mixture Density Plot')
Beispiel #11
0
#k = 2
#xd = pm.NormalDistribution(datamu[0], k*datasigma[0])
#yd = pm.NormalDistribution(datamu[1], k*datasigma[1])
xmin, ymin = data.min(axis=0)
xmax, ymax = data.max(axis=0)
#xmean, ymean = np.mean([xmin, xmax]), np.mean([ymin, ymax])
width, height = xmax-xmin, ymax-ymin
xd = pm.UniformDistribution(xmin, xmax)
yd = pm.UniformDistribution(ymin, ymax)
distrib = pm.ProductDistribution([xd, yd])
distribs.append(distrib)
compFix = [0] * ndistribs
compFix[-1] = 1 # flag to make last distrib have fixed params
'''

pmdata = pm.DataSet()
pmdata.fromArray(data)

m = pm.MixtureModel(ndistribs,
                    np.ones(ndistribs) / ndistribs,
                    distribs,
                    compFix=None)
#m.modelInitialization(pmdata) # this hangs? only for multivariate distribs, works fine for productdistribs
posterior, loglikelihood = m.EM(pmdata, 50, 0.1)
#posterior, loglikelihood = m.randMaxEM(pmdata, 20, 100, 0.5, silent=False)

cids = m.classify(pmdata, entropy_cutoff=0.5, silent=True)

ncolours = len(COLOURS)
colouris = cids % ncolours
colours = np.asarray(COLOURS)[colouris]