def testLymphData(): k = 5 d = 11 aux = [0]*d models = [] for i in range(k): aux1 = [0]*d aux2 = [0]*d aux3 = [0]*d models.append(mixture.ProductDistribution([mixture.DependenceTreeDistribution(d,aux1,aux2,aux3)])) pi = [1.0]*k pi = np.array(pi)/k train = mixture.MixtureModel(k,pi,models) data = mixture.DataSet() data.fromFiles(['data/ltree2_2fold.txt'],) train.modelInitialization(data) train.EM(data,100,0.01,silent=1)
def plotUnivariateNormalMixtureDensity(m, axis, title= None, format= '-b'): """ """ # -5, 10.0, -5.0, 10.0 x = pylab.arange(axis[0],axis[1],0.02) dat_x = mixture.DataSet() dat_x.fromList(x) dat_x.internalInit(m) #print len(x) #print len(y) y = np.exp( m.pdf(dat_x) ) #pylab.figure() pylab.plot(x,y, format) # pylab.axis(axis) if title: pylab.title(title) else: pylab.title('Normal Mixture Density Plot')
def readSites(fileName): """ Flat file parser for the JASPAR .sites format. The files are essentially fasta but there is a count matrix at the end of the file. @param fileName: File name of .sites file @return: DataSet object """ f = open(fileName, "r") seq_head = re.compile("^\>(.*)") end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[") seq = [] ids = [] count = 1 for line in f: line = mixture.chomp(line) #line = line.upper() s = seq_head.search(line) if s: #print s.groups(1)[0] tl = s.groups(1)[0].split('\t') ids.append(str(tl[1]) + '_' + str(tl[2])) #ids.append('seq'+str(count)) #count +=1 #print s.group(1) elif end.search(line): break else: if len(line) > 0: line = list(line) # remove lower case letters, only upper case letters are part of the # binding site site = [] for i, s in enumerate(line): if s.isupper(): site.append(s) seq.append(site) #print len(site) data = mixture.DataSet() data.fromList(seq, IDs=ids) return data
def readFastaSequences(fileName, out_type='DataSet'): """ Reads a file in fasta format and returns the sequence in a DataSet object @param fileName: Name of the input file @param: type of output object: DataSet or ConstrainedDataSet @return: list of sequence lists """ f = open(fileName, "r") index = -1 seqM = [] nameList = [] partSeq = "" nameReg = re.compile("^\>(.*)") try: while 1 == 1: line = f.next() s = nameReg.search(line) if s: if index != -1: if partSeq[len(partSeq) - 2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq) - 2] partSeq = partSeq.upper( ) # upper case letters by convention seqM.append(list(partSeq)) partSeq = "" index += 1 nameList.append(mixture.chomp(s.group(1))) else: partSeq += mixture.chomp(line) except StopIteration: if partSeq[len(partSeq) - 2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq) - 2] partSeq = partSeq.upper() seqM.append(list(partSeq)) if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) data.fromList(seqM, IDs=nameList) return data
def readAlnData(fn, reg_str=None, out_type='DataSet'): """ Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object. @param reg_str: regular expression for sequence parsing @param: type of output object: DataSet or ConstrainedDataSet @return: DataSet object """ f = open(fn, 'r') if reg_str: parse = re.compile(reg_str) else: parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)") d = {} f.readline() # remove first line for l in f: l = mixture.chomp(l) pat = parse.search(l) if pat: k = pat.group(1) seq = pat.group(2) if k in d.keys(): d[k] += seq else: d[k] = seq else: continue if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) sIDs = d.keys() dMatrix = [] for z in d.keys(): dMatrix.append(list(d[z])) data.fromList(dMatrix, IDs=sIDs) return data
def readJASPAR(fileName): """ Reads a flat file of JASPAR binding sites matrices. JASPAR files are essentially fasta, but only upper case letters are part of the binding site proper. Lower case letters are discarded. """ f = open(fileName, "r") seq_head = re.compile("^\>(.*)") end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[") seq = [] ids = [] count = 1 for line in f: line = mixture.chomp(line) #line = line.upper() s = seq_head.search(line) if s: ids.append('seq' + str(count)) count += 1 #print s.group(1) elif end.search(line): break else: if len(line) > 0: line = list(line) # remove lower case letters, only upper case letters are part of the # binding site site = [] for i, s in enumerate(line): if s.isupper(): site.append(s) seq.append(site) #print len(site) data = mixture.DataSet() data.fromList(seq, IDs=ids) return data
def mixture_model(allele_freq, max_components, p_mean=np.nan, p_std=np.nan, quiet=False): data = mixture.DataSet() data.fromList(allele_freq) distributions = [] for i in xrange(max_components): if np.isnan(p_mean): mean = random() else: mean = p_mean if np.isnan(p_std): std = random() else: std = p_std distributions.append(mixture.NormalDistribution(mean, std)) total_models = [] for i in xrange(max_components): weights = list(np.repeat(1.0 / (i + 1), i + 1)) components = distributions[0:i + 1] model = mixture.MixtureModel(i + 1, weights, components) model.EM(data, 1000, 0.001, silent=quiet) if not quiet: print print model print '------------------------------' total_models.append(model) model_selections = mixture.modelSelection(data, total_models, silent=quiet) best_model = total_models[model_selections[1].index( min(model_selections[1]))] best_model_bic = min(model_selections[1]) labels = best_model.classify(data, silent=1) return best_model, labels, best_model_bic
from pymix import mixture import random from numpy import numarray VNTR = mixture.Alphabet([ '.', '2/4', '2/7', '3/4', '3/7', '4/4', '4/6', '4/7', '4/8', '4/9', '7/7' ]) DIAG = mixture.Alphabet(['.', '0', '8', '1']) data = mixture.DataSet() # iq.txt = iq and achievement test fields from pheno.txt # drd4_len.txt = drd4 vntr types, only number of repeats data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"]) COMOR = 11 G = 8 components = [] for i in range(G): # intelligence and achivement tests as univariate normal distributions. (TEST) bd_mu = float(random.randint(3, 16)) bd_sigma = random.uniform(1.0, 8.0) missing_bd = mixture.NormalDistribution(-9999.9, 0.00001) dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma) mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd], compFix=[0, 2]) voc_mu = float(random.randint(3, 16)) voc_sigma = random.uniform(1.0, 8.0) missing_voc = mixture.NormalDistribution(-9999.9, 0.00001)
def plotMixtureEntropy(mix, axis): """ @param axis: matlab-like axis coordinates: [x_start, x_end, y_start, y_end] """ # -5, 10.0, -5.0, 10.0 x = pylab.arange(axis[0],axis[1]+0.1,0.1) y = pylab.arange(axis[2],axis[3]+0.1,0.1) #print x #print y #print len(x) #print len(y) #X,Y = pylab.meshgrid(x,y) #z = pylab.exp(-(X*X + Y*Y)) + 0.6*pylab.exp(-((X+1.8)**2 + Y**2)) #pylab.contour(x,y,z) z = np.zeros( (len(y),len(x)),dtype='Float64' ) for i in range(len(y)): dat = np.zeros((len(x),2),dtype='Float64' ) dat[:,1] = y[i] dat[:,0] = x #print np.exp(mix.pdf(dat)) #print "---------------------------\n",dat data = mixture.DataSet() data.fromList(dat) data.internalInit(mix) l = mixture.get_posterior(mix,data,logreturn=False) #print l #print np.exp(mix.pdf(dat)).tolist() for j in range(len(x)): z[i,j] = mixture.entropy(l[:,j]) #print dat[j,:] ,":",l[:,j], "=",z[i,j] #print "---------------------------\n" #print "z", len(z),'x', len(z[0]) ,'=', len(z) * len(z[0]) print "max",z.max() #max_val = z.max() max_val = np.log(mix.G) # maximum entropy for a vector of length mix.G print "theor. max", max_val step = max_val / 10.0 print "step",step #pylab.figure(1) #pylab.contour(x,y,z) #pylab.figure(2) #pylab.contour(x,y,z,pylab.arange(0,max_val,step)) #pylab.legend() # pylab.colorbar() pylab.contourf(x,y,z,) # pylab.arange(0,max_val,step) pylab.title('Posterior Entropy Plot')
def plotNormalMixtureDensity(mix, axis, title= None, newfigure=False, fill=True, alpha=1.0): """ @param axis: matlab-like axis coordinates: [x_start, x_end, y_start, y_end] """ if newfigure == True: pylab.figure() # -5, 10.0, -5.0, 10.0 x = pylab.arange(axis[0],axis[1]+0.1,0.1) y = pylab.arange(axis[2],axis[3]+0.1,0.1) #print len(x) #print len(y) #X,Y = pylab.meshgrid(x,y) #z = pylab.exp(-(X*X + Y*Y)) + 0.6*pylab.exp(-((X+1.8)**2 + Y**2)) #pylab.contour(x,y,z) z = np.zeros( (len(y),len(x)),dtype='Float64' ) for i in range(len(y)): ndat = np.zeros((len(x),2),dtype='Float64' ) ndat[:,1] = y[i] ndat[:,0] = x #print np.exp(mix.pdf(dat)) dat = mixture.DataSet() dat.fromList(ndat) dat.internalInit(mix) # XXX pdf is log valued, we want the true value XXX #print np.exp(mix.pdf(dat)).tolist() z[i,:] = np.exp(mix.pdf(dat)) #print "z", len(z),'x', len(z[0]) ,'=', len(z) * len(z[0]) #print "max",z.max() max_val = z.max() step = max_val / 40.0 #step = max_val / 200.0 #print "step",step #pylab.figure(1) #pylab.contour(x,y,z) if fill == True: pylab.contourf(x,y,z,pylab.arange(0,max_val,step),alpha=alpha) else: pylab.contour(x,y,z,pylab.arange(0,max_val,step),alpha=alpha) if title: pylab.title(title) else: pylab.title('Normal Mixture Density Plot')
#k = 2 #xd = pm.NormalDistribution(datamu[0], k*datasigma[0]) #yd = pm.NormalDistribution(datamu[1], k*datasigma[1]) xmin, ymin = data.min(axis=0) xmax, ymax = data.max(axis=0) #xmean, ymean = np.mean([xmin, xmax]), np.mean([ymin, ymax]) width, height = xmax-xmin, ymax-ymin xd = pm.UniformDistribution(xmin, xmax) yd = pm.UniformDistribution(ymin, ymax) distrib = pm.ProductDistribution([xd, yd]) distribs.append(distrib) compFix = [0] * ndistribs compFix[-1] = 1 # flag to make last distrib have fixed params ''' pmdata = pm.DataSet() pmdata.fromArray(data) m = pm.MixtureModel(ndistribs, np.ones(ndistribs) / ndistribs, distribs, compFix=None) #m.modelInitialization(pmdata) # this hangs? only for multivariate distribs, works fine for productdistribs posterior, loglikelihood = m.EM(pmdata, 50, 0.1) #posterior, loglikelihood = m.randMaxEM(pmdata, 20, 100, 0.5, silent=False) cids = m.classify(pmdata, entropy_cutoff=0.5, silent=True) ncolours = len(COLOURS) colouris = cids % ncolours colours = np.asarray(COLOURS)[colouris]