def testem(self): # complex DataSet with HMM sequences and scalar data dat = self.gen.sampleSet(100) # sampling hmm data seq1 = self.h1.hmm.sample(40, 10) seq2 = self.h2.hmm.sample(60, 10) seq1.merge(seq2) data = mixtureHMM.SequenceDataSet() data.fromGHMM(dat, [seq1]) data.internalInit(self.m) tA = [[0.5, 0.2, 0.3], [0.2, 0.3, 0.5], [0.1, 0.5, 0.4]] tB = [[0.2, 0.4, 0.1, 0.3], [0.5, 0.1, 0.2, 0.2], [0.4, 0.3, 0.15, 0.15]] tpi = [0.3, 0.3, 0.4] th1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA, tB, tpi) tA2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] tB2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.4], [0.2, 0.1, 0.6, 0.1]] tpi2 = [0.3, 0.4, 0.3] th2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), tA2, tB2, tpi2) tn1 = mixture.NormalDistribution(-1.5, 1.5) tn2 = mixture.NormalDistribution(9.0, 1.2) tmult1 = mixture.MultinomialDistribution(3, 4, [0.1, 0.1, 0.55, 0.25], alphabet=self.DIAG) tmult2 = mixture.MultinomialDistribution(3, 4, [0.4, 0.3, 0.1, 0.2], alphabet=self.DIAG) tc1 = mixture.ProductDistribution([tn1, tmult1, th1]) tc2 = mixture.ProductDistribution([tn2, tmult2, th2]) tmpi = [0.7, 0.3] tm = mixture.MixtureModel(2, tmpi, [tc1, tc2]) tm.EM(data, 80, 0.1, silent=1)
def setUp(self): # building generating models self.DIAG = mixture.Alphabet(['.', '0', '8', '1']) A = [[0.3, 0.6, 0.1], [0.0, 0.5, 0.5], [0.4, 0.2, 0.4]] B = [[0.5, 0.2, 0.1, 0.2], [0.5, 0.4, 0.05, 0.05], [0.8, 0.1, 0.05, 0.05]] pi = [1.0, 0.0, 0.0] self.h1 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi) A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]] pi2 = [0.6, 0.4, 0.0] self.h2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution( mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2) n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=self.DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=self.DIAG) c1 = mixture.ProductDistribution([n1, mult1, self.h1]) c2 = mixture.ProductDistribution([n2, mult2, self.h2]) mpi = [0.4, 0.6] self.m = mixture.MixtureModel(2, mpi, [c1, c2]) # mixture for sampling gc1 = mixture.ProductDistribution([n1, mult1]) gc2 = mixture.ProductDistribution([n2, mult2]) self.gen = mixture.MixtureModel(2, mpi, [gc1, gc2])
def sample(self, returnType='tuple'): assert returnType in ['tuple', 'object'] grand = random.gammavariate(self.shape, self.scale) #return grand sigma = 1.0 / grand #print sigma mu = random.normalvariate(self.mu, math.sqrt(self.tau * sigma)) #mu = random.normalvariate(self.mu, self.tau*sigma ) if returnType == 'tuple': return (mu, sigma) elif returnType == 'object': return mixture.NormalDistribution(mu, math.sqrt(sigma))
def createDistribution(data, distribution): # creating a component p = data.p # type of distribution dist = None if distribution == 'normal': p = [] for i in range(data.p): p.append(mixture.NormalDistribution(0, 1)) dist = mixture.ProductDistribution(p) else: sigma = [1] beta = [] for i in range(data.p): beta.append(random.normalvariate(0, 1)) dist = mixture.ProductDistribution( [mixtureLinearGaussian.LinearGaussianDistribution(p, beta, sigma)]) return dist
def getRandomMixture(G, p, KL_lower, KL_upper, dtypes='discgauss', M=4, seed=None): # if seed: # random.seed(seed) # mixture._C_mixextend.set_gsl_rng_seed(seed) # #print '*** seed=',seed # # else: # XXX debug # seed = random.randint(1,9000000) # mixture._C_mixextend.set_gsl_rng_seed(seed) # random.seed(seed) # #print '*** seed=',seed #M = 4 # Alphabet size for discrete distributions min_sigma = 0.1 # minimal std for Normal max_sigma = 1.0 # maximal std for Normal min_mu = -5.0 # minimal mean max_mu = 8.0 # maximal mean if dtypes == 'disc': featureTypes = [0] * p elif dtypes == 'gauss': featureTypes = [1] * p elif dtypes == 'discgauss': # discrete or Normal features for now, chosen uniformly # 0 discrete, 1 Normal featureTypes = [random.choice((0, 1)) for i in range(p)] else: raise TypeError #print featureTypes C = [] for j in range(p): c_j = [] for i in range(G): #print i,j if featureTypes[j] == 0: acc = 0 while acc == 0: cand = mixture.DiscreteDistribution( M, mixture.random_vector(M)) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 break c_j.append(cand) elif featureTypes[j] == 1: acc = 0 while acc == 0: mu = random.uniform(min_mu, max_mu) sigma = random.uniform(min_sigma, max_sigma) cand = mixture.NormalDistribution(mu, sigma) #print 'cand:',cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(d, cand) if KL_dist > KL_upper or KL_dist < KL_lower: #print ' *', cand, 'rejected:', d , KL_dist acc = 0 c_j.append(cand) else: RuntimeError C.append(c_j) # print '\n' # for cc in C: # print cc comps = [] for i in range(G): comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)])) pi = get_random_pi(G, 0.1) m = mixture.MixtureModel(G, pi, comps, struct=1) m.updateFreeParams() return m
def getRandomCSIMixture_conditionalDists(G, p, KL_lower, KL_upper, M=8, dtypes='discgauss', seed=None, fullstruct=False, disc_sampling_dist=None): # if seed: # random.seed(seed) # mixture._C_mixextend.set_gsl_rng_seed(seed) # #print '*** seed=',seed # # else: # XXX debug # seed = random.randint(1,9999999) # mixture._C_mixextend.set_gsl_rng_seed(seed) # random.seed(seed) # #print '*** seed=',seed if disc_sampling_dist == None: discSamp = mixture.DirichletPrior(M, [1.0] * M) # uniform sampling else: discSamp = disc_sampling_dist min_sigma = 0.3 # minimal std for Normal max_sigma = 5.0 # maximal std for Normal min_mu = -25.0 # minimal mean max_mu = 25.0 # maximal mean assert dtypes in ['disc', 'gauss', 'discgauss'] if dtypes == 'disc': featureTypes = [0] * p elif dtypes == 'gauss': featureTypes = [1] * p elif dtypes == 'discgauss': # discrete or Normal features for now, chosen uniformly # 0 discrete, 1 Normal featureTypes = [random.choice((0, 1)) for i in range(p)] else: raise TypeError #print featureTypes # generate random CSI structures if G < 15: P = setPartitions.generate_all_partitions( G) # XXX too slow for large G #print P C = [] leaders = [] groups = [] for j in range(p): c_j = {} leaders_j = [] groups_j = {} if fullstruct == True: struct_j = [(i, ) for i in range(G)] elif G < 15: struct_j = random.choice(P) else: print 'WARNING: improper structure sampling !' struct_j = setPartitions.get_random_partition(G) #print '\nstruct',j,struct_j for i, grp in enumerate(struct_j): lg = list(grp) #print lg lgj = lg.pop(0) #print lgj leaders_j.append(lgj) groups_j[lgj] = lg max_tries = 100000 tries = 0 if featureTypes[j] == 0: acc = 0 while acc == 0: cand = discSamp.sample() #print 'Cand:', cand acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(c_j[d], cand) #print c_j[d],cand, KL_dist if KL_dist > KL_upper or KL_dist < KL_lower: acc = 0 tries += 1 break if tries >= max_tries: raise RuntimeError, 'Failed to find separated parameters !' for cind in grp: c_j[cind] = cand elif featureTypes[j] == 1: acc = 0 while acc == 0: mu = random.uniform(min_mu, max_mu) sigma = random.uniform(min_sigma, max_sigma) cand = mixture.NormalDistribution(mu, sigma) acc = 1 for d in c_j: KL_dist = mixture.sym_kl_dist(c_j[d], cand) if KL_dist > KL_upper or KL_dist < KL_lower: acc = 0 tries += 1 break if tries >= max_tries: raise RuntimeError # print '.', #print for cind in grp: c_j[cind] = cand else: RuntimeError leaders.append(leaders_j) groups.append(groups_j) C.append(c_j) comps = [] for i in range(G): comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)])) pi = get_random_pi(G, 0.3 / G) #print '** pi =',pi # create prior piprior = mixture.DirichletPrior(G, [2.0] * G) cprior = [] for j in range(p): if featureTypes[j] == 0: cprior.append(mixture.DirichletPrior(M, [1.02] * M)) elif featureTypes[j] == 1: cprior.append(mixture.NormalGammaPrior( 0, 0, 0, 0)) # dummy parameters, to be set later else: RuntimeError mprior = mixture.MixtureModelPrior(0.1, 0.1, piprior, cprior) m = mixture.BayesMixtureModel(G, pi, comps, mprior, struct=1) m.leaders = leaders m.groups = groups m.identifiable() m.updateFreeParams() #print m return m
import labeledBayesMixture import mixture import copy # Setting up a three component Bayesian mixture over four features. # Two features are Normal distributions, two discrete. # initializing atomar distributions for first component n11 = mixture.NormalDistribution(1.0, 1.5) n12 = mixture.NormalDistribution(2.0, 0.5) d13 = mixture.DiscreteDistribution(4, [0.1, 0.4, 0.4, 0.1]) d14 = mixture.DiscreteDistribution(4, [0.25, 0.25, 0.25, 0.25]) # initializing atomar distributions for second component n21 = mixture.NormalDistribution(4.0, 0.5) n22 = mixture.NormalDistribution(-6.0, 0.5) d23 = mixture.DiscreteDistribution(4, [0.7, 0.1, 0.1, 0.1]) d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.2, 0.6]) # initializing atomar distributions for second component n31 = mixture.NormalDistribution(2.0, 0.5) n32 = mixture.NormalDistribution(-3.0, 0.5) d33 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.1, 0.7]) d34 = mixture.DiscreteDistribution(4, [0.6, 0.1, 0.2, 0.1]) # creating component distributions c1 = mixture.ProductDistribution([n11, n12, d13, d14]) c2 = mixture.ProductDistribution([n21, n22, d23, d24]) c3 = mixture.ProductDistribution([n31, n32, d33, d34]) # setting up the mixture prior
stdev = numpy.std(data[label][call]) histmaxes = getHistMaxes(hist) print(patient, sample, label, call) ###THIS IS WHERE YOU FIND THE HISTOGRAM PEAKS### ##Data: data[label][call] ##Peaks: histmaxes ##Peak heights: hist[histmaxes[n]] ##Stdev: stdev emdata = mixture.DataSet() emdata.fromList(data[label][call]) numpeaks = len(histmaxes) gaussian_objects = [] weights = [] for i in xrange(numpeaks): n = mixture.NormalDistribution(histmaxes[i], stdev) gaussian_objects.append(n) weights.append(hist[histmaxes[i]]) totweight = float(sum(weights)) weights = [x / totweight for x in weights] mymix = mixture.MixtureModel(numpeaks, weights, gaussian_objects) # print "Before",mymix mymix.EM(emdata, 40, 0.1) # print "After",mymix print("Number of peaks=", mymix.G) for i in range(mymix.G): print(mymix.pi[i], mymix.components[i]) summary.write(patient) summary.write("\t" + sample) summary.write("\t" + str(len(data[label][call])))
mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)), A, B, pi) #seq = h1.hmm.sample(10,50) #print seq A2 = [[0.5, 0.4, 0.1], [0.3, 0.2, 0.5], [0.3, 0.2, 0.5]] B2 = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.4, 0.5], [0.2, 0.2, 0.3, 0.3]] pi2 = [0.6, 0.4, 0.0] h2 = mixtureHMM.getHMM( mixtureHMM.ghmm.IntegerRange(0, 4), mixtureHMM.ghmm.DiscreteDistribution(mixtureHMM.ghmm.IntegerRange(0, 4)), A2, B2, pi2) n1 = mixture.NormalDistribution(2.5, 0.5) n2 = mixture.NormalDistribution(6.0, 0.8) mult1 = mixture.MultinomialDistribution(3, 4, [0.23, 0.26, 0.26, 0.25], alphabet=DIAG) mult2 = mixture.MultinomialDistribution(3, 4, [0.7, 0.1, 0.1, 0.1], alphabet=DIAG) c1 = mixture.ProductDistribution([n1, mult1, h1]) c2 = mixture.ProductDistribution([n2, mult2, h2]) mpi = [0.4, 0.6] m = mixture.MixtureModel(2, mpi, [c1, c2])
data = mixture.DataSet() # iq.txt = iq and achievement test fields from pheno.txt # drd4_len.txt = drd4 vntr types, only number of repeats data.fromFiles(["iq.txt", "phys.txt", "drd4_len.txt"]) COMOR = 11 G = 8 components = [] for i in range(G): # intelligence and achivement tests as univariate normal distributions. (TEST) bd_mu = float(random.randint(3, 16)) bd_sigma = random.uniform(1.0, 8.0) missing_bd = mixture.NormalDistribution(-9999.9, 0.00001) dist_bd = mixture.NormalDistribution(bd_mu, bd_sigma) mix_bd = mixture.MixtureModel(2, [0.999, 0.001], [dist_bd, missing_bd], compFix=[0, 2]) voc_mu = float(random.randint(3, 16)) voc_sigma = random.uniform(1.0, 8.0) missing_voc = mixture.NormalDistribution(-9999.9, 0.00001) dist_voc = mixture.NormalDistribution(voc_mu, voc_sigma) mix_voc = mixture.MixtureModel(2, [0.999, 0.001], [dist_voc, missing_voc], compFix=[0, 2]) read_mu = float(random.randint(80, 120)) read_sigma = random.uniform(1.0, 28.0) missing_read = mixture.NormalDistribution(-9999.9, 0.00001) dist_read = mixture.NormalDistribution(read_mu, read_sigma)
def find_threshold(self, user_params): """Finds the thresholds for errors given the data using Gaussian Mixture Model Args: data: The data to fit Kwargs: method: Whether to us [min,median,mean] of data in each bin thresh: Threshold for find_alpha bins: Number of pieces of the data we look at plot: Whether to plot the cdf and the two alpha cutoffs Returns: A soft threshold (alpha0) and A strong threshold (alpha1) Raises: """ max_gauss_mixtures = user_params.get("max_gauss_mixtures") data = self.prob_smoothed #print data # http://www.pymix.org/pymix/index.php?n=PyMix.Tutorial # make two gaussains gaussian_one = mixture.NormalDistribution(numpy.mean(data), numpy.std(data)) gaussian_two = mixture.NormalDistribution(10.0 * numpy.mean(data), numpy.std(data)) mixture_model = mixture.MixtureModel(2, [0.99, 0.01], [gaussian_one, gaussian_two]) # print mixture_model EM_tuned = False while not EM_tuned: # make mix_data from a random 10% of the original data index_array = numpy.arange(data.size) numpy.random.shuffle(index_array) mix_data = mixture.DataSet() data_size = numpy.min((int(numpy.floor(data.size / 10.0)), 50000)) mix_data.fromArray(data[index_array[:data_size]]) try: mixture_model.randMaxEM(mix_data, max_gauss_mixtures, 40, 0.001, silent=True) EM_tuned = True except AssertionError: # pymix likes to throw assertion errors when it has small machine precision errors... print "Caught an assertion error in pymix, randomizing input and trying again" except: print "pymix failed to find mixture model, using single gaussian" gaussian_two = mixture.NormalDistribution( numpy.mean(data), numpy.std(data)) EM_tuned = True #print mixture_model # hacky, no good api access to the model components gauss_one_mean = float( str(mixture_model.components[0][0]).split('[')[1].split(',')[0]) gauss_one_std = float( str(mixture_model.components[0][0]).split(', ')[1].split(']')[0]) gauss_two_mean = float( str(mixture_model.components[1][0]).split('[')[1].split(',')[0]) gauss_two_std = float( str(mixture_model.components[1][0]).split(', ')[1].split(']')[0]) print "Gauss1: mu: %f, std: %f" % (gauss_one_mean, gauss_one_std) print "Gauss2: mu: %f, std: %f" % (gauss_two_mean, gauss_two_std) #print "Using threshold %f" % threshold # inv normal cdf if gauss_one_mean > gauss_two_mean or mixture_model.pi[1] < 0.60: self.thresh_main_mean = gauss_one_mean self.thresh_main_std = gauss_one_std else: self.thresh_main_mean = gauss_two_mean self.thresh_main_std = gauss_two_std
for i in range(2): compPrior.append( mixture.NormalGammaDistribution( 1.0,2.0,3.0,4.0 ) ) mixPrior = mixture.MixturePrior(0.7,0.7,piPrior, compPrior) DNA = mixture.Alphabet(['A','C','G','T']) comps = [] for i in range(G): dlist = [] for j in range(2): phi = mixture.random_vector(4) dlist.append( mixture.DiscreteDistribution(4,phi,DNA)) for j in range(2): mu = j+1.0 sigma = j+0.5 dlist.append( mixture.NormalDistribution(mu,sigma)) comps.append(mixture.ProductDistribution(dlist)) pi = mixture.random_vector(G) m = mixture.BayesMixtureModel(G,pi, comps, mixPrior, struct = 1) mixture.writeMixture(m, 'test.bmix') m2 = mixture.readMixture('test.bmix') print m2 print m2.prior
import mixture # Example for context-specific independence (CSI) structure learning. # First we generate a data set from a three component mixture with a CSI like structure # in the distribution parameters. Then a five component CSI mixture is trained. # The training should recover the true number of components (three), # the CSI structure of the generating model as well as the distribution parameters. # Setting up the generating model. This is a benign case in the # sense that the components are reasonably well separated and we # allow ourselves plenty of training data. # Component distributions n11 = mixture.NormalDistribution(1.0, 0.5) n12 = mixture.NormalDistribution(2.0, 1.5) n13 = mixture.NormalDistribution(3.0, 0.7) d14 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2]) c1 = mixture.ProductDistribution([n11, n12, n13, d14]) n21 = mixture.NormalDistribution(1.0, 0.5) n22 = mixture.NormalDistribution(-6.0, 0.5) n23 = mixture.NormalDistribution(3.0, 0.7) d24 = mixture.DiscreteDistribution(4, [0.1, 0.1, 0.4, 0.4]) c2 = mixture.ProductDistribution([n21, n22, n23, d24]) n31 = mixture.NormalDistribution(2.0, 0.5) n32 = mixture.NormalDistribution(-3.0, 0.5) n33 = mixture.NormalDistribution(3.0, 0.7) d34 = mixture.DiscreteDistribution(4, [0.4, 0.3, 0.1, 0.2])
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file): best_loglike = None best_model = None # Giant random seeding loop, data = mx.DataSet() data.fromArray(table) for r in range(1): # weights = np.random.random_sample(k) # weights_norm = weights / sum(weights) weights_norm = [1.0 / k] * k components = [] for i in range(k): products = [] for j in range(table.shape[1]): col_type = prep.get_col_type(feature_cols[j], header) col_id = feature_cols[j] if col_type == 'cat': vals = feature_domains[col_id].keys() cnt_vals = len(vals) rand_dist = np.random.random_sample(cnt_vals) dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals)) elif col_type == 'num': min_val = feature_domains[col_id]['min'] max_val = feature_domains[col_id]['max'] # mean = random.uniform(min_val, max_val) mean = seeds[header[col_id][0]][i] stdev = (max_val - min_val) / 2.0 / k dist = mx.NormalDistribution(mean, stdev) else: sys.exit(1) products.append(dist) comp = mx.ProductDistribution(products) components.append(comp) mix_table = mx.MixtureModel(k, weights_norm, components) print mix_table #loglike = mix_table.randMaxEM(data,1,50,50) #print loglike #print mix_table if not best_loglike or loglike > best_loglike: # best_loglike = loglike best_model = copy.copy(mix_table) #data.internalInit(mix) # mix_table.modelInitialization(data) # print best_loglike # print best_model labels = best_model.classify(data, None, None, 1) ## output clustering results # count cluster sizes on sampled data f = open(result_file + '.stats', 'w') cnt = {} for l in labels: cnt[l] = 1 if l not in cnt else cnt[l] + 1 for l in cnt: f.write('%s %d %f%%\n' % (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values()))) f.close() mx.writeMixture(best_model, result_file + '.model') return best_model