def normalize(self, probe_int, probe_seq, probe_type, outFilePath, outputExtraModelData=False): probe_ids = sorted(list(set(probe_seq.keys()) & set(probe_int.keys()))) pseq = [] pint = [] ptype = [] for p in probe_ids: pseq.append(probe_seq[p]) pint.append(probe_int[p]) ptype.append(probe_type[p]) mx = self._design_matrix(array([self._encode(v) for v in pseq])) random.seed(1) randomNoise = array([random.random() / 10000000 for x in range(len(pint))]) my = log2(array((pint))) my = my + randomNoise # This avoids a problem where too many identical values results in some bins having no values model = EM() samplingProbeIndices = self._sample(len(probe_ids)) mainProbeIndices = [i for i in range(len(ptype)) if ptype[i] == "M"] samplingProbeIndices = list(set(samplingProbeIndices) & set(mainProbeIndices)) nbins = 25 b1, b2 = model.EM_vMix(my[samplingProbeIndices,],mx[samplingProbeIndices,], bins=nbins) m1 = dot(mx, b1) m2 = dot(mx, b2) binsize = 5000 nGroups = int(ceil(size(my) / binsize)) index = argsort(m1) y_norm = zeros(size(my), 'f') for i in arange(nGroups): tmp = index[(binsize * i):min([binsize * i + binsize, size(my)])] tmpSd = self._sig(my[tmp], m1[tmp]) y_norm[tmp] = ((my[tmp] - m1[tmp]) / tmpSd).tolist() model.assign_bin(m1, bins=nbins) gam = model.vresp(my, mx) print "Outputting to %s" % outFilePath outFile = file(outFilePath, 'w') outLines = [] for i, pid in enumerate(probe_ids): out = pid + "\t%.9f\t%.9f" % (y_norm[i], gam[i, 1]) if outputExtraModelData: out += "\t%.9f\t%.9f\t%.9f" % (my[i], m1[i], m2[i]) outLines.append(out) if len(outLines) % 100000 == 0: outFile.write("\n".join(outLines) + "\n") outLines = [] if len(outLines) > 0: outFile.write("\n".join(outLines) + "\n") outFile.close()
def run_mnist(): # FIXME: running EM on MNIST has the problem that all data collapses to one class # This is because the likelihood for that class is slightly higher than all other. # Probably has to do with the variance being lower for one, form k-means, # and that being more important than closeness to mean for such high dimensional data? # Running it with 0 iterations (i.e. on k-means) work fine, then it finds different orientations of the digits. data_per_class = 20 training_data = list(mnist.read("training")) dim_x, dim_y = np.shape(training_data[0][1]) ones = [d[1] for d in training_data if d[0] == 1] fours = [d[1] for d in training_data if d[0] == 4] fives = [d[1] for d in training_data if d[0] == 5] ones = ones[:data_per_class] fours = fours[:data_per_class] fives = fives[:data_per_class] data = np.array(ones + fours + fives).reshape((-1, dim_x * dim_y)) solver = EM(data=data, num_classes=3, num_nuisances=3) split_data, thetas = solver.fit(max_iter=1) for c, class_thetas in enumerate(thetas): for n, theta in enumerate(class_thetas): print(f"Prior: {theta.prior}, Var: {theta.variance}") mnist.show(thetas[c][n].mean.reshape(28, 28))
def f2(): img = cv2.imread(get_filename('0d648f99c.jpg', 'Train'), cv2.IMREAD_GRAYSCALE) img = gaussian(img, 0.5) # img = cv2.resize(img, dsize=tile_size) em_object = EM(img) em_object.run_it()
def __init__(self): Thread.__init__(self) EM.__init__(self) self.daemon = True self.alive = True self.first = True
@return [1,3,5,7,8, ....] """ interval = total/self.N if interval <= 1: interval = 1 return [i for i in range(start,total,interval)] def _quantile_normalize(self,x): xr = sort(x,axis=0) xm = mean(x,axis=1) return xm[argsort(x,axis=0)] def _design_matrix(self,PMProbe): """DesignMatrix(PMProbe): make design matrix from PMProbe PMProbe: probe sequence Int8 array, Constructs a 80 pars X-matrix for the model 3 * 25 'ACG' postions + 4 * 2 'ACGT' count and count square """ x = zeros((PMProbe.shape[0],80), 'f') x[:,0] = sum(PMProbe == self.code['T'],1).astype('float32') j = 1 for ibase in 'ACG': x[:, j:j+25] = (PMProbe == self.code[ibase]) j += 25
def model_dict(X, Z, K): """Run all the models and store their features into one dictionary for plotting. Args: - X : (N, d) data - Z : (N) labels - K : Number of clusters Returns: - models : Dictionary containing the means, covariance matrices (when existing) and labels of the models clusters. """ # Models dictionary models = dict() # Ground truth models["ground truth"] = { "mean": np.array([X[Z == k].mean(0) for k in range(len(np.unique(Z)))]), "cov": None, "labels": Z, } # Run diagonal EM em_diag = EM(K) em_diag.fit(X) models["diagonal EM"] = { "mean": em_diag.mus, "cov": np.array([np.diag(em_diag.Ds[k]) for k in range(K)]), "labels": em_diag.labels_, } # Run general EM em = GaussianMixture(K) em.fit(X) # Compute reponsabilities gaussians = np.array( [ multivariate_normal.pdf(X, em.means_[k], em.covariances_[k]) * em.weights_[k] for k in range(K) ] ) r = gaussians / gaussians.sum(0) models["general EM"] = { "mean": em.means_, "cov": em.covariances_, "labels": r.argmax(0), } # Run K-means km = KMeans(K) km.fit(X) models["K-means"] = {"mean": km.cluster_centers_, "cov": None, "labels": km.labels_} return models
def run_gaussian(): data_per_cluster = 15 num_clusters = 2 data = generate_gaussian_data(data_per_cluster, num_clusters) num_classes = num_clusters num_nuisances = 1 solver = EM(data, num_classes, num_nuisances) split_data, thetas = solver.fit_and_plot(5) for class_thetas in thetas: for theta in class_thetas: print(f"Mean: {theta.mean} - Var: {theta.variance}")
def __init__(self, host, port, rcon_password): Thread.__init__(self) EM.__init__(self) self.daemon = True self.alive = True self.conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.host = host self.port = port self.rcon_password = rcon_password self.first = True
def _create_model_and_run_em(self, d, k, mode, nframes): #+++++++++++++++++++++++++++++++++++++++++++++++++ # Generate a model with k components, d dimensions #+++++++++++++++++++++++++++++++++++++++++++++++++ w, mu, va = GM.gen_param(d, k, mode, spread=1.5) gm = GM.fromvalues(w, mu, va) # Sample nframes frames from the model data = gm.sample(nframes) #++++++++++++++++++++++++++++++++++++++++++ # Approximate the models with classical EM #++++++++++++++++++++++++++++++++++++++++++ # Init the model lgm = GM(d, k, mode) gmm = GMM(lgm, 'kmean') em = EM() lk = em.train(data, gmm)
def _create_model_and_run_em(self, d, k, mode, nframes): #+++++++++++++++++++++++++++++++++++++++++++++++++ # Generate a model with k components, d dimensions #+++++++++++++++++++++++++++++++++++++++++++++++++ w, mu, va = GM.gen_param(d, k, mode, spread = 1.5) gm = GM.fromvalues(w, mu, va) # Sample nframes frames from the model data = gm.sample(nframes) #++++++++++++++++++++++++++++++++++++++++++ # Approximate the models with classical EM #++++++++++++++++++++++++++++++++++++++++++ # Init the model lgm = GM(d, k, mode) gmm = GMM(lgm, 'kmean') em = EM() lk = em.train(data, gmm)
def _test(self, dataset, log): dic = load_dataset(dataset) gm = GM.fromvalues(dic['w0'], dic['mu0'], dic['va0']) gmm = GMM(gm, 'test') EM().train(dic['data'], gmm, log=log) assert_array_almost_equal(gmm.gm.w, dic['w'], DEF_DEC) assert_array_almost_equal(gmm.gm.mu, dic['mu'], DEF_DEC) assert_array_almost_equal(gmm.gm.va, dic['va'], DEF_DEC)
def process_image(image_id, thread_number, locale): try: print("%d start" % thread_number) img = cv2.imread(get_filename(image_id, "final"), cv2.IMREAD_GRAYSCALE) img = gaussian(img, 0.5) # TODO maybe change to 1 final_img = img.copy() img = cv2.resize(img, dsize=tile_size) print("%d go EM" % thread_number) em = EM(img, thread_number) em.run_it() print("%d done EM" % thread_number) st = timer() prediction_matrix = create_prediction_image(final_img, em.miu, em.sigma, em.pgk, em.clusters) fs = timer() print('Prediciton matrix %d' % (fs - st)) out_file = open("../EM_Result/%s" % image_id.split(".")[0], "w") for line in prediction_matrix: for el in line: out_file.write("%d " % el) out_file.write("\n") out_file.close() thread_file = open("../EM_Result/_Thread_%d" % thread_number, "a") thread_file.write(image_id) thread_file.write("\n") thread_file.close() except Exception as e: print(e) th_err = open("../EM_Result/_Thread_Error_%d" % thread_number, "a") th_err.write(image_id) th_err.write("\n") th_err.close()
def train(self, word_root_procs): ''' ''' word_cand_feats = [] for word, root_procs in word_root_procs: cand_feats = [ self.__features(root, proc) for root, proc in root_procs ] word_cand_feats.append((word, cand_feats)) self.__prob_roots, self.__prob_pats, self.__prob_trans_on_feat = EM( ).estimate(word_cand_feats) self.__trained = True
def set_up_experiment(args): """ Create exp. directory if needed, store used arguments, create patches and em objects. """ if not os.path.exists(args.output): os.makedirs(args.output) with open(os.path.join(args.output, "args.txt"), "w") as f: args_dict = vars(args) for k, v in args_dict.items(): if isinstance(v, bool) and not v: continue f.write("-{}={} \\\n".format(k, v)) np.random.seed(args.random_seed) patches = Patches( input_path=args.input, source_path=args.source, patch_size=args.patch_size, patch_overlap=args.patch_overlap, pca_k=args.pca_k, color=(args.color == "color"), ) initial_posteriors_path = os.path.join(args.output, "initial_posteriors.npy") lbp_params = dict() lbp_params["output_dir"] = args.output lbp_params["two_sigma2"] = args.lbp_two_sigma2 lbp_params["iterations"] = args.lbp_iterations lbp_params["seed"] = args.random_seed em = EM( patches=patches, num_candidates=args.num_candidates, num_transformations=args.num_transformations, lbp_params=lbp_params, lambdas_init_type=args.init_transformations, ) return patches, em
""" For tests """ import time from model_gm import ModelGM, sample_gm from em import EM from mm import MM from dmm import DMM from discrete_rv import wass if __name__ == '__main__': k = 2 mm = MM(k, sigma=1) em = EM(k, sigma=1) dmm = DMM(k, sigma=None) model = ModelGM(w=[0.5, 0.5], x=[-.5, .5], std=1) sample = sample_gm(model, 10000) # esti_em = em.estimate(sample) # print(wass(esti_em.mean_rv(), model.mean_rv())) print(dmm.estimate(sample)) print(dmm.estimate_online(sample)) # print(wass(esti_dmm, model.mean_rv())) # print(mm.estimate(sample)) # dmm = DMM(k=2) # print(dmm.estimate(sample))
from scipy.stats import multivariate_normal import math from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import time import os if __name__ == "__main__": if not os.path.exists('plots'): os.makedirs('plots') plt.rcParams['figure.figsize'] = [15, 15] X,y = make_blobs(n_samples=200, centers=[[2.5,3.5],[3.5,5.5],[4,10]]) em = EM(X,num_clusters=3) means,sigmas = em.cluster_means,em.cluster_cov_mtrxs iters = 16 color_vals = ['red','blue','green'] for i in np.arange(iters): print(f'Performing iteration {i}/{iters-1}..') plt.clf() plt.axis([-5, 30, -5, 30]) plt.scatter(X[:,0],X[:,1],marker='o', c=y, s=35, edgecolor='k') j = 0 for mu,cov in zip(means,sigmas): x_plt, y_plt = np.mgrid[mu[0] - 3*cov[0,0]:mu[0] +3*cov[0,0]:.01, mu[1] - 3*cov[1,1]:mu[1] + 3*cov[1,1]:.01] pos = np.empty(x_plt.shape + (2,)) pos[:, :, 0] = x_plt
from em import EM from dataset import Dataset from reader import Reader from nltk import word_tokenize # d = [] # with open('in2.txt', 'r') as file: # lines = file.readlines() # for l in lines: # d.append(word_tokenize(l[:-1])) # # dataset = Dataset(d) # em = EM(dataset, 2, 0.0000000000000001) # em.do() if __name__ == "__main__": [d,d_names] = Reader.read('../ROBOT/64') dataset = Dataset(d, d_names) lh = {} max = 0 for i in range(5, 6): em = EM(dataset, i, 0.0000000000000001) em.do() # TODO zapis wyników lh[i] = em.likelyhood pass for l in lh: print(str(l) + ': ' + str(lh[l])) pass
turn_off_the_lights_validation_samples, turn_off_the_lights_samples, turn_off_the_lights_hmm, 11 ]) if not os.path.exists(what_time_is_it_hmm): training_list.append([ what_time_is_it_validation_samples, what_time_is_it_samples, what_time_is_it_hmm, 10 ]) csv.write([ "Trainee", "# of States", "Iteration", "Test Set", "Max", "Min", "Mean", "Std", "5%/95% Cutoff", "10%/90% Cutoff", "15%/85% Cutoff", "20%/80% Cutoff" ]) em = EM() for item in training_list: for q in range(6, 40, 4): for iteration in range(0, 2): folder_name = os.path.split(item[1])[-1] hmm_path = "%s_%d_%d.hmm" % (os.path.splitext( item[2])[0], q, iteration) if os.path.exists(hmm_path): print("Skipping already trained %s for q=%d and i=%d..." % (folder_name, q, iteration)) continue print("Training %s for q=%d and i=%d..." % (folder_name, q, iteration)) speech_hmm = em.build_hmm_from_folder(item[1], q,
import glob, os, sys, time, posix