# Prepare ground-truth GFs (bars) W_gt = np.zeros((H, D2, D2)) for i in xrange(D2): W_gt[i, i, :] = 10.0 W_gt[D2 + i, :, i] = 10.0 W_gt = W_gt.reshape((H, D)) # Prepare model... model = MCA_ET(D, H, Hprime, gamma) gt_params = {"W": W_gt, "pi": 2.0 / H, "sigma": 1.00} # Generate trainig data my_N = N // comm.size my_data = model.generate_data(gt_params, my_N) dlog.append("y", my_data["y"][0:25, :]) # Initialize model parameters (to be learned) params = { # 'W' : W_gt, "W": np.abs(5 + np.random.normal(size=W_gt.shape)), "pi": 2 / H, "sigma": 5.00, } # params = model.noisify_params(params, anneal) params = comm.bcast(params) # Create and start EM annealing em = EM(model=model, anneal=anneal) em.data = my_data em.lparams = params
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ MCA M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing AND softmax anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] # Read in data: my_y = my_data['y'] my_cand = my_data['candidates'] my_logpj = my_suff_stat['logpj'] my_N, D = my_y.shape N = comm.allreduce(my_N) state_mtx = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) no_states = len(state_abs) # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute T = anneal['T'] T_rho = np.maximum(T, self.rho_temp_bound) rho = 1. / (1. - 1. / T_rho) beta = 1. / T pre0 = (1. - rho) / rho pre1 = -1. / 2. / sigma / sigma pil_bar = np.log(pies / (1. - pies)) Wl = np.log(W) Wrho = np.exp(rho * Wl) Wsquared = W * W # Some asserts assert np.isfinite(pil_bar).all() assert np.isfinite(Wl).all() assert np.isfinite(Wrho).all() assert (Wrho > 1e-86).all() my_corr = beta * ((my_logpj).max(axis=1)) # shape: (my_N,) my_pjb = np.exp(beta * my_logpj - my_corr[:, None]) # shape: (my_N, no_states) # Precompute factor for pi/gamma update A_pi_gamma = 0. B_pi_gamma = 0. for gp in xrange(0, self.gamma + 1): a = comb(H, gp, exact=1) * pies**gp * (1. - pies)**(H - gp) A_pi_gamma += a B_pi_gamma += gp * a # Truncate data if anneal['Ncut_factor'] > 0.0: tracing.tracepoint("M_step:truncating") my_denoms = np.log(my_pjb.sum(axis=1)) + my_corr N_use = int(N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor'])) cut_denom = parallel.allsort(my_denoms)[-N_use] which = np.array(my_denoms >= cut_denom) my_y = my_y[which] my_cand = my_cand[which] my_logpj = my_logpj[which] my_pjb = my_pjb[which] my_corr = my_corr[which] my_N, D = my_y.shape N_use = comm.allreduce(my_N) else: N_use = N dlog.append('N_use', N_use) # Allocate suff-stat arrays my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros_like(W) # shape (H, D) my_pi = 0.0 # my_sigma = 0.0 # # Iterate over all datapoints for n in xrange(my_N): tracing.tracepoint("M_step:iterating") y = my_y[n, :] # shape (D,) cand = my_cand[n, :] # shape (Hprime,) logpj = my_logpj[n, :] # shape (no_states,) pjb = my_pjb[n, :] # shape (no_states,) corr = my_corr[n] # scalar this_Wp = np.zeros_like( W) # numerator for W (current datapoint) (H, D) this_Wq = np.zeros_like( W) # denominator for W (current datapoint) (H, D) this_pi = 0.0 # numerator for pi update (current datapoint) this_sigma = 0.0 # numerator for gamma update (current datapoint) # Zero active hidden causes # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do this_sigma += pjb[0] * (y**2).sum() # One active hidden cause this_Wp += (pjb[1:(H + 1), None] * Wsquared[:, :]) * y[None, :] this_Wq += (pjb[1:(H + 1), None] * Wsquared[:, :]) this_pi += pjb[1:(H + 1)].sum() this_sigma += (pjb[1:(H + 1)] * ((W - y)**2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime, D) Wl_ = Wl[cand] # is ( " ") Wrho_ = Wrho[cand] # is ( " ") Wlrhom1 = (rho - 1) * Wl_ # is (Hprime, D) Wlbar = np.log(np.dot(state_mtx, Wrho_)) / rho # is (no_states, D) Wbar = np.exp(Wlbar) # is (no_states, D) blpj = beta * logpj[1 + H:] - corr # is (no_states,) Aid = (state_mtx[:, :, None] * np.exp(blpj[:, None, None] + (1 - rho) * Wlbar[:, None, :] + Wlrhom1[None, :, :])).sum(axis=0) assert np.isfinite(Wlbar).all() assert np.isfinite(Wbar).all() assert np.isfinite(pjb).all() assert np.isfinite(Aid).all() this_Wp[cand] += Aid * y[None, :] this_Wq[cand] += Aid this_pi += (pjb[1 + H:] * state_abs).sum() this_sigma += (pjb[1 + H:] * ((Wbar - y)**2).sum(axis=1)).sum() denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_sigma += this_sigma / denom my_ldenom_sum += np.log(np.sum( np.exp(logpj))) #For loglike computation # Calculate updated W if 'W' in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) assert np.isfinite(my_Wp).all() assert np.isfinite(my_Wq).all() comm.Allreduce([my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE]) comm.Allreduce([my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE]) # Make sure wo do not devide by zero tiny = np.finfo(Wq.dtype).tiny Wp[Wq < tiny] = 0. Wq[Wq < tiny] = tiny W_new = Wp / Wq else: W_new = W # Calculate updated pi if 'pi' in self.to_learn: tracing.tracepoint("M_step:update pi") assert np.isfinite(my_pi).all() pi_new = A_pi_gamma / B_pi_gamma * pies * comm.allreduce( my_pi) / N_use else: pi_new = pies # Calculate updated sigma if 'sigma' in self.to_learn: # TODO: XXX see LinCA XXX (merge!) tracing.tracepoint("M_step:update sigma") assert np.isfinite(my_sigma).all() sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma #Put all together and compute (always) et_approx_likelihood ldenom_sum = comm.allreduce(my_ldenom_sum) lAi = (H * np.log(1. - pi_new)) - ( (D / 2) * np.log(2 * pi)) - (D * np.log(sigma_new)) #For practical and et approx reasons we use: sum of restected respons=1 loglike_et = (lAi * N_use) + ldenom_sum return {'W': W_new, 'pi': pi_new, 'sigma': sigma_new, 'Q': loglike_et}
def E_step(self, anneal, model_params, my_data): """ BSC E_step my_data variables used: my_data['y'] Datapoints my_data['can'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm my_y = my_data['y'].copy() my_cand = my_data['candidates'] my_N, D = my_data['y'].shape H = self.H SM = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] try: mu = model_params['mu'] except: mu = np.zeros(D) model_params['mu'] = mu # Precompute beta = 1. / anneal['T'] pre1 = -1. / 2. / sigma / sigma pil_bar = np.log(pies / (1. - pies)) # Allocate return structures F = np.empty([my_N, 1 + H + self.no_states]) pre_F = np.empty([my_N, 1 + H + self.no_states]) denoms = np.zeros(my_N) # Joerg's data noise idea data_noise_scale = anneal['data_noise'] dlog.append('Data Noise', data_noise_scale) if data_noise_scale > 0.: my_data['data_noise'] = np.random.normal(scale=data_noise_scale, size=my_y.shape) my_y += my_data['data_noise'] # Pre-fill pre_F: pre_F[:, 0] = 0. pre_F[:, 1:H + 1] = pil_bar pre_F[:, 1 + H:] = pil_bar * state_abs # is (no_states,) # Iterate over all datapoints tracing.tracepoint("E_step:iterating") for n in xrange(my_N): y = my_data['y'][n, :] - mu cand = my_data['candidates'][n, :] # Zero active hidden causes log_prod_joint = pre1 * (y**2).sum() F[n, 0] = log_prod_joint # Hidden states with one active cause log_prod_joint = pre1 * ((W - y)**2).sum(axis=1) F[n, 1:H + 1] = log_prod_joint # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime x D) Wbar = np.dot(SM, W_) log_prod_joint = pre1 * ((Wbar - y)**2).sum(axis=1) F[n, 1 + H:] = log_prod_joint if anneal['anneal_prior']: F = beta * (pre_F + F) else: F = pre_F + beta * F return {'logpj': F}
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ MCA M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing AND softmax anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] # Read in data: my_y = my_data['y'] my_cand = my_data['candidates'] my_logpj = my_suff_stat['logpj'] my_N, D = my_y.shape N = comm.allreduce(my_N) state_mtx = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) no_states = len(state_abs) # Disable some warnings old_seterr = np.seterr(divide='ignore', under='ignore') # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute T = anneal['T'] T_rho = np.maximum(T, self.rho_T_bound) rho = 1. / (1. - 1. / T_rho) rho = np.maximum(np.minimum(rho, self.rho_ubound), self.rho_lbound) beta = 1. / T pre1 = -1. / 2. / sigma / sigma pil_bar = np.log(pies / (1. - pies)) Wl = accel.log(np.abs(W)) Wrho = accel.exp(rho * Wl) Wrhos = np.sign(W) * Wrho Wsquared = W * W # Some asserts assert np.isfinite(pil_bar).all() assert np.isfinite(Wl).all() assert np.isfinite(Wrho).all() assert (Wrho > 1e-86).all() my_corr = beta * ((my_logpj).max(axis=1)) # shape: (my_N,) my_logpjb = beta * my_logpj - my_corr[:, None] # shape: (my_N, no_states) my_pj = accel.exp(my_logpj) # shape: (my_N, no_states) my_pjb = accel.exp(my_logpjb) # shape: (my_N, no_states) # Precompute factor for pi update and ET cutting A_pi_gamma = 0. B_pi_gamma = 0. for gp in xrange(0, self.gamma + 1): a = comb(H, gp) * pies**gp * (1. - pies)**(H - gp) A_pi_gamma += a B_pi_gamma += gp * a # Truncate data if anneal['Ncut_factor'] > 0.0: tracing.tracepoint("M_step:truncating") my_logdenoms = accel.log(my_pjb.sum(axis=1)) + my_corr N_use = int(N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor'])) cut_denom = parallel.allsort(my_logdenoms)[-N_use] my_sel, = np.where(my_logdenoms >= cut_denom) my_N, = my_sel.shape N_use = comm.allreduce(my_N) else: my_N, _ = my_y.shape my_sel = np.arange(my_N) N_use = N # Allocate suff-stat arrays my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros_like(W) # shape (H, D) my_pi = 0.0 # my_sigma = 0.0 # # Do reverse correlation if requested if self.rev_corr: my_y_rc = my_data['y_rc'] D_rev_corr = my_y_rc.shape[1] my_rev_corr = np.zeros((H, D_rev_corr)) my_rev_corr_count = np.zeros(H) # Iterate over all datapoints tracing.tracepoint("M_step:iterating...") dlog.append('N_use', N_use) for n in my_sel: y = my_y[n, :] # shape (D,) cand = my_cand[n, :] # shape (Hprime,) logpj = my_logpj[n, :] # shape (no_states,) logpjb = my_logpjb[n, :] # shape (no_states,) pj = my_pj[n, :] # shape (no_states,) pjb = my_pjb[n, :] # shape (no_states,) this_Wp = np.zeros_like( W) # numerator for W (current datapoint) (H, D) this_Wq = np.zeros_like( W) # denominator for W (current datapoint) (H, D) this_pi = 0.0 # numerator for pi update (current datapoint) this_sigma = 0.0 # numerator for gamma update (current datapoint) # Zero active hidden causes # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do this_sigma += pjb[0] * (y**2).sum() # One active hidden cause this_Wp += (pjb[1:(H + 1), None]) * y[None, :] this_Wq += (pjb[1:(H + 1), None]) this_pi += pjb[1:(H + 1)].sum() this_sigma += (pjb[1:(H + 1)] * ((W - y)**2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime, D) Wl_ = Wl[cand] # is ( " ") Wrho_ = Wrho[cand] # is ( " ") Wrhos_ = Wrhos[cand] # is ( " ") #Wbar = calc_Wbar(state_mtx, W_) #Wlbar = np.log(np.abs(Wbar)) t0 = np.dot(state_mtx, Wrhos_) Wlbar = accel.log(np.abs(t0)) / rho # is (no_states, D) #Wlbar = np.maximum(Wlbar, -9.21) Wbar = np.sign(t0) * accel.exp(Wlbar) # is (no_states, D) t = Wlbar[:, None, :] - Wl_[None, :, :] t = np.maximum(t, 0.) Aid = state_mtx[:, :, None] * accel.exp(logpjb[H + 1:, None, None] - (rho - 1) * t) Aid = Aid.sum(axis=0) #Aid = calc_Aid(logpjb[H+1:], W_, Wl_, state_mtx, Wbar, Wlbar, rho) #assert np.isfinite(Wlbar).all() #assert np.isfinite(Wbar).all() #assert np.isfinite(pjb).all() #assert np.isfinite(Aid).all() this_Wp[cand] += Aid * y[None, :] this_Wq[cand] += Aid this_pi += (pjb[1 + H:] * state_abs).sum() this_sigma += (pjb[1 + H:] * ((Wbar - y)**2).sum(axis=1)).sum() denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_sigma += this_sigma / denom #self.tbl.append("logpj", logpj) #self.tbl.append("corr", my_corr[n]) #self.tbl.append("denom", denom) #self.tbl.append("cand", cand) #self.tbl.append("Aid", Aid) my_ldenom_sum += accel.log(np.sum( accel.exp(logpj))) #For loglike computation # Estimate reverse correlation if self.rev_corr: pys = pjb / denom if np.isfinite(pys).all(): my_rev_corr += pys[1:H + 1, None] * my_y_rc[n, None, :] my_rev_corr_count += pys[1:H + 1] my_rev_corr[cand] += np.sum(state_mtx[:, :, None] * pys[H + 1:, None, None] * my_y_rc[n, None, :], axis=0) my_rev_corr_count[cand] += np.sum(state_mtx[:, :] * pys[H + 1, None], axis=0) else: print "Not all finite rev_corr %d" % n # Calculate updated W if 'W' in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) assert np.isfinite(my_Wp).all() assert np.isfinite(my_Wq).all() comm.Allreduce([my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE]) comm.Allreduce([my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE]) # Make sure wo do not devide by zero tiny = self.tol Wq[Wq < tiny] = tiny # Calculate updated W W_new = Wp / Wq # Add inertia depending on Wq alpha = 2.5 inertia = np.maximum(1. - accel.exp(-Wq / alpha), 0.2) W_new = inertia * W_new + (1 - inertia) * W else: W_new = W # Calculate updated pi if 'pi' in self.to_learn: tracing.tracepoint("M_step:update pi") assert np.isfinite(my_pi).all() pi_new = A_pi_gamma / B_pi_gamma * pies * comm.allreduce( my_pi) / N_use else: pi_new = pies # Calculate updated sigma if 'sigma' in self.to_learn: # TODO: XXX see LinCA XXX (merge!) tracing.tracepoint("M_step:update sigma") assert np.isfinite(my_sigma).all() sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma # Put all together and compute (always) et_approx_likelihood ldenom_sum = comm.allreduce(my_ldenom_sum) lAi = (H * np.log(1. - pi_new)) - ( (D / 2) * np.log(2 * pi)) - (D * np.log(sigma_new)) # For practical and et approx reasons we use: sum of restected respons=1 loglike_et = (lAi * N_use) + ldenom_sum if self.rev_corr: rev_corr = np.empty_like(my_rev_corr) rev_corr_count = np.empty_like(my_rev_corr_count) comm.Allreduce([my_rev_corr, MPI.DOUBLE], [rev_corr, MPI.DOUBLE]) comm.Allreduce([my_rev_corr_count, MPI.DOUBLE], [rev_corr_count, MPI.DOUBLE]) rev_corr /= (1e-16 + rev_corr_count[:, None]) else: rev_corr = np.zeros((H, D)) # Restore np.seterr np.seterr(**old_seterr) return { 'W': W_new, 'pi': pi_new, 'sigma': sigma_new, 'rev_corr': rev_corr, 'Q': loglike_et }
def E_step(self, anneal, model_params, my_data): """ BSC E_step my_data variables used: my_data['y'] Datapoints my_data['can'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm my_y = my_data['y'].copy() my_cand = my_data['candidates'] my_N, D = my_data['y'].shape H = self.H SM = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] try: mu = model_params['mu'] except: mu = np.zeros(D) model_params['mu'] = mu # Precompute beta = 1./anneal['T'] pre1 = -1./2./sigma/sigma pil_bar = np.log( pies/(1.-pies) ) # Allocate return structures F = np.empty( [my_N, 1+H+self.no_states] ) pre_F = np.empty( [my_N, 1+H+ self.no_states] ) denoms = np.zeros(my_N) # Joerg's data noise idea data_noise_scale = anneal['data_noise'] dlog.append('Data Noise', data_noise_scale) if data_noise_scale > 0.: my_data['data_noise'] = np.random.normal(scale=data_noise_scale, size=my_y.shape) my_y += my_data['data_noise'] # Pre-fill pre_F: pre_F[:,0] = 0. pre_F[:,1:H+1] = pil_bar pre_F[:,1+H:] = pil_bar * state_abs # is (no_states,) # Iterate over all datapoints tracing.tracepoint("E_step:iterating") for n in xrange(my_N): y = my_data['y'][n,:] - mu cand = my_data['candidates'][n,:] # Zero active hidden causes log_prod_joint = pre1 * (y**2).sum() F[n,0] = log_prod_joint # Hidden states with one active cause log_prod_joint = pre1 * ((W-y)**2).sum(axis=1) F[n,1:H+1] = log_prod_joint # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime x D) Wbar = np.dot(SM,W_) log_prod_joint = pre1 * ((Wbar-y)**2).sum(axis=1) F[n,1+H:] = log_prod_joint if anneal['anneal_prior']: F = beta * (pre_F + F) else: F = pre_F + beta * F return { 'logpj': F }
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ BSC M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] mu = model_params['mu'] # Read in data: my_y = my_data['y'].copy() candidates = my_data['candidates'] logpj_all = my_suff_stat['logpj'] all_denoms = np.exp(logpj_all).sum(axis=1) my_N, D = my_y.shape N = comm.allreduce(my_N) # Joerg's data noise idea data_noise_scale = anneal['data_noise'] if data_noise_scale > 0: my_y += my_data['data_noise'] SM = self.state_matrix # shape: (no_states, Hprime) # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute factor for pi update A_pi_gamma = 0 B_pi_gamma = 0 for gamma_p in range(gamma + 1): A_pi_gamma += comb(H, gamma_p) * (pies**gamma_p) * ( (1 - pies)**(H - gamma_p)) B_pi_gamma += gamma_p * comb(H, gamma_p) * (pies**gamma_p) * ( (1 - pies)**(H - gamma_p)) E_pi_gamma = pies * H * A_pi_gamma / B_pi_gamma # Truncate data if anneal['Ncut_factor'] > 0.0: tracing.tracepoint("M_step:truncating") #alpha = 0.9 # alpha from ET paper #N_use = int(alpha * (N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor']))) N_use = int(N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor'])) cut_denom = parallel.allsort(all_denoms)[-N_use] which = np.array(all_denoms >= cut_denom) candidates = candidates[which] logpj_all = logpj_all[which] my_y = my_y[which] my_N, D = my_y.shape N_use = comm.allreduce(my_N) else: N_use = N dlog.append('N', N_use) # Calculate truncated Likelihood L = H * np.log(1 - pies) - 0.5 * D * np.log( 2 * pi * sigma**2) - np.log(A_pi_gamma) Fs = np.log(np.exp(logpj_all).sum(axis=1)).sum() L += comm.allreduce(Fs) / N_use dlog.append('L', L) # Precompute pil_bar = np.log(pies / (1. - pies)) corr_all = logpj_all.max(axis=1) # shape: (my_N,) pjb_all = np.exp(logpj_all - corr_all[:, None]) # shape: (my_N, no_states) # Allocate my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros((H, H)) # shape (H, H) my_pi = 0.0 # my_sigma = 0.0 # #my_mup = np.zeros_like(W) # shape (H, D) #my_muq = np.zeros((H,H)) # shape (H, H) my_mus = np.zeros(H) # shape D data_sum = my_y.sum(axis=0) # sum over all data points for mu update ## Calculate mu #for n in xrange(my_N): #tracing.tracepoint("Calculationg offset") #y = my_y[n,:] # length D #cand = candidates[n,:] # length Hprime #logpj = logpj_all[n,:] # length no_states #corr = corr_all[n] # scalar #pjb = pjb_all[n, :] ## Zero active hidden cause (do nothing for the W and pi case) ## this_Wp += 0. # nothing to do ## this_Wq += 0. # nothing to do ## this_pi += 0. # nothing to do ## One active hidden cause #this_mup = np.outer(pjb[1:(H+1)],y) #this_muq = pjb[1:(H+1)] * np.identity(H) #this_mus = pjb[1:(H+1)] ## Handle hidden states with more than 1 active cause #this_mup[cand] += np.dot(np.outer(y,pjb[(1+H):]),SM).T #this_muq_tmp = np.zeros_like(my_muq[cand]) #this_muq_tmp[:,cand] = np.dot(pjb[(1+H):] * SM.T,SM) #this_muq[cand] += this_muq_tmp #this_mus[cand] += np.inner(SM.T,pjb[(1+H):]) #denom = pjb.sum() #my_mup += this_mup / denom #my_muq += this_muq / denom #my_mus += this_mus / denom ## Calculate updated mu #if 'mu' in self.to_learn: #tracing.tracepoint("M_step:update mu") #mup = np.empty_like(my_mup) #muq = np.empty_like(my_muq) #mus = np.empty_like(my_mus) #all_data_sum = np.empty_like(data_sum) #comm.Allreduce( [my_mup, MPI.DOUBLE], [mup, MPI.DOUBLE] ) #comm.Allreduce( [my_muq, MPI.DOUBLE], [muq, MPI.DOUBLE] ) #comm.Allreduce( [my_mus, MPI.DOUBLE], [mus, MPI.DOUBLE] ) #comm.Allreduce( [data_sum, MPI.DOUBLE], [all_data_sum, MPI.DOUBLE] ) #mu_numer = all_data_sum - np.dot(mus,np.dot(np.linalg.inv(muq), mup)) #mu_denom = my_N - np.dot(mus,np.dot(np.linalg.inv(muq), mus)) #mu_new = mu_numer/ mu_denom #else: #mu_new = mu # Iterate over all datapoints tracing.tracepoint("M_step:iterating") for n in xrange(my_N): y = my_y[n, :] - mu # length D cand = candidates[n, :] # length Hprime pjb = pjb_all[n, :] this_Wp = np.zeros_like( my_Wp) # numerator for current datapoint (H, D) this_Wq = np.zeros_like( my_Wq) # denominator for current datapoint (H, H) this_pi = 0.0 # numerator for pi update (current datapoint) # Zero active hidden cause (do nothing for the W and pi case) # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do # One active hidden cause this_Wp = np.outer(pjb[1:(H + 1)], y) this_Wq = pjb[1:(H + 1)] * np.identity(H) this_pi = pjb[1:(H + 1)].sum() this_mus = pjb[1:(H + 1)].copy() # Handle hidden states with more than 1 active cause this_Wp[cand] += np.dot(np.outer(y, pjb[(1 + H):]), SM).T this_Wq_tmp = np.zeros_like(my_Wq[cand]) this_Wq_tmp[:, cand] = np.dot(pjb[(1 + H):] * SM.T, SM) this_Wq[cand] += this_Wq_tmp this_pi += np.inner(pjb[(1 + H):], SM.sum(axis=1)) this_mus[cand] += np.inner(SM.T, pjb[(1 + H):]) denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_mus += this_mus / denom # Calculate updated W if 'W' in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) comm.Allreduce([my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE]) comm.Allreduce([my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE]) W_new = np.dot(np.linalg.inv(Wq), Wp) # W_new = np.linalg.lstsq(Wq, Wp) else: W_new = W # Calculate updated pi if 'pi' in self.to_learn: tracing.tracepoint("M_step:update pi") pi_new = E_pi_gamma * comm.allreduce(my_pi) / H / N_use else: pi_new = pies # Calculate updated sigma if 'sigma' in self.to_learn: tracing.tracepoint("M_step:update sigma") # Loop for sigma update: for n in xrange(my_N): y = my_y[n, :] - mu # length D cand = candidates[n, :] # length Hprime logpj = logpj_all[n, :] # length no_states corr = logpj.max() # scalar pjb = np.exp(logpj - corr) # Zero active hidden causes this_sigma = pjb[0] * (y**2).sum() # Hidden states with one active cause this_sigma += (pjb[1:(H + 1)] * ((W - y)**2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause SM = self.state_matrix # is (no_states, Hprime) W_ = W[cand] # is (Hprime x D) Wbar = np.dot(SM, W_) this_sigma += (pjb[(H + 1):] * ((Wbar - y)**2).sum(axis=1)).sum() denom = pjb.sum() my_sigma += this_sigma / denom sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma # Calculate updated mu: if 'mu' in self.to_learn: tracing.tracepoint("M_step:update mu") mus = np.empty_like(my_mus) all_data_sum = np.empty_like(data_sum) comm.Allreduce([my_mus, MPI.DOUBLE], [mus, MPI.DOUBLE]) comm.Allreduce([data_sum, MPI.DOUBLE], [all_data_sum, MPI.DOUBLE]) mu_new = all_data_sum / my_N - np.inner(W_new.T / my_N, mus) else: mu_new = mu for param in anneal.crit_params: exec('this_param = ' + param) anneal.dyn_param(param, this_param) dlog.append('N_use', N_use) return {'W': W_new, 'pi': pi_new, 'sigma': sigma_new, 'mu': mu_new}
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ MCA M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing AND softmax anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params["W"] pies = model_params["pi"] sigma = model_params["sigma"] # Read in data: my_y = my_data["y"] my_cand = my_data["candidates"] my_logpj = my_suff_stat["logpj"] my_N, D = my_y.shape N = comm.allreduce(my_N) state_mtx = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) no_states = len(state_abs) # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute T = anneal["T"] T_rho = np.maximum(T, self.rho_temp_bound) rho = 1.0 / (1.0 - 1.0 / T_rho) beta = 1.0 / T pre0 = (1.0 - rho) / rho pre1 = -1.0 / 2.0 / sigma / sigma pil_bar = np.log(pies / (1.0 - pies)) Wl = np.log(W) Wrho = np.exp(rho * Wl) Wsquared = W * W # Some asserts assert np.isfinite(pil_bar).all() assert np.isfinite(Wl).all() assert np.isfinite(Wrho).all() assert (Wrho > 1e-86).all() my_corr = beta * ((my_logpj).max(axis=1)) # shape: (my_N,) my_pjb = np.exp(beta * my_logpj - my_corr[:, None]) # shape: (my_N, no_states) # Precompute factor for pi/gamma update A_pi_gamma = 0.0 B_pi_gamma = 0.0 for gp in xrange(0, self.gamma + 1): a = comb(H, gp, exact=1) * pies ** gp * (1.0 - pies) ** (H - gp) A_pi_gamma += a B_pi_gamma += gp * a # Truncate data if anneal["Ncut_factor"] > 0.0: tracing.tracepoint("M_step:truncating") my_denoms = np.log(my_pjb.sum(axis=1)) + my_corr N_use = int(N * (1 - (1 - A_pi_gamma) * anneal["Ncut_factor"])) cut_denom = parallel.allsort(my_denoms)[-N_use] which = np.array(my_denoms >= cut_denom) my_y = my_y[which] my_cand = my_cand[which] my_logpj = my_logpj[which] my_pjb = my_pjb[which] my_corr = my_corr[which] my_N, D = my_y.shape N_use = comm.allreduce(my_N) else: N_use = N dlog.append("N_use", N_use) # Allocate suff-stat arrays my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros_like(W) # shape (H, D) my_pi = 0.0 # my_sigma = 0.0 # # Iterate over all datapoints for n in xrange(my_N): tracing.tracepoint("M_step:iterating") y = my_y[n, :] # shape (D,) cand = my_cand[n, :] # shape (Hprime,) logpj = my_logpj[n, :] # shape (no_states,) pjb = my_pjb[n, :] # shape (no_states,) corr = my_corr[n] # scalar this_Wp = np.zeros_like(W) # numerator for W (current datapoint) (H, D) this_Wq = np.zeros_like(W) # denominator for W (current datapoint) (H, D) this_pi = 0.0 # numerator for pi update (current datapoint) this_sigma = 0.0 # numerator for gamma update (current datapoint) # Zero active hidden causes # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do this_sigma += pjb[0] * (y ** 2).sum() # One active hidden cause this_Wp += (pjb[1 : (H + 1), None] * Wsquared[:, :]) * y[None, :] this_Wq += pjb[1 : (H + 1), None] * Wsquared[:, :] this_pi += pjb[1 : (H + 1)].sum() this_sigma += (pjb[1 : (H + 1)] * ((W - y) ** 2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime, D) Wl_ = Wl[cand] # is ( " ") Wrho_ = Wrho[cand] # is ( " ") Wlrhom1 = (rho - 1) * Wl_ # is (Hprime, D) Wlbar = np.log(np.dot(state_mtx, Wrho_)) / rho # is (no_states, D) Wbar = np.exp(Wlbar) # is (no_states, D) blpj = beta * logpj[1 + H :] - corr # is (no_states,) Aid = ( state_mtx[:, :, None] * np.exp(blpj[:, None, None] + (1 - rho) * Wlbar[:, None, :] + Wlrhom1[None, :, :]) ).sum(axis=0) assert np.isfinite(Wlbar).all() assert np.isfinite(Wbar).all() assert np.isfinite(pjb).all() assert np.isfinite(Aid).all() this_Wp[cand] += Aid * y[None, :] this_Wq[cand] += Aid this_pi += (pjb[1 + H :] * state_abs).sum() this_sigma += (pjb[1 + H :] * ((Wbar - y) ** 2).sum(axis=1)).sum() denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_sigma += this_sigma / denom my_ldenom_sum += np.log(np.sum(np.exp(logpj))) # For loglike computation # Calculate updated W if "W" in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) assert np.isfinite(my_Wp).all() assert np.isfinite(my_Wq).all() comm.Allreduce([my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE]) comm.Allreduce([my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE]) # Make sure wo do not devide by zero tiny = np.finfo(Wq.dtype).tiny Wp[Wq < tiny] = 0.0 Wq[Wq < tiny] = tiny W_new = Wp / Wq else: W_new = W # Calculate updated pi if "pi" in self.to_learn: tracing.tracepoint("M_step:update pi") assert np.isfinite(my_pi).all() pi_new = A_pi_gamma / B_pi_gamma * pies * comm.allreduce(my_pi) / N_use else: pi_new = pies # Calculate updated sigma if "sigma" in self.to_learn: # TODO: XXX see LinCA XXX (merge!) tracing.tracepoint("M_step:update sigma") assert np.isfinite(my_sigma).all() sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma # Put all together and compute (always) et_approx_likelihood ldenom_sum = comm.allreduce(my_ldenom_sum) lAi = (H * np.log(1.0 - pi_new)) - ((D / 2) * np.log(2 * pi)) - (D * np.log(sigma_new)) # For practical and et approx reasons we use: sum of restected respons=1 loglike_et = (lAi * N_use) + ldenom_sum return {"W": W_new, "pi": pi_new, "sigma": sigma_new, "Q": loglike_et}
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ BSC M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] mu = model_params['mu'] # Read in data: my_y = my_data['y'].copy() candidates = my_data['candidates'] logpj_all = my_suff_stat['logpj'] all_denoms = np.exp(logpj_all).sum(axis=1) my_N, D = my_y.shape N = comm.allreduce(my_N) # Joerg's data noise idea data_noise_scale = anneal['data_noise'] if data_noise_scale > 0: my_y += my_data['data_noise'] SM = self.state_matrix # shape: (no_states, Hprime) # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute factor for pi update A_pi_gamma = 0 B_pi_gamma = 0 for gamma_p in range(gamma+1): A_pi_gamma += comb(H,gamma_p) * (pies**gamma_p) * ((1-pies)**(H-gamma_p)) B_pi_gamma += gamma_p * comb(H,gamma_p) * (pies**gamma_p) * ((1-pies)**(H-gamma_p)) E_pi_gamma = pies * H * A_pi_gamma / B_pi_gamma # Truncate data if anneal['Ncut_factor'] > 0.0: tracing.tracepoint("M_step:truncating") #alpha = 0.9 # alpha from ET paper #N_use = int(alpha * (N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor']))) N_use = int(N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor'])) cut_denom = parallel.allsort(all_denoms)[-N_use] which = np.array(all_denoms >= cut_denom) candidates = candidates[which] logpj_all = logpj_all[which] my_y = my_y[which] my_N, D = my_y.shape N_use = comm.allreduce(my_N) else: N_use = N dlog.append('N', N_use) # Calculate truncated Likelihood L = H * np.log(1-pies) - 0.5 * D * np.log(2*pi*sigma**2) - np.log(A_pi_gamma) Fs = np.log(np.exp(logpj_all).sum(axis=1)).sum() L += comm.allreduce(Fs)/N_use dlog.append('L',L) # Precompute pil_bar = np.log( pies/(1.-pies) ) corr_all = logpj_all.max(axis=1) # shape: (my_N,) pjb_all = np.exp(logpj_all - corr_all[:, None]) # shape: (my_N, no_states) # Allocate my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros((H,H)) # shape (H, H) my_pi = 0.0 # my_sigma = 0.0 # #my_mup = np.zeros_like(W) # shape (H, D) #my_muq = np.zeros((H,H)) # shape (H, H) my_mus = np.zeros(H) # shape D data_sum = my_y.sum(axis=0) # sum over all data points for mu update ## Calculate mu #for n in xrange(my_N): #tracing.tracepoint("Calculationg offset") #y = my_y[n,:] # length D #cand = candidates[n,:] # length Hprime #logpj = logpj_all[n,:] # length no_states #corr = corr_all[n] # scalar #pjb = pjb_all[n, :] ## Zero active hidden cause (do nothing for the W and pi case) ## this_Wp += 0. # nothing to do ## this_Wq += 0. # nothing to do ## this_pi += 0. # nothing to do ## One active hidden cause #this_mup = np.outer(pjb[1:(H+1)],y) #this_muq = pjb[1:(H+1)] * np.identity(H) #this_mus = pjb[1:(H+1)] ## Handle hidden states with more than 1 active cause #this_mup[cand] += np.dot(np.outer(y,pjb[(1+H):]),SM).T #this_muq_tmp = np.zeros_like(my_muq[cand]) #this_muq_tmp[:,cand] = np.dot(pjb[(1+H):] * SM.T,SM) #this_muq[cand] += this_muq_tmp #this_mus[cand] += np.inner(SM.T,pjb[(1+H):]) #denom = pjb.sum() #my_mup += this_mup / denom #my_muq += this_muq / denom #my_mus += this_mus / denom ## Calculate updated mu #if 'mu' in self.to_learn: #tracing.tracepoint("M_step:update mu") #mup = np.empty_like(my_mup) #muq = np.empty_like(my_muq) #mus = np.empty_like(my_mus) #all_data_sum = np.empty_like(data_sum) #comm.Allreduce( [my_mup, MPI.DOUBLE], [mup, MPI.DOUBLE] ) #comm.Allreduce( [my_muq, MPI.DOUBLE], [muq, MPI.DOUBLE] ) #comm.Allreduce( [my_mus, MPI.DOUBLE], [mus, MPI.DOUBLE] ) #comm.Allreduce( [data_sum, MPI.DOUBLE], [all_data_sum, MPI.DOUBLE] ) #mu_numer = all_data_sum - np.dot(mus,np.dot(np.linalg.inv(muq), mup)) #mu_denom = my_N - np.dot(mus,np.dot(np.linalg.inv(muq), mus)) #mu_new = mu_numer/ mu_denom #else: #mu_new = mu # Iterate over all datapoints tracing.tracepoint("M_step:iterating") for n in xrange(my_N): y = my_y[n,:]-mu # length D cand = candidates[n,:] # length Hprime pjb = pjb_all[n, :] this_Wp = np.zeros_like(my_Wp) # numerator for current datapoint (H, D) this_Wq = np.zeros_like(my_Wq) # denominator for current datapoint (H, H) this_pi = 0.0 # numerator for pi update (current datapoint) # Zero active hidden cause (do nothing for the W and pi case) # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do # One active hidden cause this_Wp = np.outer(pjb[1:(H+1)],y) this_Wq = pjb[1:(H+1)] * np.identity(H) this_pi = pjb[1:(H+1)].sum() this_mus = pjb[1:(H+1)].copy() # Handle hidden states with more than 1 active cause this_Wp[cand] += np.dot(np.outer(y,pjb[(1+H):]),SM).T this_Wq_tmp = np.zeros_like(my_Wq[cand]) this_Wq_tmp[:,cand] = np.dot(pjb[(1+H):] * SM.T,SM) this_Wq[cand] += this_Wq_tmp this_pi += np.inner(pjb[(1+H):], SM.sum(axis=1)) this_mus[cand] += np.inner(SM.T,pjb[(1+H):]) denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_mus += this_mus / denom # Calculate updated W if 'W' in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) comm.Allreduce( [my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE] ) comm.Allreduce( [my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE] ) W_new = np.dot(np.linalg.inv(Wq), Wp) # W_new = np.linalg.lstsq(Wq, Wp) else: W_new = W # Calculate updated pi if 'pi' in self.to_learn: tracing.tracepoint("M_step:update pi") pi_new = E_pi_gamma * comm.allreduce(my_pi) / H / N_use else: pi_new = pies # Calculate updated sigma if 'sigma' in self.to_learn: tracing.tracepoint("M_step:update sigma") # Loop for sigma update: for n in xrange(my_N): y = my_y[n,:]-mu # length D cand = candidates[n,:] # length Hprime logpj = logpj_all[n,:] # length no_states corr = logpj.max() # scalar pjb = np.exp(logpj - corr) # Zero active hidden causes this_sigma = pjb[0] * (y**2).sum() # Hidden states with one active cause this_sigma += (pjb[1:(H+1)] * ((W-y)**2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause SM = self.state_matrix # is (no_states, Hprime) W_ = W[cand] # is (Hprime x D) Wbar = np.dot(SM,W_) this_sigma += (pjb[(H+1):] * ((Wbar-y)**2).sum(axis=1)).sum() denom = pjb.sum() my_sigma += this_sigma/ denom sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma # Calculate updated mu: if 'mu' in self.to_learn: tracing.tracepoint("M_step:update mu") mus = np.empty_like(my_mus) all_data_sum = np.empty_like(data_sum) comm.Allreduce( [my_mus, MPI.DOUBLE], [mus, MPI.DOUBLE] ) comm.Allreduce( [data_sum, MPI.DOUBLE], [all_data_sum, MPI.DOUBLE] ) mu_new = all_data_sum/my_N - np.inner(W_new.T/my_N,mus) else: mu_new = mu for param in anneal.crit_params: exec('this_param = ' + param) anneal.dyn_param(param, this_param) dlog.append('N_use', N_use) return { 'W': W_new, 'pi': pi_new, 'sigma': sigma_new, 'mu': mu_new }
W_gt[sample(range(H), np.int(H * neg_bars))] *= -1 W_gt = W_gt.reshape((H, D)) W_gt += np.random.normal(size=(H, D), scale=0.5) # Prepare model... model = BSC_ET(D, H, Hprime, gamma, to_learn) mparams = {"W": W_gt, "pi": pi_gt, "sigma": sigma_gt, "mu": mu_gt} mparams = comm.bcast(mparams) pprint("Generating Model Parameters:") pprint("pi = " + np.str(mparams["pi"]) + "; sigma = " + np.str(mparams["sigma"])) # Generate trainig data my_N = N // comm.size my_data = model.generate_data(mparams, my_N) dlog.append("y", my_data["y"][0:20]) # Choose annealing schedule anneal = LinearAnnealing(anneal_steps) anneal["T"] = [(15, start_temp), (-10, end_temp)] anneal["Ncut_factor"] = [(0, 0.0), (2.0 / 3, 1.0)] anneal["anneal_prior"] = anneal_prior anneal["W_noise"] = [(0.0, W_noise_intensity), (0.9, W_noise_intensity), (1.0, 0.0)] anneal["pi_noise"] = [(0.0, pi_noise_intensity), (0.9, pi_noise_intensity), (1.0, 0.0)] anneal["sigma_noise"] = [(0.0, sigma_noise_intensity), (0.9, sigma_noise_intensity), (1.0, 0.0)] mean_W = np.zeros((H, D)) pics_per_H = my_N // H for indH in xrange(H): mean_W_tmp = np.sum(comm.allreduce(my_data["y"][indH * pics_per_H : (indH + 1) * pics_per_H, :]), axis=0) / ( my_N // H * comm.size
old_fname = sys.argv[2] + "/result.h5" old_h5 = openFile(old_fname, 'r') if comm.rank == 0: # Copy old results for node in old_h5.listNodes("/"): name = node.name rows = node.shape[0] if name in [ 'RF', 'gabor_params', 'gabor_errors', 'dog_params', 'dog_errors', 'dog_sigmas' ]: continue for r in xrange(rows): dlog.append(name, node[r]) # Extract current parameters steps_done = old_h5.root.W.shape[0] lparams = { 'mu': np.zeros((D, )), 'W': old_h5.root.W[-1], 'pi': old_h5.root.pi[-1], 'sigma': old_h5.root.sigma[-1], } old_h5.close() # Advance annealing schedule dlog.progress("Skipping %d EM iterations" % steps_done) for i in xrange(steps_done - 1): anneal.next()
W_gt = W_gt.reshape( (H, D) ) W_gt += np.random.normal(size=(H, D), scale=0.5) # Prepare model... model = MMCA_ET(D, H, Hprime, gamma) gt_params = { 'W' : W_gt, 'pi' : 2./H, 'sigma' : 0.10 } # Generate trainig data my_N = N // comm.size my_data = model.generate_data(gt_params, my_N) dlog.append('y', my_data['y'][0:25,:]) # Initialize model parameters (to be learned) params = { # 'W' : W_gt, 'W' : np.random.normal(size=W_gt.shape), 'pi' : 1/H, 'sigma' : 5.00 } #params = model.noisify_params(params, anneal) params = comm.bcast(params) # Create and start EM annealing em = EM(model=model, anneal=anneal) em.data = my_data em.lparams = params
#=============== Pick up and continue previous computation ============ dlog.progress("Picking up computation from %s" % sys.argv[2]) tracing.tracepoint("Copy old reults") old_fname = sys.argv[2] + "/result.h5" old_h5 = openFile(old_fname, 'r') if comm.rank == 0: # Copy old results for node in old_h5.listNodes("/"): name = node.name rows = node.shape[0] if name in ['RF', 'gabor_params', 'gabor_errors', 'dog_params', 'dog_errors', 'dog_sigmas']: continue for r in xrange(rows): dlog.append(name, node[r]) # Extract current parameters steps_done = old_h5.root.W.shape[0] lparams = { 'mu' : np.zeros( (D,) ), 'W' : old_h5.root.W[-1], 'pi' : old_h5.root.pi[-1], 'sigma' : old_h5.root.sigma[-1], } old_h5.close() # Advance annealing schedule dlog.progress("Skipping %d EM iterations" % steps_done) for i in xrange(steps_done-1): anneal.next()
def M_step(self, anneal, model_params, my_suff_stat, my_data): """ MCA M_step my_data variables used: my_data['y'] Datapoints my_data['candidates'] Candidate H's according to selection func. Annealing variables used: anneal['T'] Temperature for det. annealing AND softmax anneal['N_cut_factor'] 0.: no truncation; 1. trunc. according to model """ comm = self.comm H, Hprime = self.H, self.Hprime gamma = self.gamma W = model_params['W'] pies = model_params['pi'] sigma = model_params['sigma'] # Read in data: my_y = my_data['y'] my_cand = my_data['candidates'] my_logpj = my_suff_stat['logpj'] my_N, D = my_y.shape N = comm.allreduce(my_N) state_mtx = self.state_matrix # shape: (no_states, Hprime) state_abs = self.state_abs # shape: (no_states,) no_states = len(state_abs) # Disable some warnings old_seterr = np.seterr(divide='ignore', under='ignore') # To compute et_loglike: my_ldenom_sum = 0.0 ldenom_sum = 0.0 # Precompute T = anneal['T'] T_rho = np.maximum(T, self.rho_T_bound) rho = 1./(1.-1./T_rho) rho = np.maximum(np.minimum(rho, self.rho_ubound), self.rho_lbound) beta = 1./T pre1 = -1./2./sigma/sigma pil_bar = np.log( pies/(1.-pies) ) Wl = accel.log(np.abs(W)) Wrho = accel.exp(rho * Wl) Wrhos = np.sign(W) * Wrho Wsquared = W*W # Some asserts assert np.isfinite(pil_bar).all() assert np.isfinite(Wl).all() assert np.isfinite(Wrho).all() assert (Wrho > 1e-86).all() my_corr = beta*((my_logpj).max(axis=1)) # shape: (my_N,) my_logpjb = beta*my_logpj - my_corr[:, None] # shape: (my_N, no_states) my_pj = accel.exp(my_logpj) # shape: (my_N, no_states) my_pjb = accel.exp(my_logpjb) # shape: (my_N, no_states) # Precompute factor for pi update and ET cutting A_pi_gamma = 0.; B_pi_gamma = 0. for gp in xrange(0, self.gamma+1): a = comb(H, gp) * pies**gp * (1.-pies)**(H-gp) A_pi_gamma += a B_pi_gamma += gp * a # Truncate data if anneal['Ncut_factor'] > 0.0: tracing.tracepoint("M_step:truncating") my_logdenoms = accel.log(my_pjb.sum(axis=1)) + my_corr N_use = int(N * (1 - (1 - A_pi_gamma) * anneal['Ncut_factor'])) cut_denom = parallel.allsort(my_logdenoms)[-N_use] my_sel, = np.where(my_logdenoms >= cut_denom) my_N, = my_sel.shape N_use = comm.allreduce(my_N) else: my_N,_ = my_y.shape my_sel = np.arange(my_N) N_use = N # Allocate suff-stat arrays my_Wp = np.zeros_like(W) # shape (H, D) my_Wq = np.zeros_like(W) # shape (H, D) my_pi = 0.0 # my_sigma = 0.0 # # Do reverse correlation if requested if self.rev_corr: my_y_rc = my_data['y_rc'] D_rev_corr = my_y_rc.shape[1] my_rev_corr = np.zeros( (H,D_rev_corr) ) my_rev_corr_count = np.zeros(H) # Iterate over all datapoints tracing.tracepoint("M_step:iterating...") dlog.append('N_use', N_use) for n in my_sel: y = my_y[n,:] # shape (D,) cand = my_cand[n,:] # shape (Hprime,) logpj = my_logpj[n,:] # shape (no_states,) logpjb = my_logpjb[n,:] # shape (no_states,) pj = my_pj[n,:] # shape (no_states,) pjb = my_pjb[n,:] # shape (no_states,) this_Wp = np.zeros_like(W) # numerator for W (current datapoint) (H, D) this_Wq = np.zeros_like(W) # denominator for W (current datapoint) (H, D) this_pi = 0.0 # numerator for pi update (current datapoint) this_sigma = 0.0 # numerator for gamma update (current datapoint) # Zero active hidden causes # this_Wp += 0. # nothing to do # this_Wq += 0. # nothing to do # this_pi += 0. # nothing to do this_sigma += pjb[0] * (y**2).sum() # One active hidden cause this_Wp += (pjb[1:(H+1),None]) * y[None, :] this_Wq += (pjb[1:(H+1),None]) this_pi += pjb[1:(H+1)].sum() this_sigma += (pjb[1:(H+1)] * ((W-y)**2).sum(axis=1)).sum() # Handle hidden states with more than 1 active cause W_ = W[cand] # is (Hprime, D) Wl_ = Wl[cand] # is ( " ") Wrho_ = Wrho[cand] # is ( " ") Wrhos_ = Wrhos[cand] # is ( " ") #Wbar = calc_Wbar(state_mtx, W_) #Wlbar = np.log(np.abs(Wbar)) t0 = np.dot(state_mtx, Wrhos_) Wlbar = accel.log(np.abs(t0)) / rho # is (no_states, D) #Wlbar = np.maximum(Wlbar, -9.21) Wbar = np.sign(t0)*accel.exp(Wlbar) # is (no_states, D) t = Wlbar[:, None, :]-Wl_[None, :, :] t = np.maximum(t, 0.) Aid = state_mtx[:,:, None] * accel.exp(logpjb[H+1:,None,None] - (rho-1)*t) Aid = Aid.sum(axis=0) #Aid = calc_Aid(logpjb[H+1:], W_, Wl_, state_mtx, Wbar, Wlbar, rho) #assert np.isfinite(Wlbar).all() #assert np.isfinite(Wbar).all() #assert np.isfinite(pjb).all() #assert np.isfinite(Aid).all() this_Wp[cand] += Aid * y[None, :] this_Wq[cand] += Aid this_pi += (pjb[1+H:] * state_abs).sum() this_sigma += (pjb[1+H:] * ((Wbar-y)**2).sum(axis=1)).sum() denom = pjb.sum() my_Wp += this_Wp / denom my_Wq += this_Wq / denom my_pi += this_pi / denom my_sigma += this_sigma / denom #self.tbl.append("logpj", logpj) #self.tbl.append("corr", my_corr[n]) #self.tbl.append("denom", denom) #self.tbl.append("cand", cand) #self.tbl.append("Aid", Aid) my_ldenom_sum += accel.log(np.sum(accel.exp(logpj))) #For loglike computation # Estimate reverse correlation if self.rev_corr: pys = pjb / denom if np.isfinite(pys).all(): my_rev_corr += pys[1:H+1, None]*my_y_rc[n,None,:] my_rev_corr_count += pys[1:H+1] my_rev_corr[cand] += np.sum(state_mtx[:,:,None]*pys[H+1:,None,None]*my_y_rc[n,None,:], axis=0) my_rev_corr_count[cand] += np.sum(state_mtx[:,:]*pys[H+1,None], axis=0) else: print "Not all finite rev_corr %d" % n # Calculate updated W if 'W' in self.to_learn: tracing.tracepoint("M_step:update W") Wp = np.empty_like(my_Wp) Wq = np.empty_like(my_Wq) assert np.isfinite(my_Wp).all() assert np.isfinite(my_Wq).all() comm.Allreduce( [my_Wp, MPI.DOUBLE], [Wp, MPI.DOUBLE] ) comm.Allreduce( [my_Wq, MPI.DOUBLE], [Wq, MPI.DOUBLE] ) # Make sure wo do not devide by zero tiny = self.tol Wq[Wq < tiny] = tiny # Calculate updated W W_new = Wp / Wq # Add inertia depending on Wq alpha = 2.5 inertia = np.maximum(1. - accel.exp(-Wq / alpha), 0.2) W_new = inertia*W_new + (1-inertia)*W else: W_new = W # Calculate updated pi if 'pi' in self.to_learn: tracing.tracepoint("M_step:update pi") assert np.isfinite(my_pi).all() pi_new = A_pi_gamma / B_pi_gamma * pies * comm.allreduce(my_pi) / N_use else: pi_new = pies # Calculate updated sigma if 'sigma' in self.to_learn: # TODO: XXX see LinCA XXX (merge!) tracing.tracepoint("M_step:update sigma") assert np.isfinite(my_sigma).all() sigma_new = np.sqrt(comm.allreduce(my_sigma) / D / N_use) else: sigma_new = sigma # Put all together and compute (always) et_approx_likelihood ldenom_sum = comm.allreduce(my_ldenom_sum) lAi = (H * np.log(1. - pi_new)) - ((D/2) * np.log(2*pi)) -( D * np.log(sigma_new)) # For practical and et approx reasons we use: sum of restected respons=1 loglike_et = (lAi * N_use) + ldenom_sum if self.rev_corr: rev_corr = np.empty_like(my_rev_corr) rev_corr_count = np.empty_like(my_rev_corr_count) comm.Allreduce( [my_rev_corr, MPI.DOUBLE], [rev_corr, MPI.DOUBLE]) comm.Allreduce( [my_rev_corr_count, MPI.DOUBLE], [rev_corr_count, MPI.DOUBLE]) rev_corr /= (1e-16+rev_corr_count[:,None]) else: rev_corr = np.zeros( (H, D) ) # Restore np.seterr np.seterr(**old_seterr) return { 'W': W_new, 'pi': pi_new, 'sigma': sigma_new , 'rev_corr': rev_corr, 'Q':loglike_et}
W_gt = W_gt.reshape((H, D)) W_gt += np.random.normal(size=(H, D), scale=0.5) # Prepare model... model = BSC_ET(D, H, Hprime, gamma, to_learn) mparams = {'W': W_gt, 'pi': pi_gt, 'sigma': sigma_gt, 'mu': mu_gt} mparams = comm.bcast(mparams) pprint("Generating Model Parameters:") pprint("pi = " + np.str(mparams['pi']) + "; sigma = " + np.str(mparams['sigma'])) # Generate trainig data my_N = N // comm.size my_data = model.generate_data(mparams, my_N) dlog.append('y', my_data['y'][0:20]) # Choose annealing schedule anneal = LinearAnnealing(anneal_steps) anneal['T'] = [(15, start_temp), (-10, end_temp)] anneal['Ncut_factor'] = [(0, 0.), (2. / 3, 1.)] anneal['anneal_prior'] = anneal_prior anneal['W_noise'] = [(0., W_noise_intensity), (0.9, W_noise_intensity), (1., 0.)] anneal['pi_noise'] = [(0., pi_noise_intensity), (0.9, pi_noise_intensity), (1., 0.)] anneal['sigma_noise'] = [(0., sigma_noise_intensity), (0.9, sigma_noise_intensity), (1., 0.)] mean_W = np.zeros((H, D)) pics_per_H = my_N // H
# Prepare ground-truth GFs (bars) W_gt = np.zeros((H, D2, D2)) for i in xrange(D2): W_gt[i, i, :] = -10. W_gt[D2 + i, :, i] = +10. W_gt = W_gt.reshape((H, D)) W_gt += np.random.normal(size=(H, D), scale=0.5) # Prepare model... model = MMCA_ET(D, H, Hprime, gamma) gt_params = {'W': W_gt, 'pi': 2. / H, 'sigma': 0.10} # Generate trainig data my_N = N // comm.size my_data = model.generate_data(gt_params, my_N) dlog.append('y', my_data['y'][0:25, :]) # Initialize model parameters (to be learned) params = { # 'W' : W_gt, 'W': np.random.normal(size=W_gt.shape), 'pi': 1 / H, 'sigma': 5.00 } #params = model.noisify_params(params, anneal) params = comm.bcast(params) # Create and start EM annealing em = EM(model=model, anneal=anneal) em.data = my_data em.lparams = params