def pssdsstransformerr(sdss, cliperr=0.2, mode="dpf"): """ Get PS magnitudes from SDSS magnitudes, with errors.""" psmag = numpy.zeros((len(sdss), 5), dtype="f4") psmagerr = numpy.zeros_like(psmag) if len(psmag) == 0: return psmag, psmagerr if mode == "dpf": basecolor, corfun, corcolor = ( ("g", "r", "i", "z", "z"), ("gg", "rr", "ii", "zz", "yz"), ("gi", "gi", "gi", "gi", "gi"), ) else: basecolor, corfun, corcolor = ( ("g", "r", "i", "z", "z"), ("gg", "rr", "ii", "zz", "yz"), ("gr", "ri", "ri", "iz", "iz"), ) for i, dat in enumerate(zip(basecolor, corfun, corcolor)): f, cc, col = dat cn1, cn2 = (cc[0] + "-" + cc[1], col[0] + "-" + col[1]) tcol = sdss[col[0]] - sdss[col[1]] cor, poly = pssdsstransform(cn1, cn2, tcol, return_poly=True, mode=mode) tcol[~numpy.isfinite(cor)] = numpy.nan psmag[:, i] = sdss[f] + cor dpoly = (numpy.arange(len(poly)) * poly)[1:] dfpoly = numpy.polyval(numpy.flipud(dpoly), tcol) # must compute df, C df = numpy.zeros((len(cor), 3), dtype="f4") df[:, 0] = 1.0 df[:, 1] = dfpoly df[:, 2] = -dfpoly c = numpy.zeros((len(df), 3, 3)) olderr = numpy.seterr(invalid="ignore") filt = [f, col[0], col[1]] for ind1, f1 in enumerate(filt): err = sdss[f1 + "Err"] err[err > cliperr] = numpy.inf for ind2, f2 in enumerate(filt): c[:, ind1, ind2] = err ** 2 * (f1 == f2) tdot = numpy.tensordot from numpy.core.umath_tests import matrix_multiply as mm err = numpy.sqrt(mm(mm(df.reshape(df.shape[0], 1, 3), c), df.reshape(df.shape + (1,)))) numpy.seterr(**olderr) err = err.reshape(len(err)) psmagerr[:, i] = err return psmag, psmagerr
def pssdsstransformerr(sdss, cliperr=.2, mode='dpf'): """ Get PS magnitudes from SDSS magnitudes, with errors.""" psmag = numpy.zeros((len(sdss), 5), dtype='f4') psmagerr = numpy.zeros_like(psmag) if len(psmag) == 0: return psmag, psmagerr if mode == 'dpf': basecolor, corfun, corcolor = (('g', 'r', 'i', 'z', 'z'), ('gg', 'rr', 'ii', 'zz', 'yz'), ('gi', 'gi', 'gi', 'gi', 'gi')) else: basecolor, corfun, corcolor = (('g', 'r', 'i', 'z', 'z'), ('gg', 'rr', 'ii', 'zz', 'yz'), ('gr', 'ri', 'ri', 'iz', 'iz')) for i,dat in enumerate(zip(basecolor, corfun, corcolor)): f, cc, col = dat cn1, cn2 = (cc[0]+'-'+cc[1], col[0]+'-'+col[1]) tcol = sdss[col[0]]-sdss[col[1]] cor, poly = pssdsstransform(cn1, cn2, tcol, return_poly=True, mode=mode) tcol[~numpy.isfinite(cor)] = numpy.nan psmag[:,i] = sdss[f]+cor dpoly = (numpy.arange(len(poly))*poly)[1:] dfpoly = numpy.polyval(numpy.flipud(dpoly), tcol) # must compute df, C df = numpy.zeros((len(cor), 3), dtype='f4') df[:,0] = 1. df[:,1] = dfpoly df[:,2] = -dfpoly c = numpy.zeros((len(df),3,3)) olderr = numpy.seterr(invalid='ignore') filt = [f, col[0], col[1]] for ind1, f1 in enumerate(filt): err = sdss[f1+'Err'] err[err > cliperr] = numpy.inf for ind2, f2 in enumerate(filt): c[:,ind1,ind2] = err**2*(f1 == f2) tdot = numpy.tensordot from numpy.core.umath_tests import matrix_multiply as mm err = numpy.sqrt(mm(mm(df.reshape(df.shape[0], 1, 3), c), df.reshape(df.shape+(1,)))) numpy.seterr(**olderr) err = err.reshape(len(err)) psmagerr[:,i] = err return psmag, psmagerr
def em(df, k, eps): # init some values (mus, sigmas, probs) = em_init(df, k) n = len(df) ll_old = 0 i = 0 diff = 1 while diff > eps and i < 1000000: ws = np.zeros((k, n)) # for each cluster get posterior probability for j in range(k): ws[j, :] = probs[j] * mvn(mus[j], sigmas[j]).pdf(df.loc[:,0:3]) ws /= ws.sum(0) #print(f'ws: {ws[0,:]}') #print(f'sums: {ws.sum(axis=1)}') # update probabilities probs = ws.sum(axis=1) probs /= n # update means mus = np.dot(ws, df.loc[:,0:3]) mus /= ws.sum(1)[:, None] #print(mus) # update sigmas sigmas = np.zeros((k, 4, 4)) for j in range(k): # get values from data frame, subtract mean values and convert to numpy array ys = (df.loc[:,0:3] - mus[j, :]).to_numpy() # Calculate sigmas using matrix multiply. gives a deprecation warning but couldn't figure it out with transpose sigmas[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0) sigmas /= ws.sum(axis=1)[:, None, None] # init temporary log likelihood variable ll_new = 0 # caclulate probability for each for p, mu, sigma in zip(probs, mus, sigmas): ll_new += p * mvn(mu, sigma).pdf(df.loc[:,0:3].to_numpy()) ll_new = np.log(ll_new).sum() diff = np.abs(ll_new - ll_old) ll_old = ll_new # increment counter i += 1 return diff, ll_new, probs, mus, sigmas, i, ws
def em_gmm_vect(xs, pis, mus, sigmas, tol=0.01, max_iter=100): n, p = xs.shape k = len(pis) ll_old = 0 for i in range(max_iter): exp_A = [] exp_B = [] ll_new = 0 # E-step ws = np.zeros((k, n)) for j in range(k): ws[j, :] = pis[j] * mvn(mus[j], sigmas[j]).pdf(xs) ws /= ws.sum(0) # M-step pis = ws.sum(axis=1) pis /= n mus = np.dot(ws, xs) mus /= ws.sum(1)[:, None] sigmas = np.zeros((k, p, p)) for j in range(k): ys = xs - mus[j, :] sigmas[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0) sigmas /= ws.sum(axis=1)[:, None, None] # update complete log likelihoood ll_new = 0 for pi, mu, sigma in zip(pis, mus, sigmas): ll_new += pi * mvn(mu, sigma).pdf(xs) ll_new = np.log(ll_new).sum() if np.abs(ll_new - ll_old) < tol: break ll_old = ll_new return ll_new, pis, mus, sigmas
def MLE_gaussian_mix(data, weights, means, sigmas, tol=0.01, max_iter=100): n, p = data.shape k = len(weights) ll_old = 0 for i in range(max_iter): like_new = 0 # E-step ws = np.zeros((k, n)) for j in range(k): ws[j, :] = weights[j] * mvn(means[j], sigmas[j]).pdf(data) ws /= ws.sum(0) # M-step weights = ws.sum(axis=1) weights /= n means = np.dot(ws, data) # vectorize this in python is damn hard you can do the trick with transpose # means /= ws.sum(1)[:, None] means = (means.T / ws.sum(1)).T sigmas = np.zeros((k, p, p)) for j in range(k): ys = data - means[j, :] sigmas[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0) sigmas /= ws.sum(axis=1)[:, None, None] # compute the likelihoood anc compare like_new = 0 for pi, mu, sigma in zip(weights, means, sigmas): like_new += pi * mvn(mu, sigma).pdf(data) like_new = np.log(like_new).sum() if np.abs(like_new - ll_old) < tol: break ll_old = like_new return like_new, weights, means, sigmas
def fit (self, X, mu, sigma, pi, max_iter=100, tolerance=0.01): # Dimensions n, p = X.shape k = self.k # Keep track of log likelihood for convergence purposes log_likelihood_old, log_likelihood_new = 0, 0 for i in range(max_iter): # E-Step resp = np.zeros((k, n)) for mode in range(k): resp[mode] = pi[mode] * mvn(mu[mode], sigma[mode]).pdf(X) resp /= resp.sum(0) # M-Step pi = resp.sum(axis=1) / n mu = np.asarray([np.dot(r,X) / r.sum() for r in resp]) # Sigma implementation adapted from Ref.8 sigma = np.zeros((k, p, p)) for j in range(k): Y = X - mu[j, :] sigma[j] = (resp[j,:,None,None] * mm(Y[:,:,None], Y[:,None,:])).sum(axis=0) sigma /= resp.sum(axis=1)[:,None,None] # Track trajectory of means against iteration self.trajectory.append(mu) # Update log likelihood and check for convergence log_likelihood_new = np.sum([P * mvn(M, S).pdf(X) for P,M,S in zip(pi, mu, sigma)], axis=0) log_likelihood_new = np.log(log_likelihood_new).sum() if np.abs(log_likelihood_new - log_likelihood_old) < tolerance: break # Otherwiae, keep updated value for next iteration log_likelihood_old = log_likelihood_new return [mu, sigma, pi]
def Apply_EM(iris, k, e): # lenth of data n = len(iris) # E-Step # Initializing value up to k sigma = np.array([np.eye(4)] * k) cluster_mu = [] cluster_p = [] for i in range(k): atr_mu = [] for column in iris[[0, 1, 2, 3]]: atr_mu.append(Random_val(iris[column])) cluster_mu.append(atr_mu) cluster_p.append(1 / k) cluster_mus = np.array(cluster_mu) like_old = 0 i = 0 diff = 1 # applying the given condition while diff > e and i < 1000000: ws = np.zeros((k, n)) # calculating probability of each cluster for j in range(k): ws[j, :] = cluster_p[j] * mvn(cluster_mus[j], sigma[j]).pdf( iris.loc[:, 0:3]) ws /= ws.sum(0) # M Step # update probabilities cluster_p = ws.sum(axis=1) cluster_p /= n cluster_mus = np.dot(ws, iris.loc[:, 0:3]) cluster_mus /= ws.sum(1)[:, None] # update sigmas sigma = np.zeros((k, 4, 4)) for j in range(k): # get values from data frame, subtract mean values and convert to numpy array ys = (iris.loc[:, 0:3] - cluster_mus[j, :]).to_numpy() # Calculate sigmas sigma[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0) sigma /= ws.sum(axis=1)[:, None, None] # init temporary log likelihood variable like_new = 0 # calculate probability for each for p, mu, sig in zip(cluster_p, cluster_mus, sigma): like_new += p * mvn(mu, sig).pdf(iris.loc[:, 0:3].to_numpy()) like_new = np.log(like_new).sum() diff = np.abs(like_new - like_old) like_old = like_new # incrementing by 1 i += 1 print("\nNumber of iterations for the convergence is = ", i) new_nodes = pd.DataFrame() for node, point in enumerate(ws): new_nodes[node] = point new_nodes['tag'] = new_nodes.idxmax(axis=1) print("Node of clusters=", new_nodes.groupby(['tag']).agg('count')[0]) print("Mean of mattrix for 3 cluster=\n", cluster_mus) print("Covariance=\n", sigma) # calculating purity # Add flower(label) data type in the dataset new_nodes['Type'] = iris.iloc[:, 4] # Grouping to get max count groupd = new_nodes.groupby(['Type', 'tag']).agg({'tag': ['count']}) groupd.columns = ['tag_count'] groupd = groupd.groupby(['Type']).agg({'tag_count': ['max']}) groupd.columns = ['tag_count_max'] groupd = groupd.reset_index() print('Purity of clustering=', round(sum(groupd['tag_count_max']) / len(iris), 2)) return