Beispiel #1
0
def pssdsstransformerr(sdss, cliperr=0.2, mode="dpf"):
    """ Get PS magnitudes from SDSS magnitudes, with errors."""
    psmag = numpy.zeros((len(sdss), 5), dtype="f4")
    psmagerr = numpy.zeros_like(psmag)
    if len(psmag) == 0:
        return psmag, psmagerr
    if mode == "dpf":
        basecolor, corfun, corcolor = (
            ("g", "r", "i", "z", "z"),
            ("gg", "rr", "ii", "zz", "yz"),
            ("gi", "gi", "gi", "gi", "gi"),
        )
    else:
        basecolor, corfun, corcolor = (
            ("g", "r", "i", "z", "z"),
            ("gg", "rr", "ii", "zz", "yz"),
            ("gr", "ri", "ri", "iz", "iz"),
        )
    for i, dat in enumerate(zip(basecolor, corfun, corcolor)):
        f, cc, col = dat
        cn1, cn2 = (cc[0] + "-" + cc[1], col[0] + "-" + col[1])
        tcol = sdss[col[0]] - sdss[col[1]]
        cor, poly = pssdsstransform(cn1, cn2, tcol, return_poly=True, mode=mode)
        tcol[~numpy.isfinite(cor)] = numpy.nan
        psmag[:, i] = sdss[f] + cor
        dpoly = (numpy.arange(len(poly)) * poly)[1:]
        dfpoly = numpy.polyval(numpy.flipud(dpoly), tcol)
        # must compute df, C
        df = numpy.zeros((len(cor), 3), dtype="f4")
        df[:, 0] = 1.0
        df[:, 1] = dfpoly
        df[:, 2] = -dfpoly
        c = numpy.zeros((len(df), 3, 3))
        olderr = numpy.seterr(invalid="ignore")
        filt = [f, col[0], col[1]]
        for ind1, f1 in enumerate(filt):
            err = sdss[f1 + "Err"]
            err[err > cliperr] = numpy.inf
            for ind2, f2 in enumerate(filt):
                c[:, ind1, ind2] = err ** 2 * (f1 == f2)
        tdot = numpy.tensordot
        from numpy.core.umath_tests import matrix_multiply as mm

        err = numpy.sqrt(mm(mm(df.reshape(df.shape[0], 1, 3), c), df.reshape(df.shape + (1,))))
        numpy.seterr(**olderr)
        err = err.reshape(len(err))
        psmagerr[:, i] = err
    return psmag, psmagerr
Beispiel #2
0
def pssdsstransformerr(sdss, cliperr=.2, mode='dpf'):
    """ Get PS magnitudes from SDSS magnitudes, with errors."""
    psmag = numpy.zeros((len(sdss), 5), dtype='f4')
    psmagerr = numpy.zeros_like(psmag)
    if len(psmag) == 0:
        return psmag, psmagerr
    if mode == 'dpf':
        basecolor, corfun, corcolor = (('g',  'r',  'i',  'z',  'z'),
                                       ('gg', 'rr', 'ii', 'zz', 'yz'),
                                       ('gi', 'gi', 'gi', 'gi', 'gi'))
    else:
        basecolor, corfun, corcolor = (('g',  'r',  'i',  'z',  'z'),
                                       ('gg', 'rr', 'ii', 'zz', 'yz'),
                                       ('gr', 'ri', 'ri', 'iz', 'iz'))
    for i,dat in enumerate(zip(basecolor, corfun, corcolor)):
        f, cc, col = dat
        cn1, cn2 = (cc[0]+'-'+cc[1], col[0]+'-'+col[1])
        tcol = sdss[col[0]]-sdss[col[1]]
        cor, poly = pssdsstransform(cn1, cn2, tcol, return_poly=True,
                                    mode=mode)
        tcol[~numpy.isfinite(cor)] = numpy.nan
        psmag[:,i] = sdss[f]+cor
        dpoly = (numpy.arange(len(poly))*poly)[1:]
        dfpoly = numpy.polyval(numpy.flipud(dpoly), tcol)
        # must compute df, C
        df = numpy.zeros((len(cor), 3), dtype='f4')
        df[:,0] = 1.
        df[:,1] = dfpoly
        df[:,2] = -dfpoly
        c = numpy.zeros((len(df),3,3))
        olderr = numpy.seterr(invalid='ignore')
        filt = [f, col[0], col[1]]
        for ind1, f1 in enumerate(filt):
            err = sdss[f1+'Err']
            err[err > cliperr] = numpy.inf
            for ind2, f2 in enumerate(filt):
                c[:,ind1,ind2] = err**2*(f1 == f2)
        tdot = numpy.tensordot
        from numpy.core.umath_tests import matrix_multiply as mm
        err = numpy.sqrt(mm(mm(df.reshape(df.shape[0], 1, 3), c),
                            df.reshape(df.shape+(1,))))
        numpy.seterr(**olderr)
        err = err.reshape(len(err))
        psmagerr[:,i] = err
    return psmag, psmagerr
def em(df, k, eps):
    # init some values
    (mus, sigmas, probs) = em_init(df, k)

    n = len(df)
    ll_old = 0

    i = 0
    diff = 1
    while diff > eps and i < 1000000:
        ws = np.zeros((k, n))

        # for each cluster get posterior probability
        for j in range(k):
            ws[j, :] = probs[j] * mvn(mus[j], sigmas[j]).pdf(df.loc[:,0:3])
        ws /= ws.sum(0)

        #print(f'ws: {ws[0,:]}')
        #print(f'sums: {ws.sum(axis=1)}')
        # update probabilities
        probs = ws.sum(axis=1)
        probs /= n

        # update means
        mus = np.dot(ws, df.loc[:,0:3])
        mus /= ws.sum(1)[:, None]

        #print(mus)
        # update sigmas
        sigmas = np.zeros((k, 4, 4))

        for j in range(k):
            # get values from data frame, subtract mean values and convert to numpy array
            ys = (df.loc[:,0:3] - mus[j, :]).to_numpy()

            # Calculate sigmas using matrix multiply. gives a deprecation warning but couldn't figure it out with transpose
            sigmas[j] = (ws[j, :, None, None] * mm(ys[:, :, None], ys[:, None, :])).sum(axis=0)
        sigmas /= ws.sum(axis=1)[:, None, None]

        # init temporary log likelihood variable
        ll_new = 0

        # caclulate probability for each
        for p, mu, sigma in zip(probs, mus, sigmas):
            ll_new += p * mvn(mu, sigma).pdf(df.loc[:,0:3].to_numpy())

        ll_new = np.log(ll_new).sum()

        diff = np.abs(ll_new - ll_old)
        ll_old = ll_new

        # increment counter
        i += 1

    return diff, ll_new, probs, mus, sigmas, i, ws
Beispiel #4
0
def em_gmm_vect(xs, pis, mus, sigmas, tol=0.01, max_iter=100):

    n, p = xs.shape
    k = len(pis)

    ll_old = 0
    for i in range(max_iter):
        exp_A = []
        exp_B = []
        ll_new = 0

        # E-step
        ws = np.zeros((k, n))
        for j in range(k):
            ws[j, :] = pis[j] * mvn(mus[j], sigmas[j]).pdf(xs)
        ws /= ws.sum(0)

        # M-step
        pis = ws.sum(axis=1)
        pis /= n

        mus = np.dot(ws, xs)
        mus /= ws.sum(1)[:, None]

        sigmas = np.zeros((k, p, p))
        for j in range(k):
            ys = xs - mus[j, :]
            sigmas[j] = (ws[j, :, None, None] *
                         mm(ys[:, :, None], ys[:, None, :])).sum(axis=0)

        sigmas /= ws.sum(axis=1)[:, None, None]

        # update complete log likelihoood
        ll_new = 0
        for pi, mu, sigma in zip(pis, mus, sigmas):
            ll_new += pi * mvn(mu, sigma).pdf(xs)

        ll_new = np.log(ll_new).sum()

        if np.abs(ll_new - ll_old) < tol:
            break
        ll_old = ll_new

    return ll_new, pis, mus, sigmas
Beispiel #5
0
def MLE_gaussian_mix(data, weights, means, sigmas, tol=0.01, max_iter=100):

    n, p = data.shape
    k = len(weights)

    ll_old = 0
    for i in range(max_iter):

        like_new = 0

        # E-step
        ws = np.zeros((k, n))
        for j in range(k):
            ws[j, :] = weights[j] * mvn(means[j], sigmas[j]).pdf(data)
        ws /= ws.sum(0)

        # M-step
        weights = ws.sum(axis=1)
        weights /= n

        means = np.dot(ws, data)
        # vectorize this in python is damn hard you can do the trick with transpose
        # means /= ws.sum(1)[:, None]
        means = (means.T / ws.sum(1)).T

        sigmas = np.zeros((k, p, p))
        for j in range(k):
            ys = data - means[j, :]
            sigmas[j] = (ws[j, :, None, None] *
                         mm(ys[:, :, None], ys[:, None, :])).sum(axis=0)
        sigmas /= ws.sum(axis=1)[:, None, None]

        # compute the  likelihoood anc compare
        like_new = 0
        for pi, mu, sigma in zip(weights, means, sigmas):
            like_new += pi * mvn(mu, sigma).pdf(data)
        like_new = np.log(like_new).sum()

        if np.abs(like_new - ll_old) < tol:
            break
        ll_old = like_new

    return like_new, weights, means, sigmas
Beispiel #6
0
 def fit (self, X, mu, sigma, pi, max_iter=100, tolerance=0.01):  
     # Dimensions
     n, p = X.shape
     k = self.k
     
     # Keep track of log likelihood for convergence purposes
     log_likelihood_old, log_likelihood_new = 0, 0
     
     for i in range(max_iter):            
         # E-Step
         resp = np.zeros((k, n))
         for mode in range(k):
             resp[mode] = pi[mode] * mvn(mu[mode], sigma[mode]).pdf(X)
         resp /= resp.sum(0)
         
         # M-Step
         pi = resp.sum(axis=1) / n
         mu = np.asarray([np.dot(r,X) / r.sum() for r in resp])
         
         # Sigma implementation adapted from Ref.8
         sigma = np.zeros((k, p, p))
         for j in range(k):
             Y = X - mu[j, :]
             sigma[j] = (resp[j,:,None,None] * mm(Y[:,:,None], Y[:,None,:])).sum(axis=0)
         sigma /= resp.sum(axis=1)[:,None,None]
                     
         # Track trajectory of means against iteration
         self.trajectory.append(mu)
         
         # Update log likelihood and check for convergence
         log_likelihood_new =  np.sum([P * mvn(M, S).pdf(X) for P,M,S in zip(pi, mu, sigma)], axis=0)
         log_likelihood_new = np.log(log_likelihood_new).sum()
         if np.abs(log_likelihood_new - log_likelihood_old) < tolerance:
             break
         
         # Otherwiae, keep updated value for next iteration
         log_likelihood_old = log_likelihood_new
         
     return [mu, sigma, pi]
Beispiel #7
0
def Apply_EM(iris, k, e):
    # lenth of data
    n = len(iris)

    # E-Step

    # Initializing value up to k
    sigma = np.array([np.eye(4)] * k)

    cluster_mu = []
    cluster_p = []
    for i in range(k):
        atr_mu = []
        for column in iris[[0, 1, 2, 3]]:
            atr_mu.append(Random_val(iris[column]))
        cluster_mu.append(atr_mu)
        cluster_p.append(1 / k)

    cluster_mus = np.array(cluster_mu)

    like_old = 0
    i = 0
    diff = 1
    # applying the given condition
    while diff > e and i < 1000000:
        ws = np.zeros((k, n))

        # calculating probability of each cluster
        for j in range(k):
            ws[j, :] = cluster_p[j] * mvn(cluster_mus[j], sigma[j]).pdf(
                iris.loc[:, 0:3])
        ws /= ws.sum(0)

        # M Step

        # update probabilities
        cluster_p = ws.sum(axis=1)
        cluster_p /= n

        cluster_mus = np.dot(ws, iris.loc[:, 0:3])
        cluster_mus /= ws.sum(1)[:, None]

        # update sigmas
        sigma = np.zeros((k, 4, 4))

        for j in range(k):
            # get values from data frame, subtract mean values and convert to numpy array
            ys = (iris.loc[:, 0:3] - cluster_mus[j, :]).to_numpy()

            # Calculate sigmas
            sigma[j] = (ws[j, :, None, None] *
                        mm(ys[:, :, None], ys[:, None, :])).sum(axis=0)
        sigma /= ws.sum(axis=1)[:, None, None]

        # init temporary log likelihood variable
        like_new = 0

        # calculate probability for each
        for p, mu, sig in zip(cluster_p, cluster_mus, sigma):
            like_new += p * mvn(mu, sig).pdf(iris.loc[:, 0:3].to_numpy())

        like_new = np.log(like_new).sum()

        diff = np.abs(like_new - like_old)
        like_old = like_new

        # incrementing by 1
        i += 1

    print("\nNumber of iterations for the convergence is = ", i)
    new_nodes = pd.DataFrame()
    for node, point in enumerate(ws):
        new_nodes[node] = point

    new_nodes['tag'] = new_nodes.idxmax(axis=1)

    print("Node of clusters=", new_nodes.groupby(['tag']).agg('count')[0])

    print("Mean of mattrix for 3 cluster=\n", cluster_mus)

    print("Covariance=\n", sigma)

    # calculating purity

    # Add flower(label) data  type in the dataset
    new_nodes['Type'] = iris.iloc[:, 4]

    # Grouping to get max count
    groupd = new_nodes.groupby(['Type', 'tag']).agg({'tag': ['count']})
    groupd.columns = ['tag_count']
    groupd = groupd.groupby(['Type']).agg({'tag_count': ['max']})
    groupd.columns = ['tag_count_max']
    groupd = groupd.reset_index()

    print('Purity of clustering=',
          round(sum(groupd['tag_count_max']) / len(iris), 2))

    return