def select_best_bic(X):
    bic_ls = []
    for K in [1,2,3,4]:
        likelihood_ls = []
        bic_ls_seed = []

        for seed in range(5):
            mixture, post = common.init(X, K, seed)
            mixture, post, LL = naive_em.run(X,mixture,post)
            likelihood_ls.append(LL)
            bic_ls_seed.append(common.bic(X, mixture, LL))

        best_seed = np.argmax(bic_ls_seed)

        mixture, post = common.init(X, K, int(best_seed))
        mixture, post, LL = naive_em.run(X, mixture, post)
        bic_ls.append(common.bic(X,mixture,LL))
    print("The best K is {} with bic {}".format(np.argmax(bic_ls)+1, max(bic_ls)))
    return "Done"
Beispiel #2
0
def select_k_em():
    """
    Select the best K based on BIC

    :return:
    """
    for k in [1, 2, 3, 4]:
        para_list = []
        for seed in [0, 1, 2, 3, 4]:
            gm, post = common.init(X, k, seed)
            mixture, p, cost = naive_em.run(X, gm, post)
            para_list.append((mixture, p, cost))
        max_para = max(para_list, key=lambda x: x[2])
        print(common.bic(X, max_para[0], max_para[2]))
Beispiel #3
0
def run_naive_em_with_bic():
    max_bic = None
    for K in range(1, 5):
        max_ll = None
        best_seed = None
        for seed in range(0, 5):
            mixture, post = common.init(X, K, seed)
            mixture, post, ll = naive_em.run(X, mixture, post)
            if max_ll is None or ll > max_ll:
                max_ll = ll
                best_seed = seed

        mixture, post = common.init(X, K, best_seed)
        mixture, post, ll = naive_em.run(X, mixture, post)
        bic = common.bic(X, mixture, ll)
        if max_bic is None or bic > max_bic:
            max_bic = bic
        title = "EM for K={}, seed={}, ll={}, bic={}".format(K, best_seed, ll, bic)
        print(title)
        common.plot(X, mixture, post, title)
def run_em(X, plot=False):
    max_bic = None
    for i in range(len(K)):
        max_ln_like = None
        best_seed = None
        for j in range(len(seed)):
            mixture, post = common.init(X, K[i], seed[j])
            mixture, post, ln_like = em.run(X, mixture, post)
            if max_ln_like is None or ln_like > max_ln_like:
                max_ln_like = ln_like
                best_seed = seed[j]
            if plot:
                common.plot(X, mixture, post,
                            "K={}, seed={}".format(K[i], seed[j]))

        mixture, post = common.init(X, K[i], best_seed)
        mixture, post, ln_like = em.run(X, mixture, post)
        bic = common.bic(X, mixture, ln_like)
        if max_bic is None or bic > max_bic:
            max_bic = bic
        print("K = {}, Max ln(likelihood) = {}, Best seed = {}, Max BIC = {}".
              format(K[i], max_ln_like, best_seed, max_bic))
Beispiel #5
0
# TODO: Your code here
K = [1, 2, 3, 4]  # k=1,2,3,4
seed = [0, 1, 2, 3, 4]  # k =0,1,2,3,4


def k_means_function(X, K, seed):
    init_model = common.init(X, K, seed)
    mixture, post, cost = kmeans.run(X, init_model[0], init_model[1])
    return mixture, post, cost


def naive_em_function(X, K, seed):
    init_model = common.init(X, K, seed)
    mixture, post, cost = naive_em.run(X, init_model[0], init_model[1])
    return mixture, post, cost


for i in range(len(K)):
    print("K=", K[i])
    for j in range(len(seed)):
        print("seed=", seed[j])
        # mixture1, post1, cost1 =k_means_function(X, K[i], seed[j])
        # print("K-mean :",cost1)

        mixture2, post2, cost2 = naive_em_function(X, K[i], seed[j])
        print("Naive EM :", cost2)
        common.bic(X, mixture2, cost2)
        # common.plot(X, mixture1, post1, "K-mean")
        # common.plot(X, mixture2, post2, "Naive EM")
Beispiel #6
0
print("naive EM log likelihood : " + str(naive_em_estimate))

print("############## Some Tests ######################")
initialMixture, initialPost = common.init(toy_X, 1, 0)
mixtureEM1, postEM1, ll1 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 2, 0)
mixtureEM2, postEM2, ll2 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 3, 0)
mixtureEM3, postEM3, ll3 = naive_em.run(toy_X, initialMixture, initialPost)

initialMixture, initialPost = common.init(toy_X, 4, 0)
mixtureEM4, postEM4, ll4 = naive_em.run(toy_X, initialMixture, initialPost)

print("BIC K1 : " + str(common.bic(toy_X, mixtureEM1, ll1)))
print("BIC K2 : " + str(common.bic(toy_X, mixtureEM2, ll2)))
print("BIC K3 : " + str(common.bic(toy_X, mixtureEM3, ll3)))
print("BIC K4 : " + str(common.bic(toy_X, mixtureEM4, ll4)))

X_netflix = np.loadtxt("netflix_incomplete.txt")
test_em_seeds(X_netflix, 1)
#test_em_seeds(X_netflix, 12)

X_gold = np.loadtxt('netflix_complete.txt')
mixture4, post4 = common.init(X_netflix, 12, 1)
mixture, post, cost4 = em.run(X_netflix, mixture4, post4)
X_pred = em.fill_matrix(X_netflix, mixture)

rmse_result = common.rmse(X_gold, X_pred)
print("RMSE between prediction and GOLD is : " + str(rmse_result))
Beispiel #7
0
import numpy as np
import kmeans
import common
import naive_em
import em

X = np.loadtxt("toy_data.txt")
K = 4
seeds = [0, 1, 2, 3, 4]
for seed in seeds:
    mixture, post = common.init(X, K, seed)
    # kmixture, kpost, kcost = kmeans.run(X, mixture, post)
    # title = f"K is {K}, seed is {seed}, cost is {kcost}"
    em_mixture, em_post, em_cost = naive_em.run(X, mixture, post)
    with_bic = common.bic(X, em_mixture, em_cost)
    title = f"K is {K}, seed is {seed}, em_cost is {em_cost}, with_bic is {with_bic}"
    print(title)
    common.plot(X, em_mixture, em_post, title)

# TODO: Your code here
Beispiel #8
0
import kmeans
import common
import naive_em
import em

X = np.loadtxt("toy_data.txt")
seeds = [0, 1, 2, 3, 4]
K = [1, 2, 3, 4]

kbest = 1
bestbic = -100000000

for k in K:
    best = 100000000
    seed_best = 0
    for seed in seeds:
        mixtures, post = common.init(X, k, seed)
        tupl = naive_em.run(X, mixtures, None)
        if (best > tupl[2]):
            best = tupl[2]
            seed_best = seed
    mixtures, post = common.init(X, k, seed_best)
    tupl = naive_em.run(X, mixtures, None)
    bi = common.bic(X, mixtures, tupl[2])
    if (bi > bestbic):
        bestbic = bi
        kbest = k

print(kbest)
print(bestbic)
Beispiel #9
0
for K in Ks:
    for seed in seeds:
        mixture, post = common.init(X, K=K, seed=seed)  # Initialize K-means
        mixture, post, log_likelihood = naive_em.run(X, mixture, post)
        common.plot(X, mixture, post, [K, seed])
        print(K, seed, log_likelihood)

# =============================================================================
# 5. Bayesian Information Criterion
# Picking the best K
# =============================================================================

for K in Ks:
    mixture, post = common.init(X, K=K)  # Initialize K-means
    mixture, post, log_likelihood = naive_em.run(X, mixture, post)
    BIC = common.bic(X, mixture, log_likelihood)
    print(K, BIC)

# =============================================================================
# 7. Implementing EM for matrix completion
# Test for comlete case
# =============================================================================

X = np.loadtxt("toy_data.txt")
Ks = [1, 2, 3, 4]
seeds = [0, 1, 2, 3, 4]
mixture, post = common.init(X, 3, seed=0)
post1, log_likelihood1 = naive_em.estep(X, mixture)
post2, log_likelihood2 = em.estep(X, mixture)
print(log_likelihood1, log_likelihood2)
Beispiel #10
0
import numpy as np
import kmeans
import common
import naive_em
import em

X = np.loadtxt("netflix_complete.txt")
K = 12
# TODO: Your code here
for seed in range(5):
    mixtures , post = common.init(X , K , seed)   

    # m, p, cost = kmeans.run(X , mixtures , post)
    # print (cost)
    # common.plot(X , mixtures , post , "Title")

    m, p, cost = naive_em.run(X , mixtures , post)      
    print (common.bic(X , m , cost))
    # common.plot(X , mixtures , post , "Title")

Beispiel #11
0
import numpy as np
import kmeans
import common
import naive_em
import em

X = np.loadtxt("toy_data.txt")

for i in range(4):
    for j in range(5):
        initial_mixture, post = common.init(X, i + 1, j)
        #M, L, cost_final = kmeans.run(X, initial_mixture, post)
        #title = "K means for K "+str(i+1)+" seed " +str(j)
        #common.plot(X, M, L, title)
        #print("For K "+ str(i+1) + " seed " + str(j) +" cost is " + str(cost_final))

        M, L, likelihood = naive_em.run(X, initial_mixture, post)
        bic = common.bic(X, M, likelihood)

        title = "EM for K " + str(i + 1) + " seed " + str(j)
        common.plot(X, M, L, title)
        print("For K " + str(i + 1) + " seed " + str(j) + " likelihood is " +
              str(likelihood) + " bic is " + str(bic))
Beispiel #12
0
        naive_em.run(X, *common.init(X, K[k], seeds[i]))

    # Print lowest cost
    print("=============== Clusters:", k + 1, "======================")
    print("Lowest cost using kMeans is:", np.min(costs_kMeans))
    print("Highest log likelihood using EM is:", np.max(costs_EM))

    # Save best seed for plotting
    best_seed_kMeans[k] = np.argmin(costs_kMeans)
    best_seed_EM[k] = np.argmax(costs_EM)

    # Plot kMeans and EM results
    common.plot(X,
                mixtures_kMeans[best_seed_kMeans[k]],
                posts_kMeans[best_seed_kMeans[k]],
                title="kMeans")

    common.plot(X,
                mixtures_EM[best_seed_EM[k]],
                posts_EM[best_seed_EM[k]],
                title="EM")

    #BIC score for EM
    bic[k] = common.bic(X, mixtures_EM[best_seed_EM[k]], np.max(costs_EM))

# Print the best K based on BIC
print("================= BIC ====================")
print("Best K is:", np.argmax(bic) + 1)
print("BIC for the best K is:", np.max(bic))

########## End: kMeans vs EM (and BIC) #############
    def run_bic_test_input(self, test):
        X, mixture, ll, expected_bic = test.data()

        bic = common.bic(X, mixture, ll)
        self.assertEqual(np.isclose(bic, expected_bic), True,
                         f'BIC: got {bic}, expected {expected_bic}')
               [0.3927848,  0.83607876],
               [0.33739616, 0.64817187],
               [0.36824154, 0.95715516],
               [0.14035078, 0.87008726],
               [0.47360805, 0.80091075],
               [0.52047748, 0.67887953],
               [0.72063265, 0.58201979],
               [0.53737323, 0.75861562],
               [0.10590761, 0.47360042],
               [0.18633234, 0.73691818]])
K = 6
Mu = np.array([[0.6235637,  0.38438171],
               [0.3927848,  0.83607876],
               [0.81216873, 0.47997717],
               [0.14035078, 0.87008726],
               [0.36824154, 0.95715516],
               [0.10590761, 0.47360042]])
Var = np.array([0.10038354, 0.07227467, 0.13240693, 0.12411825, 0.10497521, 0.12220856])
P = np.array([0.1680912,  0.15835331, 0.21384187, 0.14223565, 0.14295074, 0.17452722])
ll = -1655.170056
print(X.shape, Mu.shape, Var.shape, P.shape)
print(common.bic(X, mixture= (Mu, Var, P), log_likelihood = ll))


### Result should be -1686.312633

x = (- 1686.312633 + 1655.170056) * -2
print(x / np.log(15))

y = (- 1453.539804 + 1398.871858) * -2
print(y / np.log(12))
Beispiel #15
0
# Question 4
# X = np.loadtxt('toy_data.txt')
# for K in range(1,5):
#         logs = []
#         for seed in range(5):
#                 mixture, post = common.init(X,K, seed)
#                 mu_s, var_s, p_s = mixture.mu, mixture.var, mixture.p
#                 mixture, post, LL = naive_em.run(X , mixture, post)
#                 common.plot(X, mixture, post, f"k:{K}, seed: {seed}")
#                 logs.append(LL)
#         print('############## K = ',K)
#         print('Log likelihood: ', np.max(logs))

# Question 5

X = np.loadtxt('toy_data.txt')
results = []
for K in range(1, 5):
    for seed in range(5):
        mixture, post = common.init(X, K, seed)
        mu_s, var_s, p_s = mixture.mu, mixture.var, mixture.p
        mixture, post, LL = naive_em.run(X, mixture, post)
        BIC = common.bic(X, mixture, LL)
        results.append([K, seed, BIC])

output = pd.DataFrame(results, columns=['K', 'seed', 'BIC'])
print(output)
max_bic_row = output['BIC'].idxmax()
print('Answer')
print(output.iloc[[max_bic_row]])
Beispiel #16
0
print(all_cost)
print(np.min(all_cost, axis=1))

# mixture, post = common.init(X, K, seed)

## for EM algorithm
print('\n\nEM algorithm')
all_cost = np.zeros((len(K_list), len(seed_list)))
for i in np.arange(len(K_list)):
    for j in np.arange(len(seed_list)):
        ans1 = 'EM algorithm: K = {}, seed = {}'.format(
            K_list[i], seed_list[j])
        print(ans1)
        mixture, post = common.init(X, K_list[i], seed_list[j])
        mixture, post, likelihood = em.naive_run(X, mixture, post)
        ans2 = 'Cost = {}'.format(likelihood)
        all_cost[i][j] = likelihood
        print(ans2)
    # common.plot(X, mixture, post, ans1)

print(all_cost)
print(np.min(all_cost, axis=1))

## calculate bic
best_likelihood = np.min(all_cost, axis=1)
bic_list = np.zeros(len(best_likelihood))
for i in np.arange(len(best_likelihood)):
    mixture, post = common.init(X, K_list[i], 0)
    bic_list[i] = common.bic(X, mixture, best_likelihood[i])
print(bic_list)
Beispiel #17
0

X = np.loadtxt("toy_data.txt")
K = [1, 2, 3, 4]
# TODO: Your code here
costs = []
loglikelihoods = []
bics = []
for k in K:
    cost_seeds_ = []
    log_likelihood_ = []
    bic_ = []
    for seed in range(4):
        gauss_mixture, post = common.init(X=X, K=k, seed=seed)
        gauss_mixture_kmeans, post_kmeans, cost = kmeans.run(
            X=X, mixture=gauss_mixture, post=post)
        #print('for k =',k, "and seed=",seed, end=" ")
        #print("cost=",cost)
        gauss_mixture_em, post_em, loglikelihood = naive_em.run(
            X, gauss_mixture, post)
        bic_.append(common.bic(X, gauss_mixture_em, loglikelihood))
        log_likelihood_.append(loglikelihood)
        cost_seeds_.append(cost)
#        plot_points(X,post_kmeans,
#                    title="kmeans with k:"+str(k)+" seed:"+str(seed))
#        plot_points(X,post_em,
#                    title="em with k:"+str(k)+" seed:"+str(seed))
    bics.append(bic_)
    costs.append(cost_seeds_)
    loglikelihoods.append(log_likelihood_)
Beispiel #18
0
    print("Best seed: ", np.argmin(costs_kMeans))

print("******* End of section 2 *******\n ")

######### Section 4: Comparing K-means and EM ############
print("******* Section 4 *******\n ")
costs_EM = [0, 0, 0, 0, 0]
mixtures_EM = [0, 0, 0, 0, 0]  # Mixtures for best seed
bic = [0., 0., 0., 0.]  # BIC for best cluster

for k in range(len(K)):
    for i in range(len(seeds)):
        mixtures_EM[i], _, costs_EM[i] = naive_em.run(
            X, *common.init(X, K[k], seeds[i]))

    bic[k] = common.bic(X, mixtures_EM[np.argmax(costs_EM)], np.max(costs_EM))

    print("----- Mixture ", k + 1, " -----")
    print("Highest log: ", np.max(costs_EM))
    print("Best seed: ", np.argmax(costs_EM))

print("******* End of section 4 *******\n ")

######### Section 5: Bayesian Information Criterion ############
print("******* Section 5 *******\n ")
print("Best K: ", np.argmax(bic) + 1)
print("BIC for the best K: ", np.max(bic))

print("******* End of section 5 *******\n ")

######### Section 8: Using the mixture model for collaborative filtering ############
Beispiel #19
0
Ks = [1, 2, 3, 4]
seeds = [0, 1, 2, 3, 4]
BICs = np.empty(len(Ks))

for i, K in enumerate(Ks):
    k_best_mix, k_best_post, k_best_cost = None, None, np.inf
    em_best_mix, em_best_post, em_best_ll = None, None, -np.inf
    for seed in seeds:
        init_mix, init_post = common.init(X, K, seed)
        k_mix, k_post, k_cost = kmeans.run(X, init_mix, init_post)
        em_mix, em_post, em_ll = naive_em.run(X, init_mix, init_post)
        if k_cost < k_best_cost:
            k_best_mix, k_best_post, k_best_cost = k_mix, k_post, k_cost
        if em_ll > em_best_ll:
            em_best_mix, em_best_post, em_best_ll = em_mix, em_post, em_ll
    BICs[i] = common.bic(X, em_best_mix, em_best_ll)
    common.plot(X, k_best_mix, k_best_post, "K-means K={}".format(K))
    common.plot(X, em_best_mix, em_best_post, "EM K={}".format(K))

print("BICs: ", BICs)
print("Best BIC: ", np.max(BICs))
print("Best K: ", Ks[np.argmax(BICs)])

X = np.loadtxt("netflix_incomplete.txt")

K = 12
seeds = [0, 1, 2, 3, 4]

em_best_mix, em_best_post, em_best_ll = None, None, -np.inf
for seed in seeds:
    init_mix, init_post = common.init(X, K, seed)
Beispiel #20
0
X = np.loadtxt("toy_data.txt")

# TODO: Your code here
# for K in [1,2,3,4]:
#     for seed in [0,1,2,3,4]:
#         mixture,post=common.init(X, K, seed)
#         mixture, post, cost=kmeans.run(X,mixture,post)
#         common.plot(X,mixture,post,title='K=%s seed=%s cost=%s'%(K,seed,cost))
#         print('K=%s seed=%s cost=%s'%(K,seed,cost))

for K in [1, 2, 3, 4]:
    maxcost = -100000
    for seed in [0]:
        mixture, post = common.init(X, K, seed)
        mixture, post, cost = naive_em.run(X, mixture, post)
        print(common.bic(X, mixture, cost))
        # common.plot(X,mixture,post,title='EM K=%s seed=%s cost=%s'%(K,seed,cost))
        # mixture,post=common.init(X, K, seed)
        # mixture, post, cost=kmeans.run(X,mixture,post)
        # common.plot(X,mixture,post,title='kmeansK=%s seed=%s cost=%s'%(K,seed,cost))
        # print('K=%s seed=%s cost=%s'%(K,seed,cost))
        # if cost>maxcost: maxcost=cost
    # print("maxll %s"%maxcost)
# K=3
# seed=0
# mixture,post=common.init(X, K, seed)
# # print(mixture)
# # posts,ll = naive_em.estep(X,mixture)
# # print('ll = %s'%ll)
# # print(posts)
# # for K in [1,2,3,4]:
Beispiel #21
0
        mixture, post = common.init(X, K=k, seed=seed)

        # run EM-algorithm
        mixture, post, LL = naive_em.run(X, mixture=mixture, post=post)

        mixtures.append(mixture)
        posts.append(post)
        logloss[i] = LL
        #print('K=', k, 'seed=', seed, 'logloss=', LL)

    best_seed = np.argmax(logloss)
    logloss = logloss[best_seed]
    mixture = mixtures[best_seed]
    post = posts[best_seed]

    current_bic = common.bic(X, mixture, logloss)
    bic[j] = current_bic

    print(f'K={k}', f'Best seed={best_seed}', f'logloss={logloss}', f'BIC={current_bic}')
    #common.plot(X, mixture, post, title=f"Naive-EM, K={k}")

best_K_ix = np.argmax(bic)
best_K = K[best_K_ix]
best_bic = bic[best_K_ix]
print(f"Best K={best_K}", f"BIC={best_bic}")


# -----------------------------------
# EM Algorithm for Matrix Completion
# -----------------------------------
Beispiel #22
0
import numpy as np
import em
import common

X = np.loadtxt("test_incomplete.txt")
X_gold = np.loadtxt("test_complete.txt")

X = X_gold

K = 4
n, d = X.shape
seed = 0

# TODO: Your code here
mixture, post = common.init(X, K, seed)
mixture, post, l = em.run(X, mixture, post)
bic = common.bic(X, mixture, l)
print("bic = ", bic)
# title = "Incomplete - > K = {}, seed = {},  log likelyhood = {}, bic = {} plot.png".format(K, seed, int(l), int(bic))
title = "test log plot"
common.plot(X, mixture, post, title)