def predict(self, X, DistStr): N, D = X.shape if DistStr == "Gauss": P_hat = np.zeros((N, len(self.K))) for k, l in self.likelihoods.items(): P_hat[:, k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log( self.priors[k]) return P_hat.argmax(axis=1) if DistStr == "Multinomial": P_hat = np.zeros((N, len(self.K))) for k, l in self.likelihoods.items(): P_hat[:, k] = mlvn.logpmf(X, l["N"], l["P"]) + np.log( self.priors[k]) return P_hat.argmax(axis=1) if DistStr == "Bernoulli": P_hat = np.zeros((N, len(self.K))) for k, l in self.likelihoods.items(): #Using the Bernoulli function/formula. Trick is to get the matrices to go from mxn to a 1x1 number for each k value P_hat[:, k] = np.log(self.priors[k]) + np.matmul( X, np.log(l["mean"])) + np.matmul( (1 - X), np.log(abs(1 - l["mean"]))) return P_hat.argmax(axis=1)
def AssignClustersSingleTopic(M, omega, X): """ Assign a corpus of documents X to the most likely topic, via the MAP assignment of Remark 2.1 @param M: the conditional expectations matrix @param omega: the mixing weights @param X: a bag-of-words documents distributed as a Single Topic Model, with N rows an n columns; at position (i,j) we have the number of times the word j appeared in doc. i, """ # Guarantees that the columns of M are in the simplex M = M / np.sign(M.sum(0)) M[M <= 0] = 0.000001 M[M >= 1] = 0.999999 M = M / M.sum(0) # Guarantees that the omega is in the simplex omega = projsplx(omega) N, n = X.shape n, k = M.shape wmu = np.zeros((N, k)) nn = X.sum(1) #Calculates the probability that a given sample has been generated by a given topic for i in range(k): mu = M[:, i].reshape(n) wmu[:, i] = multinomial.logpmf(X, n=nn, p=mu) + np.log(omega[i]) #Perform MAP assignment CL = np.argmax(wmu, 1) return CL
def calc_independent_loglikelihood_var_disc(variable): x = train_df.groupby([variable]).size() n = len(train_df[variable]) p = x / n loglike_array = multinomial.logpmf(x.tolist(), n, p.tolist()) loglikelihood = np.sum(loglike_array) return loglikelihood
def run(ARGS, data=None, model=None, is_test=False): data = data or get_classification_data(ARGS.dataset, split=ARGS.split) model = model or get_classification_model(ARGS.model)( data.K, is_test=is_test, seed=ARGS.seed) def onehot(Y, K): return np.eye(K)[Y.flatten().astype(int)].reshape(Y.shape[:-1] + (K, )) Y_oh = onehot(data.Y_test, data.K)[None, :, :] # 1, N_test, K model.fit(data.X_train, data.Y_train) p = model.predict(data.X_test) # N_test, K # clip very large and small probs eps = 1e-12 p = np.clip(p, eps, 1 - eps) p = p / np.expand_dims(np.sum(p, -1), -1) # evaluation metrics res = {} logp = multinomial.logpmf(Y_oh, n=1, p=p) res['test_loglik'] = np.average(logp) pred = np.argmax(p, axis=-1) res['test_acc'] = np.average( np.array(pred == data.Y_test.flatten()).astype(float)) res['Y_test'] = data.Y_test res['p_test'] = p res.update(ARGS.__dict__) if not is_test: # pragma: no cover with Database(ARGS.database_path) as db: db.write('classification', res) return res
def run(ARGS, is_test=False): data = get_classification_data(ARGS.dataset, split=ARGS.split, prop=1.) ind = np.zeros(data.X_train.shape[0]).astype(bool) ind[:ARGS.num_initial_points] = True X, Y = data.X_train, data.Y_train def onehot(Y, K): return np.eye(K)[Y.flatten().astype(int)].reshape(Y.shape[:-1] + (K, )) Y_oh = onehot(Y, data.K) Model = get_classification_model(ARGS.model) model = Model(data.K, is_test=is_test, seed=ARGS.seed) test_ll = [] train_ll = [] all_ll = [] test_acc = [] train_acc = [] all_acc = [] for _ in range(min(ARGS.iterations, X.shape[0] - ARGS.num_initial_points)): model.fit(X[ind], Y[ind]) p = model.predict(X) # NK # clip very large and small probs eps = 1e-12 p = np.clip(p, eps, 1 - eps) p = p / np.expand_dims(np.sum(p, -1), -1) # entropy of predictions at all points ent = multinomial.entropy(n=1, p=p) # set the seen ones to -inf so we don't choose them ent[ind] = -np.inf # choose the highest entropy point to see next i = np.argmax(ent) ind[i] = True logp = multinomial.logpmf(Y_oh, n=1, p=p) # N is_correct = (np.argmax(p, 1) == Y.flatten()) # N test_ll.append(np.average(logp[np.invert(ind)])) train_ll.append(np.average(logp[ind])) all_ll.append(np.average(logp)) test_acc.append(np.average(is_correct[np.invert(ind)])) train_acc.append(np.average(is_correct[ind])) all_acc.append(np.average(is_correct)) res = { 'test_loglik': np.array(test_ll), 'train_loglik': np.array(train_ll), 'total_loglik': np.array(all_ll), 'test_acc': np.array(test_acc), 'train_acc': np.array(train_acc), 'total_acc': np.array(all_acc), } res.update(ARGS.__dict__) if not is_test: # pragma: no cover with Database(ARGS.database_path) as db: db.write('active_learning_discrete', res)
def calc_cond_loglikelihood(variables): y = variables[0] parents = variables[1:] parents_d = [] parents_c = [] loglikelihood = 0 # Create parent sets partitioned by continuous and discrete variables for parent in parents: if parent in [ 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsiv', 'SS' ]: parents_c.append(parent) else: parents_d.append(parent) # Check if y is continuous and discrete variables if y in [ 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsiv', 'SS' ]: y_continuous = True else: y_continuous = False # if all variables are discrete if (len(parents_c) == 0) and (y_continuous == False): X = train_df.groupby(variables).size() N = len(train_df) P = X / N loglike_array = multinomial.logpmf(X.tolist(), N, P.tolist()) loglikelihood = np.sum(loglike_array) # if all variables are continuous elif len(parents_d) == 0 and (y_continuous == True): X = train_df[parents_c + [y]] n, k = X.shape # Parameter estimation with MLE for each parent_c #mu_vec = [] sigma_vec = [] for var in X.columns: mean, variance = norm.fit(X[var]) #MLE #mu_vec.append(mean) sigma_vec.append(variance) sigma_array = np.array(sigma_vec) # Calculate Likelihood with Formula loglike_c = -(n / 2) * (np.log(abs(sigma_array)) + k * np.log(2 * math.pi) + 1) loglikelihood = loglike_c.sum() # else: mixed case else: # Partitioning if y_continuous: X = train_df.set_index(parents_d)[parents_c + [y]] else: X = train_df.set_index(parents_d + [y])[parents_c] pi_i = X.index.unique().tolist() # Iterate over partitions for p in pi_i: # Create design matrix for partition p X = X.sort_index() X_p = X.loc[p] n, k = X_p.shape # Parameter estimation with MLE for each parent_c sigma_vec = [] for var in X_p.columns: mean, variance = norm.fit(X_p[var]) #MLE # if variance is 0 (because not enough cases), then estimation from train_df if variance == 0: mean, variance = norm.fit(train_df[var]) sigma_vec.append(variance) sigma_array = np.array(sigma_vec) # Calculate Likelihood with Formula from Andrews et al. loglike_c = -(n / 2) * (np.log(abs(sigma_array)) + k * np.log(2 * math.pi) + 1) logprob_d = n * np.log(n / len(train_df)) loglikelihood += loglike_c + logprob_d # Calculate likelihood of parents loglike_parents = 0 for parent in parents: if parent in parents_c: loglike_parents += calc_independent_loglikelihood_var_cont(parent) else: loglike_parents += calc_independent_loglikelihood_var_disc(parent) loglikelihood = loglikelihood.sum() - loglike_parents.sum() return loglikelihood