def max_likelihood_qa(data, nvals, qa, approximate=False, smooth=0.1): nc = qa.max() + 1 sil = np.zeros(nc) for i in range(nc): di = data[qa==i].astype(np.int) total = float(len(di)) si = 0.0 for j in range(di.shape[1]): bc = np.bincount(di[:,j]) for c in bc: if c == 0: continue si += c*log(c / total) si += log(total / len(data)) # cluster prior sil[i] = si s = sil.sum() if approximate else logsumexp(sil) return s
def penalized_ll(nbcs, data): ll = 0.0 cluster_priors = [log(1.0 * c.size / len(data)) for c in nbcs] for inst in data: vals = [cluster_priors[i] + nbcs[i].fast_ll(inst, float('-inf')) for i in range(len(nbcs))] ll += logsumexp( vals ) ll -= knobs.cluster_penalty * len(nbcs) * data.shape[1] return ll
def ll(self, inst, best_cll=float('-inf')): sstats = self.sstats l = 0.0 for i in xrange(len(self.nvals)): v = inst[i] w = sstats[i][v] l += log((w + self.smoo) / (self.size + self.nvals[i]*self.smoo)) if l < best_cll: return l return l
def nbmix_model(data, nvals, qa, smooth=0.1): data = data.astype(np.int, copy=False) nc = qa.max() + 1 n = data.shape[1] m = data.shape[0] # compute params for NB models lprobs = float('-inf')*np.ones( (n, nc, max(nvals)) ) priors = np.zeros(nc) for i in range(nc): di = data[qa==i] di_size = float(len(di)) priors[i] = log(di_size / m) for j in range(n): bc = np.bincount(di[:,j], minlength=nvals[j]) for k in range(nvals[j]): c = bc[k] if c == 0: continue lprobs[j,i,k] = log((c + smooth) / (di_size + smooth*nvals[j])) return lprobs, priors
def new_ll(nvals, smooth=0.1): return sum( log((1. + smooth) / (1. + v * smooth)) for v in nvals )