def buildMixtureTreeMaxKL(data, K, iterations, maxKL, branchesPerNode = 2, multinomial = None): logging.info("Dataset Size: " + str(len(data))) if (multinomial): (worstKL, worstN) = MME.worstFitForSingleMultinomial(data, multinomial) logging.info("Worst KL found in branch: " + str(worstKL)) if (worstKL < maxKL): return None if (len(data) < branchesPerNode): return None # hyperparameters are fixed here: hyperP = MME.MultinomialMixtureModelHyperparams(branchesPerNode, K, [1.0 / branchesPerNode]*branchesPerNode, [1.0 / K]*K) mixtureModel = MME.computeDirichletMixture(data, hyperP, iterations) smallerDatasets = [] for c in range(0, hyperP.C): smallerDatasets.append([]) for counts in data: c = MME.assignComponentToCounts(counts, mixtureModel) smallerDatasets[c].append(counts) treeModel = MultinomialMixtureTree(mixtureModel) for c in range(0, hyperP.C): smallerDataset = smallerDatasets[c] child = buildMixtureTreeMaxKL(smallerDataset, K, iterations, maxKL, branchesPerNode, mixtureModel.multinomials[c]) treeModel.mixtureNodes[c] = child return treeModel
def buildSimpleMixtureTree(data, K, iterations, height, branchesPerNode = 2): if (height == 0): return None # hyperparameters are fixed here: hyperP = MME.MultinomialMixtureModelHyperparams(branchesPerNode, K, [1.0 / branchesPerNode]*branchesPerNode, [1.0 / K]*K) mixtureModel = MME.computeDirichletMixture(data, hyperP, iterations) smallerDatasets = [] for c in range(0, hyperP.C): smallerDatasets.append([]) for counts in data: c = MME.assignComponentToCounts(counts, mixtureModel) smallerDatasets[c].append(counts) treeModel = MultinomialMixtureTree(mixtureModel) for c in range(0, hyperP.C): smallerDataset = smallerDatasets[c] child = buildSimpleMixtureTree(smallerDataset, K, iterations, height - 1, branchesPerNode) treeModel.mixtureNodes[c] = child return treeModel
dataset.append(map(int, splitrow)) #for n in range(0, len(dataset)): # counts = dataset[n] # print str(n) + "\t" + str(MME.assignComponentToCounts(counts, model)) # print file for google docs #print "component\t", #for i in range(0, C): print str(i) + "\t", #print "" #print "prior\t" + "\t".join(map(str, finalModel.mixture)) # #for k in range(0, 168): # print str(k) + "\t", # for i in range(0, C): # print str(finalModel.multinomials[i][k]) + "\t", # print "" rowInfoList = [] N = 0 for row in dataset: c = MME.assignComponentToCounts(row, model) klDiv = MME.klTest(row, model, c) rowInfoList.append([N, c, klDiv]) N += 1 sortedRows = sorted(rowInfoList, key=(lambda x: -x[2])) print "row\tmodel\tklDivergence" for row in sortedRows: print "\t".join(map(str, row))
for row in sys.stdin: splitrow = row.split("\t") dataset.append(map(int, splitrow)) #for n in range(0, len(dataset)): # counts = dataset[n] # print str(n) + "\t" + str(MME.assignComponentToCounts(counts, model)) # print file for google docs #print "component\t", #for i in range(0, C): print str(i) + "\t", #print "" #print "prior\t" + "\t".join(map(str, finalModel.mixture)) # #for k in range(0, 168): # print str(k) + "\t", # for i in range(0, C): # print str(finalModel.multinomials[i][k]) + "\t", # print "" rowInfoList = [] N = 0 for row in dataset: c = MME.assignComponentToCounts(row, model) klDiv = MME.klTest(row, model.multinomials[c]) rowInfoList.append([N, c, klDiv, sum(row)]) N += 1 print "row\tmodel\tklDivergence\tNumber of Data Points" for row in rowInfoList: print "\t".join(map(str, row))