def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def computeNIPS(dimensions): dsname, data, features = getNips() lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")
def getHydrochemLL(): dsname, data, features = getHydrochem() print(data) print(data.shape) featureTypes = ["continuous"] * data.shape[1] domains = [[0, 1]] * data.shape[1] print(domains) families = ['piecewise'] * data.shape[1] #families = ['histogram'] * data.shape[1] #@memory.cache def learn(data, families, mininst, alpha, th): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=th), domains=domains, alpha=alpha, families=families, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=mininst) return spn stats = Stats(name=dsname) alll = [] for train, test, i in kfolded(data, 5): dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=100000) ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train, families, 10, 0.1, 0.1) ll = spn.root.eval(test) alll.append(numpy.mean(ll)) stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) print(numpy.mean(alll)) stats.save("results/hydrochems/" + dsname + ".json")
# @memory.cache def llpdn(pdn, test): return pdn.getLogLikelihood(test) for dsname, data, featureNames in [datasets.getCommunitiesAndCrimes()]: #for dsname, data, featureNames in [datasets.getNips(), datasets.getSynthetic(), datasets.getMSNBCclicks(), datasets.getCommunitiesAndCrimes()]: printlocal(dsname) printlocal(featureNames) printlocal(len(featureNames)) printlocal(data.shape) stats = Stats(name=dsname) for train, test, i in kfolded(data, 5): spn = LearnSPN(alpha=0.001, min_instances_slice=80, cluster_prep_method="sqrt", cache=memory).fit_structure(train) printlocal("done") stats.addConfig("PSPN", spn.config) # stats.add("SPN Pois", Stats.LOG_LIKELIHOOD, llspn(spn, test)) printlocal("LL") stats.add("PSPN", Stats.MODEL_SIZE, spn.size()) printlocal("model size") prediction = spnComputeLambdas(spn, test) printlocal("model spnComputeLambdas") #prediction2 = spnComputeLambdasCuda(spn, test)
def getAirPollution(dimensions): dsname, data, features = getAirQualityUCITimeless() idxmissing = data == -200 data = data[:, numpy.sum(idxmissing,0) < 2000] idxmissing = data == -200 data = data[numpy.sum(idxmissing,1) == 0, :] idxmissing = data == -200 print(data.shape) _, mixt = getArchetypes(data, dimensions) if mixt is None: print( "no archetypes", dimensions) #0/0 return def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), #min_instances_slice=int(data.shape[0]*0.01)) min_instances_slice=200) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")
ax.xaxis.get_major_ticks()], []): tick.label.set_fontsize(16) for lh in leg.legendHandles: lh.set_dashes((None, None)) ltext = leg.get_texts() setp(ltext[0], fontsize=14, color='b') setp(ltext[1], fontsize=14, color='g') setp(ltext[2], fontsize=14, color='r') setp(ltext[3], fontsize=14, color='k') for line, txt in zip(leg.get_lines(), leg.get_texts()): line.set_linewidth(10) line.set_color(txt.get_color()) savefig(fname, bbox_inches='tight', dpi=600) data = {} for fname in glob("*.json"): stats = Stats(fname=fname) data[stats.name] = {} for method in stats.getMethods(Stats.SQUARED_ERROR): data[stats.name][method] = stats.getValues(method, Stats.SQUARED_ERROR) print(data) plotBoxes(data, "Prediction Error", 0.1, "ploterr.pdf", figsize=(6, 4))
dictionary.compactify() data = data[:, newWId] return data, words numpy.random.seed(1237) for dsname, data, featureNames in [ datasets.getNips(), datasets.getSynthetic(), datasets.getMSNBCclicks(), datasets.getCommunitiesAndCrimes() ]: data, words = filterDS(data, featureNames) stats = Stats(name=dsname) nrfolds = 5 for train, test, i in kfolded(data, nrfolds): print(dsname, train.shape, test.shape, i) for topics in [5, 10, 20, 50, 100]: stats.addConfig( "LDA" + str(topics), { "topics": topics, "train documents": train.shape[0], "test documents": test.shape[0], "words": train.shape[1] }) perplexity, tt = ldaperplexity(train, test, topics) stats.add("LDA" + str(topics), Stats.PERPLEXITY, perplexity)