def test_cartesian(): assert_almost(simplex.cartesian(np.array([1, 0, 0])), np.array([0, 0])) assert_almost(simplex.cartesian(np.array([0, 1, 0])), np.array([1, 0])) assert_almost(simplex.cartesian(np.array([0, 0, 1])), np.array([0.5, 0.5*np.sqrt(3)])) assert_almost(simplex.cartesian(np.array([0.5, 0.5, 0])), np.array([0.5, 0])) assert_almost(simplex.cartesian(np.array([0, 0.5, 0.5])), np.array([0.75, np.sqrt(3)*0.25]))
Created on 29 May 2017 @author: alejomc ''' from _functools import reduce from dirichlet.simplex import barycentric, cartesian import numpy import matplotlib.pyplot as plt import matplotlib.tri as tri import numpy as np #from http://blog.bogatron.net/blog/2014/02/02/visualizing-dirichlet-distributions/ corners = np.array([cartesian((1,0,0)), cartesian((0,1,0)), cartesian((0,0,1))]) triangle = tri.Triangulation(corners[:, 0], corners[:, 1]) refiner = tri.UniformTriRefiner(triangle) trimesh = refiner.refine_triangulation(subdiv=4) # plt.figure(figsize=(8, 4)) # for (i, mesh) in enumerate((triangle, trimesh)): # plt.subplot(1, 2, i+ 1) # plt.triplot(mesh) # plt.axis('off') # plt.axis('equal')
def test6(data): print(data.shape) _, mixt = getArchetypes(data, 3) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd mixt = normalize(mixt) dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000) featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn spn = learn(mixt) print(spn) spn_samples = numpy.zeros((data.shape[0], 3))/0 a,spn_samples = spn.root.sample(spn_samples) spn_samples = normalize(spn_samples) #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) return result def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res xy_all = cartesian(mixt) filename = 'plots/dirichlet_mle.pdf' try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) # all fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) numpy.random.seed(17) mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0]) print(dirichlet_alphas) xy_samples = cartesian(mixt_samples) fig = plt.figure() draw_pdf_contours_func(plotDirichlet) plt.title("dirichlet trained on all, sampled points") plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, original points") plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) xy_spn_samples = cartesian(spn_samples) fig = plt.figure() draw_pdf_contours_func(spnpdf) plt.title("spn trained on all, sampled points") plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize) plt.colorbar() pp.savefig(fig) pp.close()
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700): if mixttype == "Archetype": _, mixt = getArchetypes(data, dimensions) if mixt is None: return () elif mixttype == "LDA": lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) elif mixttype == "RandomSample": mixt = numpy.random.dirichlet((1,1,1), 20).transpose() print(mixt) 0/0 print(mixt.shape) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42) numpy.savetxt("mixt_train.csv", mixt_train) numpy.savetxt("mixt_test.csv", mixt_test) #0/0 featureTypes = ["continuous"] * mixt.shape[1] domains = [] for i, ft in enumerate(featureTypes): if ft == "continuous": r = (0.0, 1.0) fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400) domain = numpy.array(sorted(fd.keys())) else: domain = numpy.unique(data[:, i]) domains.append(domain) dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000) #@memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn #for the good pdf it was 700 spn = learn(mixt_train) print(spn) def spnpdf(data): data = data.reshape(-1, mixt.shape[1]) res = spn.root.eval(normalize(data))[0] return res print(dirichlet_alphas) def plotDirichlet(data): data = data.reshape(-1, mixt.shape[1]) try: result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas) except: print(normalize(data)) print(normalize(data)*1.0) print(normalize(data)+1) print(normalize(data)+0) 0/0 return result df_train = pandas.DataFrame() df_test = pandas.DataFrame() dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas) dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas) df_train["dirichlet_train"] = dtrain_fit df_test["dirichlet_test"] = dtest_fit spn_train_fit = spn.root.eval(mixt_train) spn_test_fit = spn.root.eval(mixt_test) df_train["spn_train"] = spn_train_fit df_test["spn_test"] = spn_test_fit if dimensions == 3: xy_train = cartesian(mixt_train) xy_test = cartesian(mixt_test) filename = 'plots/%s_%s.pdf' % (dsname, mixttype) try: import os os.remove(filename) except OSError: pass pp = PdfPages(filename) markersize = 1.0 # all # fig = plt.figure() # plt.title("dirichlet, original points") # draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("Dirichlet, train points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("Dirichlet, test points") draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # all # fig = plt.figure() # plt.title("spn, original points") # draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) # #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) # # plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize) # plt.colorbar() # pp.savefig(fig) # train fig = plt.figure() plt.title("SPN, train points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) # test fig = plt.figure() plt.title("SPN, test points") draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12) #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf) plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize) #plt.colorbar() fig.tight_layout() pp.savefig(fig) pp.close() return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions, "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit), "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) , "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers() )