Ejemplo n.º 1
0
def test_cartesian():
    assert_almost(simplex.cartesian(np.array([1, 0, 0])), np.array([0, 0]))
    assert_almost(simplex.cartesian(np.array([0, 1, 0])), np.array([1, 0]))
    assert_almost(simplex.cartesian(np.array([0, 0, 1])),
                  np.array([0.5, 0.5*np.sqrt(3)]))
    assert_almost(simplex.cartesian(np.array([0.5, 0.5, 0])), np.array([0.5, 0]))
    assert_almost(simplex.cartesian(np.array([0, 0.5, 0.5])),
                  np.array([0.75, np.sqrt(3)*0.25]))
Ejemplo n.º 2
0
Created on 29 May 2017

@author: alejomc
'''
from _functools import reduce

from dirichlet.simplex import barycentric, cartesian
import numpy

import matplotlib.pyplot as plt
import matplotlib.tri as tri
import numpy as np


#from http://blog.bogatron.net/blog/2014/02/02/visualizing-dirichlet-distributions/
corners = np.array([cartesian((1,0,0)), cartesian((0,1,0)), cartesian((0,0,1))])

triangle = tri.Triangulation(corners[:, 0], corners[:, 1])

refiner = tri.UniformTriRefiner(triangle)
trimesh = refiner.refine_triangulation(subdiv=4)

# plt.figure(figsize=(8, 4))
# for (i, mesh) in enumerate((triangle, trimesh)):
#     plt.subplot(1, 2, i+ 1)
#     plt.triplot(mesh)
#     plt.axis('off')
#     plt.axis('equal')
    

Ejemplo n.º 3
0
def test6(data):
    print(data.shape)
    _, mixt = getArchetypes(data, 3)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd
    
    mixt = normalize(mixt)
    
    dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000)
    
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    print(domains)

    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=1,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=50)
        return spn
    
    spn = learn(mixt)
    print(spn)
    
    spn_samples = numpy.zeros((data.shape[0], 3))/0
    a,spn_samples = spn.root.sample(spn_samples)
    
    spn_samples = normalize(spn_samples)
    
    
    
    #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        return result
    
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    xy_all = cartesian(mixt)
    
    
    filename = 'plots/dirichlet_mle.pdf'
    try:
        import os
        os.remove(filename)
    except OSError:
        pass
    pp = PdfPages(filename)
    
    # all
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    numpy.random.seed(17)
    mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0])
    print(dirichlet_alphas)
    xy_samples = cartesian(mixt_samples)
    
    
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, sampled points")
    plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, sampled points")
    plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    
    pp.close()
Ejemplo n.º 4
0
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700):
    if mixttype == "Archetype":
        _, mixt = getArchetypes(data, dimensions)
        if mixt is None:
            return ()
    elif mixttype == "LDA":
        lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
        lda.fit(data)
        mixt = lda.transform(data)
    elif mixttype == "RandomSample":
        mixt = numpy.random.dirichlet((1,1,1), 20).transpose()
        print(mixt)
        0/0
        
    print(mixt.shape)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42)


    numpy.savetxt("mixt_train.csv", mixt_train)
    numpy.savetxt("mixt_test.csv", mixt_test)
    #0/0

    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    
    dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000)

    #@memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    #for the good pdf it was 700
    
    
    spn = learn(mixt_train)
    print(spn)
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    print(dirichlet_alphas)
    
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        try:
            result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        except:
            print(normalize(data))
            print(normalize(data)*1.0)
            print(normalize(data)+1)
            print(normalize(data)+0)
            0/0
        return result
    
    df_train = pandas.DataFrame()
    df_test = pandas.DataFrame()
    
    dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas)
    df_train["dirichlet_train"] = dtrain_fit
    df_test["dirichlet_test"] = dtest_fit
    
    spn_train_fit = spn.root.eval(mixt_train)
    spn_test_fit = spn.root.eval(mixt_test)
    df_train["spn_train"] = spn_train_fit
    df_test["spn_test"] = spn_test_fit
    

    
    if dimensions == 3:
        xy_train = cartesian(mixt_train)
        xy_test = cartesian(mixt_test)
        
        filename = 'plots/%s_%s.pdf' % (dsname, mixttype)
        try:
            import os
            os.remove(filename)
        except OSError:
            pass
        pp = PdfPages(filename)
        
        markersize = 1.0
        # all
#         fig = plt.figure()
#         plt.title("dirichlet, original points")
#         draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        # train
        fig = plt.figure()
        plt.title("Dirichlet, train points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("Dirichlet, test points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
    
        # all
#         fig = plt.figure()
#         plt.title("spn, original points")
#         draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
# 
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        
        # train
        fig = plt.figure()
        plt.title("SPN, train points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("SPN, test points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        pp.close()
    
    return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions,
            "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit),
            "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) ,
            "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers()
            )