Esempio n. 1
0
def test1(data, features):
    
    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)
    
    nrfolds = 10
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        c.end()
        
        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))
        
        print(ll)
        
        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())
        
        stats.save("stats_" + dsname + ".json")
    
    print(arcs)
Esempio n. 2
0
def test2(data, features):
    arc, mixt = getArchetypes(data, 3)
    
    print(mixt)
    
    0 / 0
    
    
    spn = SPN.LearnStructure(mixt, featureTypes=["continuous"] * mixt.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
Esempio n. 3
0
def test6(data):
    print(data.shape)
    _, mixt = getArchetypes(data, 3)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd
    
    mixt = normalize(mixt)
    
    dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000)
    
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    print(domains)

    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=1,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=50)
        return spn
    
    spn = learn(mixt)
    print(spn)
    
    spn_samples = numpy.zeros((data.shape[0], 3))/0
    a,spn_samples = spn.root.sample(spn_samples)
    
    spn_samples = normalize(spn_samples)
    
    
    
    #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        return result
    
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    xy_all = cartesian(mixt)
    
    
    filename = 'plots/dirichlet_mle.pdf'
    try:
        import os
        os.remove(filename)
    except OSError:
        pass
    pp = PdfPages(filename)
    
    # all
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    numpy.random.seed(17)
    mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0])
    print(dirichlet_alphas)
    xy_samples = cartesian(mixt_samples)
    
    
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, sampled points")
    plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, sampled points")
    plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    
    pp.close()
Esempio n. 4
0
def getAirPollution(dimensions):
    dsname, data, features = getAirQualityUCITimeless()
    
    idxmissing = data == -200
    
    data = data[:, numpy.sum(idxmissing,0) < 2000]
    idxmissing = data == -200
    data = data[numpy.sum(idxmissing,1) == 0, :]
    idxmissing = data == -200
    print(data.shape)
    
    _, mixt = getArchetypes(data, dimensions)
    
    if mixt is None:
        print( "no archetypes", dimensions)
        #0/0
        return
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 #min_instances_slice=int(data.shape[0]*0.01))
                                 min_instances_slice=200)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")   
Esempio n. 5
0
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700):
    if mixttype == "Archetype":
        _, mixt = getArchetypes(data, dimensions)
        if mixt is None:
            return ()
    elif mixttype == "LDA":
        lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
        lda.fit(data)
        mixt = lda.transform(data)
    elif mixttype == "RandomSample":
        mixt = numpy.random.dirichlet((1,1,1), 20).transpose()
        print(mixt)
        0/0
        
    print(mixt.shape)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42)


    numpy.savetxt("mixt_train.csv", mixt_train)
    numpy.savetxt("mixt_test.csv", mixt_test)
    #0/0

    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    
    dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000)

    #@memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    #for the good pdf it was 700
    
    
    spn = learn(mixt_train)
    print(spn)
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    print(dirichlet_alphas)
    
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        try:
            result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        except:
            print(normalize(data))
            print(normalize(data)*1.0)
            print(normalize(data)+1)
            print(normalize(data)+0)
            0/0
        return result
    
    df_train = pandas.DataFrame()
    df_test = pandas.DataFrame()
    
    dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas)
    df_train["dirichlet_train"] = dtrain_fit
    df_test["dirichlet_test"] = dtest_fit
    
    spn_train_fit = spn.root.eval(mixt_train)
    spn_test_fit = spn.root.eval(mixt_test)
    df_train["spn_train"] = spn_train_fit
    df_test["spn_test"] = spn_test_fit
    

    
    if dimensions == 3:
        xy_train = cartesian(mixt_train)
        xy_test = cartesian(mixt_test)
        
        filename = 'plots/%s_%s.pdf' % (dsname, mixttype)
        try:
            import os
            os.remove(filename)
        except OSError:
            pass
        pp = PdfPages(filename)
        
        markersize = 1.0
        # all
#         fig = plt.figure()
#         plt.title("dirichlet, original points")
#         draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        # train
        fig = plt.figure()
        plt.title("Dirichlet, train points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("Dirichlet, test points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
    
        # all
#         fig = plt.figure()
#         plt.title("spn, original points")
#         draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
# 
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        
        # train
        fig = plt.figure()
        plt.title("SPN, train points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("SPN, test points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        pp.close()
    
    return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions,
            "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit),
            "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) ,
            "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers()
            )