コード例 #1
0
def test1(data, features):
    
    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)
    
    nrfolds = 10
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        c.end()
        
        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))
        
        print(ll)
        
        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())
        
        stats.save("stats_" + dsname + ".json")
    
    print(arcs)
コード例 #2
0
def computeNIPS(dimensions):
    dsname, data, features = getNips()
    
    lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    lda.fit(data)
    mixt = lda.transform(data)

    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")   
コード例 #3
0
def getHydrochemLL():
    dsname, data, features = getHydrochem()

    print(data)
    print(data.shape)

    featureTypes = ["continuous"] * data.shape[1]

    domains = [[0, 1]] * data.shape[1]

    print(domains)
    families = ['piecewise'] * data.shape[1]

    #families = ['histogram'] * data.shape[1]
    #@memory.cache
    def learn(data, families, mininst, alpha, th):
        spn = SPN.LearnStructure(
            data,
            featureTypes=featureTypes,
            row_split_method=Splitting.Gower(),
            col_split_method=Splitting.RDCTest(threshold=th),
            domains=domains,
            alpha=alpha,
            families=families,
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=mininst)
        return spn

    stats = Stats(name=dsname)

    alll = []
    for train, test, i in kfolded(data, 5):
        dirichlet_alphas = dirichlet.mle(train,
                                         method='meanprecision',
                                         maxiter=100000)
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test),
                                          alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

        spn = learn(train, families, 10, 0.1, 0.1)
        ll = spn.root.eval(test)
        alll.append(numpy.mean(ll))
        stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

    print(numpy.mean(alll))
    stats.save("results/hydrochems/" + dsname + ".json")
コード例 #4
0
ファイル: marginal.py プロジェクト: willis-hu/spyn
# @memory.cache
def llpdn(pdn, test):
    return pdn.getLogLikelihood(test)


for dsname, data, featureNames in [datasets.getCommunitiesAndCrimes()]:

    #for dsname, data, featureNames in [datasets.getNips(), datasets.getSynthetic(), datasets.getMSNBCclicks(), datasets.getCommunitiesAndCrimes()]:

    printlocal(dsname)
    printlocal(featureNames)
    printlocal(len(featureNames))
    printlocal(data.shape)

    stats = Stats(name=dsname)
    for train, test, i in kfolded(data, 5):
        spn = LearnSPN(alpha=0.001,
                       min_instances_slice=80,
                       cluster_prep_method="sqrt",
                       cache=memory).fit_structure(train)

        printlocal("done")
        stats.addConfig("PSPN", spn.config)
        # stats.add("SPN Pois", Stats.LOG_LIKELIHOOD, llspn(spn, test))
        printlocal("LL")
        stats.add("PSPN", Stats.MODEL_SIZE, spn.size())
        printlocal("model size")
        prediction = spnComputeLambdas(spn, test)
        printlocal("model spnComputeLambdas")
        #prediction2 = spnComputeLambdasCuda(spn, test)
コード例 #5
0
def getAirPollution(dimensions):
    dsname, data, features = getAirQualityUCITimeless()
    
    idxmissing = data == -200
    
    data = data[:, numpy.sum(idxmissing,0) < 2000]
    idxmissing = data == -200
    data = data[numpy.sum(idxmissing,1) == 0, :]
    idxmissing = data == -200
    print(data.shape)
    
    _, mixt = getArchetypes(data, dimensions)
    
    if mixt is None:
        print( "no archetypes", dimensions)
        #0/0
        return
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 #min_instances_slice=int(data.shape[0]*0.01))
                                 min_instances_slice=200)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")   
コード例 #6
0
ファイル: plotPreds.py プロジェクト: willis-hu/spyn
                     ax.xaxis.get_major_ticks()], []):
        tick.label.set_fontsize(16)

    for lh in leg.legendHandles:
        lh.set_dashes((None, None))

    ltext = leg.get_texts()
    setp(ltext[0], fontsize=14, color='b')
    setp(ltext[1], fontsize=14, color='g')
    setp(ltext[2], fontsize=14, color='r')
    setp(ltext[3], fontsize=14, color='k')

    for line, txt in zip(leg.get_lines(), leg.get_texts()):
        line.set_linewidth(10)
        line.set_color(txt.get_color())

    savefig(fname, bbox_inches='tight', dpi=600)


data = {}
for fname in glob("*.json"):

    stats = Stats(fname=fname)
    data[stats.name] = {}

    for method in stats.getMethods(Stats.SQUARED_ERROR):
        data[stats.name][method] = stats.getValues(method, Stats.SQUARED_ERROR)

print(data)
plotBoxes(data, "Prediction Error", 0.1, "ploterr.pdf", figsize=(6, 4))
コード例 #7
0
ファイル: computeperplexity.py プロジェクト: willis-hu/spyn
    dictionary.compactify()
    data = data[:, newWId]
    return data, words


numpy.random.seed(1237)

for dsname, data, featureNames in [
        datasets.getNips(),
        datasets.getSynthetic(),
        datasets.getMSNBCclicks(),
        datasets.getCommunitiesAndCrimes()
]:

    data, words = filterDS(data, featureNames)
    stats = Stats(name=dsname)
    nrfolds = 5
    for train, test, i in kfolded(data, nrfolds):

        print(dsname, train.shape, test.shape, i)

        for topics in [5, 10, 20, 50, 100]:
            stats.addConfig(
                "LDA" + str(topics), {
                    "topics": topics,
                    "train documents": train.shape[0],
                    "test documents": test.shape[0],
                    "words": train.shape[1]
                })
            perplexity, tt = ldaperplexity(train, test, topics)
            stats.add("LDA" + str(topics), Stats.PERPLEXITY, perplexity)