Exemple #1
0
def test1(data, features):
    
    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)
    
    nrfolds = 10
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        c.end()
        
        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))
        
        print(ll)
        
        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())
        
        stats.save("stats_" + dsname + ".json")
    
    print(arcs)
Exemple #2
0
def rbo(l1, l2, p=0.98):
    """
        It computea Ranked Biased Overlap (RBO) score.
        l1 -- ranked list
        l2 -- ranked list
    """
    if not l1:
        l1 = []
    if not l2:
        l2 = []

    sl, ll = sorted([(len(l1), l1), (len(l2), l2)])
    s, S = sl
    l, L = ll
    if s == 0:
        return 0

    # Calculate the overlaps at ranks 1 through l
    # (the longer of the two lists)
    ss = set([])  # contains elements from the smaller list till depth i
    ls = set([])  # contains elements from the longer list till depth i
    x_d = {0: 0}
    sum1 = 0.0
    for i in range(l):
        x = L[i]
        y = S[i] if i < s else None
        d = i + 1

        # if two elements are identical then
        # we don't need to add to either of the set
        if x == y:
            x_d[d] = x_d[d - 1] + 1.0
        # else add items to respective list
        # and calculate overlap
        else:
            ls.add(x)
            if y != None:
                ss.add(y)
            x_d[d] = x_d[d - 1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls
                                                               else 0.0)

        #calculate average overlap
        sum1 += x_d[d] / d * pow(p, d)

    sum2 = 0.0
    for i in range(l - s):
        d = s + i + 1
        sum2 += x_d[d] * (d - s) / (d * s) * pow(p, d)

    sum3 = ((x_d[l] - x_d[s]) / l + x_d[s] / s) * pow(p, l)

    # Equation 32
    rbo_ext = (1 - p) / p * (sum1 + sum2) + sum3
    return rbo_ext
Exemple #3
0
def getHydrochemLL():
    dsname, data, features = getHydrochem()

    print(data)
    print(data.shape)

    featureTypes = ["continuous"] * data.shape[1]

    domains = [[0, 1]] * data.shape[1]

    print(domains)
    families = ['piecewise'] * data.shape[1]

    #families = ['histogram'] * data.shape[1]
    #@memory.cache
    def learn(data, families, mininst, alpha, th):
        spn = SPN.LearnStructure(
            data,
            featureTypes=featureTypes,
            row_split_method=Splitting.Gower(),
            col_split_method=Splitting.RDCTest(threshold=th),
            domains=domains,
            alpha=alpha,
            families=families,
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=mininst)
        return spn

    stats = Stats(name=dsname)

    alll = []
    for train, test, i in kfolded(data, 5):
        dirichlet_alphas = dirichlet.mle(train,
                                         method='meanprecision',
                                         maxiter=100000)
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test),
                                          alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

        spn = learn(train, families, 10, 0.1, 0.1)
        ll = spn.root.eval(test)
        alll.append(numpy.mean(ll))
        stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

    print(numpy.mean(alll))
    stats.save("results/hydrochems/" + dsname + ".json")
Exemple #4
0
def computeNIPS(dimensions):
    dsname, data, features = getNips()
    
    lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    lda.fit(data)
    mixt = lda.transform(data)

    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")   
Exemple #5
0
def getAirPollution(dimensions):
    dsname, data, features = getAirQualityUCITimeless()
    
    idxmissing = data == -200
    
    data = data[:, numpy.sum(idxmissing,0) < 2000]
    idxmissing = data == -200
    data = data[numpy.sum(idxmissing,1) == 0, :]
    idxmissing = data == -200
    print(data.shape)
    
    _, mixt = getArchetypes(data, dimensions)
    
    if mixt is None:
        print( "no archetypes", dimensions)
        #0/0
        return
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 #min_instances_slice=int(data.shape[0]*0.01))
                                 min_instances_slice=200)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")