def product_distribution_sym_kl_dist(p1, p2):
    """
    Returns the symmetric KL distance of two ProductDistribution objects, defined as sum of the
    the component-wise KL distances
    """
    d = 0.0
    for j in range(p1.dist_nr):
        d += mixture.sym_kl_dist(p1[j], p2[j])
    return d
Beispiel #2
0
def product_distribution_sym_kl_dist(p1,p2):
    """
    Returns the symmetric KL distance of two ProductDistribution objects, defined as sum of the
    the component-wise KL distances
    """
    d = 0.0
    for j in range(p1.dist_nr):
        d += mixture.sym_kl_dist(p1[j],p2[j])
    return d
def scoreStructureLearning(N,
                           gen,
                           delta,
                           seed=None,
                           silent=False,
                           skipAfterRNGcalls=False):
    """
    If skipAfterRNGcalls is True, the function terminates after all calls to RNGs have been done.
    """

    #print 'start scoring'

    #    if seed != None:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        print '*** given seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,999999999)
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        print '*** random seed=',seed

    data = gen.sampleDataSet(N)

    # XXX update NormalGammaPrior hyperparameters
    for j in range(gen.dist_nr):
        if isinstance(gen.prior.compPrior[j], mixture.NormalGammaPrior):
            gen.prior.compPrior[j].setParams(data.getInternalFeature(j), gen.G)

    gen.prior.structPriorHeuristic(delta, data.N)

    print '\nupdating generating model structure:'
    print 'vorher:'
    print gen.leaders
    print gen.groups

    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(gen,
                                                                     data,
                                                                     silent=1)

    print '\nnachher:'
    print gen.leaders
    print gen.groups

    if silent == False:
        printModel(gen, 'generating model')

    m = copy.copy(gen)
    # reset structure
    m.initStructure()

    # training parameters
    nr_rep = 40  # XXX
    #nr_rep = 4 # XXX

    nr_steps = 400
    em_delta = 0.6

    print 'start training'
    print 'EM repeats:', nr_rep

    m.randMaxTraining(data, nr_rep, nr_steps, em_delta, silent=1, rtype=0)
    print 'finished training'

    if skipAfterRNGcalls == True:
        print '*** Skipping !'
        return numpy.zeros(4)

#    # check for consistency of component indices (identifiability issues)
#    bad = 0
    if silent == False:
        cmap = {}

        for j in range(gen.dist_nr):
            print '\nfeature:', j

            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                      gen.components[i2][j])
                print i1, '->', kldists.argmin(), map(
                    lambda x: '%.2f' % float(x), kldists)  # kldists.min()

#        for i1 in range(m.G):
#            print
#            cdists = numpy.zeros(m.G)
#            for i2 in range(m.G):
#                cdists[i2] = product_distribution_sym_kl_dist(m.components[i1], gen.components[i2])
#                #print i1,i2,product_distribution_sym_kl_dist(m.components[i1], gen.components[i2])
#
#            print i1,'maps to', numpy.argmin(cdists), cdists.tolist()
#            amin = numpy.argmin(cdists)
#            if not amin == i1:     # minimal KL distance should occur at equal indices in gen and m
#                bad = 1
#                cmap[i1] = amin

#    if bad:
#
#
#
#        # XXX check whether cmap defines new unambiguous ordering
#
#        # check whether components have switched positions
#        reorder = 0
#        order = range(m.G)
#        try:
#
#            #print cmap
#
#            for i1 in cmap.keys():
#                order[i1] = cmap[i1]
#
#            #print order
#            #print set(order)
#            #print  list(set(order))
#
#            if len(set(order)) == m.G:
#                reorder = 1
#        except KeyError:
#            pass
#        except AssertionError:
#            pass
#
#        if reorder:
#            print '** new order', order
#
#            m.reorderComponents(order)
#
#        else:
#
#
#            #print cdists
#            print i1,'maps to', numpy.argmin(cdists)
#
#            print 'Failed matching.'
#
#            print 'generating model gen:'
#            print gen
#
#            print 'trained model m:'
#            print m
#
#            raise ValueError

#    mtest =copy.copy(gen)
#    ch = mtest.updateStructureBayesian(data,silent=1)
#    print '\nTEST:',ch
#    for j in range(m.dist_nr):
#        print j,mtest.leaders[j], mtest.groups[j]

#print m.prior

    print '-----------------------------------------------------------------------'
    print '\n True structure:'
    print 'True model post:', mixture.get_loglikelihood(
        gen, data) + gen.prior.pdf(gen)
    #for j in range(m.dist_nr):
    #    print j,gen.leaders[j], gen.groups[j]
    print gen.leaders
    print gen.groups

    if silent == False:
        printModel(m, 'trained model')

    m1 = copy.copy(m)
    t0 = time.time()
    #print '\n\n################# TOPDOWN #####################'
    m1.updateStructureBayesian(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m1.mapEM(data,40,0.1)
    print '\nTop down (', str(time2), 's ):'
    print m1.leaders
    print m1.groups
    print 'Top down model post:', mixture.get_loglikelihood(
        m1, data) + m1.prior.pdf(m1)
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m1)  # structureEditDistance(gen,m1)

    if silent == False:
        printModel(m1, 'top down model')

    #print '#############################'

    #print '\n\n################# FULL FixedOrder #####################'
    m2 = copy.copy(m)
    t0 = time.time()
    m2.updateStructureBayesianFullEnumerationFixedOrder(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m2.mapEM(data,40,0.1)

    #    print
    #    for j in range(m2.dist_nr):
    #        print j,m2.leaders[j], m2.groups[j]
    print '\nFull enumeration Fixed Order  (', str(time2), 's ):'
    print m2.leaders
    print m2.groups
    print 'Full fixed order model post:', mixture.get_loglikelihood(
        m2, data) + m2.prior.pdf(m2)
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m2) # structureEditDistance(gen,m1)

    if silent == False:
        printModel(m2, 'full fixed model')

    #print '\n\n################# BOTTUMUP #####################'
    m3 = copy.copy(m)
    t0 = time.time()
    m3.updateStructureBayesianBottomUp(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m3.mapEM(data,40,0.1)
    #    print
    #    for j in range(m3.dist_nr):
    #        print j,m3.leaders[j], m3.groups[j]
    print '\nBottom up: (', str(time2), 's ):'
    print m3.leaders
    print m3.groups
    print 'Bottom up model post:', mixture.get_loglikelihood(
        m3, data) + m3.prior.pdf(m3)
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m3) # structureEditDistance(gen,m1)

    if silent == False:
        printModel(m3, 'bottom up model')

    #print '\n\n################# FULL enumeration #####################'
    m4 = copy.copy(m)
    t0 = time.time()
    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(m4,
                                                                     data,
                                                                     silent=0)
    t1 = time.time()
    time2 = t1 - t0
    # m4.mapEM(data,40,0.1)
    #    print
    #    for j in range(m4.dist_nr):
    #        print j,m4.leaders[j], m4.groups[j]
    print '\nFull enumeration: (', str(time2), 's )'
    print m4.leaders
    print m4.groups
    print 'Full enumeration model post:', mixture.get_loglikelihood(
        m4, data) + m4.prior.pdf(m4)
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m4)

    if silent == False:
        printModel(m4, 'full enumeration model')

    print '-----------------------------------------------------------------------'

    #    dtop = mixture.structureAccuracy(gen,m1)
    #    dfull_fixed = mixture.structureAccuracy(gen,m2)
    #    dfull = mixture.structureAccuracy(gen,m4)
    #    dbottom = mixture.structureAccuracy(gen,m3)

    logp_top = mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
    logp_full_fixed = mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
    logp_full = mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
    logp_bottom = mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)

    if (not (round(logp_top, 3) <= round(logp_full, 3))
            or not (round(logp_full_fixed, 3) <= round(logp_full, 3))
            or not (round(logp_bottom, 3) <= round(logp_full, 3))):
        raise ValueError

    return numpy.array([logp_top, logp_full_fixed, logp_full, logp_bottom])
def matchModelStructures(gen, m):
    """
    Checks whether m1 and m2 are consistent in the sense that for each
    leader in m1, there is a number of distributions in m2 which take minimum
    distance to the leader as the number of distributions in the group in m1.
    
    Used to check whether a the parameteric EM has obviously captured the CSI structure of
    the generating model.
    """
    #print '**** matchModelStructures'

    gen_csi = []
    for j in range(gen.dist_nr):
        gen_csi.append({})
        for l in gen.leaders[j]:
            gen_csi[j][tuple([l] + gen.groups[j][l])] = []

    #print gen_csi

    for j in range(gen.dist_nr):
        #print 'feature:',j
        for i1 in range(m.G):
            kldists = numpy.zeros(m.G)
            for i2 in range(m.G):
                kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                  gen.components[i2][j])
            cg = numpy.where(kldists == kldists.min())[0]

            gen_csi[j][tuple(cg)].append(i1)

    #print gen_csi
    # check easy case: all components match in gen and m
    match = 1
    for j in range(gen.dist_nr):
        for cg in gen_csi[j]:
            if cg != tuple(gen_csi[j][cg]):
                match = 0
    if match:
        #print 'Simple match !'
        return 1

    # check whether component indices have changed but the structures are consistent otherwise
    cmaps = []
    for j in range(gen.dist_nr):
        cmaps.append({})
        for i1 in range(m.G):
            kldists = numpy.zeros(m.G)
            for i2 in range(m.G):
                kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                  gen.components[i2][j])
            cg = numpy.where(kldists == kldists.min())[0]
            cmaps[j][i1] = cg

    #print cmaps

    gen_compred = checkComponentRedundancy(gen.leaders, gen.groups)

    #print 'gen_compred', gen_compred

    if len(gen_compred) > 1:
        return 0  # XXX case not covered yet

    match = 1
    m_to_gen = {}
    for i in range(m.G):
        m_to_gen[i] = -1

    for j in range(gen.dist_nr):
        for i in cmaps[j]:
            if len(cmaps[j][i]) == 1:
                if m_to_gen[i] == -1:
                    m_to_gen[i] = cmaps[j][i][0]
                else:
                    if m_to_gen[i] == cmaps[j][i][0]:
                        continue
                    else:
                        match = 0
                        break
    #print m_to_gen

    if len(gen_compred) == 0:
        for k in m_to_gen:
            #print m_to_gen[k]
            if m_to_gen[k] == -1:
                return 0
        return 1

    for k in m_to_gen:
        if m_to_gen[k] == -1:  # no assignment so far
            for j in range(gen.dist_nr):
                #print gen_compred
                #print k, cmaps[j][k].tolist(),gen_compred[0]
                if not cmaps[j][k].tolist() == gen_compred[0]:

                    match = 0

    #print '*** match=', match

    return match
def getRandomMixture(G,
                     p,
                     KL_lower,
                     KL_upper,
                     dtypes='discgauss',
                     M=4,
                     seed=None):

    #    if seed:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        #print '*** seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,9000000)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        random.seed(seed)
    #        #print '*** seed=',seed

    #M = 4  # Alphabet size for discrete distributions

    min_sigma = 0.1  # minimal std for Normal
    max_sigma = 1.0  # maximal std for Normal
    min_mu = -5.0  # minimal mean
    max_mu = 8.0  # maximal mean

    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p
    elif dtypes == 'discgauss':
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [random.choice((0, 1)) for i in range(p)]
    else:
        raise TypeError

    #print featureTypes

    C = []
    for j in range(p):
        c_j = []
        for i in range(G):
            #print i,j
            if featureTypes[j] == 0:
                acc = 0
                while acc == 0:
                    cand = mixture.DiscreteDistribution(
                        M, mixture.random_vector(M))

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0
                            break

                c_j.append(cand)
            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)

                    cand = mixture.NormalDistribution(mu, sigma)

                    #print 'cand:',cand

                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d, cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0

                c_j.append(cand)

            else:
                RuntimeError

        C.append(c_j)


#    print '\n'
#    for cc in C:
#        print cc

    comps = []
    for i in range(G):
        comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)]))

    pi = get_random_pi(G, 0.1)

    m = mixture.MixtureModel(G, pi, comps, struct=1)
    m.updateFreeParams()

    return m
def getRandomCSIMixture_conditionalDists(G,
                                         p,
                                         KL_lower,
                                         KL_upper,
                                         M=8,
                                         dtypes='discgauss',
                                         seed=None,
                                         fullstruct=False,
                                         disc_sampling_dist=None):

    #    if seed:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        #print '*** seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,9999999)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        random.seed(seed)
    #        #print '*** seed=',seed

    if disc_sampling_dist == None:
        discSamp = mixture.DirichletPrior(M, [1.0] * M)  # uniform sampling
    else:
        discSamp = disc_sampling_dist

    min_sigma = 0.3  # minimal std for Normal
    max_sigma = 5.0  # maximal std for Normal
    min_mu = -25.0  # minimal mean
    max_mu = 25.0  # maximal mean

    assert dtypes in ['disc', 'gauss', 'discgauss']

    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p
    elif dtypes == 'discgauss':
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [random.choice((0, 1)) for i in range(p)]
    else:
        raise TypeError

    #print featureTypes

    # generate random CSI structures

    if G < 15:
        P = setPartitions.generate_all_partitions(
            G)  # XXX too slow for large G
    #print P

    C = []

    leaders = []
    groups = []
    for j in range(p):
        c_j = {}

        leaders_j = []
        groups_j = {}

        if fullstruct == True:
            struct_j = [(i, ) for i in range(G)]

        elif G < 15:
            struct_j = random.choice(P)
        else:
            print 'WARNING: improper structure sampling !'
            struct_j = setPartitions.get_random_partition(G)

        #print '\nstruct',j,struct_j

        for i, grp in enumerate(struct_j):

            lg = list(grp)

            #print lg

            lgj = lg.pop(0)

            #print lgj

            leaders_j.append(lgj)
            groups_j[lgj] = lg

            max_tries = 100000
            tries = 0

            if featureTypes[j] == 0:
                acc = 0

                while acc == 0:
                    cand = discSamp.sample()

                    #print 'Cand:', cand

                    acc = 1
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d], cand)

                        #print c_j[d],cand, KL_dist

                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError, 'Failed to find separated parameters !'

                for cind in grp:
                    c_j[cind] = cand

            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)
                    cand = mixture.NormalDistribution(mu, sigma)
                    acc = 1

                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d], cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError

                #    print '.',
                #print

                for cind in grp:
                    c_j[cind] = cand

            else:
                RuntimeError

        leaders.append(leaders_j)
        groups.append(groups_j)

        C.append(c_j)

    comps = []
    for i in range(G):
        comps.append(mixture.ProductDistribution([C[j][i] for j in range(p)]))

    pi = get_random_pi(G, 0.3 / G)
    #print '** pi =',pi

    # create prior
    piprior = mixture.DirichletPrior(G, [2.0] * G)

    cprior = []
    for j in range(p):
        if featureTypes[j] == 0:
            cprior.append(mixture.DirichletPrior(M, [1.02] * M))

        elif featureTypes[j] == 1:
            cprior.append(mixture.NormalGammaPrior(
                0, 0, 0, 0))  # dummy parameters, to be set later

        else:
            RuntimeError

    mprior = mixture.MixtureModelPrior(0.1, 0.1, piprior, cprior)

    m = mixture.BayesMixtureModel(G, pi, comps, mprior, struct=1)
    m.leaders = leaders
    m.groups = groups

    m.identifiable()
    m.updateFreeParams()
    #print m

    return m
def scoreStructureLearning_diffFullVsTopdown(N,
                                             gen,
                                             delta,
                                             seed=None,
                                             silent=False,
                                             skipAfterRNGcalls=False):
    """
    If skipAfterRNGcalls is True, the function terminates after all calls to RNGs have been done.
    """

    #print 'start scoring'

    #    if seed != None:
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        print '*** given seed=',seed
    #
    #    else: # XXX debug
    #        seed = random.randint(1,999999999)
    #        random.seed(seed)
    #        mixture._C_mixextend.set_gsl_rng_seed(seed)
    #        print '*** random seed=',seed

    data = gen.sampleDataSet(N)

    # XXX update NormalGammaPrior hyperparameters
    for j in range(gen.dist_nr):
        if isinstance(gen.prior.compPrior[j], mixture.NormalGammaPrior):
            gen.prior.compPrior[j].setParams(data.getInternalFeature(j), gen.G)

    gen.prior.structPriorHeuristic(delta, data.N)

    #    print '\nupdating generating model structure:'
    #    print 'vorher:'
    #    print gen.leaders
    #    print gen.groups

    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(gen,
                                                                     data,
                                                                     silent=1)

    #    print '\nnachher:'
    #    print gen.leaders
    #    print gen.groups

    m = copy.copy(gen)
    # reset structure
    m.initStructure()

    # training parameters
    nr_rep = 40  # XXX
    #nr_rep = 4 # XXX

    nr_steps = 400
    em_delta = 0.6

    #    print 'start training'
    #    print 'EM repeats:',nr_rep

    m.randMaxTraining(data, nr_rep, nr_steps, em_delta, silent=1, rtype=0)
    #    print 'finished training'

    if skipAfterRNGcalls == True:
        print '*** Skipping !'
        return numpy.zeros(4)

    m1 = copy.copy(m)
    t0 = time.time()
    #print '\n\n################# TOPDOWN #####################'
    m1.updateStructureBayesian(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m1.mapEM(data,40,0.1)
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m1)  # structureEditDistance(gen,m1)

    #print '#############################'

    #print '\n\n################# FULL FixedOrder #####################'
    m2 = copy.copy(m)
    t0 = time.time()
    m2.updateStructureBayesianFullEnumerationFixedOrder(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m2.mapEM(data,40,0.1)

    #    print
    #    for j in range(m2.dist_nr):
    #        print j,m2.leaders[j], m2.groups[j]
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m2) # structureEditDistance(gen,m1)

    #print '\n\n################# BOTTUMUP #####################'
    m3 = copy.copy(m)
    t0 = time.time()
    m3.updateStructureBayesianBottomUp(data, silent=1)
    t1 = time.time()
    time2 = t1 - t0
    #m3.mapEM(data,40,0.1)
    #    print
    #    for j in range(m3.dist_nr):
    #        print j,m3.leaders[j], m3.groups[j]
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m3) # structureEditDistance(gen,m1)

    #print '\n\n################# FULL enumeration #####################'
    m4 = copy.copy(m)
    t0 = time.time()
    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(m4,
                                                                     data,
                                                                     silent=1)
    t1 = time.time()
    time2 = t1 - t0
    # m4.mapEM(data,40,0.1)
    #    print
    #    for j in range(m4.dist_nr):
    #        print j,m4.leaders[j], m4.groups[j]
    #    print 'Accuracy:',mixture.structureAccuracy(gen,m4)

    logp_top = mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
    logp_full_fixed = mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
    logp_full = mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
    logp_bottom = mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)

    if (not (round(logp_top, 3) <= round(logp_full, 3))
            or not (round(logp_full_fixed, 3) <= round(logp_full, 3))
            or not (round(logp_bottom, 3) <= round(logp_full, 3))):
        print 'ERROR:'
        print 'top:', logp_top
        print 'full fixed:', logp_full_fixed
        print 'full:', logp_full
        print 'bottom:', logp_bottom, '\n'

        printModel(gen, 'generating model')
        printStructure(gen)
        print
        printModel(m4, 'full enumeration model')
        printStructure(m4)
        print
        printModel(m2, 'fixed full model')
        printStructure(m2)

        raise ValueError

#    # as a measure of separation of the component in the trained model, sum up
#    # sym. KL divergence of all components and features
#    train_diff = 0
#    for j in range(gen.dist_nr):
#        for i1 in range(m.G):
#            for i2 in range(m.G):
#                train_diff += mixture.sym_kl_dist(m.components[i1][j], m.components[i2][j])

    mix_dist1 = mixtureKLdistance(gen, m)
    mix_dist2 = mixtureKLdistance(m, gen)

    max_dist1 = mixtureMaxKLdistance(gen, m)
    max_dist2 = mixtureMaxKLdistance(m, gen)

    # number of leaders in the full enumeration model
    nr_full_lead = 0
    for ll in m4.leaders:
        nr_full_lead += len(ll)

    match = matchModelStructures(gen, m)

    compred = checkComponentRedundancy(gen.leaders, gen.groups)
    if not (str(logp_top) == str(logp_full_fixed) == str(logp_full)):

        print '-----------------------------------------------------------------------'

        print 'Different:'
        print 'top:', logp_top
        print 'full fixed:', logp_full_fixed
        print 'full:', logp_full
        print 'bottom:', logp_bottom, '\n'

        explain = 0
        if str(compred) != '[]':
            print '*** redundant components', compred
            explain = 1
        if gen.pi.min() < 0.05:
            print '*** vanishing component in generating model'
            explain = 1
        if m.pi.min() < 0.05:
            print '*** vanishing component in trained model'
            explain = 1

        if explain == 0:
            print '*** UNEXPLAINED !'

        printModel(gen, 'generating model')
        printModel(m, 'trained model')
        #print 'Trained model diff (simplistic):',train_diff
        print 'D: Mixture distance gen/trained:', mix_dist1
        print 'D: Mixture distance trained/gen:', mix_dist2

        print 'D: Mixture Max-distance gen/trained:', max_dist1
        print 'D: Mixture Max-distance trained/gen:', max_dist2

        print '\nGenerating distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(gen.components[i1][j],
                                                      gen.components[i2][j])
                print map(lambda x: '%.2f' % float(x),
                          kldists)  # kldists.min()

        print '\nTrained distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                      m.components[i2][j])
                print map(lambda x: '%.2f' % float(x),
                          kldists)  # kldists.min()

        print '\nTrained distances to generating:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                      gen.components[i2][j])
                print i1, '->', kldists.argmin(), map(
                    lambda x: '%.2f' % float(x), kldists)  # kldists.min()

        print '\n True structure:'
        print 'True model post:', mixture.get_loglikelihood(
            gen, data) + gen.prior.pdf(gen)
        #for j in range(m.dist_nr):
        #    print j,gen.leaders[j], gen.groups[j]
        printStructure(gen)

        print '\nTop down:'
        printStructure(m1)
        print 'Top down model post:', mixture.get_loglikelihood(
            m1, data) + m1.prior.pdf(m1)
        printModel(m1, 'top down model')

        print '\nFull enumeration Fixed Order:'
        printStructure(m2)
        print 'Full fixed order model post:', mixture.get_loglikelihood(
            m2, data) + m2.prior.pdf(m2)
        printModel(m2, 'full fixed model')

        print '\nBottom up:'
        printStructure(m3)
        print 'Bottom up model post:', mixture.get_loglikelihood(
            m3, data) + m3.prior.pdf(m3)
        printModel(m3, 'bottom up model')

        print '\nFull enumeration:'
        printStructure(m4)
        print 'Full enumeration model post:', mixture.get_loglikelihood(
            m4, data) + m4.prior.pdf(m4)
        printModel(m4, 'full enumeration model')

        print '-----------------------------------------------------------------------'

    elif str(
            compred
    ) != '[]' and nr_full_lead > m4.p and match != 1:  # redundant components and not fully merged
        print '-----------------------------------------------------------------------'
        print 'Same but redundant components:', compred

        printModel(gen, 'generating model')
        printModel(m, 'trained model')
        #print 'Trained model diff:',train_diff
        print 'S: Mixture distance gen/trained:', mix_dist1
        print 'S: Mixture distance trained/gen:', mix_dist2

        print 'S: Mixture Max-distance gen/trained:', max_dist1
        print 'S: Mixture Max-distance trained/gen:', max_dist2

        print '\nGenerating distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(gen.components[i1][j],
                                                      gen.components[i2][j])
                print i1, ':', map(lambda x: '%.2f' % float(x),
                                   kldists)  # kldists.min()

        print '\nTrained distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                      m.components[i2][j])
                print i1, ':', map(lambda x: '%.2f' % float(x),
                                   kldists)  # kldists.min()

        print '\nTrained distances to generating:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:', j

            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j],
                                                      gen.components[i2][j])
                print i1, '->', kldists.argmin(), map(
                    lambda x: '%.2f' % float(x), kldists)  # kldists.min()

        print '\n True structure:'
        print 'True model post:', mixture.get_loglikelihood(
            gen, data) + gen.prior.pdf(gen)
        #for j in range(m.dist_nr):
        #    print j,gen.leaders[j], gen.groups[j]
        printStructure(gen)

        print '\nTop down:'
        printStructure(m1)
        print 'Top down model post:', mixture.get_loglikelihood(
            m1, data) + m1.prior.pdf(m1)

        print '\nFull enumeration Fixed Order:'
        printStructure(m2)
        print 'Full fixed order model post:', mixture.get_loglikelihood(
            m2, data) + m2.prior.pdf(m2)

        print '\nBottom up:'
        printStructure(m3)
        print 'Bottom up model post:', mixture.get_loglikelihood(
            m3, data) + m3.prior.pdf(m3)

        print '\nFull enumeration:'
        printStructure(m4)
        print 'Full enumeration model post:', mixture.get_loglikelihood(
            m4, data) + m4.prior.pdf(m4)

        print '-----------------------------------------------------------------------'

#    else:
#        print '-----------------------------------------------------------------------'
#        print 'S: Mixture distance gen/trained:',mix_dist1
#        print 'S: Mixture distance trained/gen:',mix_dist2
#        print '-----------------------------------------------------------------------'

#    else:
#        print '** all equal.'

#    dtop = mixture.structureAccuracy(gen,m1)
#    dfull_fixed = mixture.structureAccuracy(gen,m2)
#    dfull = mixture.structureAccuracy(gen,m4)
#    dbottom = mixture.structureAccuracy(gen,m3)

    return numpy.array([logp_top, logp_full_fixed, logp_full, logp_bottom])
Beispiel #8
0
def plotKLDistance(ref_dist, objf='sym' ,title='KL Distance', show=True):
    
    assert ref_dist.M == 3, 'Only 3 dimensions for now.'
    
    # KL distance to be used, either symmetric or the two directions
    # with respect to ref_dist
    assert objf in ['sym', 'leftToRight', 'rightToLeft']  
    
    # A 2-simplex lives in 3-space.
    dimension = 3 # XXX dimension fixed to 3 for now

    # These are the vertex labels, converted to strings.
    labels = numpy.eye(dimension, dtype=int)
    labels = map(str, map(tuple, labels))

    # Let's create the simplex.
    simplex = Simplex2D(labels, modify_labels=False)

    # construct grid
    dist = []
    x = numpy.arange(0.001,1.0,0.01)
    y = numpy.arange(0.001,1.0,0.01)
    for p1 in x:
        d_row = []
        for p2 in y:
            #for p3 in z:
            p3 = 1.0-p1-p2
            d_row.append([p1,p2,p3])
        dist.append(d_row)
    
    sample_dist = dict(zip(labels, ref_dist.phi))
    proj_ref = simplex.project_distribution(sample_dist, use_logs=False)
    
    
    proj_x = []
    proj_y = []
    distance = []
    f = lambda x: round(x,3)
    for drow in dist:
        x_row = []
        y_row = []
        d_row = []
        for d in drow:
            #print d, 1.0 - numpy.sum(map(f,d))
            
            
            if (1.0 - numpy.sum(map(f,d))) < 1e-15 and d[0] > 0 and d[1] > 0 and d[2] > 0.0:
                #print ref_dist,mixture.DiscreteDistribution(3, d), mixture.sym_kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d))
                if objf == 'sym':
                    d_row.append( mixture.sym_kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d)))
                elif objf == 'leftToRight':    
                    d_row.append( mixture.kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d)))
                elif objf == 'rightToLeft':    
                    d_row.append( mixture.kl_dist( mixture.DiscreteDistribution(3, d), ref_dist ))                
                else:
                    raise TypeError
                    
            else:
                d_row.append( 0.0)
            
            sample_dist = dict(zip(labels, d))  
            pp = simplex.project_distribution(sample_dist, use_logs=False)
            x_row.append(pp[0])
            y_row.append(pp[1])

        proj_x.append(x_row)
        proj_y.append(y_row)
        distance.append(d_row)

    proj_x = numpy.array(proj_x)
    proj_y = numpy.array(proj_y)
    distance = numpy.array(distance)

    # Create the figure
    fig = pylab.figure()
    fig.set_facecolor('w')
    fig.add_axes([.15,.15,.70,.70], axisbg='w', aspect='equal')
    axis = pylab.gca()
    
    # Plot the simplex
    simplex_plotter = Simplex2DPlotter(simplex, axis)
    simplex_plotter.prepare_axes()
    simplex_plotter.plot_simplex()
    
    #axis.set_title(title)
    axis.text(-0.5, 0.55, title, fontsize=12)
    
    # Plot the samples
    #x = [sample[0] for sample in samples]
    #y = [sample[1] for sample in samples]

    max_val = distance.max()
    step = max_val / 50.0
    
    axis.contourf(proj_x, proj_y,distance,pylab.arange(0,max_val,step)) 
    axis.plot([proj_ref[0]], [proj_ref[1]], 'or') 

    if show:
        pylab.show()
Beispiel #9
0
def plotKLDistance(ref_dist, objf='sym' ,title='KL Distance', show=True):
    
    assert ref_dist.M == 3, 'Only 3 dimensions for now.'
    
    # KL distance to be used, either symmetric or the two directions
    # with respect to ref_dist
    assert objf in ['sym', 'leftToRight', 'rightToLeft']  
    
    # A 2-simplex lives in 3-space.
    dimension = 3 # XXX dimension fixed to 3 for now

    # These are the vertex labels, converted to strings.
    labels = numpy.eye(dimension, dtype=int)
    labels = map(str, map(tuple, labels))

    # Let's create the simplex.
    simplex = Simplex2D(labels, modify_labels=False)

    # construct grid
    dist = []
    x = numpy.arange(0.001,1.0,0.01)
    y = numpy.arange(0.001,1.0,0.01)
    for p1 in x:
        d_row = []
        for p2 in y:
            #for p3 in z:
            p3 = 1.0-p1-p2
            d_row.append([p1,p2,p3])
        dist.append(d_row)
    
    sample_dist = dict(zip(labels, ref_dist.phi))
    proj_ref = simplex.project_distribution(sample_dist, use_logs=False)
    
    
    proj_x = []
    proj_y = []
    distance = []
    f = lambda x: round(x,3)
    for drow in dist:
        x_row = []
        y_row = []
        d_row = []
        for d in drow:
            #print d, 1.0 - numpy.sum(map(f,d))
            
            
            if (1.0 - numpy.sum(map(f,d))) < 1e-15 and d[0] > 0 and d[1] > 0 and d[2] > 0.0:
                #print ref_dist,mixture.DiscreteDistribution(3, d), mixture.sym_kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d))
                if objf == 'sym':
                    d_row.append( mixture.sym_kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d)))
                elif objf == 'leftToRight':    
                    d_row.append( mixture.kl_dist(  ref_dist, mixture.DiscreteDistribution(3, d)))
                elif objf == 'rightToLeft':    
                    d_row.append( mixture.kl_dist( mixture.DiscreteDistribution(3, d), ref_dist ))                
                else:
                    raise TypeError
                    
            else:
                d_row.append( 0.0)
            
            sample_dist = dict(zip(labels, d))  
            pp = simplex.project_distribution(sample_dist, use_logs=False)
            x_row.append(pp[0])
            y_row.append(pp[1])

        proj_x.append(x_row)
        proj_y.append(y_row)
        distance.append(d_row)

    proj_x = numpy.array(proj_x)
    proj_y = numpy.array(proj_y)
    distance = numpy.array(distance)

    # Create the figure
    fig = pylab.figure()
    fig.set_facecolor('w')
    fig.add_axes([.15,.15,.70,.70], axisbg='w', aspect='equal')
    axis = pylab.gca()
    
    # Plot the simplex
    simplex_plotter = Simplex2DPlotter(simplex, axis)
    simplex_plotter.prepare_axes()
    simplex_plotter.plot_simplex()
    
    #axis.set_title(title)
    axis.text(-0.5, 0.55, title, fontsize=12)
    
    # Plot the samples
    #x = [sample[0] for sample in samples]
    #y = [sample[1] for sample in samples]

    max_val = distance.max()
    step = max_val / 50.0
    
    axis.contourf(proj_x, proj_y,distance,pylab.arange(0,max_val,step)) 
    axis.plot([proj_ref[0]], [proj_ref[1]], 'or') 

    if show:
        pylab.show()
Beispiel #10
0
def scoreStructureLearning(N, gen, delta, seed=None, silent=False, skipAfterRNGcalls = False):
    """
    If skipAfterRNGcalls is True, the function terminates after all calls to RNGs have been done.
    """


    #print 'start scoring'

#    if seed != None:
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        print '*** given seed=',seed
#        
#    else: # XXX debug
#        seed = random.randint(1,999999999)
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        print '*** random seed=',seed


        
    data = gen.sampleDataSet(N)                



    # XXX update NormalGammaPrior hyperparameters
    for j in range(gen.dist_nr):
        if isinstance(gen.prior.compPrior[j], mixture.NormalGammaPrior):
            gen.prior.compPrior[j].setParams(data.getInternalFeature(j), gen.G)
    
    gen.prior.structPriorHeuristic(delta, data.N)
    
    print '\nupdating generating model structure:'
    print 'vorher:'
    print gen.leaders
    print gen.groups


    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(gen, data, silent=1)

    print '\nnachher:'
    print gen.leaders
    print gen.groups


    if silent == False:
        printModel(gen,'generating model')


    m = copy.copy(gen)
    # reset structure
    m.initStructure()
    
    # training parameters
    nr_rep = 40 # XXX
    #nr_rep = 4 # XXX
    
    
    nr_steps = 400
    em_delta = 0.6
    
    print 'start training'
    print 'EM repeats:',nr_rep

    m.randMaxTraining(data,nr_rep, nr_steps,em_delta,silent=1,rtype=0)
    print 'finished training'

    if skipAfterRNGcalls == True:
        print '*** Skipping !'
        return numpy.zeros(4)



#    # check for consistency of component indices (identifiability issues)
#    bad = 0
    if silent == False:
        cmap = {}
        
        for j in range(gen.dist_nr):
            print '\nfeature:',j
    
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], gen.components[i2][j])
                print i1,'->', kldists.argmin(), map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()
            
        
#        for i1 in range(m.G):
#            print
#            cdists = numpy.zeros(m.G)
#            for i2 in range(m.G):
#                cdists[i2] = product_distribution_sym_kl_dist(m.components[i1], gen.components[i2])
#                #print i1,i2,product_distribution_sym_kl_dist(m.components[i1], gen.components[i2])
#
#            print i1,'maps to', numpy.argmin(cdists), cdists.tolist()
#            amin = numpy.argmin(cdists)
#            if not amin == i1:     # minimal KL distance should occur at equal indices in gen and m
#                bad = 1
#                cmap[i1] = amin

#    if bad:            
#        
#       
#        
#        # XXX check whether cmap defines new unambiguous ordering
#        
#        # check whether components have switched positions
#        reorder = 0
#        order = range(m.G)
#        try:
#            
#            #print cmap
#            
#            for i1 in cmap.keys():
#                order[i1] = cmap[i1]
#            
#            #print order
#            #print set(order)
#            #print  list(set(order))
#            
#            if len(set(order)) == m.G:
#                reorder = 1        
#        except KeyError:
#            pass
#        except AssertionError:    
#            pass
#            
#        if reorder:
#            print '** new order', order
#            
#            m.reorderComponents(order)
#
#        else:
#                    
#        
#            #print cdists
#            print i1,'maps to', numpy.argmin(cdists)
#
#            print 'Failed matching.'
#
#            print 'generating model gen:'
#            print gen
#
#            print 'trained model m:'
#            print m 
#
#            raise ValueError     


#    mtest =copy.copy(gen)
#    ch = mtest.updateStructureBayesian(data,silent=1)
#    print '\nTEST:',ch
#    for j in range(m.dist_nr):
#        print j,mtest.leaders[j], mtest.groups[j]


    #print m.prior

    print '-----------------------------------------------------------------------'
    print '\n True structure:'
    print 'True model post:',mixture.get_loglikelihood(gen, data) + gen.prior.pdf(gen)
    #for j in range(m.dist_nr):
    #    print j,gen.leaders[j], gen.groups[j]
    print gen.leaders
    print gen.groups

    if silent == False:
        printModel(m,'trained model')

    m1 = copy.copy(m)
    t0 = time.time()
    #print '\n\n################# TOPDOWN #####################'
    m1.updateStructureBayesian(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m1.mapEM(data,40,0.1)
    print '\nTop down (',str(time2),'s ):'
    print m1.leaders
    print m1.groups
    print 'Top down model post:',mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
#    print 'Accuracy:',mixture.structureAccuracy(gen,m1)  # structureEditDistance(gen,m1)

    if silent == False:
        printModel(m1,'top down model')


    #print '#############################'
    

    #print '\n\n################# FULL FixedOrder #####################'
    m2 = copy.copy(m)
    t0 = time.time()
    m2.updateStructureBayesianFullEnumerationFixedOrder(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m2.mapEM(data,40,0.1)

#    print
#    for j in range(m2.dist_nr):
#        print j,m2.leaders[j], m2.groups[j]
    print '\nFull enumeration Fixed Order  (',str(time2),'s ):'
    print m2.leaders
    print m2.groups
    print 'Full fixed order model post:',mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
#    print 'Accuracy:',mixture.structureAccuracy(gen,m2) # structureEditDistance(gen,m1)


    if silent == False:
        printModel(m2,'full fixed model')


        

    #print '\n\n################# BOTTUMUP #####################'    
    m3 = copy.copy(m)
    t0 = time.time()
    m3.updateStructureBayesianBottomUp(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m3.mapEM(data,40,0.1)
#    print 
#    for j in range(m3.dist_nr):
#        print j,m3.leaders[j], m3.groups[j]
    print '\nBottom up: (',str(time2),'s ):'
    print m3.leaders
    print m3.groups
    print 'Bottom up model post:',mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)
#    print 'Accuracy:',mixture.structureAccuracy(gen,m3) # structureEditDistance(gen,m1)


    if silent == False:
        printModel(m3,'bottom up model')

    
    #print '\n\n################# FULL enumeration #####################'
    m4 = copy.copy(m)
    t0 = time.time()
    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(m4, data, silent=0)
    t1 = time.time()
    time2 = t1-t0    
   # m4.mapEM(data,40,0.1)
#    print 
#    for j in range(m4.dist_nr):
#        print j,m4.leaders[j], m4.groups[j]
    print '\nFull enumeration: (',str(time2),'s )'
    print m4.leaders
    print m4.groups
    print 'Full enumeration model post:',mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
#    print 'Accuracy:',mixture.structureAccuracy(gen,m4)

    if silent == False:
        printModel(m4,'full enumeration model')


    print '-----------------------------------------------------------------------'



#    dtop = mixture.structureAccuracy(gen,m1)
#    dfull_fixed = mixture.structureAccuracy(gen,m2) 
#    dfull = mixture.structureAccuracy(gen,m4) 
#    dbottom = mixture.structureAccuracy(gen,m3)

    logp_top = mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
    logp_full_fixed = mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
    logp_full = mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
    logp_bottom = mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)


    if (not (round(logp_top,3) <= round(logp_full,3) ) or not (round(logp_full_fixed,3) <= round(logp_full,3))
        or not (round(logp_bottom,3) <= round(logp_full,3)) ):
        raise ValueError


    return numpy.array([ logp_top, logp_full_fixed, logp_full, logp_bottom ])
Beispiel #11
0
def matchModelStructures(gen, m):
    """
    Checks whether m1 and m2 are consistent in the sense that for each
    leader in m1, there is a number of distributions in m2 which take minimum
    distance to the leader as the number of distributions in the group in m1.
    
    Used to check whether a the parameteric EM has obviously captured the CSI structure of
    the generating model.
    """
    #print '**** matchModelStructures'
    
    gen_csi = []
    for j in range(gen.dist_nr):    
        gen_csi.append({})
        for l in gen.leaders[j]:
            gen_csi[j][ tuple( [l] + gen.groups[j][l] ) ] = []
    
    #print gen_csi
    
    for j in range(gen.dist_nr):
        #print 'feature:',j
        for i1 in range(m.G):
            kldists = numpy.zeros(m.G)
            for i2 in range(m.G):
                kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], gen.components[i2][j])
            cg = numpy.where( kldists == kldists.min() )[0]

            gen_csi[j][tuple(cg)].append(i1)
                

    #print gen_csi       
    # check easy case: all components match in gen and m
    match = 1
    for j in range(gen.dist_nr):
        for cg in gen_csi[j]:
            if cg != tuple(gen_csi[j][cg]):
                match = 0
    if match:
        #print 'Simple match !'             
        return 1

    # check whether component indices have changed but the structures are consistent otherwise
    cmaps = []
    for j in range(gen.dist_nr):   
        cmaps.append({})
        for i1 in range(m.G):
            kldists = numpy.zeros(m.G)
            for i2 in range(m.G):
                kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], gen.components[i2][j])
            cg = numpy.where( kldists == kldists.min() )[0]
            cmaps[j][i1]=cg
    
    #print cmaps   

    gen_compred = checkComponentRedundancy(gen.leaders, gen.groups)    
    
    #print 'gen_compred', gen_compred 
    
    if len(gen_compred) > 1:
        return 0  # XXX case not covered yet 
    
    match = 1
    m_to_gen = {}
    for i in range(m.G):
        m_to_gen[i] = -1
    
    for j in range(gen.dist_nr):    
        for i in cmaps[j]:
            if len(cmaps[j][i]) == 1:
                if m_to_gen[i] == -1:
                    m_to_gen[i] = cmaps[j][i][0]
                else:
                    if m_to_gen[i] == cmaps[j][i][0]:
                        continue
                    else:
                        match = 0
                        break
    #print m_to_gen   

    if len(gen_compred) == 0:
        for k in m_to_gen:
            #print m_to_gen[k]
            if m_to_gen[k] == -1:
                return 0
        return 1

    for k in m_to_gen:
        if m_to_gen[k] == -1:  # no assignment so far
            for j in range(gen.dist_nr):
                #print gen_compred
                #print k, cmaps[j][k].tolist(),gen_compred[0]
                if not cmaps[j][k].tolist() == gen_compred[0]:

                    match = 0                    
            

    #print '*** match=', match

    return match
Beispiel #12
0
def getRandomMixture(G, p, KL_lower, KL_upper, dtypes='discgauss', M=4,seed = None):
    
#    if seed:
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        #print '*** seed=',seed
#        
#    else: # XXX debug
#        seed = random.randint(1,9000000)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        random.seed(seed)
#        #print '*** seed=',seed
            
    
    #M = 4  # Alphabet size for discrete distributions
    
    min_sigma = 0.1    # minimal std for Normal
    max_sigma = 1.0   # maximal std for Normal
    min_mu = -5.0      # minimal mean
    max_mu = 8.0       # maximal mean
    
    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p    
    elif dtypes == 'discgauss':    
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [ random.choice( (0, 1) )  for i in range(p) ]
    else:
        raise TypeError
    
        
    #print featureTypes

    C = []
    for j in range(p):
        c_j = []
        for i in range(G):
            #print i,j
            if featureTypes[j] == 0:
                acc = 0
                while acc == 0:
                    cand = mixture.DiscreteDistribution(M, mixture.random_vector(M) )
                    
                    #print 'cand:',cand
                    
                    acc = 1
                    
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d,cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0
                            break
                
                c_j.append(cand)
            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)
                    
                    cand = mixture.NormalDistribution(mu, sigma )
                    
                    #print 'cand:',cand
                    
                    acc = 1
                    
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(d,cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            #print '  *', cand, 'rejected:', d , KL_dist
                            acc = 0
                
                c_j.append(cand)

            else:
                RuntimeError
                
        C.append(c_j)                

#    print '\n'
#    for cc in C:
#        print cc
    
                
    comps = []
    for i in range(G):
        comps.append( mixture.ProductDistribution( [ C[j][i] for j in range(p) ] ) )

    pi = get_random_pi(G,0.1)

    m = mixture.MixtureModel(G,pi, comps,struct=1)            
    m.updateFreeParams()

    return m                
Beispiel #13
0
def getRandomCSIMixture_conditionalDists(G, p, KL_lower, KL_upper, M=8, dtypes='discgauss', seed = None, fullstruct=False, disc_sampling_dist=None):
    
#    if seed:
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        #print '*** seed=',seed
#        
#    else: # XXX debug
#        seed = random.randint(1,9999999)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        random.seed(seed)
#        #print '*** seed=',seed

    

    if disc_sampling_dist == None:
        discSamp = mixture.DirichletPrior(M,[1.0] * M ) # uniform sampling
    else:
        discSamp = disc_sampling_dist   
        

    
    min_sigma = 0.3    # minimal std for Normal
    max_sigma = 5.0   # maximal std for Normal
    min_mu = -25.0      # minimal mean
    max_mu = 25.0       # maximal mean
    
    assert dtypes in ['disc','gauss','discgauss']    
        
    if dtypes == 'disc':
        featureTypes = [0] * p
    elif dtypes == 'gauss':
        featureTypes = [1] * p    
    elif dtypes == 'discgauss':    
        # discrete or Normal features for now, chosen uniformly
        # 0 discrete, 1 Normal
        featureTypes = [ random.choice( (0, 1) )  for i in range(p) ]
    else:
        raise TypeError

    #print featureTypes


    # generate random CSI structures

    if G < 15:
        P = setPartitions.generate_all_partitions(G) # XXX too slow for large G
    #print P

    C = []
    
    leaders = []
    groups = []
    for j in range(p):
        c_j = {}
        
        leaders_j = []
        groups_j = {}
    
    
        if fullstruct == True:
            struct_j = [(i,) for i in range(G)]
            
        elif G < 15:
            struct_j = random.choice(P)
        else:
            print 'WARNING: improper structure sampling !'
            struct_j = setPartitions.get_random_partition(G)
        
        #print '\nstruct',j,struct_j
        
        for i,grp in enumerate(struct_j):
            
            lg = list(grp)        
            
            #print lg
            
            lgj = lg.pop(0)
            
            #print lgj
            
            leaders_j.append(lgj)
            groups_j[lgj] = lg

            max_tries = 100000
            tries = 0


            if featureTypes[j] == 0:
                acc = 0
                
                while acc == 0:
                    cand = discSamp.sample() 
                    
                    #print 'Cand:', cand
                    
                    acc = 1
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d],cand)
                        
                        #print c_j[d],cand, KL_dist
                        
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError, 'Failed to find separated parameters !'
                                                
                    
                for cind in grp:
                    c_j[cind] = cand


            elif featureTypes[j] == 1:
                acc = 0
                while acc == 0:
                    mu = random.uniform(min_mu, max_mu)
                    sigma = random.uniform(min_sigma, max_sigma)
                    cand = mixture.NormalDistribution(mu, sigma )
                    acc = 1
                    
                    for d in c_j:
                        KL_dist = mixture.sym_kl_dist(c_j[d],cand)
                        if KL_dist > KL_upper or KL_dist < KL_lower:
                            acc = 0
                            tries += 1
                            break

                    if tries >= max_tries:
                        raise RuntimeError
                            
                    
                #    print '.',
                #print
                
                for cind in grp:
                    c_j[cind] = cand

            else:
                RuntimeError
                
        leaders.append(leaders_j)
        groups.append(groups_j)
        
        C.append(c_j)                
                
    comps = []
    for i in range(G):
        comps.append( mixture.ProductDistribution( [ C[j][i] for j in range(p) ] ) )

    pi = get_random_pi(G, 0.3 / G)
    #print '** pi =',pi 
    
    
    # create prior
    piprior = mixture.DirichletPrior(G,[2.0]*G)
    
    cprior = []
    for j in range(p):
        if featureTypes[j] == 0:
            cprior.append( mixture.DirichletPrior(M,[1.02]*M)) 

        elif featureTypes[j] == 1:
            cprior.append( mixture.NormalGammaPrior(0,0,0,0))   # dummy parameters, to be set later

        else:
            RuntimeError
        
    mprior = mixture.MixtureModelPrior(0.1,0.1, piprior, cprior)
    

    m = mixture.BayesMixtureModel(G,pi, comps, mprior, struct =1)            
    m.leaders = leaders
    m.groups = groups
    
    m.identifiable()
    m.updateFreeParams()
    #print m

    return m          
Beispiel #14
0
def scoreStructureLearning_diffFullVsTopdown(N, gen, delta, seed=None, silent=False, skipAfterRNGcalls = False):
    """
    If skipAfterRNGcalls is True, the function terminates after all calls to RNGs have been done.
    """


    #print 'start scoring'

#    if seed != None:
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        print '*** given seed=',seed
#        
#    else: # XXX debug
#        seed = random.randint(1,999999999)
#        random.seed(seed)
#        mixture._C_mixextend.set_gsl_rng_seed(seed)
#        print '*** random seed=',seed


        
    data = gen.sampleDataSet(N)                



    # XXX update NormalGammaPrior hyperparameters
    for j in range(gen.dist_nr):
        if isinstance(gen.prior.compPrior[j], mixture.NormalGammaPrior):
            gen.prior.compPrior[j].setParams(data.getInternalFeature(j), gen.G)
    
    gen.prior.structPriorHeuristic(delta, data.N)
    
#    print '\nupdating generating model structure:'
#    print 'vorher:'
#    print gen.leaders
#    print gen.groups


    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(gen, data, silent=1)

#    print '\nnachher:'
#    print gen.leaders
#    print gen.groups




    m = copy.copy(gen)
    # reset structure
    m.initStructure()
    
    # training parameters
    nr_rep = 40 # XXX
    #nr_rep = 4 # XXX
    
    
    nr_steps = 400
    em_delta = 0.6
    
#    print 'start training'
#    print 'EM repeats:',nr_rep

    m.randMaxTraining(data,nr_rep, nr_steps,em_delta,silent=1,rtype=0)
#    print 'finished training'

    if skipAfterRNGcalls == True:
        print '*** Skipping !'
        return numpy.zeros(4)


    m1 = copy.copy(m)
    t0 = time.time()
    #print '\n\n################# TOPDOWN #####################'
    m1.updateStructureBayesian(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m1.mapEM(data,40,0.1)
#    print 'Accuracy:',mixture.structureAccuracy(gen,m1)  # structureEditDistance(gen,m1)



    #print '#############################'
    

    #print '\n\n################# FULL FixedOrder #####################'
    m2 = copy.copy(m)
    t0 = time.time()
    m2.updateStructureBayesianFullEnumerationFixedOrder(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m2.mapEM(data,40,0.1)

#    print
#    for j in range(m2.dist_nr):
#        print j,m2.leaders[j], m2.groups[j]
#    print 'Accuracy:',mixture.structureAccuracy(gen,m2) # structureEditDistance(gen,m1)



        

    #print '\n\n################# BOTTUMUP #####################'    
    m3 = copy.copy(m)
    t0 = time.time()
    m3.updateStructureBayesianBottomUp(data,silent=1)
    t1 = time.time()
    time2 = t1-t0    
    #m3.mapEM(data,40,0.1)
#    print 
#    for j in range(m3.dist_nr):
#        print j,m3.leaders[j], m3.groups[j]
#    print 'Accuracy:',mixture.structureAccuracy(gen,m3) # structureEditDistance(gen,m1)



    
    #print '\n\n################# FULL enumeration #####################'
    m4 = copy.copy(m)
    t0 = time.time()
    fullEnumerationExhaustive.updateStructureBayesianFullEnumeration(m4, data, silent=1)
    t1 = time.time()
    time2 = t1-t0    
   # m4.mapEM(data,40,0.1)
#    print 
#    for j in range(m4.dist_nr):
#        print j,m4.leaders[j], m4.groups[j]
#    print 'Accuracy:',mixture.structureAccuracy(gen,m4)


    logp_top = mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
    logp_full_fixed = mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
    logp_full = mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
    logp_bottom = mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)


    if (not (round(logp_top,3) <= round(logp_full,3) ) or not (round(logp_full_fixed,3) <= round(logp_full,3))
        or not (round(logp_bottom,3) <= round(logp_full,3)) ):
        print 'ERROR:'
        print 'top:',logp_top
        print 'full fixed:',logp_full_fixed
        print 'full:',logp_full
        print 'bottom:',logp_bottom,'\n'
        
        printModel(gen,'generating model')
        printStructure(gen)
        print
        printModel(m4,'full enumeration model')
        printStructure(m4)
        print
        printModel(m2,'fixed full model')
        printStructure(m2)
        
        raise ValueError

#    # as a measure of separation of the component in the trained model, sum up 
#    # sym. KL divergence of all components and features
#    train_diff = 0
#    for j in range(gen.dist_nr):
#        for i1 in range(m.G):
#            for i2 in range(m.G):
#                train_diff += mixture.sym_kl_dist(m.components[i1][j], m.components[i2][j])

    mix_dist1 =  mixtureKLdistance(gen,m)
    mix_dist2 =  mixtureKLdistance(m, gen)
    
    max_dist1 = mixtureMaxKLdistance(gen,m)
    max_dist2 = mixtureMaxKLdistance(m, gen)

    # number of leaders in the full enumeration model
    nr_full_lead = 0
    for ll in m4.leaders:
        nr_full_lead += len(ll)

    match = matchModelStructures(gen, m)

    compred = checkComponentRedundancy(gen.leaders, gen.groups)
    if not(str(logp_top) == str(logp_full_fixed) == str(logp_full) ):

        print '-----------------------------------------------------------------------'

        print 'Different:'
        print 'top:',logp_top
        print 'full fixed:',logp_full_fixed
        print 'full:',logp_full
        print 'bottom:',logp_bottom,'\n'

        explain = 0
        if str(compred) != '[]':
            print '*** redundant components',compred
            explain = 1
        if gen.pi.min() < 0.05:
            print '*** vanishing component in generating model'
            explain = 1
        if m.pi.min() < 0.05:
            print '*** vanishing component in trained model'
            explain = 1

        if explain == 0:
            print '*** UNEXPLAINED !'


        printModel(gen,'generating model')
        printModel(m,'trained model')
        #print 'Trained model diff (simplistic):',train_diff
        print 'D: Mixture distance gen/trained:',mix_dist1
        print 'D: Mixture distance trained/gen:',mix_dist2

        print 'D: Mixture Max-distance gen/trained:',max_dist1
        print 'D: Mixture Max-distance trained/gen:',max_dist2


        print '\nGenerating distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(gen.components[i1][j], gen.components[i2][j])
                print map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()

        print '\nTrained distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], m.components[i2][j])
                print map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()


        print '\nTrained distances to generating:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], gen.components[i2][j])
                print i1,'->', kldists.argmin(), map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()



        print '\n True structure:'
        print 'True model post:',mixture.get_loglikelihood(gen, data) + gen.prior.pdf(gen)
        #for j in range(m.dist_nr):
        #    print j,gen.leaders[j], gen.groups[j]
        printStructure(gen)
        
        print '\nTop down:'
        printStructure(m1)
        print 'Top down model post:',mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)
        printModel(m1,'top down model')

        print '\nFull enumeration Fixed Order:'
        printStructure(m2)
        print 'Full fixed order model post:',mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)
        printModel(m2,'full fixed model')

        print '\nBottom up:'
        printStructure(m3)
        print 'Bottom up model post:',mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)
        printModel(m3,'bottom up model')

        print '\nFull enumeration:' 
        printStructure(m4)
        print 'Full enumeration model post:',mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)
        printModel(m4,'full enumeration model')


        print '-----------------------------------------------------------------------'

    elif str(compred) != '[]' and nr_full_lead > m4.p and match != 1:  # redundant components and not fully merged
        print '-----------------------------------------------------------------------'
        print 'Same but redundant components:', compred



        printModel(gen,'generating model')
        printModel(m,'trained model')
        #print 'Trained model diff:',train_diff        
        print 'S: Mixture distance gen/trained:',mix_dist1
        print 'S: Mixture distance trained/gen:',mix_dist2

        print 'S: Mixture Max-distance gen/trained:',max_dist1
        print 'S: Mixture Max-distance trained/gen:',max_dist2

        
        print '\nGenerating distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(gen.components[i1][j], gen.components[i2][j])
                print i1,':', map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()

        print '\nTrained distances to self:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j
            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], m.components[i2][j])
                print i1,':', map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()


        print '\nTrained distances to generating:'
        cmap = {}
        for j in range(gen.dist_nr):
            print 'feature:',j

            for i1 in range(m.G):
                kldists = numpy.zeros(m.G)
                for i2 in range(m.G):
                    kldists[i2] = mixture.sym_kl_dist(m.components[i1][j], gen.components[i2][j])
                print i1,'->', kldists.argmin(), map(lambda x:'%.2f' % float(x),kldists)     # kldists.min()



        print '\n True structure:'
        print 'True model post:',mixture.get_loglikelihood(gen, data) + gen.prior.pdf(gen)
        #for j in range(m.dist_nr):
        #    print j,gen.leaders[j], gen.groups[j]
        printStructure(gen)
        

        print '\nTop down:'
        printStructure(m1)
        print 'Top down model post:',mixture.get_loglikelihood(m1, data) + m1.prior.pdf(m1)

        print '\nFull enumeration Fixed Order:'
        printStructure(m2)
        print 'Full fixed order model post:',mixture.get_loglikelihood(m2, data) + m2.prior.pdf(m2)

        print '\nBottom up:'
        printStructure(m3)
        print 'Bottom up model post:',mixture.get_loglikelihood(m3, data) + m3.prior.pdf(m3)

        print '\nFull enumeration:' 
        printStructure(m4)
        print 'Full enumeration model post:',mixture.get_loglikelihood(m4, data) + m4.prior.pdf(m4)

        print '-----------------------------------------------------------------------'
    
#    else:
#        print '-----------------------------------------------------------------------'
#        print 'S: Mixture distance gen/trained:',mix_dist1
#        print 'S: Mixture distance trained/gen:',mix_dist2
#        print '-----------------------------------------------------------------------'


#    else:
#        print '** all equal.'


#    dtop = mixture.structureAccuracy(gen,m1)
#    dfull_fixed = mixture.structureAccuracy(gen,m2) 
#    dfull = mixture.structureAccuracy(gen,m4) 
#    dbottom = mixture.structureAccuracy(gen,m3)



    return numpy.array([ logp_top, logp_full_fixed, logp_full, logp_bottom ])