Example #1
0
def generate_data(n_documents, n_topics, n_vocabulary, n_words):

    # Generate random data from the generative model

    # Generate document assignments for the words
    word_documents = nodes.Categorical(np.ones(n_documents) / n_documents,
                                       plates=(n_words, )).random()

    # Topic distribution for each document
    p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics),
                              plates=(n_documents, )).random()

    # Word distribution for each topic
    p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary),
                             plates=(n_topics, )).random()

    # Topic for each word in each document
    topic = nodes.Categorical(p_topic[word_documents],
                              plates=(n_words, )).random()

    # Each word in each document
    corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random()

    bpplt.pyplot.figure()
    bpplt.hinton(p_topic)
    bpplt.pyplot.title("True topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(p_word)
    bpplt.pyplot.title("True word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return (corpus, word_documents)
Example #2
0
def run(M=30, D=5):

    # Generate data
    y = np.random.randint(D, size=(M, ))

    # Construct model
    p = nodes.Dirichlet(1 * np.ones(D), name='p')
    z = nodes.Categorical(p, plates=(M, ), name='z')

    # Observe the data with randomly missing values
    mask = random.mask(M, p=0.5)
    z.observe(y, mask=mask)

    # Run VB-EM
    Q = VB(p, z)
    Q.update()

    # Show results
    z.show()
    p.show()
Example #3
0
print("++++++++++++++++++++++++++")

N = 10000
y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1])


a0 = [0.5, 0.1, 0.1]

mu0 = -1
lambda0 = 5



#MU = bayes.Gaussian(mu=mu0, Lambda=0.9)
#X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, ))
P = bayes.Dirichlet(a0)
X = bayes.Categorical(P, plates=(N, ))

#P.initialize_from_random()

Q = VB(X, P)

X.observe(y)
Q.update(repeat=1000)


print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
print(P.random())
#print(np.sum(y==2))
Example #4
0
    def create_model(self, model_type=None):

        #Create location model for each of the timezone
        location_model = []

        if ('all' == model_type):
            p_conc = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_theta = nodes.Dirichlet(p_conc,
                                      plates = (self.N_TIMEZONES,),
                                      name = 'p_theta')
            for time in np.arange(self.N_TIMEZONES):
                model = nodes.Categorical(p_theta[time],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)


            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta, p_conc)

        elif ('cross' == model_type):
            raise 'Not Implemented'
            pass
        elif ('2fold' == model_type):
            p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS)

            p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS))

            morning_time = np.arange(6,19)
            night_time = np.append(np.arange(0,6) , np.arange(19,24))

            p_theta_morning = nodes.Dirichlet(p_conc_morning,
                                      plates = (morning_time.size,),
                                      name = 'p_theta_morning')
            p_theta_night = nodes.Dirichlet(p_conc_night,
                                      plates = (night_time.size,),
                                      name = 'p_theta_night')


            #Combinging morning time
            for count, time in enumerate(morning_time):
                model = nodes.Categorical(p_theta_morning[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]
                #print(timezone_observations)

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            #Combinging night time
            for count, time in enumerate(night_time):
                model = nodes.Categorical(p_theta_night[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta_morning, p_theta_night, p_conc_morning, p_conc_night)
        else:
            raise 'no model_type selected'

        print ("models created")

        ####################################################################################
        #Learning parameters
        Q.update(repeat=1000)
        print ('learned params')
        ####################################################################################
        
        if ('all' == model_type):
            return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS))
        elif ('2fold' == model_type):
            learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS))
            learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS))
            return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
Example #5
0
def model(n_documents,
          n_topics,
          n_vocabulary,
          corpus,
          word_documents,
          plates_multiplier=1):
    '''
    Construct Latent Dirichlet Allocation model.
    
    Parameters
    ----------
    
    documents : int
        The number of documents

    topics : int
        The number of topics

    vocabulary : int
        The number of words in the vocabulary

    corpus : integer array
        The vocabulary index of each word in the corpus

    word_documents : integer array
        The document index of each word in the corpus
    '''

    # Topic distributions for each document
    p_topic = nodes.Dirichlet(np.ones(n_topics),
                              plates=(n_documents, ),
                              name='p_topic')

    # Word distributions for each topic
    p_word = nodes.Dirichlet(np.ones(n_vocabulary),
                             plates=(n_topics, ),
                             name='p_word')

    # Use a simple wrapper node so that the value of this can be changed if one
    # uses stocahstic variational inference
    word_documents = Constant(CategoricalMoments(n_documents),
                              word_documents,
                              name='word_documents')

    # Choose a topic for each word in the corpus
    topics = nodes.Categorical(nodes.Gate(word_documents, p_topic),
                               plates=(len(corpus), ),
                               plates_multiplier=(plates_multiplier, ),
                               name='topics')

    # Choose each word in the corpus from the vocabulary
    words = nodes.Categorical(nodes.Gate(topics, p_word), name='words')

    # Observe the corpus
    words.observe(corpus)

    # Break symmetry by random initialization
    p_topic.initialize_from_random()
    p_word.initialize_from_random()

    return VB(words, topics, p_word, p_topic, word_documents)
Example #6
0
    # print()
    # print(subsets[subset])
    Q['X'].observe([y[inx] for inx in subset])
    # Learn intermediate variables
    Q.update('Z')
    #  Set step length
    step = (iter + delay) ** (-forgetting_rate)
    # Stochastic gradient for the global variables
    Q.gradient_step('p', 'T', 'E', scale=step)

'''

likelihood = Q['E'].random()

qp = p.random()
qT = T.random()
qE = E.random()

#print(qT)
#print(qE)

d = bayes.Dirichlet([0.3, 0.7])
n = bayes.Categorical(d)
print(n.parents[0])
print(n.parents[0].get_moments())
f = n.parents[0].get_moments()[0]
print(np.exp(f))
print(n)
print(n.pdf([0, 1]))
print(E)