Beispiel #1
0
def generate_data(n_documents, n_topics, n_vocabulary, n_words):

    # Generate random data from the generative model

    # Generate document assignments for the words
    word_documents = nodes.Categorical(np.ones(n_documents) / n_documents,
                                       plates=(n_words, )).random()

    # Topic distribution for each document
    p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics),
                              plates=(n_documents, )).random()

    # Word distribution for each topic
    p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary),
                             plates=(n_topics, )).random()

    # Topic for each word in each document
    topic = nodes.Categorical(p_topic[word_documents],
                              plates=(n_words, )).random()

    # Each word in each document
    corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random()

    bpplt.pyplot.figure()
    bpplt.hinton(p_topic)
    bpplt.pyplot.title("True topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(p_word)
    bpplt.pyplot.title("True word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return (corpus, word_documents)
Beispiel #2
0
def run(M=30, D=5):

    # Generate data
    y = np.random.randint(D, size=(M, ))

    # Construct model
    p = nodes.Dirichlet(1 * np.ones(D), name='p')
    z = nodes.Categorical(p, plates=(M, ), name='z')

    # Observe the data with randomly missing values
    mask = random.mask(M, p=0.5)
    z.observe(y, mask=mask)

    # Run VB-EM
    Q = VB(p, z)
    Q.update()

    # Show results
    z.show()
    p.show()
print("++++++++++++++++++++++++++")

N = 10000
y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1])


a0 = [0.5, 0.1, 0.1]

mu0 = -1
lambda0 = 5



#MU = bayes.Gaussian(mu=mu0, Lambda=0.9)
#X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, ))
P = bayes.Dirichlet(a0)
X = bayes.Categorical(P, plates=(N, ))

#P.initialize_from_random()

Q = VB(X, P)

X.observe(y)
Q.update(repeat=1000)


print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
print(P.random())
#print(np.sum(y==2))
Beispiel #4
0
    def create_model(self, model_type=None):

        #Create location model for each of the timezone
        location_model = []

        if ('all' == model_type):
            p_conc = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_theta = nodes.Dirichlet(p_conc,
                                      plates = (self.N_TIMEZONES,),
                                      name = 'p_theta')
            for time in np.arange(self.N_TIMEZONES):
                model = nodes.Categorical(p_theta[time],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)


            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta, p_conc)

        elif ('cross' == model_type):
            raise 'Not Implemented'
            pass
        elif ('2fold' == model_type):
            p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS)
            p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS)

            p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS))
            p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS))

            morning_time = np.arange(6,19)
            night_time = np.append(np.arange(0,6) , np.arange(19,24))

            p_theta_morning = nodes.Dirichlet(p_conc_morning,
                                      plates = (morning_time.size,),
                                      name = 'p_theta_morning')
            p_theta_night = nodes.Dirichlet(p_conc_night,
                                      plates = (night_time.size,),
                                      name = 'p_theta_night')


            #Combinging morning time
            for count, time in enumerate(morning_time):
                model = nodes.Categorical(p_theta_morning[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]
                #print(timezone_observations)

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            #Combinging night time
            for count, time in enumerate(night_time):
                model = nodes.Categorical(p_theta_night[count],
                                        plates=(self.N_OBSERVATIONS[time],1),
                                        name=str(time))

                #observe data
                timezone_observations = self._observed_locations[self._observed_locations['time'] == time]

                if not timezone_observations.empty:
                    data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1))
                    model.observe(data)

                location_model.append(model)

            Q = VB(location_model[0], location_model[1], location_model[2], location_model[3],
                    location_model[4], location_model[5], location_model[6], location_model[7],
                    location_model[8], location_model[9], location_model[10], location_model[11],
                    location_model[12], location_model[13], location_model[14], location_model[15],
                    location_model[16], location_model[17], location_model[18], location_model[19],
                    location_model[20], location_model[21], location_model[22], location_model[23],
                    p_theta_morning, p_theta_night, p_conc_morning, p_conc_night)
        else:
            raise 'no model_type selected'

        print ("models created")

        ####################################################################################
        #Learning parameters
        Q.update(repeat=1000)
        print ('learned params')
        ####################################################################################
        
        if ('all' == model_type):
            return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS))
        elif ('2fold' == model_type):
            learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS))
            learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS))
            return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
Beispiel #5
0
def get_node_distr_over_comm(g, walks, method=None, params={}):

    if method == "HMM_param":

        seqs = []
        lens = []
        for walk in walks:
            s = [[int(w)] for w in walk]
            seqs.extend(s)
            lens.append(len(s))

        model = hmm.MultinomialHMM(n_components=params['number_of_topics'],
                                   tol=0.001,
                                   n_iter=5000)
        model.fit(seqs, lens)

        #posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())]))
        #comms = np.argmax(posteriors, 1)

        likelihood = model.emissionprob_
        """
        comms = np.argmax(likelihood, 0)

        node2comm = {}
        for id in range(len(comms)):
            node2comm[str(id)] = comms[id]

        return node2comm
        """

    elif method == "Nonparam_HMM":

        seqs = []
        lens = []
        for walk in walks:
            s = [int(w) for w in walk]
            seqs.append(s)
            lens.append(len(s))

        seqs = np.vstack(seqs)

        K = params['number_of_topics']  # the number of hidden states
        O = g.number_of_nodes()  # the size of observation set
        L = len(seqs[0])  # the length of each sequence
        N = len(seqs)  # the number of sequences

        p0 = params['prior_p0']  # a vector of size K
        t0 = params['prior_t0']  # a vector of size K
        e0 = params['prior_e0']  # a vector of size K

        p = bayes.Dirichlet(p0 * np.ones(K), name='p')

        T = bayes.Dirichlet(t0 * np.ones(K), plates=(K, ), name='T')

        E = bayes.Dirichlet(e0 * np.ones(O), plates=(K, ), name='E')

        Z = bayes.CategoricalMarkovChain(p,
                                         T,
                                         states=L,
                                         name='Z',
                                         plates=(N, ))

        # Emission/observation distribution
        X = bayes.Mixture(Z, bayes.Categorical, E, name='X')

        p.initialize_from_random()
        T.initialize_from_random()
        E.initialize_from_random()

        Q = VB(X, Z, p, T, E)

        Q['X'].observe(seqs)
        Q.update(repeat=1000)

        likelihood = Q['E'].random()
        """
        comms = np.argmax(likelihood, 0)

        node2comm = {}
        for id in range(len(comms)):
            node2comm[str(id)] = comms[id]

        return node2comm
        """

        return likelihood

    elif method == "LDA":

        # Run GibbsLDA++
        if not os.path.exists(GIBBSLDA_PATH):
            raise ValueError("Invalid path of GibbsLDA++!")

        temp_lda_folder = os.path.join(TEMP_FOLDER, "lda_temp")
        if not os.path.exists(temp_lda_folder):
            os.makedirs(temp_lda_folder)

        temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile")
        # Save the walks into the dfile
        n = len(walks)
        with open(temp_dfile_path, 'w') as f:
            f.write("{}\n".format(n))
            for walk in walks:
                f.write("{}\n".format(" ".join(str(w) for w in walk)))

        initial_time = time.time()
        cmd = "{} -est ".format(GIBBSLDA_PATH)
        cmd += "-alpha {} ".format(params['lda_alpha'])
        cmd += "-beta {} ".format(params['lda_beta'])
        cmd += "-ntopics {} ".format(params['number_of_topics'])
        cmd += "-niters {} ".format(params['lda_number_of_iters'])
        cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1)
        cmd += "-dfile {} ".format(temp_dfile_path)
        os.system(cmd)

        print("-> The LDA algorithm run in {:.2f} secs".format(time.time() -
                                                               initial_time))

        # Read wordmap file
        id2node = {}
        temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt")
        with open(temp_wordmap_path, 'r') as f:
            f.readline()  # skip the first line
            for line in f.readlines():
                tokens = line.strip().split()
                id2node[int(tokens[1])] = tokens[0]

        # Read phi file
        num_of_nodes = len(id2node)
        phi = np.zeros(shape=(params['number_of_topics'], num_of_nodes),
                       dtype=np.float)
        temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi")
        with open(temp_phi_path, 'r') as f:
            for comm, line in enumerate(f.readlines()):
                for id, value in enumerate(line.strip().split()):
                    phi[comm, int(id2node[id])] = value

        # Read the tassign file, generate topic corpus
        temp_tassing_path = os.path.join(temp_lda_folder,
                                         "model-final.tassign")
        comm_corpus = []
        with smart_open(temp_tassing_path, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                comm_corpus.append([token.split(':')[1] for token in tokens])
        """
        max_topics = np.argmax(phi, axis=0)

        node2comm = {}
        for nodeId in id2node:
            node2comm[id2node[nodeId]] = max_topics[int(nodeId)]

        return node2comm
        """

        return phi, comm_corpus
    else:
        raise ValueError("Wrong parameter name!")
Beispiel #6
0
def model(n_documents,
          n_topics,
          n_vocabulary,
          corpus,
          word_documents,
          plates_multiplier=1):
    '''
    Construct Latent Dirichlet Allocation model.
    
    Parameters
    ----------
    
    documents : int
        The number of documents

    topics : int
        The number of topics

    vocabulary : int
        The number of words in the vocabulary

    corpus : integer array
        The vocabulary index of each word in the corpus

    word_documents : integer array
        The document index of each word in the corpus
    '''

    # Topic distributions for each document
    p_topic = nodes.Dirichlet(np.ones(n_topics),
                              plates=(n_documents, ),
                              name='p_topic')

    # Word distributions for each topic
    p_word = nodes.Dirichlet(np.ones(n_vocabulary),
                             plates=(n_topics, ),
                             name='p_word')

    # Use a simple wrapper node so that the value of this can be changed if one
    # uses stocahstic variational inference
    word_documents = Constant(CategoricalMoments(n_documents),
                              word_documents,
                              name='word_documents')

    # Choose a topic for each word in the corpus
    topics = nodes.Categorical(nodes.Gate(word_documents, p_topic),
                               plates=(len(corpus), ),
                               plates_multiplier=(plates_multiplier, ),
                               name='topics')

    # Choose each word in the corpus from the vocabulary
    words = nodes.Categorical(nodes.Gate(topics, p_word), name='words')

    # Observe the corpus
    words.observe(corpus)

    # Break symmetry by random initialization
    p_topic.initialize_from_random()
    p_word.initialize_from_random()

    return VB(words, topics, p_word, p_topic, word_documents)
Beispiel #7
0
for n in range(N):
    s = np.random.choice(a=K, size=1)[0]
    for l in range(L):
        o = np.random.choice(a=E, size=1, p=O[s, :])[0]
        s = np.random.choice(a=K, size=1, p=A[s, :])[0]
        y[n].append(o)

#L = len(y)

p0 = 0.3  # a vector of size K
t0 = 0.2  # a vector of size K
e0 = 0.1

p_param = p0 * np.ones(K, dtype=np.float)
p = bayes.Dirichlet(p_param, name='p')

t_param = t0 * np.ones(K, dtype=np.float)
T = bayes.Dirichlet(t_param, plates=(K, ), name='T')

e_param = e0 * np.ones(E, dtype=np.float)
E = bayes.Dirichlet(e_param, plates=(K, ), name='E')

z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(N, ), name='Z')
x = bayes.Mixture(z, bayes.Categorical, E, plates=(N, L), name='X')

p.initialize_from_random()
T.initialize_from_random()
E.initialize_from_random()

Q = VB(x, z, E, T, p)