Esempio n. 1
0
                         outputs=[s, P, ll],
                         mode=theano.Mode(optimizer="unsafe"))

    s, P, ll = kf(Y, 2 * np.ones(m))

    import pymc3 as pm

    with pm.Model() as model:
        # Phi, Q, L, c, H, Sv, d, s0, P0, n, m, g

        phi = pm.Normal("phi", shape=(1, 1))
        q = pm.HalfStudentT("q", nu=1.0, sd=2.0, shape=(1, 1))
        K = KalmanFilter("kf",
                         phi,
                         q,
                         np.array([[1.]]),
                         np.array([0.]),
                         np.array([[1.]]),
                         np.array([[0.0]]),
                         np.array([0.]),
                         np.array([0.]),
                         np.array([[10.]]),
                         1,
                         1,
                         1,
                         observed=y)

    with model:
        # approx = pm.fit(n=100, method="advi")
        trace = pm.sample_approx(approx, draws=500)
    def train_pymc3(docs_te, docs_tr, n_samples_te, n_samples_tr, n_words,
                    n_topics, n_tokens):
        """
        Return: 
            Pymc3 LDA results
        
        Parameters:
            docs_tr: training documents (processed)
            docs_te: testing documents (processed)
            n_samples_te: number of testing docs
            n_samples_tr: number of training docs
            n_words: size of vocabulary
            n_topics: number of topics to learn
            n_tokens: number of non-zero datapoints in processed training tf matrix
            
        """

        # Log-likelihood of documents for LDA
        def logp_lda_doc(beta, theta):
            """
            Returns the log-likelihood function for given documents.

            K : number of topics in the model
            V : number of words (size of vocabulary)
            D : number of documents (in a mini-batch)

            Parameters
            ----------
            beta : tensor (K x V)
              Word distribution.
            theta : tensor (D x K)
              Topic distributions for the documents.
            """
            def ll_docs_f(docs):
                dixs, vixs = docs.nonzero()
                vfreqs = docs[dixs, vixs]
                ll_docs = vfreqs * pmmath.logsumexp(
                    tt.log(theta[dixs]) + tt.log(beta.T[vixs]),
                    axis=1).ravel()

                # Per-word log-likelihood times no. of tokens in the whole dataset
                return tt.sum(ll_docs) / (tt.sum(vfreqs) + 1e-9) * n_tokens

            return ll_docs_f

        # fit the pymc3 LDA

        # we have sparse dataset. It's better to have dence batch so that all words accure there
        minibatch_size = 128

        # defining minibatch
        doc_t_minibatch = pm.Minibatch(docs_tr.toarray(), minibatch_size)
        doc_t = shared(docs_tr.toarray()[:minibatch_size])

        with pm.Model() as model:
            theta = Dirichlet(
                'theta',
                a=pm.floatX((1.0 / n_topics) * np.ones(
                    (minibatch_size, n_topics))),
                shape=(minibatch_size, n_topics),
                transform=t_stick_breaking(1e-9),
                # do not forget scaling
                total_size=n_samples_tr)
            beta = Dirichlet('beta',
                             a=pm.floatX((1.0 / n_topics) * np.ones(
                                 (n_topics, n_words))),
                             shape=(n_topics, n_words),
                             transform=t_stick_breaking(1e-9))
            # Note, that we defined likelihood with scaling, so here we need no additional `total_size` kwarg
            doc = pm.DensityDist('doc',
                                 logp_lda_doc(beta, theta),
                                 observed=doc_t)

        # Encoder
        class LDAEncoder:
            """Encode (term-frequency) document vectors to variational means and (log-transformed) stds.
            """
            def __init__(self,
                         n_words,
                         n_hidden,
                         n_topics,
                         p_corruption=0,
                         random_seed=1):
                rng = np.random.RandomState(random_seed)
                self.n_words = n_words
                self.n_hidden = n_hidden
                self.n_topics = n_topics
                self.w0 = shared(0.01 * rng.randn(n_words, n_hidden).ravel(),
                                 name='w0')
                self.b0 = shared(0.01 * rng.randn(n_hidden), name='b0')
                self.w1 = shared(0.01 * rng.randn(n_hidden, 2 *
                                                  (n_topics - 1)).ravel(),
                                 name='w1')
                self.b1 = shared(0.01 * rng.randn(2 * (n_topics - 1)),
                                 name='b1')
                self.rng = MRG_RandomStreams(seed=random_seed)
                self.p_corruption = p_corruption

            def encode(self, xs):
                if 0 < self.p_corruption:
                    dixs, vixs = xs.nonzero()
                    mask = tt.set_subtensor(
                        tt.zeros_like(xs)[dixs, vixs],
                        self.rng.binomial(size=dixs.shape,
                                          n=1,
                                          p=1 - self.p_corruption))
                    xs_ = xs * mask
                else:
                    xs_ = xs

                w0 = self.w0.reshape((self.n_words, self.n_hidden))
                w1 = self.w1.reshape((self.n_hidden, 2 * (self.n_topics - 1)))
                hs = tt.tanh(xs_.dot(w0) + self.b0)
                zs = hs.dot(w1) + self.b1
                zs_mean = zs[:, :(self.n_topics - 1)]
                zs_rho = zs[:, (self.n_topics - 1):]
                return {'mu': zs_mean, 'rho': zs_rho}

            def get_params(self):
                return [self.w0, self.b0, self.w1, self.b1]

            # call Encoder

        encoder = LDAEncoder(n_words=n_words,
                             n_hidden=100,
                             n_topics=n_topics,
                             p_corruption=0.0)
        local_RVs = OrderedDict([(theta, encoder.encode(doc_t))])

        # get parameters
        encoder_params = encoder.get_params()

        # Train pymc3 Model
        η = .1
        s = shared(η)

        def reduce_rate(a, h, i):
            s.set_value(η / ((i / minibatch_size) + 1)**.7)

        with model:
            approx = pm.MeanField(local_rv=local_RVs)
            approx.scale_cost_to_minibatch = False
            inference = pm.KLqp(approx)
        inference.fit(10000,
                      callbacks=[reduce_rate],
                      obj_optimizer=pm.sgd(learning_rate=s),
                      more_obj_params=encoder_params,
                      total_grad_norm_constraint=200,
                      more_replacements={doc_t: doc_t_minibatch})

        # Extracting characteristic words
        doc_t.set_value(docs_tr.toarray())
        samples = pm.sample_approx(approx, draws=100)
        beta_pymc3 = samples['beta'].mean(axis=0)

        # Predictive distribution
        def calc_pp(ws, thetas, beta, wix):
            """
            Parameters
            ----------
            ws: ndarray (N,)
                Number of times the held-out word appeared in N documents.
            thetas: ndarray, shape=(N, K)
                Topic distributions for N documents.
            beta: ndarray, shape=(K, V)
                Word distributions for K topics.
            wix: int
                Index of the held-out word

            Return
            ------
            Log probability of held-out words.
            """
            return ws * np.log(thetas.dot(beta[:, wix]))

        def eval_lda(transform, beta, docs_te, wixs):
            """Evaluate LDA model by log predictive probability.

            Parameters
            ----------
            transform: Python function
                Transform document vectors to posterior mean of topic proportions.
            wixs: iterable of int
                Word indices to be held-out.
            """
            lpss = []
            docs_ = deepcopy(docs_te)
            thetass = []
            wss = []
            total_words = 0
            for wix in wixs:
                ws = docs_te[:, wix].ravel()
                if 0 < ws.sum():
                    # Hold-out
                    docs_[:, wix] = 0

                    # Topic distributions
                    thetas = transform(docs_)

                    # Predictive log probability
                    lpss.append(calc_pp(ws, thetas, beta, wix))

                    docs_[:, wix] = ws
                    thetass.append(thetas)
                    wss.append(ws)
                    total_words += ws.sum()
                else:
                    thetass.append(None)
                    wss.append(None)

            # Log-probability
            lp = np.sum(np.hstack(lpss)) / total_words

            return {'lp': lp, 'thetass': thetass, 'beta': beta, 'wss': wss}

        inp = tt.matrix(dtype='int64')
        sample_vi_theta = theano.function([inp],
                                          approx.sample_node(
                                              approx.model.theta,
                                              100,
                                              more_replacements={
                                                  doc_t: inp
                                              }).mean(0))

        def transform_pymc3(docs):
            return sample_vi_theta(docs)

        result_pymc3 = eval_lda(transform_pymc3, beta_pymc3, docs_te.toarray(),
                                np.arange(100))
        print('Predictive log prob (pm3) = {}'.format(result_pymc3['lp']))

        return result_pymc3
def variational_inference(X_train, Y_train, X_test, Y_test, m, k):
    import numpy as np
    import pymc3 as pm
    from sklearn.preprocessing import MinMaxScaler
    import theano
    import matplotlib.pyplot as plt
    import numpy
    import random

    #输入数据并进行标准化
    n, p = np.shape(X_train)
    Y_train = np.reshape(Y_train, (len(Y_train), 1))
    Y_test = np.reshape(Y_test, (len(Y_test), 1))
    scaler_x = MinMaxScaler(feature_range=(-1, 1))
    X_train = scaler_x.fit_transform(X_train)
    X_test = scaler_x.transform(X_test)
    scaler_y = MinMaxScaler(feature_range=(0, 1))
    Y_train = scaler_y.fit_transform(Y_train)
    Y_test = scaler_y.transform(Y_test)
    X_train = theano.shared(X_train)
    #添加噪音
    #sigma=0.1
    #rd_num=int(sigma*len(Y_train))
    #rd=random.sample(range(len(Y_train)),rd_num)
    #sm=np.random.uniform(-0.1,0,size=rd_num)
    #Y_train=np.ravel(Y_train)
    #Y_train[rd]=sm

    #定义模型
    basic_model = pm.Model()
    with basic_model:
        b = pm.Normal('b', mu=0, tau=1)
        A = pm.Normal('A', mu=0, tau=1, shape=(p, m))
        gamma_0 = pm.Gamma('gamma_0', alpha=10**(-5), beta=10**(-5))
        gamma_1 = pm.Gamma('gamma_1', alpha=10**(-5), beta=10**(-5))
        beta = pm.Normal('beta', mu=0, tau=gamma_0, shape=(m, 1))
        Y_obs = pm.Normal('Y_obs',
                          mu=sigmoid_kernel(X_train, beta, A, b),
                          tau=gamma_1,
                          observed=Y_train)
        start = pm.find_MAP()
        #approx=pm.fit(k,start=start,obj_optimizer=pm.adam(),callbacks=[tracker])
        approx = pm.fit(k, start=start, obj_optimizer=pm.adam())
        #在拟合好的模型中,对参数z={beta,A,b,gamma_0,gamma_1}进行抽样
        trace = pm.sample_approx(approx=approx, draws=5000)
        #pm.traceplot(trace)
        #pm.summary(trace)
        #取5000次后验预测的均值为最终结果。
        post_pred = pm.sample_ppc(trace, samples=5000, model=basic_model)
        y_train_pred = np.mean(post_pred['Y_obs'], axis=0)
        #对预测结果与实际结果进行比较。
        mse_train = (((y_train_pred - Y_train)**2).sum()) / np.size(Y_train, 0)
        X_train.set_value(X_test)
        post_pred = pm.sample_ppc(trace, samples=5000, model=basic_model)
        y_test_pred = np.mean(post_pred['Y_obs'], axis=0)
        mse_test = (((y_test_pred - Y_test)**2).sum()) / np.size(Y_test, 0)  #
    Y_mean = np.ones_like(Y_test) * np.mean(Y_test)
    r2 = 1 - (((y_test_pred - Y_test)**2).sum()) / (
        ((Y_test - Y_mean)**2).sum())  #
    n = len(Y_test)
    err = Y_test - y_test_pred
    err_mean = np.ones_like(err) * np.mean(err)
    err_var = (((err - err_mean)**2).sum()) / (n - 1)
    y_var = (((Y_test - Y_mean)**2).sum()) / (n - 1)
    Evar = 1 - err_var / y_var
    #print('mse_train=',mse_train,'\n mse_test=',mse_test,'\n r2=',r2,'\n Evar=',Evar,'\n m=',m)
    return mse_train, mse_test, r2, Evar, m
Esempio n. 4
0
def run_lda(args):
    tf_vectorizer, docs_tr, docs_te = prepare_sparse_matrix_nonlabel(args.n_tr, args.n_te, args.n_word)
    feature_names = tf_vectorizer.get_feature_names()
    doc_tr_minibatch = pm.Minibatch(docs_tr.toarray(), args.bsz)
    doc_tr = shared(docs_tr.toarray()[:args.bsz])

    def log_prob(beta, theta):
        """Returns the log-likelihood function for given documents.

        K : number of topics in the model
        V : number of words (size of vocabulary)
        D : number of documents (in a mini-batch)

        Parameters
        ----------
        beta : tensor (K x V)
            Word distributions.
        theta : tensor (D x K)
            Topic distributions for documents.
        """

        def ll_docs_f(docs):
            dixs, vixs = docs.nonzero()
            vfreqs = docs[dixs, vixs]
            ll_docs = (vfreqs * pmmath.logsumexp(tt.log(theta[dixs]) + tt.log(beta.T[vixs]),
                                                 axis=1).ravel())

            return tt.sum(ll_docs) / (tt.sum(vfreqs) + 1e-9)

        return ll_docs_f

    with pm.Model() as model:
        beta = Dirichlet("beta",
                         a=pm.floatX((1. / args.n_topic) * np.ones((args.n_topic, args.n_word))),
                         shape=(args.n_topic, args.n_word), )

        theta = Dirichlet("theta",
                          a=pm.floatX((10. / args.n_topic) * np.ones((args.bsz, args.n_topic))),
                          shape=(args.bsz, args.n_topic), total_size=args.n_tr, )

        doc = pm.DensityDist("doc", log_prob(beta, theta), observed=doc_tr)

    encoder = ThetaEncoder(n_words=args.n_word, n_hidden=100, n_topics=args.n_topic)
    local_RVs = OrderedDict([(theta, encoder.encode(doc_tr))])
    encoder_params = encoder.get_params()

    s = shared(args.lr)

    def reduce_rate(a, h, i):
        s.set_value(args.lr / ((i / args.bsz) + 1) ** 0.7)

    with model:
        approx = pm.MeanField(local_rv=local_RVs)
        approx.scale_cost_to_minibatch = False
        inference = pm.KLqp(approx)

    inference.fit(args.n_iter,
                  callbacks=[reduce_rate, pm.callbacks.CheckParametersConvergence(diff="absolute")],
                  obj_optimizer=pm.adam(learning_rate=s),
                  more_obj_params=encoder_params,
                  total_grad_norm_constraint=200,
                  more_replacements={ doc_tr: doc_tr_minibatch }, )

    doc_tr.set_value(docs_tr.toarray())
    inp = tt.matrix(dtype="int64")
    sample_vi_theta = theano.function([inp],
        approx.sample_node(approx.model.theta, args.n_sample, more_replacements={doc_tr: inp}), )

    test = docs_te.toarray()
    test_n = test.sum(1)

    beta_pymc3 = pm.sample_approx(approx, draws=args.n_sample)['beta']
    theta_pymc3 = sample_vi_theta(test)

    assert beta_pymc3.shape == (args.n_sample, args.n_topic, args.n_word)
    assert theta_pymc3.shape == (args.n_sample, args.n_te, args.n_topic)

    beta_mean = beta_pymc3.mean(0)
    theta_mean = theta_pymc3.mean(0)

    pred_rate = theta_mean.dot(beta_mean)
    pp_test = (test * np.log(pred_rate)).sum(1) / test_n

    posteriors = { 'theta': theta_pymc3, 'beta': beta_pymc3,}

    log_top_words(beta_pymc3.mean(0), feature_names, n_top_words=args.n_top_word)
    save_elbo(approx.hist)
    save_pp(pp_test)
    save_draws(posteriors)
Esempio n. 5
0
local_RVs = OrderedDict([(theta, encoder.encode(counts_share))])

encoder_params = encoder.get_params()

with lda_model:
    approx1 = pm.fit(
        6000,
        method='advi',
        local_rv=local_RVs,
        more_obj_params=encoder_params,
        # https://arxiv.org/pdf/1705.08292.pdf
        # sgd(with/without momentum) seems to be good choice for high dimensional problems
        obj_optimizer=pm.sgd,
        # but your gradients will explode here
        total_grad_norm_constraint=1000)
    samples = pm.sample_approx(approx1, draws=100)
    beta_pymc3 = samples['beta'].mean(axis=0)
    theta_pymc3 = samples['theta'].mean(axis=0)

plt.plot(approx1.hist[10:])
plt.show()

## get label for each cell
z_pymc3 = theta_pymc3.argmax(axis=1)

pd.DataFrame({
    "celda": z_celda,
    "pymc3": z_pymc3
}).groupby(['celda', 'pymc3']).size()

## 3:21 minutes to run the AEVB with ADVI to get the posterior Dis