Beispiel #1
0
    def inference(self, iter_=5000, burn=1000):
        theta = pm.Container([
            pm.CompletedDirichlet(
                "theta_%s" % d, pm.Dirichlet("ptheta_%s" % d,
                                             theta=self.alpha))
            for d in range(self.D)
        ])
        phi = pm.Container([
            pm.CompletedDirichlet("phi_%s" % k,
                                  pm.Dirichlet("pphi_%s" % k, theta=self.beta))
            for k in range(self.K)
        ])
        z_d = pm.Container([
            pm.Categorical("z_%s" % d,
                           p=theta[d],
                           value=np.random.randint(self.K,
                                                   size=len(self.bw[d])),
                           size=len(self.bw[d])) for d in range(self.D)
        ])
        w_z = pm.Container([
            pm.Categorical("w_%s_%s" % (d, w),
                           p=phi[z_d[d][w].get_value()],
                           value=self.bw[d][w],
                           observed=True) for d in range(self.D)
            for w in range(len(self.bw[d]))
        ])

        model = pm.Model([theta, phi, z_d, w_z])
        self.mcmc = pm.MCMC(model)
        self.mcmc.sample(iter=iter_, burn=burn)
Beispiel #2
0
    def get_z_data(self, p, p_pos, q):
        K = 2  # Num topics
        M = p  # Num documents
        N = q  # Total num of unique words across all documents

        alpha = 1.0  # Concentration parameter for distribution over
        # distributions over words (one for each topic)
        beta = 1.0  # Concentration parameter for distribution over
        # distributions over topics (one for each
        # document)

        phi = pymc.Container([
            pymc.CompletedDirichlet(
                name="phi_" + str(k),
                D=pymc.Dirichlet(name="phi_temp_" + str(k),
                                 theta=beta * numpy.ones(N)),
            ) for k in range(K)
        ])

        theta = pymc.Container([
            pymc.CompletedDirichlet(
                name="theta_" + str(m),
                D=pymc.Dirichlet(name="theta_temp_" + str(m),
                                 theta=alpha * numpy.ones(K)),
            ) for m in range(M)
        ])

        z = pymc.Container([
            pymc.Categorical(name="z_" + str(m), p=theta[m], size=N)
            for m in range(M)
        ])

        w = pymc.Container([
            pymc.Categorical(
                name="w_" + str(m) + "_" + str(n),
                p=pymc.Lambda(
                    "phi_z_" + str(m) + str(n),
                    lambda z_in=z[m][n], phi_in=phi: phi_in[z_in],
                ),
            ) for m in range(M) for n in range(N)
        ])
        lda = pymc.Model([w, z, theta, phi])

        z_rvs = []
        for m in range(M):
            metadata = {"doc_idx": m, "num_unique_words": N}
            rv = WordCountVecRV(
                model=lda, name="w_0_0",
                metadata=metadata)  # Note: w_0_0 is just a dummy
            # argument that must be present in
            # the pymc.Model
            z_rvs += [rv]
        return z_rvs
Beispiel #3
0
    def __init__(self, corpus, K=10, iterations=1000, burn=100):
        print("Building model ...")
        self.K = K
        self.V = corpus.wordCount + 1
        self.M = corpus.documentCount
        self.alpha = np.ones(self.K)
        self.beta = np.ones(self.V)
        self.corpus = corpus
        self.observations = np.array(corpus.observations)

        self.phi = np.empty(self.K, dtype=object)
        for i in range(self.K):
            self.phi[i] = pm.CompletedDirichlet(
                "Phi[%i]" % i, pm.Dirichlet("phi[%i]" % i, theta=self.beta))
        self.phi = pm.Container(self.phi)

        self.theta = np.empty(self.M, dtype=object)
        for i in range(self.M):
            self.theta[i] = pm.CompletedDirichlet(
                "Theta[%i]" % i, pm.Dirichlet("theta[%i]" % i,
                                              theta=self.alpha))
        self.theta = pm.Container(self.theta)

        self.z = np.empty(self.observations.shape, dtype=object)
        for i in range(self.M):
            self.z[i] = pm.Categorical("z[%i]" % i,
                                       size=len(self.observations[i]),
                                       p=self.theta[i],
                                       value=np.random.randint(
                                           self.K,
                                           size=len(self.observations[i])))
        self.z = pm.Container(self.z)

        self.w = []
        for i in range(self.M):
            self.w.append([])
            for j in range(len(self.observations[i])):
                self.w[i].append(
                    pm.Categorical(
                        "w[%i][%i]" % (i, j),
                        p=pm.Lambda(
                            "phi[z[%i][%i]]" % (i, j),
                            lambda z=self.z[i][j], phi=self.phi: phi[z]),
                        value=self.observations[i][j],
                        observed=True))
        self.w = pm.Container(self.w)

        self.mcmc = pm.MCMC(pm.Model([self.theta, self.phi, self.z, self.w]))

        print("Fitting model ...")
        self.mcmc.sample(iter=iterations, burn=burn)
    def initialize_variables(self):
        """Initializes MCMC variables."""
        self.dirichlet = pymc.Dirichlet(
            "dirichlet", self.prior_pops
        )  # This has size (n-1), so it is missing the final component.
        self.matrix_populations = pymc.CompletedDirichlet(
            "matrix_populations", self.dirichlet
        )  # This RV fills in the missing value of the population vector, but has shape (1, n) rather than (n)
        self.populations = pymc.CommonDeterministics.Index(
            "populations", self.matrix_populations,
            0)  # Finally, we get a flat array of the populations.

        self.dirichlet.keep_trace = False

        @pymc.dtrm
        def mu(populations=self.populations):
            return populations.dot(self.predictions)

        self.mu = mu

        @pymc.potential
        def logp(populations=self.populations, mu=self.mu):
            return -0.5 * get_chi2(populations,
                                   self.predictions,
                                   self.measurements,
                                   self.uncertainties,
                                   mu=mu)

        self.logp = logp
Beispiel #5
0
def create_mk_model(tree, chars, Qtype, pi):
    """
    Create model objects to be passed to pymc.MCMC

    Creates Qparams and likelihood function
    """
    if type(chars) == dict:
        chars = [chars[l] for l in [n.label for n in tree.leaves()]]
    nchar = len(set(chars))
    if Qtype=="ER":
        N = 1
    elif Qtype=="Sym":
        N = int(binom(nchar, 2))
    elif Qtype=="ARD":
        N = int((nchar ** 2 - nchar))
    else:
        ValueError("Qtype must be one of: ER, Sym, ARD")

    # Setting a Dirichlet prior with Jeffrey's hyperprior of 1/2
    if N != 1:
        theta = [1.0/2.0]*N
        Qparams_init = pymc.Dirichlet("Qparams_init", theta, value = [0.5])
        Qparams_init_full = pymc.CompletedDirichlet("Qparams_init_full", Qparams_init)
    else:
        Qparams_init_full = [[1.0]]

    # Exponential scaling factor for Qparams
    scaling_factor = pymc.Exponential(name="scaling_factor", beta=1.0, value=1.0)

    # Scaled Qparams; we would not expect them to necessarily add
    # to 1 as would be the case in a Dirichlet distribution
    @pymc.deterministic(plot=False)
    def Qparams(q=Qparams_init_full, s=scaling_factor):
        Qs = np.empty(N)
        for i in range(N):
            Qs[i] = q[0][i]*s
        return Qs

    l = mk.create_likelihood_function_mk(tree=tree, chars=chars, Qtype=Qtype,
                                  pi="Equal", findmin=False)
    @pymc.potential
    def mklik(q = Qparams, name="mklik"):
        return l(q)
    return locals()
Beispiel #6
0
# SUPPORT: [0,1]
# DISTRIBUTION: None
B_matrix = B

#---------------------------- Prior Parameters ---------------------------#
# Actual group membership probabilities for each person
# DIMENSIONS: 1 x (num_people * num_groups)
# SUPPORT: (0,1], Elements of each vector should sum to 1 for each person
# DISTRIBUTION: Dirichlet(alpha)
pi_list = np.empty(num_people, dtype=object)
for person in range(num_people):
    person_pi = pymc.Dirichlet('pi_%i' % person, theta=alpha_vector)
    pi_list[person] = person_pi

completed_pi_list = [
    pymc.CompletedDirichlet('completed_pi_%d' % i, dist)
    for i, dist in enumerate(pi_list)
]

# Indicator variables of whether the pth person is in a group or not
# DIMENSIONS: 1 x (num_people^2) for each list, where each element is Kx1
# DOMAIN : {0,1}, only one element of vector is 1, all else 0
# DISTRIBUTION: Categorical (using Multinomial with 1 observation)
z_pTq_matrix = np.empty([num_people, num_people], dtype=object)
z_pFq_matrix = np.empty([num_people, num_people], dtype=object)
for p_person in range(num_people):
    for q_person in range(num_people):
        z_pTq_matrix[p_person, q_person] = pymc.Multinomial(
            'z_%dT%d_vector' % (p_person, q_person),
            n=1,
            p=pi_list[p_person],
Beispiel #7
0
    def _create_parameter_model(self, database, initial_parameters):
        """
        Creates set of stochastics representing the set of all parameters for all models

        Arguments
        ---------
        database : dict
            FreeSolv database
        initial_parameters : dict
            The set of initial values of the parameters

        Returns
        -------
        parameters : dict
            PyMC dictionary containing the parameters to sample.\
        """
        parameters = dict()  # just the parameters
        parameters['gbmodel_dir'] = pymc.Dirichlet('gbmodel_dir',
                                                   np.ones([self.ngbmodels]))
        parameters['gbmodel_prior'] = pymc.CompletedDirichlet(
            'gbmodel_prior', parameters['gbmodel_dir'])
        if self.ngbmodels == 5:
            parameters['gbmodel'] = pymc.Categorical(
                'gbmodel', value=4, p=parameters['gbmodel_prior'])
        else:
            parameters['gbmodel'] = pymc.Categorical(
                'gbmodel', p=parameters['gbmodel_prior'])
        uninformative_tau = 0.0001
        joint_proposal_sets = {}
        for (key, value) in initial_parameters.iteritems():
            (atomtype, parameter_name) = key.split('_')
            if parameter_name == 'scalingFactor':
                stochastic = pymc.Uniform(key,
                                          value=value,
                                          lower=-0.8,
                                          upper=+1.5)
            elif parameter_name == 'radius':
                stochastic = pymc.Uniform(key,
                                          value=value,
                                          lower=0.5,
                                          upper=2.5)
            elif parameter_name == 'alpha':
                stochastic = pymc.Normal(key,
                                         value=value,
                                         mu=value,
                                         tau=uninformative_tau)
            elif parameter_name == 'beta':
                stochastic = pymc.Normal(key,
                                         value=value,
                                         mu=value,
                                         tau=uninformative_tau)
            elif parameter_name == 'gamma':
                stochastic = pymc.Normal(key,
                                         value=value,
                                         mu=value,
                                         tau=uninformative_tau)
            else:
                raise Exception("Unrecognized parameter name: %s" %
                                parameter_name)
            parameters[key] = stochastic
            self.stochastics_joint_proposal.append(stochastic)
        return parameters
#
import spacepy.plot as spp  # for the styles
import numpy as np
import pymc as pm

K = 2  # number of topics
V = 4  # number of words
D = 3  # number of documents

data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])

alpha = np.ones(K)
beta = np.ones(V)

theta = pm.Container([
    pm.CompletedDirichlet("theta_%s" % i,
                          pm.Dirichlet("ptheta_%s" % i, theta=alpha))
    for i in range(D)
])
phi = pm.Container([
    pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k,
                                                     theta=beta))
    for k in range(K)
])
Wd = [len(doc) for doc in data]

z = pm.Container([
    pm.Categorical('z_%i' % d,
                   p=theta[d],
                   size=Wd[d],
                   value=np.random.randint(K, size=Wd[d])) for d in range(D)
])
Beispiel #9
0
#for igene in xrange(ds.shape[1]):
for igene in xrange(500):
    if igene % 25 == 0:
        print(igene)

    dir_1 = pymc.Dirichlet("dir_1",
                           theta=numpy.repeat(1.0, d.shape[2]),
                           trace=True)
    dir_2 = pymc.Dirichlet("dir_2",
                           theta=numpy.repeat(1.0, d.shape[2]),
                           trace=True)

    ddiff = pymc.Lambda(
        "ddiff",
        lambda dir_1=pymc.CompletedDirichlet(
            "cdir_1", dir_1, trace=False), dir_2=pymc.CompletedDirichlet(
                "cdir_2", dir_2, trace=False): dir_2[0] - dir_1[0],
        trace=True)

    vals_1 = ds[anno[ctrl], igene]
    vals_2 = ds[anno[cond], igene]

    mn_1 = pymc.Multinomial("mn_1",
                            value=vals_1,
                            n=vals_1.sum(axis=1),
                            p=dir_1,
                            observed=True,
                            trace=False)
    mn_2 = pymc.Multinomial("mn_2",
                            value=vals_2,
                            n=vals_2.sum(axis=1),
Beispiel #10
0
def hrm_bayesian(tree, chars, Qtype, nregime, pi="Fitzjohn", constraint="Rate"):
    """
    Create a hidden rates model for pymc to be sampled from.

    Args:
        tree (Node): Root node of a tree. All branch lengths must be
          greater than 0 (except root)
        chars (dict): Dict mapping character states to tip labels.
          Character states should be coded 0,1,2...

          Can also be a list with tip states in preorder sequence
        pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight
          values at root node. Defaults to "Equal"
          Method "Fitzjohn" is not thouroughly tested, use with caution
        Qtype: Either a string specifying how to esimate values for Q or a
          numpy array of a pre-specified Q matrix.
            "Simple": Symmetric rates within observed states and between
              rates.
            "STD": State Transitions Different. Transitions between states
              within the same rate class are asymetrical
        nregime (int): Number of hidden states. nstates = 0 is
          equivalent to a vanilla Mk model
        constraint (str): Contraints to apply to the parameters of the Q matrix.
          Can be one of the following:
            "Rate": The fastest rate in the fastest regime must be faster than
              the fastest rate in the slowest regime
            "Symmetry": For two-regime models only. The two regimes must
              have different symmetry (a>b in regime 1, b>a in regime 2)
            "None": No contraints
    """
    if type(chars) == dict:
        chars = [chars[l] for l in [n.label for n in tree.leaves()]]
    nobschar = len(set(chars))
    nchar = nobschar * nregime
    assert Qtype in ["Simple", "STD", "RTD", "ARD"], "Q type must be one of: simple, STD, RTD, ARD"
    ###########################################################################
    # Qparams:
    ###########################################################################
    # The simple model has # of parameters equal to nregime + 1 (One set of
    # rates for each regime, plus transition rate between regimes)
    # For now, we will have each be exponentially distributed

    # Simplest model: all transitions between states within a regime are equal.
    # Each regime has a rate associated with it.
    # There is one rate between regimes.
    # Number of parameters = nregime+1
    if Qtype == "Simple":
        # Wtihin-regime transitions
        WR_Qparams = np.ndarray(nregime, dtype="object")
        for i in range(nregime):
            WR_Qparams[i] = pymc.Exponential(name="wr-par"+str(i), beta = 1.0, value = 1e-2+(i/100.0))
        # Between-regime transitions:
        BR_Qparams = pymc.Exponential(name="br-par", beta = 1.0, value = 1e-2)
    # State-transitions different. Transitions between states within a
    # rate regime can differ (ARD). Transitions between rates share one
    # rate parameter.
    # Number of parameters = nregime*(nobschar**2-nobschar) + 1
    if Qtype == "STD":
        theta = [1.0/2.0] * nobschar
        i_d = np.ndarray(nregime, dtype="object")
        c_d = np.ndarray(nregime, dtype="object")
        scale = np.ndarray(nregime, dtype="object")
        # Within-regime transitions
        WR_Qparams = np.ndarray(nregime, dtype="object")
        for i in range(nregime):
            # First, create a dirichlet distribution
            i_d[i] = pymc.Dirichlet("parInit_"+str(i), theta, value = [1.0/nobschar]*(nobschar-1))
            c_d[i] = pymc.CompletedDirichlet("parInit"+str(i), i_d[i])
            scale[i] = pymc.Exponential(name="scaling"+str(i), beta=1.0, value=1e-2+(i/100.0))
            # Then, scale dirichlet distribution by overall rate parameter for that regime
            @pymc.deterministic(plot=False,name="wr-par"+str(i))
            def d_scaled(d = c_d[i], s = scale[i]):
                return (d*s)[0]
            WR_Qparams[i] = d_scaled
        # Between-regime transitions
        BR_Qparams = pymc.Exponential(name="br-par", beta = 1.0, value = 1e-2)
    if Qtype == "RTD":
        WR_Qparams = np.ndarray(nregime, dtype="object")
        for i in range(nregime):
            WR_Qparams[i] = pymc.Exponential(name="wr-par"+str(i), beta = 1.0, value = 1e-2+(i/100.0))

        BR_Qparams = np.ndarray(nregime-1, dtype="object")
        for i in range(nregime-1):
            BR_Qparams[i] = pymc.Exponential(name="br-par"+str(i), beta=1.0, value=1e-2)
    if Qtype == "ARD":
        theta = [1.0/2.0] * nobschar
        i_d = np.ndarray(nregime, dtype="object")
        c_d = np.ndarray(nregime, dtype="object")
        scale = np.ndarray(nregime, dtype="object")
        # Within-regime transitions
        WR_Qparams = np.ndarray(nregime, dtype="object")

        for i in range(nregime):
            # First, create a dirichlet distribution
            i_d[i] = pymc.Dirichlet("parInit_"+str(i), theta, value = [1.0/nobschar]*(nobschar-1))
            c_d[i] = pymc.CompletedDirichlet("parInit"+str(i), i_d[i])

            scale[i] = pymc.Exponential(name="scaling"+str(i), beta=1.0, value=1e-2+(i/100.0))
            # Then, scale dirichlet distribution by overall rate parameter for that regime
            @pymc.deterministic(plot=False,name="wr-par"+str(i))
            def d_scaled(d = c_d[i], s = scale[i]):
                return (d*s)[0]
            WR_Qparams[i] = d_scaled
        BR_Qparams = np.ndarray((nregime-1)*2*nobschar, dtype="object")
        br_i = 0
        for i in list(range(nregime-1))*2:
            for n in range(nobschar):
                BR_Qparams[br_i] = pymc.Exponential(name="br-par"+str(br_i), beta=1.0, value=1e-2)
                br_i += 1
    ###########################################################################
    # Likelihood
    ###########################################################################
    l = hrm.create_likelihood_function_hrm_mk(tree=tree, chars=chars,
        nregime=nregime, Qtype="ARD", pi=pi, findmin=False)
    @pymc.potential
    def mklik(wr = WR_Qparams, br=BR_Qparams, name="mklik"):
        if Qtype == "Simple":
            # Getting the locations of each Q parameter to feed
            # to the likelihood function

            # Note that the likelihood function takes q parameters
            # in coumnwise-order, not counting zero and negative values.
            # Within-regime shifts
            qinds = {}
            for i,q in enumerate(wr):
                qinds[i]=valid_indices(nobschar, nregime, i,i)
            rshift_pairs = list(zip(list(range(nregime))[1:], list(range(nregime))[:-1]))
            qinds[i+1] = [] # Between-regime shifts(all share 1 rate)
            for p in rshift_pairs:
                qinds[i+1].extend(valid_indices(nobschar, nregime, p[0],p[1]))
                qinds[i+1].extend(valid_indices(nobschar, nregime, p[1],p[0]))
            # These are the indices of the values we will give to
            # the likelihood function, in order
            param_indices = sorted([ i for v in list(qinds.values()) for i in v])
            qparam_list = list(wr)+[br] # Making a single list to get parameters from
            Qparams = [] # Empty list for values to feed to lik function
            for pi in param_indices:
                qi = [ k for k,v in qinds.items() if pi in v ][0]
                Qparams.append(qparam_list[qi]) # Pulling out the correct param
            # Qparams now contains the parameters needed in the
            # correct order for the likelihood function.
            if constraint == "Rate":
                if ((sorted(list(wr)) == list(wr)) and (br < wr[nregime-1])):
                    return l(np.array(Qparams))
                else:
                    return -np.inf
            else:
                return l(np.array(Qparams))
        if Qtype == "STD":
            qinds = {}
            n=0
            for i,q in enumerate(wr):
                for k in range(nregime):
                    qinds[n] = [valid_indices(nobschar, nregime, i, i)[k]]
                    n+=1
            rshift_pairs = list(zip(list(range(nregime))[1:], list(range(nregime))[:-1]))
            qinds[n] = [] # Between-regime shifts(all share 1 rate)
            for p in rshift_pairs:
                qinds[n].extend(valid_indices(nobschar, nregime, p[0],p[1]))
                qinds[n].extend(valid_indices(nobschar, nregime, p[1],p[0]))
            param_indices = sorted([ i for v in list(qinds.values()) for i in v])
            qparam_list = [i for s in [q for q in wr] for i in s]+[br] # Making a single list to get parameters from
            Qparams = [] # Empty list for values to feed to lik function
            for pi in param_indices:
                qi = [ k for k,v in qinds.items() if pi in v ][0]
                Qparams.append(qparam_list[qi]) # Pulling out the correct param
            # Potential constraints are "Rate" and "Symmetry"
            if constraint == "Rate":
                for i in range(nregime):
                    n = [q[i] for q in wr]
                    if not sorted(n) == n:
                        return -np.inf
            if constraint == "Symmetry":
                assert nchar == 4
                for i in range(nregime):
                    if not (wr[0][0]/wr[0][1] <= 1) and (wr[1][0]/wr[1][1] >= 1):
                        return -np.inf
            if br > max(wr[nregime-1]):
                return -np.inf

            return l(np.array(Qparams))
        if Qtype == "RTD":
            raise AssertionError
            qinds = {}
            for i,q in enumerate(wr):
                qinds[i]=valid_indices(nobschar, nregime, i,i)
        if Qtype == "ARD":
            qinds = {}
            n=0
            for i,q in enumerate(wr):
                for k in range(nregime):
                    qinds[n] = [valid_indices(nobschar, nregime, i, i)[k]]
                    n+=1
            rshift_pairs = list(zip(list(range(nregime))[1:], list(range(nregime))[:-1]))
            for p in rshift_pairs:
                for i in  valid_indices(nobschar, nregime, p[0],p[1]):
                    qinds[n] = [i]
                    n+=1
                for i in  valid_indices(nobschar, nregime, p[1],p[0]):
                    qinds[n] = [i]
                    n+=1
            param_indices = sorted([ i for v in list(qinds.values()) for i in v])
            qparam_list = [i for s in [q for q in wr] for i in s]+[b for b in br] # Making a single list to get parameters from
            Qparams = [] # Empty list for values to feed to lik function
            for pi in param_indices:
                qi = [ k for k,v in qinds.items() if pi in v ][0]
                Qparams.append(qparam_list[qi]) # Pulling out the correct param
            for i in range(nregime):
                n = [q[i] for q in wr]
                if not sorted(n) == n:
                    return -np.inf
            if max(br) > max(wr[nregime-1]):
                return -np.inf
            return l(np.array(Qparams))
    return locals()
Beispiel #11
0
def create_multi_mk_model(tree, chars, Qtype, pi, nregime=2):
    """
    Create an mk model with multiple regimes to be sampled from with MCMC.

    Regime number is fixed and the location of the regime shift is allowed
    to change
    """
    if type(chars) == dict:
        chars = [chars[l] for l in [n.label for n in tree.leaves()]]
    # Preparations
    nchar = len(set(chars))
    if Qtype=="ER":
        N = 1
    elif Qtype=="Sym":
        N = int(binom(nchar, 2))
    elif Qtype=="ARD":
        N = int((nchar ** 2 - nchar))
    else:
        ValueError("Qtype must be one of: ER, Sym, ARD")
    # This model has 2 components: Q parameters and a switchpoint
    # They are combined in a custom likelihood function

    ###########################################################################
    # Switchpoint:
    ###########################################################################
    # Modeling the movement of the regime shift(s) is the tricky part
    # Regime shifts will only be allowed to happen at a node
    # Regime shift: Uniform categorical distribution
    valid_switches = [i.ni for i in tree if not (i.isleaf or i.isroot)]
    # Uniform
    switch_ind = pymc.DiscreteUniform("switch_ind",lower=0, upper=len(valid_switches)-1)
    @pymc.deterministic(dtype=int)
    def switch(name="switch",switch_ind=switch_ind):
        return valid_switches[switch_ind]
    ###########################################################################
    # Qparams:
    ###########################################################################
    # Unscaled Q param: Dirichlet distribution
    # Setting a Dirichlet prior with Jeffrey's hyperprior of 1/2
    theta = [1.0/2.0]*N

    # One set of Q-parameters per regime
    allQparams_init = np.empty(nregime, dtype=object)
    allQparams_init_full = np.empty(nregime, dtype=object)
    allScaling_factors = np.empty(nregime, dtype=object)
    for i in range(nregime):
        if N != 1:
            allQparams_init[i] = pymc.Dirichlet("allQparams_init"+str(i), theta)
            allQparams_init_full[i] = pymc.CompletedDirichlet("allQparams_init_full"+str(i), allQparams_init[i])
        else: # Dirichlet function does not like creating a distribution
              # with only 1 state. Set it to 1 by hand
            allQparams_init_full[i] = [[1.0]]
        # Exponential scaling factor for Qparams
        allScaling_factors[i] = pymc.Exponential(name="allScaling_factors"+str(i), beta=1.0)
        # Scaled Qparams; we would not expect them to necessarily add
        # to 1 as would be the case in a Dirichlet distribution

    # Regimes are grouped by rows. Each row is a regime.
    @pymc.deterministic(plot=False)
    def Qparams(q=allQparams_init_full, s=allScaling_factors):
        Qs = np.empty([nregime,N])
        for n in range(N):
            for i in range(nregime):
                Qs[i][n] = q[i][0][n]*s[i]
        return Qs
    ###########################################################################
    # Likelihood
    ###########################################################################
    # The likelihood function

    # Pre-allocating arrays
    qarray = np.zeros([nregime,N])
    locsarray = np.empty([2], dtype=object)
    l = mk_mr.create_likelihood_function_multimk(tree=tree, chars=chars,
        Qtype=Qtype,
        pi="Equal", findmin=False, nregime=2)

    @pymc.potential
    def multi_mklik(q = Qparams, switch=switch, name="multi_mklik"):

        locs = mk_mr.locs_from_switchpoint(tree,tree[int(switch)],locsarray)

        np.copyto(qarray, q)
        return l(qarray, locs=locs)
    return locals()
NSAMPLES = 2500
NBURN = 20000
NTHIN = 10
traces = dict()
traces["y"] = numpy.empty((d.shape[1], 2, NSAMPLES))
traces["dr"] = numpy.empty((d.shape[1], 2, NSAMPLES, 4))
#ghat = g.sum(axis=2)

for i in xrange(d.shape[1]):
    print(i)
    dr = [
        pymc.Dirichlet("dir_%d" % j, theta=numpy.repeat(1.0, 4), trace=True)
        for j in xrange(2)
    ]
    cdr = [pymc.CompletedDirichlet("cdir_%d" % j, dr[j]) for j in xrange(2)]
    y = [pymc.Exponential("g_%d" % j, 0.0001, trace=True) for j in xrange(2)]
    #y = ghat[:, i]
    mu = [
        pymc.Lambda("mu_%d" % j,
                    lambda y=y[j], d=cdr[j], L=libsizes_hat[libidxs == j]:
                    ((y * d.T) * L).T,
                    trace=False) for j in xrange(2)
    ]

    alpha = [(mu[j] / (mu[j] * t1 + t0)) for j in xrange(2)]
    dnb = [
        pymc.NegativeBinomial("d_%d" % j,
                              mu[j],
                              alpha[j],
                              observed=True,
Beispiel #13
0
# DIMENSIONS: num_groups x num_groups
# SUPPORT: [0,1]
# DISTRIBUTION: None
B_matrix = B

#---------------------------- Prior Parameters ---------------------------#
# Actual group membership probabilities for each person
# DIMENSIONS: 1 x (num_people * num_groups)
# SUPPORT: (0,1], Elements of each vector should sum to 1 for each person
# DISTRIBUTION: Dirichlet(alpha)
pi_list = np.empty(num_people, dtype=object)
for person in range(num_people):
    person_pi = pymc.Dirichlet('pi_%i' % person, theta=alpha_vector)
    pi_list[person] = person_pi

completed_pi_list = [pymc.CompletedDirichlet('completed_pi_%d' % i, dist) for i, dist in enumerate(pi_list)] 

# Indicator variables of whether the pth person is in a group or not
# DIMENSIONS: 1 x (num_people^2) for each list, where each element is Kx1
# DOMAIN : {0,1}, only one element of vector is 1, all else 0
# DISTRIBUTION: Categorical (using Multinomial with 1 observation)
z_pTq_matrix = np.empty([num_people,num_people], dtype=object)
z_pFq_matrix = np.empty([num_people,num_people], dtype=object)
for p_person in range(num_people):
    for q_person in range(num_people):
        z_pTq_matrix[p_person,q_person] = pymc.Multinomial('z_%dT%d_vector' % (p_person,q_person), n=1, p=pi_list[p_person], trace=False)
        z_pFq_matrix[p_person,q_person] = pymc.Multinomial('z_%dF%d_vector' % (p_person,q_person), n=1, p=pi_list[q_person], trace=False)

#---------------------------- Data Level ---------------------------------#
# Combination of Priors to build the scalar parameter for y~Bernoulli
@pymc.deterministic
Beispiel #14
0
# a simple demo for Binomal-Beta Conjugate
p = pm.Beta("p",alpha=1,beta=1)
n = pm.Binomial("Bino",n=19,p=p,value=5,observed=True)
mcmc = pm.MCMC([n,p])
mcmc.sample(25000)

%matplotlib inline
from pymc.Matplot import plot as mcplot
mcplot(mcmc.trace("p"),common_scale=False)

# a simple demo for Dirichlet-Multinomal Conjugate
N = 5 # dimension
beta = np.ones(N)
mu=pm.Dirichlet("mu", theta=beta)
cmu = pm.CompletedDirichlet("cmu", D=mu)

n = pm.Multinomial('n', n=D, p=cmu, value=n_class, observed=True)

alpha = np.ones(N)

theta = pm.Container([pm.Dirichlet("theta_%s" % i,theta=alpha) \
                      for i in range(N)])
ctheta = pm.Container([pm.CompletedDirichlet("ctheta_%s" % i, D=theta[i]) for i in range(N)])
c = pm.Container([pm.Multinomial("c_%s" % i, n=n_class[i], p=theta[i]\
                                ,value = data[i], observed=True)\
                 for i in range(N)])

@pm.deterministic
def precision(mu=cmu, theta=ctheta):
    return np.sum([mu[0][i]*theta[i][0][i] for i in range(N)])
Beispiel #15
0
    return np.array(doc_nr)


nr_assoc_word = numpy_array(word_in_dict, list_lists)
print(nr_assoc_word)

nr_doc = len(nr_assoc_word)
nr_words_doc = [len(doc) for doc in nr_assoc_word]
nr_words = len(word_in_dict)
nr_topics = 3

alpha = np.ones(nr_topics)
beta = np.ones(nr_words)

theta = pm.Container([
    pm.CompletedDirichlet("theta_%s" % i,
                          pm.Dirichlet("theta1_%s" % i, theta=alpha))
    for i in range(nr_doc)
])

for d in range(nr_doc):
    print(theta[d].value)

phi = pm.Container([
    pm.CompletedDirichlet("phi_%s" % j, pm.Dirichlet("phi1_%s" % j,
                                                     theta=beta))
    for j in range(nr_topics)
])

for i in range(nr_topics):
    print(phi[i].value)