Python CategoricalAndConcentration.copy_sampleの例

プログラミング言語: Python

名前空間/パッケージ名: distributions

メソッド/関数: copy_sample

hotexamples.comのコード掲載数: 4

Python CategoricalAndConcentration.copy_sample - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdistributions.CategoricalAndConcentration.copy_sampleの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

copy_sample(2)

get_vlb(2)

log_likelihood(2)

max_likelihood(2)

meanfieldupdate(2)

resample(2)

CategoricalAndConcentration(1)

meanfield_sgdstep(1)

num_parameters(1)

コード例 #1

ファイルを表示

ファイル: models.py プロジェクト: daStrauss/pybasicbayes

class Mixture(ModelGibbsSampling, ModelMeanField, ModelEM):
    '''
    This class is for mixtures of other distributions.
    '''
    def __init__(self,components,alpha_0=None,a_0=None,b_0=None,weights=None):
        assert len(components) > 0
        assert (alpha_0 is not None) ^ (a_0 is not None and b_0 is not None)

        self.components = components

        if alpha_0 is not None:
            self.weights = Categorical(alpha_0=alpha_0,K=len(components),weights=weights)
        else:
            self.weights = CategoricalAndConcentration(
                    a_0=a_0,b_0=b_0,K=len(components),weights=weights)

        self.labels_list = []

    def add_data(self,data,**kwargs):
        self.labels_list.append(Labels(data=np.asarray(data),
            components=self.components,weights=self.weights,
            **kwargs))

    def generate(self,N,keep=True):
        templabels = Labels(components=self.components,weights=self.weights,N=N)

        out = np.empty(self.components[0].rvs(N).shape)
        counts = np.bincount(templabels.z,minlength=len(self.components))
        for idx,(c,count) in enumerate(zip(self.components,counts)):
            out[templabels.z == idx,...] = c.rvs(count)

        perm = np.random.permutation(N)
        out = out[perm]
        templabels.z = templabels.z[perm]

        if keep:
            templabels.data = out
            self.labels_list.append(templabels)

        return out, templabels.z

    def _log_likelihoods(self,x):
        x = np.asarray(x)
        K = len(self.components)
        vals = np.empty((x.shape[0],K))
        for idx, c in enumerate(self.components):
            vals[:,idx] = c.log_likelihood(x)
        vals += self.weights.log_likelihood(np.arange(K))
        assert not np.isnan(vals).any()
        return np.logaddexp.reduce(vals,axis=1)

    def log_likelihood(self,x):
        return self._log_likelihoods(x).sum()

    ### Gibbs sampling

    def resample_model(self,temp=None):
        assert all(isinstance(c,GibbsSampling) for c in self.components), \
                'Components must implement GibbsSampling'
        for idx, c in enumerate(self.components):
            c.resample(data=[l.data[l.z == idx] for l in self.labels_list])

        self.weights.resample([l.z for l in self.labels_list])

        for l in self.labels_list:
            l.resample(temp=temp)

    def copy_sample(self):
        new = copy.copy(self)
        new.components = [c.copy_sample() for c in self.components]
        new.weights = self.weights.copy_sample()
        new.labels_list = [l.copy_sample() for l in self.labels_list]
        return new

    ### Mean Field

    def meanfield_coordinate_descent_step(self):
        assert all(isinstance(c,MeanField) for c in self.components), \
                'Components must implement MeanField'
        assert len(self.labels_list) > 0, 'Must have data to run MeanField'

        # NOTE: to interleave mean field steps with Gibbs sampling steps, label
        # updates needs to come first, otherwise the sampled updates will be
        # ignored and the model will essentially stay where it was the last time
        # mean field updates were run

        ### update sweep!
        # update the label responsibilities
        for l in self.labels_list:
            l.meanfieldupdate()

        # pass the weights to pi
        K = len(self.components)
        self.weights.meanfieldupdate(np.arange(K),[l.r for l in self.labels_list])

        # pass the weights to the components
        for idx, c in enumerate(self.components):
            c.meanfieldupdate([l.data for l in self.labels_list],
                    [l.r[:,idx] for l in self.labels_list])

        ### get vlb!
        vlb = 0.

        # get labels terms
        vlb += sum(l.get_vlb() for l in self.labels_list)

        # get pi term
        vlb += self.weights.get_vlb()

        # get components terms
        vlb += sum(c.get_vlb() for c in self.components)

        # finally, need the evidence term in the vlb
        for l in self.labels_list:
            vlb += np.sum([r.dot(c.expected_log_likelihood(l.data))
                                for c,r in zip(self.components, l.r.T)])

        # add in symmetry factor (if we're actually symmetric)
        if len(set(self.weights.weights)) == 1 and \
                len(set(type(c) for c in self.components)) == 1:
            vlb += special.gammaln(len(self.components)+1)

        return vlb

    ### EM

    def EM_step(self):
        assert all(isinstance(c,MaxLikelihood) for c in self.components), \
                'Components must implement MaxLikelihood'
        assert len(self.labels_list) > 0, 'Must have data to run EM'

        ## E step
        for l in self.labels_list:
            l.E_step()

        ## M step
        # component parameters
        for idx, c in enumerate(self.components):
            c.max_likelihood([l.data for l in self.labels_list],
                    [l.expectations[:,idx] for l in self.labels_list])

        # mixture weights
        self.weights.max_likelihood(np.arange(len(self.components)),
                [l.expectations for l in self.labels_list])

    @property
    def num_parameters(self):
        # NOTE: scikit.learn's gmm.py doesn't count the weights in the number of
        # parameters, but I don't know why they wouldn't. Some convention?
        return sum(c.num_parameters for c in self.components) + self.weights.num_parameters

    def BIC(self,data=None):
        '''
        BIC on the passed data.
        If passed data is None (default), calculates BIC on the model's assigned data.
        '''
        # NOTE: in principle this method computes the BIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        if data is None:
            assert len(self.labels_list) > 0, \
                    "If not passing in data, the class must already have it. Use the method add_data()"
            return -2*sum(self.log_likelihood(l.data) for l in self.labels_list) + \
                        self.num_parameters * np.log(sum(l.data.shape[0] for l in self.labels_list))
        else:
            return -2*self.log_likelihood(data) + self.num_parameters * np.log(data.shape[0])

    def AIC(self):
        # NOTE: in principle this method computes the AIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        assert len(self.labels_list) > 0, 'Must have data to get AIC'
        return 2*self.num_parameters - 2*sum(self.log_likelihood(l.data) for l in self.labels_list)

    ### Misc.

    def plot(self,color=None,legend=True):
        cmap = cm.get_cmap()

        if len(self.labels_list) > 0:
            label_colors = {}

            used_labels = reduce(set.union,[set(l.z) for l in self.labels_list],set([]))
            num_labels = len(used_labels)
            num_subfig_rows = len(self.labels_list)

            for idx,label in enumerate(used_labels):
                label_colors[label] = idx/(num_labels-1 if num_labels > 1 else 1) \
                        if color is None else color

            for subfigidx,l in enumerate(self.labels_list):
                # plot the current observation distributions (and obs. if given)
                plt.subplot(num_subfig_rows,1,1+subfigidx)
                for label, o in enumerate(self.components):
                    if label in l.z:
                        o.plot(color=cmap(label_colors[label]),
                                data=(l.data[l.z == label] if l.data is not None else None),
                                label='%d' % label)

            if legend:
                plt.legend(
                        [plt.Rectangle((0,0),1,1,fc=cmap(c))
                            for i,c in label_colors.iteritems() if i in used_labels],
                        [i for i in label_colors if i in used_labels],
                        loc='best'
                        )

        else:
            top10 = np.array(self.components)[np.argsort(self.weights.weights)][-1:-11:-1]
            colors = [cmap(x) for x in np.linspace(0,1,len(top10))] if color is None \
                    else [color]*len(top10)
            for i,(o,c) in enumerate(zip(top10,colors)):
                o.plot(color=c,label='%d' % i)

    def to_json_dict(self):
        assert len(self.labels_list) == 1
        data = self.labels_list[0].data
        z = self.labels_list[0].z
        assert data.ndim == 2 and data.shape[1] == 2

        return  {
                    'points':[{'x':x,'y':y,'label':int(label)} for x,y,label in zip(data[:,0],data[:,1],z)],
                    'ellipses':[dict(c.to_json_dict().items() + [('label',i)])
                        for i,c in enumerate(self.components) if i in z]
                }

    def predictive_likelihoods(self,test_data,forecast_horizons):
        likes = self._log_likelihoods(test_data)
        return [likes[k:] for k in forecast_horizons]

    def block_predictive_likelihoods(self,test_data,blocklens):
        csums = np.cumsum(self._log_likelihoods(test_data))
        outs = []
        for k in blocklens:
            outs.append(csums[k:] - csums[:-k])
        return outs

コード例 #2

ファイルを表示

ファイル: models.py プロジェクト: ariddell/pybasicbayes

class Mixture(ModelGibbsSampling, ModelMeanField, ModelEM, ModelParallelTempering):
    '''
    This class is for mixtures of other distributions.
    '''
    _labels_class = Labels

    def __init__(self,components,alpha_0=None,a_0=None,b_0=None,weights=None,weights_obj=None):
        assert len(components) > 0
        assert (alpha_0 is not None) ^ (a_0 is not None and b_0 is not None) \
                ^ (weights_obj is not None)

        self.components = components

        if alpha_0 is not None:
            self.weights = Categorical(alpha_0=alpha_0,K=len(components),weights=weights)
        elif weights_obj is not None:
            self.weights = weights_obj
        else:
            self.weights = CategoricalAndConcentration(
                    a_0=a_0,b_0=b_0,K=len(components),weights=weights)

        self.labels_list = []

    def add_data(self,data,**kwargs):
        self.labels_list.append(self._labels_class(data=np.asarray(data),model=self,**kwargs))

    @property
    def N(self):
        return len(self.components)

    def generate(self,N,keep=True):
        templabels = self._labels_class(model=self,N=N)

        out = np.empty(self.components[0].rvs(N).shape)
        counts = np.bincount(templabels.z,minlength=self.N)
        for idx,(c,count) in enumerate(zip(self.components,counts)):
            out[templabels.z == idx,...] = c.rvs(count)

        perm = np.random.permutation(N)
        out = out[perm]
        templabels.z = templabels.z[perm]

        if keep:
            templabels.data = out
            self.labels_list.append(templabels)

        return out, templabels.z

    def _clear_caches(self):
        for l in self.labels_list:
            l.clear_caches()

    def _log_likelihoods(self,x):
        # NOTE: nans propagate as nans
        x = np.asarray(x)
        K = len(self.components)
        vals = np.empty((x.shape[0],K))
        for idx, c in enumerate(self.components):
            vals[:,idx] = c.log_likelihood(x)
        vals += self.weights.log_likelihood(np.arange(K))
        return logsumexp(vals,axis=1)

    def log_likelihood(self,x=None):
        if x is None:
            return sum(l.log_likelihood() for l in self.labels_list)
        else:
            assert isinstance(x,(np.ndarray,list))
            if isinstance(x,list):
                return sum(self.log_likelihood(d) for d in x)
            else:
                self.add_data(x)
                return self.labels_list.pop().log_likelihood()

    ### parallel tempering

    @property
    def temperature(self):
        return self._temperature if hasattr(self,'_temperature') else 1.

    @temperature.setter
    def temperature(self,T):
        self._temperature = T

    @property
    def energy(self):
        energy = 0.
        for l in self.labels_list:
            for label, datum in zip(l.z,l.data):
                energy += self.components[label].energy(datum)
        return energy

    def swap_sample_with(self,other):
        self.components, other.components = other.components, self.components
        self.weights, other.weights = other.weights, self.weights

        for l1, l2 in zip(self.labels_list,other.labels_list):
            l1.z, l2.z = l2.z, l1.z

    ### Gibbs sampling

    def resample_model(self,num_procs=0,components_jobs=0):
        self.resample_components(num_procs=components_jobs)
        self.resample_weights()
        self.resample_labels(num_procs=num_procs)

    def resample_weights(self):
        self.weights.resample([l.z for l in self.labels_list])
        self._clear_caches()

    def resample_components(self,num_procs=0):
        if num_procs == 0:
            for idx, c in enumerate(self.components):
                c.resample(data=[l.data[l.z == idx] for l in self.labels_list])
        else:
            self._resample_components_joblib(num_procs)
        self._clear_caches()

    def resample_labels(self,num_procs=0):
        if num_procs == 0:
            for l in self.labels_list:
                l.resample()
        else:
            self._resample_labels_joblib(num_procs)

    def copy_sample(self):
        new = copy.copy(self)
        new.components = [c.copy_sample() for c in self.components]
        new.weights = self.weights.copy_sample()
        new.labels_list = [l.copy_sample() for l in self.labels_list]
        for l in new.labels_list:
            l.model = new
        return new

    def _resample_components_joblib(self,num_procs):
        from joblib import Parallel, delayed
        import parallel

        parallel.model = self
        parallel.labels_list = self.labels_list

        if len(self.components) > 0:
            params = Parallel(n_jobs=num_procs,backend='multiprocessing')\
                    (delayed(parallel._get_sampled_component_params)(idx)
                            for idx in range(len(self.components)))

        for c, p in zip(self.components,params):
            c.parameters = p

    def _resample_labels_joblib(self,num_procs):
        from joblib import Parallel, delayed
        import parallel

        if len(self.labels_list) > 0:
            parallel.model = self

            raw = Parallel(n_jobs=num_procs,backend='multiprocessing')\
                    (delayed(parallel._get_sampled_labels)(idx)
                            for idx in range(len(self.labels_list)))

            for l, (z,normalizer) in zip(self.labels_list,raw):
                l.z, l._normalizer = z, normalizer


    ### Mean Field

    def meanfield_coordinate_descent_step(self):
        assert all(isinstance(c,MeanField) for c in self.components), \
                'Components must implement MeanField'
        assert len(self.labels_list) > 0, 'Must have data to run MeanField'

        self._meanfield_update_sweep()
        return self._vlb()

    def _meanfield_update_sweep(self):
        # NOTE: to interleave mean field steps with Gibbs sampling steps, label
        # updates need to come first, otherwise the sampled updates will be
        # ignored and the model will essentially stay where it was the last time
        # mean field updates were run
        # TODO fix that, seed with sample from variational distribution
        self.meanfield_update_labels()
        self.meanfield_update_parameters()

    def meanfield_update_labels(self):
        for l in self.labels_list:
            l.meanfieldupdate()

    def meanfield_update_parameters(self):
        self.meanfield_update_components()
        self.meanfield_update_weights()

    def meanfield_update_weights(self):
        self.weights.meanfieldupdate(None,[l.r for l in self.labels_list])
        self._clear_caches()

    def meanfield_update_components(self):
        for idx, c in enumerate(self.components):
            c.meanfieldupdate([l.data for l in self.labels_list],
                    [l.r[:,idx] for l in self.labels_list])
        self._clear_caches()

    def _vlb(self):
        vlb = 0.
        vlb += sum(l.get_vlb() for l in self.labels_list)
        vlb += self.weights.get_vlb()
        vlb += sum(c.get_vlb() for c in self.components)
        for l in self.labels_list:
            vlb += np.sum([r.dot(c.expected_log_likelihood(l.data))
                                for c,r in zip(self.components, l.r.T)])

        # add in symmetry factor (if we're actually symmetric)
        if len(set(type(c) for c in self.components)) == 1:
            vlb += special.gammaln(len(self.components)+1)

        return vlb

    ### SVI

    def meanfield_sgdstep(self,minibatch,minibatchfrac,stepsize,**kwargs):
        minibatch = minibatch if isinstance(minibatch,list) else [minibatch]
        mb_labels_list = []
        for data in minibatch:
            self.add_data(data,z=np.empty(data.shape[0]),**kwargs) # NOTE: dummy
            mb_labels_list.append(self.labels_list.pop())

        for l in mb_labels_list:
            l.meanfieldupdate()

        self._meanfield_sgdstep_parameters(mb_labels_list,minibatchfrac,stepsize)

    def _meanfield_sgdstep_parameters(self,mb_labels_list,minibatchfrac,stepsize):
        self._meanfield_sgdstep_components(mb_labels_list,minibatchfrac,stepsize)
        self._meanfield_sgdstep_weights(mb_labels_list,minibatchfrac,stepsize)

    def _meanfield_sgdstep_components(self,mb_labels_list,minibatchfrac,stepsize):
        for idx, c in enumerate(self.components):
            c.meanfield_sgdstep(
                    [l.data for l in mb_labels_list],
                    [l.r[:,idx] for l in mb_labels_list],
                    minibatchfrac,stepsize)

    def _meanfield_sgdstep_weights(self,mb_labels_list,minibatchfrac,stepsize):
        self.weights.meanfield_sgdstep(
                None,[l.r for l in mb_labels_list],
                minibatchfrac,stepsize)

    ### EM

    def EM_step(self):
        # assert all(isinstance(c,MaxLikelihood) for c in self.components), \
        #         'Components must implement MaxLikelihood'
        assert len(self.labels_list) > 0, 'Must have data to run EM'

        ## E step
        for l in self.labels_list:
            l.E_step()

        ## M step
        # component parameters
        for idx, c in enumerate(self.components):
            c.max_likelihood([l.data for l in self.labels_list],
                    [l.expectations[:,idx] for l in self.labels_list])

        # mixture weights
        self.weights.max_likelihood(np.arange(len(self.components)),
                [l.expectations for l in self.labels_list])

    @property
    def num_parameters(self):
        # NOTE: scikit.learn's gmm.py doesn't count the weights in the number of
        # parameters, but I don't know why they wouldn't. Some convention?
        return sum(c.num_parameters for c in self.components) + self.weights.num_parameters

    def BIC(self,data=None):
        '''
        BIC on the passed data.
        If passed data is None (default), calculates BIC on the model's assigned data.
        '''
        # NOTE: in principle this method computes the BIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        if data is None:
            assert len(self.labels_list) > 0, \
                    "If not passing in data, the class must already have it. Use the method add_data()"
            return -2*sum(self.log_likelihood(l.data) for l in self.labels_list) + \
                        self.num_parameters * np.log(sum(l.data.shape[0] for l in self.labels_list))
        else:
            return -2*self.log_likelihood(data) + self.num_parameters * np.log(data.shape[0])

    def AIC(self):
        # NOTE: in principle this method computes the AIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        assert len(self.labels_list) > 0, 'Must have data to get AIC'
        return 2*self.num_parameters - 2*sum(self.log_likelihood(l.data) for l in self.labels_list)

    ### Misc.

    @property
    def used_labels(self):
        if len(self.labels_list) > 0:
            label_usages = sum(np.bincount(l.z,minlength=self.N) for l in self.labels_list)
            used_labels, = np.where(label_usages > 0)
        else:
            used_labels = np.argsort(self.weights.weights)[-1:-11:-1]
        return used_labels

    def plot(self,color=None,legend=False,alpha=None,update=False,draw=True):
        import matplotlib.pyplot as plt
        from matplotlib import cm
        artists = []

        ### get colors
        cmap = cm.get_cmap()
        if color is None:
            label_colors = dict((idx,cmap(v))
                for idx, v in enumerate(np.linspace(0,1,self.N,endpoint=True)))
        else:
            label_colors = dict((idx,color) for idx in range(self.N))

        ### plot data scatter
        for l in self.labels_list:
            colorseq = [label_colors[label] for label in l.z]
            if update and hasattr(l,'_data_scatter'):
                l._data_scatter.set_offsets(l.data[:,:2])
                l._data_scatter.set_color(colorseq)
            else:
                l._data_scatter = plt.scatter(l.data[:,0],l.data[:,1],c=colorseq,s=5)
            artists.append(l._data_scatter)

        ### plot parameters
        axis = plt.axis()
        for label, (c, w) in enumerate(zip(self.components,self.weights.weights)):
            artists.extend(
                c.plot(
                    color=label_colors[label],
                    label='%d' % label,
                    alpha=min(0.25,1.-(1.-w)**2)/0.25 if alpha is None else alpha,
                    update=update,draw=False))
        plt.axis(axis)

        ### add legend
        if legend and color is None:
            plt.legend(
                [plt.Rectangle((0,0),1,1,fc=c)
                    for i,c in label_colors.iteritems() if i in used_labels],
                [i for i in label_colors if i in used_labels],
                loc='best', ncol=2)

        if draw: plt.draw()
        return artists


    def to_json_dict(self):
        assert len(self.labels_list) == 1
        data = self.labels_list[0].data
        z = self.labels_list[0].z
        assert data.ndim == 2 and data.shape[1] == 2

        return  {
                    'points':[{'x':x,'y':y,'label':int(label)} for x,y,label in zip(data[:,0],data[:,1],z)],
                    'ellipses':[dict(c.to_json_dict().items() + [('label',i)])
                        for i,c in enumerate(self.components) if i in z]
                }

    def predictive_likelihoods(self,test_data,forecast_horizons):
        likes = self._log_likelihoods(test_data)
        return [likes[k:] for k in forecast_horizons]

    def block_predictive_likelihoods(self,test_data,blocklens):
        csums = np.cumsum(self._log_likelihoods(test_data))
        outs = []
        for k in blocklens:
            outs.append(csums[k:] - csums[:-k])
        return outs

コード例 #3

ファイルを表示

class Mixture(ModelGibbsSampling, ModelMeanField, ModelEM,
              ModelParallelTempering):
    '''
    This class is for mixtures of other distributions.
    '''
    def __init__(self,
                 components,
                 alpha_0=None,
                 a_0=None,
                 b_0=None,
                 weights=None,
                 weights_obj=None):
        assert len(components) > 0
        assert (alpha_0 is not None) ^ (a_0 is not None and b_0 is not None) \
                ^ (weights_obj is not None)

        self.components = components

        if alpha_0 is not None:
            self.weights = Categorical(alpha_0=alpha_0,
                                       K=len(components),
                                       weights=weights)
        elif weights_obj is not None:
            self.weights = weights_obj
        else:
            self.weights = CategoricalAndConcentration(a_0=a_0,
                                                       b_0=b_0,
                                                       K=len(components),
                                                       weights=weights)

        self.labels_list = []

    def add_data(self, data, **kwargs):
        self.labels_list.append(
            Labels(data=np.asarray(data), model=self, **kwargs))

    def generate(self, N, keep=True):
        templabels = Labels(model=self, N=N)

        out = np.empty(self.components[0].rvs(N).shape)
        counts = np.bincount(templabels.z, minlength=len(self.components))
        for idx, (c, count) in enumerate(zip(self.components, counts)):
            out[templabels.z == idx, ...] = c.rvs(count)

        perm = np.random.permutation(N)
        out = out[perm]
        templabels.z = templabels.z[perm]

        if keep:
            templabels.data = out
            self.labels_list.append(templabels)

        return out, templabels.z

    def _clear_caches(self):
        for l in self.labels_list:
            l.clear_caches()

    def _log_likelihoods(self, x):
        # NOTE: nans propagate as nans
        x = np.asarray(x)
        K = len(self.components)
        vals = np.empty((x.shape[0], K))
        for idx, c in enumerate(self.components):
            vals[:, idx] = c.log_likelihood(x)
        vals += self.weights.log_likelihood(np.arange(K))
        return np.logaddexp.reduce(vals, axis=1)

    def log_likelihood(self, x=None):
        if x is None:
            return sum(l.log_likelihood() for l in self.labels_list)
        else:
            assert isinstance(x, (np.ndarray, list))
            if isinstance(x, list):
                return sum(self.log_likelihood(d) for d in x)
            else:
                self.add_data(x)
                return self.labels_list.pop().log_likelihood()

    ### parallel tempering

    @property
    def temperature(self):
        return self._temperature if hasattr(self, '_temperature') else 1.

    @temperature.setter
    def temperature(self, T):
        self._temperature = T

    @property
    def energy(self):
        energy = 0.
        for l in self.labels_list:
            for label, datum in zip(l.z, l.data):
                energy += self.components[label].energy(datum)
        return energy

    def swap_sample_with(self, other):
        self.components, other.components = other.components, self.components
        self.weights, other.weights = other.weights, self.weights

        for l1, l2 in zip(self.labels_list, other.labels_list):
            l1.z, l2.z = l2.z, l1.z

    ### Gibbs sampling

    def resample_model(self, labels_jobs=0, components_jobs=0):
        self.resample_components(joblib_jobs=components_jobs)
        self.resample_weights()
        self.resample_labels(joblib_jobs=labels_jobs)

    def resample_weights(self):
        self.weights.resample([l.z for l in self.labels_list])
        self._clear_caches()

    def resample_components(self, joblib_jobs=0):
        if joblib_jobs == 0:
            for idx, c in enumerate(self.components):
                c.resample(data=[l.data[l.z == idx] for l in self.labels_list])
        else:
            self._resample_components_joblib(joblib_jobs)
        self._clear_caches()

    def resample_labels(self, joblib_jobs=0):
        if joblib_jobs == 0:
            for l in self.labels_list:
                l.resample()
        else:
            self._resample_labels_joblib(joblib_jobs)

    def copy_sample(self):
        new = copy.copy(self)
        new.components = [c.copy_sample() for c in self.components]
        new.weights = self.weights.copy_sample()
        new.labels_list = [l.copy_sample() for l in self.labels_list]
        for l in new.labels_list:
            l.model = new
        return new

    def _resample_components_joblib(self, joblib_jobs):
        from joblib import Parallel, delayed
        import parallel

        parallel.model = self
        parallel.labels_list = self.labels_list

        if len(self.components) > 0:
            params = Parallel(n_jobs=joblib_jobs,backend='multiprocessing')\
                    (delayed(parallel._get_sampled_component_params)(idx)
                            for idx in range(len(self.components)))

        for c, p in zip(self.components, params):
            c.parameters = p

    def _resample_labels_joblib(self, joblib_jobs):
        from joblib import Parallel, delayed
        import parallel

        if len(self.labels_list) > 0:
            parallel.model = self

            raw = Parallel(n_jobs=joblib_jobs,backend='multiprocessing')\
                    (delayed(parallel._get_sampled_labels)(idx)
                            for idx in range(len(self.labels_list)))

            for l, (z, normalizer) in zip(self.labels_list, raw):
                l.z, l._normalizer = z, normalizer

    ### Mean Field

    def meanfield_coordinate_descent_step(self):
        assert all(isinstance(c,MeanField) for c in self.components), \
                'Components must implement MeanField'
        assert len(self.labels_list) > 0, 'Must have data to run MeanField'

        self._meanfield_update_sweep()
        return self._vlb()

    def _meanfield_update_sweep(self):
        # NOTE: to interleave mean field steps with Gibbs sampling steps, label
        # updates need to come first, otherwise the sampled updates will be
        # ignored and the model will essentially stay where it was the last time
        # mean field updates were run
        # TODO fix that, seed with sample from variational distribution
        self.meanfield_update_labels()
        self.meanfield_update_parameters()

    def meanfield_update_labels(self):
        for l in self.labels_list:
            l.meanfieldupdate()

    def meanfield_update_parameters(self):
        self.meanfield_update_components()
        self.meanfield_update_weights()

    def meanfield_update_weights(self):
        self.weights.meanfieldupdate(None, [l.r for l in self.labels_list])
        self._clear_caches()

    def meanfield_update_components(self):
        for idx, c in enumerate(self.components):
            c.meanfieldupdate([l.data for l in self.labels_list],
                              [l.r[:, idx] for l in self.labels_list])
        self._clear_caches()

    def _vlb(self):
        vlb = 0.
        vlb += sum(l.get_vlb() for l in self.labels_list)
        vlb += self.weights.get_vlb()
        vlb += sum(c.get_vlb() for c in self.components)
        for l in self.labels_list:
            vlb += np.sum([
                r.dot(c.expected_log_likelihood(l.data))
                for c, r in zip(self.components, l.r.T)
            ])

        # add in symmetry factor (if we're actually symmetric)
        if len(set(type(c) for c in self.components)) == 1:
            vlb += special.gammaln(len(self.components) + 1)

        return vlb

    ### SVI

    def meanfield_sgdstep(self, minibatch, minibatchfrac, stepsize, **kwargs):
        minibatch = minibatch if isinstance(minibatch, list) else [minibatch]
        mb_labels_list = []
        for data in minibatch:
            self.add_data(data, z=np.empty(data.shape[0]),
                          **kwargs)  # NOTE: dummy
            mb_labels_list.append(self.labels_list.pop())

        for l in mb_labels_list:
            l.meanfieldupdate()

        self._meanfield_sgdstep_parameters(mb_labels_list, minibatchfrac,
                                           stepsize)

    def _meanfield_sgdstep_parameters(self, mb_labels_list, minibatchfrac,
                                      stepsize):
        self._meanfield_sgdstep_components(mb_labels_list, minibatchfrac,
                                           stepsize)
        self._meanfield_sgdstep_weights(mb_labels_list, minibatchfrac,
                                        stepsize)

    def _meanfield_sgdstep_components(self, mb_labels_list, minibatchfrac,
                                      stepsize):
        for idx, c in enumerate(self.components):
            c.meanfield_sgdstep([l.data for l in mb_labels_list],
                                [l.r[:, idx] for l in mb_labels_list],
                                minibatchfrac, stepsize)

    def _meanfield_sgdstep_weights(self, mb_labels_list, minibatchfrac,
                                   stepsize):
        self.weights.meanfield_sgdstep(None, [l.r for l in mb_labels_list],
                                       minibatchfrac, stepsize)

    ### EM

    def EM_step(self):
        # assert all(isinstance(c,MaxLikelihood) for c in self.components), \
        #         'Components must implement MaxLikelihood'
        assert len(self.labels_list) > 0, 'Must have data to run EM'

        ## E step
        for l in self.labels_list:
            l.E_step()

        ## M step
        # component parameters
        for idx, c in enumerate(self.components):
            c.max_likelihood(
                [l.data for l in self.labels_list],
                [l.expectations[:, idx] for l in self.labels_list])

        # mixture weights
        self.weights.max_likelihood(np.arange(len(self.components)),
                                    [l.expectations for l in self.labels_list])

    @property
    def num_parameters(self):
        # NOTE: scikit.learn's gmm.py doesn't count the weights in the number of
        # parameters, but I don't know why they wouldn't. Some convention?
        return sum(c.num_parameters
                   for c in self.components) + self.weights.num_parameters

    def BIC(self, data=None):
        '''
        BIC on the passed data.
        If passed data is None (default), calculates BIC on the model's assigned data.
        '''
        # NOTE: in principle this method computes the BIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        if data is None:
            assert len(self.labels_list) > 0, \
                    "If not passing in data, the class must already have it. Use the method add_data()"
            return -2*sum(self.log_likelihood(l.data) for l in self.labels_list) + \
                        self.num_parameters * np.log(sum(l.data.shape[0] for l in self.labels_list))
        else:
            return -2 * self.log_likelihood(
                data) + self.num_parameters * np.log(data.shape[0])

    def AIC(self):
        # NOTE: in principle this method computes the AIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        assert len(self.labels_list) > 0, 'Must have data to get AIC'
        return 2 * self.num_parameters - 2 * sum(
            self.log_likelihood(l.data) for l in self.labels_list)

    ### Misc.

    def plot(self,
             color=None,
             legend=True,
             alpha=None,
             plot_all_components=True):
        cmap = cm.get_cmap()

        if len(self.labels_list) > 0:
            label_colors = {}

            if plot_all_components:
                assigned_labels = reduce(set.union,
                                         [set(l.z) for l in self.labels_list],
                                         set([]))
                used_labels = np.arange(len(self.components))
            else:
                used_labels = assigned_labels = \
                        reduce(set.union,[set(l.z) for l in self.labels_list],set([]))
            num_labels = len(used_labels)
            num_subfig_rows = len(self.labels_list)

            for idx, label in enumerate(used_labels):
                label_colors[label] = idx/(num_labels-1 if num_labels > 1 else 1) \
                        if color is None else color

            for subfigidx, l in enumerate(self.labels_list):
                # plot the current observation distributions (and obs. if given)
                plt.subplot(num_subfig_rows, 1, 1 + subfigidx)
                for label, (weight, o) in enumerate(
                        zip(self.weights.weights, self.components)):
                    o.plot(
                        color=cmap(label_colors[label])
                        if color is None else color,
                        data=(l.data[l.z == label]
                              if l.data is not None else None),
                        label='%d' % label,
                        alpha=0.1 * (label in assigned_labels) +
                        0.9 * self.weights.weights[label] /
                        self.weights.weights.max() if alpha is None else alpha)

            if legend and color is None:
                plt.legend([
                    plt.Rectangle((0, 0), 1, 1, fc=cmap(c))
                    for i, c in label_colors.iteritems() if i in used_labels
                ], [i for i in label_colors if i in used_labels],
                           loc='best',
                           ncol=2)

        else:
            top10indices = np.argsort(self.weights.weights)[-1:-11:-1]
            top10 = np.array(self.components)[top10indices]
            colors = [cmap(x) for x in np.linspace(0,1,len(top10))] if color is None \
                    else [color]*len(top10)
            for i, (o, c) in enumerate(zip(top10, colors)):
                o.plot(color=c,
                       label='%d' % i,
                       alpha=0.04 +
                       0.95 * self.weights.weights[top10indices[i]] /
                       self.weights.weights[top10indices].max()
                       if alpha is None else alpha)

    def to_json_dict(self):
        assert len(self.labels_list) == 1
        data = self.labels_list[0].data
        z = self.labels_list[0].z
        assert data.ndim == 2 and data.shape[1] == 2

        return {
            'points': [{
                'x': x,
                'y': y,
                'label': int(label)
            } for x, y, label in zip(data[:, 0], data[:, 1], z)],
            'ellipses': [
                dict(c.to_json_dict().items() + [('label', i)])
                for i, c in enumerate(self.components) if i in z
            ]
        }

    def predictive_likelihoods(self, test_data, forecast_horizons):
        likes = self._log_likelihoods(test_data)
        return [likes[k:] for k in forecast_horizons]

    def block_predictive_likelihoods(self, test_data, blocklens):
        csums = np.cumsum(self._log_likelihoods(test_data))
        outs = []
        for k in blocklens:
            outs.append(csums[k:] - csums[:-k])
        return outs

コード例 #4

ファイルを表示

class Mixture(ModelGibbsSampling, ModelMeanField, ModelEM):
    '''
    This class is for mixtures of other distributions.
    '''
    def __init__(self,
                 components,
                 alpha_0=None,
                 a_0=None,
                 b_0=None,
                 weights=None,
                 weights_obj=None):
        assert len(components) > 0
        assert (alpha_0 is not None) ^ (a_0 is not None and b_0 is not None) \
                ^ (weights_obj is not None)

        self.components = components

        if alpha_0 is not None:
            self.weights = Categorical(alpha_0=alpha_0,
                                       K=len(components),
                                       weights=weights)
        elif weights_obj is not None:
            self.weights = weights_obj
        else:
            self.weights = CategoricalAndConcentration(a_0=a_0,
                                                       b_0=b_0,
                                                       K=len(components),
                                                       weights=weights)

        self.labels_list = []

    def add_data(self, data, **kwargs):
        self.labels_list.append(
            Labels(data=np.asarray(data),
                   components=self.components,
                   weights=self.weights,
                   **kwargs))

    def generate(self, N, keep=True):
        templabels = Labels(components=self.components,
                            weights=self.weights,
                            N=N)

        out = np.empty(self.components[0].rvs(N).shape)
        counts = np.bincount(templabels.z, minlength=len(self.components))
        for idx, (c, count) in enumerate(zip(self.components, counts)):
            out[templabels.z == idx, ...] = c.rvs(count)

        perm = np.random.permutation(N)
        out = out[perm]
        templabels.z = templabels.z[perm]

        if keep:
            templabels.data = out
            self.labels_list.append(templabels)

        return out, templabels.z

    def _log_likelihoods(self, x):
        x = np.asarray(x)
        K = len(self.components)
        vals = np.empty((x.shape[0], K))
        for idx, c in enumerate(self.components):
            vals[:, idx] = c.log_likelihood(x)
        vals += self.weights.log_likelihood(np.arange(K))
        assert not np.isnan(vals).any()
        return np.logaddexp.reduce(vals, axis=1)

    def log_likelihood(self, x):
        return self._log_likelihoods(x).sum()

    ### Gibbs sampling

    def resample_model(self, temp=None):
        assert all(isinstance(c,GibbsSampling) for c in self.components), \
                'Components must implement GibbsSampling'
        for idx, c in enumerate(self.components):
            c.resample(data=[l.data[l.z == idx] for l in self.labels_list])

        self.weights.resample([l.z for l in self.labels_list])

        for l in self.labels_list:
            l.resample(temp=temp)

    def copy_sample(self):
        new = copy.copy(self)
        new.components = [c.copy_sample() for c in self.components]
        new.weights = self.weights.copy_sample()
        new.labels_list = [l.copy_sample() for l in self.labels_list]
        return new

    ### Mean Field

    def meanfield_coordinate_descent_step(self):
        assert all(isinstance(c,MeanField) for c in self.components), \
                'Components must implement MeanField'
        assert len(self.labels_list) > 0, 'Must have data to run MeanField'

        # NOTE: to interleave mean field steps with Gibbs sampling steps, label
        # updates needs to come first, otherwise the sampled updates will be
        # ignored and the model will essentially stay where it was the last time
        # mean field updates were run

        ### update sweep!
        # update the label responsibilities
        for l in self.labels_list:
            l.meanfieldupdate()

        # pass the weights to pi
        K = len(self.components)
        self.weights.meanfieldupdate(np.arange(K),
                                     [l.r for l in self.labels_list])

        # pass the weights to the components
        for idx, c in enumerate(self.components):
            c.meanfieldupdate([l.data for l in self.labels_list],
                              [l.r[:, idx] for l in self.labels_list])

        ### get vlb!
        vlb = 0.

        # get labels terms
        vlb += sum(l.get_vlb() for l in self.labels_list)

        # get pi term
        vlb += self.weights.get_vlb()

        # get components terms
        vlb += sum(c.get_vlb() for c in self.components)

        # finally, need the evidence term in the vlb
        for l in self.labels_list:
            vlb += np.sum([
                r.dot(c.expected_log_likelihood(l.data))
                for c, r in zip(self.components, l.r.T)
            ])

        # add in symmetry factor (if we're actually symmetric)
        if len(set(type(c) for c in self.components)) == 1:
            vlb += special.gammaln(len(self.components) + 1)

        return vlb

    ### EM

    def EM_step(self):
        assert all(isinstance(c,MaxLikelihood) for c in self.components), \
                'Components must implement MaxLikelihood'
        assert len(self.labels_list) > 0, 'Must have data to run EM'

        ## E step
        for l in self.labels_list:
            l.E_step()

        ## M step
        # component parameters
        for idx, c in enumerate(self.components):
            c.max_likelihood(
                [l.data for l in self.labels_list],
                [l.expectations[:, idx] for l in self.labels_list])

        # mixture weights
        self.weights.max_likelihood(np.arange(len(self.components)),
                                    [l.expectations for l in self.labels_list])

    @property
    def num_parameters(self):
        # NOTE: scikit.learn's gmm.py doesn't count the weights in the number of
        # parameters, but I don't know why they wouldn't. Some convention?
        return sum(c.num_parameters
                   for c in self.components) + self.weights.num_parameters

    def BIC(self, data=None):
        '''
        BIC on the passed data.
        If passed data is None (default), calculates BIC on the model's assigned data.
        '''
        # NOTE: in principle this method computes the BIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        if data is None:
            assert len(self.labels_list) > 0, \
                    "If not passing in data, the class must already have it. Use the method add_data()"
            return -2*sum(self.log_likelihood(l.data) for l in self.labels_list) + \
                        self.num_parameters * np.log(sum(l.data.shape[0] for l in self.labels_list))
        else:
            return -2 * self.log_likelihood(
                data) + self.num_parameters * np.log(data.shape[0])

    def AIC(self):
        # NOTE: in principle this method computes the AIC only after finding the
        # maximum likelihood parameters (or, of course, an EM fixed-point as an
        # approximation!)
        assert len(self.labels_list) > 0, 'Must have data to get AIC'
        return 2 * self.num_parameters - 2 * sum(
            self.log_likelihood(l.data) for l in self.labels_list)

    ### Misc.

    def plot(self, color=None, legend=True):
        cmap = cm.get_cmap()

        if len(self.labels_list) > 0:
            label_colors = {}

            used_labels = reduce(set.union,
                                 [set(l.z) for l in self.labels_list], set([]))
            num_labels = len(used_labels)
            num_subfig_rows = len(self.labels_list)

            for idx, label in enumerate(used_labels):
                label_colors[label] = idx/(num_labels-1 if num_labels > 1 else 1) \
                        if color is None else color

            for subfigidx, l in enumerate(self.labels_list):
                # plot the current observation distributions (and obs. if given)
                plt.subplot(num_subfig_rows, 1, 1 + subfigidx)
                for label, o in enumerate(self.components):
                    if label in l.z:
                        o.plot(color=cmap(label_colors[label]),
                               data=(l.data[l.z == label]
                                     if l.data is not None else None),
                               label='%d' % label)

            if legend:
                plt.legend([
                    plt.Rectangle((0, 0), 1, 1, fc=cmap(c))
                    for i, c in label_colors.iteritems() if i in used_labels
                ], [i for i in label_colors if i in used_labels],
                           loc='best')

        else:
            top10 = np.array(self.components)[np.argsort(
                self.weights.weights)][-1:-11:-1]
            colors = [cmap(x) for x in np.linspace(0,1,len(top10))] if color is None \
                    else [color]*len(top10)
            for i, (o, c) in enumerate(zip(top10, colors)):
                o.plot(color=c, label='%d' % i)

    def to_json_dict(self):
        assert len(self.labels_list) == 1
        data = self.labels_list[0].data
        z = self.labels_list[0].z
        assert data.ndim == 2 and data.shape[1] == 2

        return {
            'points': [{
                'x': x,
                'y': y,
                'label': int(label)
            } for x, y, label in zip(data[:, 0], data[:, 1], z)],
            'ellipses': [
                dict(c.to_json_dict().items() + [('label', i)])
                for i, c in enumerate(self.components) if i in z
            ]
        }

    def predictive_likelihoods(self, test_data, forecast_horizons):
        likes = self._log_likelihoods(test_data)
        return [likes[k:] for k in forecast_horizons]

    def block_predictive_likelihoods(self, test_data, blocklens):
        csums = np.cumsum(self._log_likelihoods(test_data))
        outs = []
        for k in blocklens:
            outs.append(csums[k:] - csums[:-k])
        return outs