def test_logseries_convergence(self): # Test for ticket #923 N = 1000 random.seed(0) rvsn = random.logseries(0.8, size=N) # these two frequency counts should be close to theoretical # numbers with this large sample # theoretical large N result is 0.49706795 freq = np.sum(rvsn == 1) / float(N) msg = "Frequency was %f, should be > 0.45" % freq assert_(freq > 0.45, msg) # theoretical large N result is 0.19882718 freq = np.sum(rvsn == 2) / float(N) msg = "Frequency was %f, should be < 0.23" % freq assert_(freq < 0.23, msg)
def logarithmic(self, p): ''' Parameters:\n p: float, must be in range (0, 1). ''' return r.logseries(p, self.size)
def _rvs(self, p): # looks wrong for p>0.5, too few k=1 # trying to use generic is worse, no k=1 at all return mtrand.logseries(p, size=self._size)
def loglikelihood(self, X, num_samples=10, method='biased', sampling_method=('ais', {'num_steps': 10}), **kwargs): """ Computes the log-likelihood (in nats) for a set of data samples. If the model is overcomplete, the log-likelihood is estimated using one of two importance sampling methods. The biased method tends to underestimate the log-likelihood. To get rid of the bias, use more samples. The unbiased method oftentimes suffers from extremely high variance and should be used with caution. @type X: array_like @param X: a number of visible states stored in columns @type method: string @param method: whether to use the 'biased' or 'unbiased' method @type num_samples: integer @param num_samples: number of generated importance weights @type sampling_method: tuple @param sampling_method: method and parameters to generate importance weights @type return_all: boolean @param return_all: if true, return all important weights and don't average (default: False) @rtype: ndarray @return: the log-probability of each data point """ return_all = kwargs.get('return_all', False) if self.num_hiddens == self.num_visibles: return self.prior_loglikelihood(dot(inv(self.A), X)) - slogdet(self.A)[1] else: if method == 'biased': # sample importance weights log_is_weights = asshmarray(empty([num_samples, X.shape[1]])) def parfor(i): log_is_weights[i] = self.sample_posterior_ais(X, **sampling_method[1])[1] mapp(parfor, range(num_samples)) if return_all: return asarray(log_is_weights) else: # average importance weights to get log-likelihoods return logmeanexp(log_is_weights, 0) elif method == 'unbiased': loglik = empty(X.shape[1]) # sample importance weights log_is_weights = asshmarray(empty([num_samples, X.shape[1]])) def parfor(i): log_is_weights[i] = self.sample_posterior_ais(X, **sampling_method[1])[1] mapp(parfor, range(num_samples)) # obtain an initial first guess using the biased method is_weights = exp(log_is_weights) is_mean = mean(is_weights, 0) is_var = var(is_weights, 0, ddof=1) # Taylor series expansion points c = (is_var + square(is_mean)) / is_mean # logarithmic series distribution parameters p = sqrt(is_var / (is_var + square(is_mean))) # sample "number of importance samples" for each data point num_samples = array([logseries(p_) for p_ in p], dtype='uint32') for k in range(1, max(num_samples) + 1): # data points for which to generate k importance weights indices = where(num_samples == k)[0] # sample importance weights if len(indices) > 0: log_is_weights = asshmarray(empty([k, len(indices)])) def parfor(i): log_is_weights[i] = self.sample_posterior_ais(X[:, indices], num_steps=num_steps)[1] mapp(parfor, range(k)) # hyperparameter used for selected datapoints c_ = c[indices] p_ = p[indices] # unbiased estimate of log-likelihood loglik[indices] = log(c_) + log(1. - p_) * prod((c_ - exp(log_is_weights)) / (c_ * p_), 0) if return_all: return loglik else: return mean(loglik, 0).reshape(1, -1) else: raise NotImplementedError('Unknown method \'{0}\'.'.format(method))
def logseries(size, params): try: return random.logseries(params['p'], size) except ValueError as e: exit(e)