def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=False): logger.debug( 'start: MNL simulation with len(data)={} and numalts={}'.format( len(data), numalts)) atype = 'numpy' if not GPU else 'cuda' data = np.transpose(data) coeff = np.reshape(np.array(coeff), (1, len(coeff))) data, coeff = PMAT(data, atype), PMAT(coeff, atype) probs = mnl_probs(data, coeff, numalts) if returnprobs: return np.transpose(probs.get_mat()) # convert to cpu from here on - gpu doesn't currently support these ops if probs.typ == 'cuda': probs = PMAT(probs.get_mat()) probs = probs.cumsum(axis=0) r = pmat.random(probs.size() / numalts) choices = probs.subtract(r, inplace=True).firstpositive(axis=0) logger.debug('finish: MNL simulation') return choices.get_mat()
def nl_estimate(data, chosen, numalts, nestinfo, availability, GPU=False, coeffrange=(-2.0, 2.0)): atype = 'numpy' if not GPU else 'cuda' data = np.transpose(data) chosen = np.transpose(chosen) if availability is not None: availability = np.transpose(availability) numvars = data.shape[0] numobs = data.shape[1] / numalts data, chosen = PMAT(data, atype), PMAT(chosen, atype) if availability is not None: availability = PMAT(availability, atype) beta = np.ones(nestinfo.numnests() + numvars) beta[:nestinfo.numnests()] = 4.0 bounds = np.array( [coeffrange for i in range(nestinfo.numnests() + numvars)]) bounds[:nestinfo.numnests()] = (1.0, 5.0) print "WARNING: setting bounds manually" t1 = time.time() args = (data, chosen, numalts, nestinfo, availability, GPU) bfgs_result = scipy.optimize.fmin_l_bfgs_b(nl_loglik, beta, args=args, approx_grad=True, bounds=bounds, epsilon=.001, pgtol=.01) # bfgs_result = scipy.optimize.fmin_bfgs(nl_loglik, # beta, # full_output=1, # args=(data,chosen,numalts,nestinfo,GPU)) print "Optimized in %f seconds" % (time.time() - t1) beta = bfgs_result[0] inv_hessian = 1.0 / \ approximate_second_derivative(nl_loglik, beta, args=args) stderr = np.sqrt(inv_hessian) # get_standard_error(inv_hessian) tscore = beta / stderr l_0beta = np.zeros(nestinfo.numnests() + numvars) l_0beta[:nestinfo.numnests()] = 1.0 l_0 = -1 * nl_loglik(l_0beta, *args) l_1 = -1 * nl_loglik(beta, *args) ll_ratio = 1 - (l_1 / l_0) # print "Null Log-liklihood: %f" % l_0 # print "Log-liklihood at convergence: %f" % l_1 # print "Log-liklihood ratio: %f" % ll_ratio return (l_0, l_1, ll_ratio), zip(beta, stderr, tscore)
def mcfaddencorrectionvec(self, atype): if self._mcfaddencorrectionvec: return self._mcfaddencorrectionvec totaltspernest = self.totaltspernest() if not self.nestsizevaries(): mcfaddencorrection = totaltspernest / float(self._samplepernest) mcfaddencorrectionvec = PMAT( np.reshape(np.repeat(mcfaddencorrection, self._samplepernest), (-1, 1)), atype).log(inplace=True) else: # can choose between taking the mean nest size or actual nest size # which varies per choice in this case # mcfaddencorrection = np.mean( # totaltspernest,axis=0)/float(self._samplepernest) mcfaddencorrection = np.repeat( totaltspernest / float(self._samplepernest), self._samplepernest) mcfaddencorrection = np.transpose( np.reshape(mcfaddencorrection, (-1, self.samplepernest() * self.numnests()))) mcfaddencorrectionvec = PMAT(mcfaddencorrection, atype).log(inplace=True) self._mcfaddencorrectionvec = mcfaddencorrectionvec return self._mcfaddencorrectionvec
def nl_probs(data, beta, mu, numalts, nestinfo, availability, GPU=0): atype = 'numpy' if not GPU else 'cuda' nestsize = nestinfo.samplepernest() utilities = beta.multiply(data) utilities.reshape(numalts, utilities.size() / numalts) if DEBUG: print "beta", beta, "mu", mu rate_panel = nestinfo.ratepanel(atype) assert rate_panel.shape() == utilities.shape() muvec = PMAT(np.reshape(np.repeat(mu, nestsize), (-1, 1)), atype) exponentiated_utility = utilities.multiply_by_col( muvec, inplace=False).exp(inplace=True).element_multiply(rate_panel, inplace=True) if availability is not None: exponentiated_utility.element_multiply(availability, inplace=True) exponentiated_utility.reshape(nestsize, -1) sum_exponentiated_utility = exponentiated_utility.sum(axis=0).reshape( mu.size, -1) logGnest = sum_exponentiated_utility.log(inplace=True) \ .multiply_by_col(PMAT(np.reshape(1.0 / mu - 1.0, (-1, 1)), atype)) muvec = PMAT(np.reshape(np.repeat(mu - 1.0, nestsize), (-1, 1)), atype) logG = (utilities.multiply_by_col(muvec, inplace=False).reshape( nestsize, -1).add_row_vec(logGnest.reshape(1, -1), inplace=True).reshape(numalts, -1)) if not nestinfo.nestsizevaries(): exponentiated_utility = \ (utilities.element_add(logG, inplace=True) .add_col_vec(nestinfo.mcfaddencorrectionvec(atype), inplace=True) .exp(inplace=True)) else: exponentiated_utility = \ (utilities.element_add(logG, inplace=True) .element_add(nestinfo.mcfaddencorrectionvec(atype), inplace=True) .exp(inplace=True)) if availability is not None: exponentiated_utility.element_multiply(availability, inplace=True) sum_exponentiated_utility = exponentiated_utility.sum(axis=0) probs = exponentiated_utility.divide_by_row(sum_exponentiated_utility, inplace=True) return probs
def mnl_estimate(data, chosen, numalts, GPU=0, coeffrange=(-3, 3), weights=None, lcgrad=False, beta=None): atype = 'numpy' if not GPU else 'cuda' numvars = data.shape[1] numobs = data.shape[0] / numalts if chosen is None: chosen = np.ones((numobs, numalts)) # used for latent classes data = np.transpose(data) chosen = np.transpose(chosen) data, chosen = PMAT(data, atype), PMAT(chosen, atype) if weights is not None: weights = PMAT(np.transpose(weights), atype) if beta is None: beta = np.zeros(numvars) bounds = np.array([coeffrange for i in range(numvars)]) args = (data, chosen, numalts, weights, lcgrad) bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik, beta, args=args, fprime=None, factr=1e5, approx_grad=False, bounds=bounds) beta = bfgs_result[0] stderr = mnl_loglik(beta, data, chosen, numalts, weights, stderr=1, lcgrad=lcgrad) tscore = beta / stderr l_0beta = np.zeros(numvars) l_0 = -1 * mnl_loglik(l_0beta, *args)[0] l_1 = -1 * mnl_loglik(beta, *args)[0] ll_ratio = 1 - (l_1 / l_0) print "Null Log-liklihood: %f" % l_0 print "Log-liklihood at convergence: %f" % l_1 print "Log-liklihood ratio: %f" % ll_ratio return (l_0, l_1, ll_ratio), zip(beta, stderr, tscore)
def mnl_simulate(data, coeff, numalts, GPU=False, returnprobs=True): """ Get the probabilities for each chooser choosing between `numalts` alternatives. Parameters ---------- data : 2D array The data are expected to be in "long" form where each row is for one alternative. Alternatives are in groups of `numalts` rows per choosers. Alternatives must be in the same order for each chooser. coeff : 1D array The model coefficients corresponding to each column in `data`. numalts : int The number of alternatives available to each chooser. GPU : bool, optional returnprobs : bool, optional If True, return the probabilities for each chooser/alternative instead of actual choices. Returns ------- probs or choices: 2D array If `returnprobs` is True the probabilities are a 2D array with a row for each chooser and columns for each alternative. """ logger.debug( 'start: MNL simulation with len(data)={} and numalts={}'.format( len(data), numalts)) atype = 'numpy' if not GPU else 'cuda' data = np.transpose(data) coeff = np.reshape(np.array(coeff), (1, len(coeff))) data, coeff = PMAT(data, atype), PMAT(coeff, atype) probs = mnl_probs(data, coeff, numalts) if returnprobs: return np.transpose(probs.get_mat()) # convert to cpu from here on - gpu doesn't currently support these ops if probs.typ == 'cuda': probs = PMAT(probs.get_mat()) probs = probs.cumsum(axis=0) r = pmat.random(probs.size() / numalts) choices = probs.subtract(r, inplace=True).firstpositive(axis=0) logger.debug('finish: MNL simulation') return choices.get_mat()
def ratepanel(self, atype): if self._ratepanel: return self._ratepanel numnests = self.numnests() nestsize = self.samplepernest() totaltspernest = self.totaltspernest() chosennest = self.chosennest() if totaltspernest.ndim == 1: rate_notchosen_panel = np.tile(totaltspernest / float(nestsize), (chosennest.size, 1)) rate_chosen_panel = np.tile( (totaltspernest - 1) / float(nestsize - 1), (chosennest.size, 1)) else: # in these case, the number of alternatives varies by decision rate_notchosen_panel = totaltspernest / float(nestsize) rate_chosen_panel = (totaltspernest - 1) / float(nestsize - 1) # in nest two lines, if there are a fewer alts than sample size, they count fully # this is because of availability, which adds no utility for alts that aren't available rate_notchosen_panel[np.where(rate_notchosen_panel < 1.0)] = 1.0 rate_chosen_panel[np.where(rate_chosen_panel < 1.0)] = 1.0 chosen = np.zeros((chosennest.size, numnests), dtype='bool') chosen[np.arange(chosennest.size), chosennest] = True rate_panel = rate_chosen_panel * chosen + rate_notchosen_panel * np.invert( chosen) rate_panel = np.repeat(rate_panel, nestsize, axis=1) rate_panel[np.arange(chosennest.size), chosennest * nestsize] = 1 self._ratepanel = PMAT(np.transpose(rate_panel), atype) return self._ratepanel
def nl_loglik(beta, data, chosen, numalts, nestinfo, availability, GPU=0, stderr=0): numvars = beta.size - nestinfo.numnests() numobs = data.size() / numvars / numalts mu, beta = beta[:nestinfo.numnests()], beta[nestinfo.numnests():] beta = np.reshape(beta, (1, beta.size)) beta = PMAT(beta, data.typ) probs = nl_probs(data, beta, mu, numalts, nestinfo, availability, GPU) if stderr: assert 0 #return get_standard_error(get_hessian(gradmat.get_mat())) loglik = probs.element_multiply( chosen, inplace=True).sum(axis=0).log(inplace=True).sum(axis=1) if loglik.typ == 'numpy': loglik = loglik.get_mat() else: loglik = loglik.get_mat()[0, 0] if DEBUG: print "loglik", loglik return -1 * loglik
def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False, stderr=0): logger.debug('start: calculate MNL log-likelihood') numvars = beta.size numobs = data.size() / numvars / numalts beta = np.reshape(beta, (1, beta.size)) beta = PMAT(beta, data.typ) probs = mnl_probs(data, beta, numalts) # lcgrad is the special gradient for the latent class membership model if lcgrad: assert weights gradmat = weights.subtract(probs).reshape(probs.size(), 1) gradarr = data.multiply(gradmat) else: if not weights: gradmat = chosen.subtract(probs).reshape(probs.size(), 1) else: gradmat = chosen.subtract(probs).multiply_by_row(weights).reshape( probs.size(), 1) gradarr = data.multiply(gradmat) if stderr: gradmat = data.multiply_by_row(gradmat.reshape(1, gradmat.size())) gradmat.reshape(numvars, numalts * numobs) return get_standard_error(get_hessian(gradmat.get_mat())) chosen.reshape(numalts, numobs) if weights is not None: if probs.shape() == weights.shape(): loglik = ((probs.log(inplace=True).element_multiply( weights, inplace=True).element_multiply( chosen, inplace=True)).sum(axis=1).sum(axis=0)) else: loglik = ((probs.log(inplace=True).multiply_by_row( weights, inplace=True).element_multiply( chosen, inplace=True)).sum(axis=1).sum(axis=0)) else: loglik = (probs.log(inplace=True).element_multiply( chosen, inplace=True)).sum(axis=1).sum(axis=0) if loglik.typ == 'numpy': loglik, gradarr = loglik.get_mat(), gradarr.get_mat().flatten() else: loglik = loglik.get_mat()[0, 0] gradarr = np.reshape(gradarr.get_mat(), (1, gradarr.size()))[0] logger.debug('finish: calculate MNL log-likelihood') return -1 * loglik, -1 * gradarr
def mnl_loglik(beta,data,chosen,numalts,weights=None,lcgrad=False,stderr=0): numvars = beta.size numobs = data.size()/numvars/numalts beta = np.reshape(beta,(1,beta.size)) beta = PMAT(beta,data.typ) probs = mnl_probs(data,beta,numalts) if lcgrad: assert weights gradmat = weights.subtract(probs).reshape(1,probs.size()) else: gradmat = chosen.subtract(probs).reshape(1,probs.size()) gradmat = data.multiply_by_row(gradmat) # this line is a bit hackish - you can't do the whole sum at once on a gpu # need to shorten the length of the axis over which to sum gradarr = gradmat.reshape(numvars*numalts,numobs) if weights is not None and not lcgrad: gradarr = gradarr.element_multiply(weights,inplace=True) gradarr = gradarr.sum(axis=1).reshape(numvars,numalts).sum(axis=1) gradmat.reshape(numvars,numalts*numobs) if stderr: if not lcgrad: return get_standard_error(get_hessian(gradmat.get_mat())) else: return np.zeros(beta.size()) chosen.reshape(numalts,numobs) if weights is not None: loglik = (probs.log(inplace=True).element_multiply(weights,inplace=True) \ .element_multiply(chosen,inplace=True)).sum(axis=1).sum(axis=0) else: loglik = (probs.log(inplace=True).element_multiply(chosen,inplace=True)).sum(axis=1).sum(axis=0) if loglik.typ == 'numpy': loglik, gradarr = loglik.get_mat(), gradarr.get_mat() else: loglik = loglik.get_mat()[0,0] gradarr = np.reshape(gradarr.get_mat(),(1,gradarr.size()))[0] return -1*loglik, -1*gradarr
def mnl_simulate(data, coeff, numalts, GPU=0, returnprobs=0): atype = 'numpy' if not GPU else 'cuda' data = np.transpose(data) coeff = np.reshape(np.array(coeff),(1,len(coeff))) data, coeff = PMAT(data,atype), PMAT(coeff,atype) probs = mnl_probs(data,coeff,numalts) if returnprobs: return np.transpose(probs.get_mat()) # convert to cpu from here on - gpu doesn't currently support these ops if probs.typ == 'cuda': probs = PMAT(probs.get_mat()) probs = probs.cumsum(axis=0) r = pmat.random(probs.size()/numalts) choices = probs.subtract(r,inplace=True).firstpositive(axis=0) return choices.get_mat()
def mnl_simulate(data, coeff, numalts, GPU=0, returnprobs=0): atype = 'numpy' if not GPU else 'cuda' data = np.transpose(data) coeff = np.reshape(np.array(coeff), (1, len(coeff))) data, coeff = PMAT(data, atype), PMAT(coeff, atype) probs = mnl_probs(data, coeff, numalts) if returnprobs: return np.transpose(probs.get_mat()) # convert to cpu from here on - gpu doesn't currently support these ops if probs.typ == 'cuda': probs = PMAT(probs.get_mat()) probs = probs.cumsum(axis=0) r = pmat.random(probs.size() / numalts) choices = probs.subtract(r, inplace=True).firstpositive(axis=0) return choices.get_mat()
def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), weights=None, lcgrad=False, beta=None): """ Parameters ---------- data chosen numalts GPU : bool coeffrange weights lcgrad : bool beta Returns ------- log_likelihood : dict Dictionary of log-likelihood values describing the quality of the model fit. fit_parameters : pandas.DataFrame Table of fit parameters with columns 'Coefficient', 'Std. Error', 'T-Score'. """ atype = 'numpy' if not GPU else 'cuda' numvars = data.shape[1] numobs = data.shape[0] / numalts if chosen is None: chosen = np.ones((numobs, numalts)) # used for latent classes data = np.transpose(data) chosen = np.transpose(chosen) data, chosen = PMAT(data, atype), PMAT(chosen, atype) if weights is not None: weights = PMAT(np.transpose(weights), atype) if beta is None: beta = np.zeros(numvars) bounds = np.array([coeffrange for i in range(numvars)]) args = (data, chosen, numalts, weights, lcgrad) bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik, beta, args=args, fprime=None, factr=1e5, approx_grad=False, bounds=bounds) beta = bfgs_result[0] stderr = mnl_loglik(beta, data, chosen, numalts, weights, stderr=1, lcgrad=lcgrad) l0beta = np.zeros(numvars) l0 = -1 * mnl_loglik(l0beta, *args)[0] l1 = -1 * mnl_loglik(beta, *args)[0] log_likelihood = { 'null': float(l0[0][0]), 'convergence': float(l1[0][0]), 'ratio': float((1 - (l1 / l0))[0][0]) } fit_parameters = pd.DataFrame({ 'Coefficient': beta, 'Std. Error': stderr, 'T-Score': beta / stderr }) return log_likelihood, fit_parameters
def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-3, 3), weights=None, lcgrad=False, beta=None): """ Calculate coefficients of the MNL model. Parameters ---------- data : 2D array The data are expected to be in "long" form where each row is for one alternative. Alternatives are in groups of `numalts` rows per choosers. Alternatives must be in the same order for each chooser. chosen : 2D array This boolean array has a row for each chooser and a column for each alternative. The column ordering for alternatives is expected to be the same as their row ordering in the `data` array. A one (True) indicates which alternative each chooser has chosen. numalts : int The number of alternatives. GPU : bool, optional coeffrange : tuple of floats, optional Limits of (min, max) to which coefficients are clipped. weights : ndarray, optional lcgrad : bool, optional beta : 1D array, optional Any initial guess for the coefficients. Returns ------- log_likelihood : dict Dictionary of log-likelihood values describing the quality of the model fit. fit_parameters : pandas.DataFrame Table of fit parameters with columns 'Coefficient', 'Std. Error', 'T-Score'. Each row corresponds to a column in `data` and are given in the same order as in `data`. See Also -------- scipy.optimize.fmin_l_bfgs_b : The optimization routine used. """ logger.debug( 'start: MNL fit with len(data)={} and numalts={}'.format( len(data), numalts)) atype = 'numpy' if not GPU else 'cuda' numvars = data.shape[1] numobs = data.shape[0] / numalts if chosen is None: chosen = np.ones((numobs, numalts)) # used for latent classes data = np.transpose(data) chosen = np.transpose(chosen) data, chosen = PMAT(data, atype), PMAT(chosen, atype) if weights is not None: weights = PMAT(np.transpose(weights), atype) if beta is None: beta = np.zeros(numvars) bounds = [coeffrange] * numvars with log_start_finish('scipy optimization for MNL fit', logger): args = (data, chosen, numalts, weights, lcgrad) bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik, beta, args=args, fprime=None, factr=10, approx_grad=False, bounds=bounds ) beta = bfgs_result[0] stderr = mnl_loglik( beta, data, chosen, numalts, weights, stderr=1, lcgrad=lcgrad) l0beta = np.zeros(numvars) l0 = -1 * mnl_loglik(l0beta, *args)[0] l1 = -1 * mnl_loglik(beta, *args)[0] log_likelihood = { 'null': float(l0[0][0]), 'convergence': float(l1[0][0]), 'ratio': float((1 - (l1 / l0))[0][0]) } fit_parameters = pd.DataFrame({ 'Coefficient': beta, 'Std. Error': stderr, 'T-Score': beta / stderr}) logger.debug('finish: MNL fit') return log_likelihood, fit_parameters