def csv2st(csvfile, headers=False, stubs=False, title=None): """Return SimpleTable instance, created from the data in `csvfile`, which is in comma separated values format. The first row may contain headers: set headers=True. The first column may contain stubs: set stubs=True. Can also supply headers and stubs as tuples of strings. """ rows = list() with open(csvfile, 'r') as fh: reader = csv.reader(fh) if headers is True: try: headers = next(reader) except NameError: #must be Python 2.5 or earlier headers = next(reader) elif headers is False: headers = () if stubs is True: stubs = list() for row in reader: if row: stubs.append(row[0]) rows.append(row[1:]) else: #no stubs, or stubs provided for row in reader: if row: rows.append(row) if stubs is False: stubs = () nrows = len(rows) ncols = len(rows[0]) if any(len(row) != ncols for row in rows): raise IOError('All rows of CSV file must have same length.') return SimpleTable(data=rows, headers=headers, stubs=stubs)
def csv2st(csvfile, headers=False, stubs=False, title=None): """Return SimpleTable instance, created from the data in `csvfile`, which is in comma separated values format. The first row may contain headers: set headers=True. The first column may contain stubs: set stubs=True. Can also supply headers and stubs as tuples of strings. """ rows = list() with open(csvfile,'r') as fh: reader = csv.reader(fh) if headers is True: try: headers = next(reader) except NameError: #must be Python 2.5 or earlier headers = next(reader) elif headers is False: headers=() if stubs is True: stubs = list() for row in reader: if row: stubs.append(row[0]) rows.append(row[1:]) else: #no stubs, or stubs provided for row in reader: if row: rows.append(row) if stubs is False: stubs = () nrows = len(rows) ncols = len(rows[0]) if any(len(row)!=ncols for row in rows): raise IOError('All rows of CSV file must have same length.') return SimpleTable(data=rows, headers=headers, stubs=stubs)
def cv_loo(self, bw, func): r""" The cross-validation function with leave-one-out estimator Parameters ---------- bw: array_like Vector of bandwidth values func: callable function Returns the estimator of g(x). Can be either ``_est_loc_constant`` (local constant) or ``_est_loc_linear`` (local_linear). Returns ------- L: float The value of the CV function Notes ----- Calculates the cross-validation least-squares function. This function is minimized by compute_bw to calculate the optimal value of bw For details see p.35 in [2] .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) and :math:`h` is the vector of bandwidths """ LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() LOO_W = LeaveOneOut(self.W_in).__iter__() L = 0 for ii, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) w = next(LOO_W) G = func(bw, endog=Y, exog=-X_not_i, data_predict=-self.exog[ii, :], W=w)[0] L += (self.endog[ii] - G)**2 # Note: There might be a way to vectorize this. See p.72 in [1] return L / self.nobs
def _compute_test_stat(self, u): n = np.shape(u)[0] XLOO = LeaveOneOut(self.exog) uLOO = LeaveOneOut(u[:,None]).__iter__() ival = 0 S2 = 0 for i, X_not_i in enumerate(XLOO): u_j = next(uLOO) u_j = np.squeeze(u_j) # See Bootstrapping procedure on p. 357 in [1] K = gpke(self.bw, data=-X_not_i, data_predict=-self.exog[i, :], var_type=self.var_type, tosum=False) f_i = (u[i] * u_j * K) assert u_j.shape == K.shape ival += f_i.sum() # See eq. 12.7 on p. 355 in [1] S2 += (f_i**2).sum() # See Theorem 12.1 on p.356 in [1] assert np.size(ival) == 1 assert np.size(S2) == 1 ival *= 1. / (n * (n - 1)) ix_cont = _get_type_pos(self.var_type)[0] hp = self.bw[ix_cont].prod() S2 *= 2 * hp / (n * (n - 1)) T = n * ival * np.sqrt(hp / S2) return T
def _compute_test_stat(self, u): n = np.shape(u)[0] XLOO = LeaveOneOut(self.exog) uLOO = LeaveOneOut(u[:, None]).__iter__() I = 0 S2 = 0 for i, X_not_i in enumerate(XLOO): u_j = next(uLOO) u_j = np.squeeze(u_j) # See Bootstrapping procedure on p. 357 in [1] K = gpke(self.bw, data=-X_not_i, data_predict=-self.exog[i, :], var_type=self.var_type, tosum=False) f_i = (u[i] * u_j * K) assert u_j.shape == K.shape I += f_i.sum() # See eq. 12.7 on p. 355 in [1] S2 += (f_i**2).sum() # See Theorem 12.1 on p.356 in [1] assert np.size(I) == 1 assert np.size(S2) == 1 I *= 1. / (n * (n - 1)) ix_cont = _get_type_pos(self.var_type)[0] hp = self.bw[ix_cont].prod() S2 *= 2 * hp / (n * (n - 1)) T = n * I * np.sqrt(hp / S2) return T
def cv_loo(self, bw, func): r""" The cross-validation function with leave-one-out estimator Parameters ---------- bw: array_like Vector of bandwidth values func: callable function Returns the estimator of g(x). Can be either ``_est_loc_constant`` (local constant) or ``_est_loc_linear`` (local_linear). Returns ------- L: float The value of the CV function Notes ----- Calculates the cross-validation least-squares function. This function is minimized by compute_bw to calculate the optimal value of bw For details see p.35 in [2] .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) and :math:`h` is the vector of bandwidths """ LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() LOO_W = LeaveOneOut(self.W_in).__iter__() L = 0 for ii, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) w = next(LOO_W) G = func(bw, endog=Y, exog=-X_not_i, data_predict=-self.exog[ii, :], W=w)[0] L += (self.endog[ii] - G) ** 2 # Note: There might be a way to vectorize this. See p.72 in [1] return L / self.nobs
def insert_stubs(self, loc, stubs): """Return None. Insert column of stubs at column `loc`. If there is a header row, it gets an empty cell. So ``len(stubs)`` should equal the number of non-header rows. """ _Cell = self._Cell stubs = iter(stubs) for row in self: if row.datatype == 'header': empty_cell = _Cell('', datatype='empty') row.insert(loc, empty_cell) else: try: row.insert_stub(loc, next(stubs)) except NameError: #Python 2.5 or earlier row.insert_stub(loc, next(stubs)) except StopIteration: raise ValueError('length of stubs must match table length')
def insert_stubs(self, loc, stubs): """Return None. Insert column of stubs at column `loc`. If there is a header row, it gets an empty cell. So ``len(stubs)`` should equal the number of non-header rows. """ _Cell = self._Cell stubs = iter(stubs) for row in self: if row.datatype == 'header': empty_cell = _Cell('', datatype='empty') row.insert(loc, empty_cell) else: try: row.insert_stub(loc, next(stubs)) except NameError: #Python 2.5 or earlier row.insert_stub(loc, next(stubs)) except StopIteration: raise ValueError('length of stubs must match table length')
def cv_loo(self, params): """ Similar to the cross validation leave-one-out estimator. Modified to reflect the linear components. Parameters ---------- params: array_like Vector consisting of the coefficients (b) and the bandwidths (bw). The first ``k_linear`` elements are the coefficients. Returns ------- L: float The value of the objective function References ---------- See p.254 in [1] """ params = np.asarray(params) b = params[0:self.k_linear] bw = params[self.k_linear:] LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() LOO_Z = LeaveOneOut(self.exog_nonparametric).__iter__() Xb = np.dot(self.exog, b)[:, None] L = 0 for ii, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) Z = next(LOO_Z) Xb_j = np.dot(X_not_i, b)[:, None] Yx = Y - Xb_j G = self.func(bw, endog=Yx, exog=-Z, data_predict=-self.exog_nonparametric[ii, :])[0] lt = Xb[ii, :] #.sum() # linear term L += (self.endog[ii] - lt - G)**2 return L
def _data2rows(self, raw_data): """Return list of Row, the raw data as rows of cells. """ logging.debug('Enter SimpleTable.data2rows.') _Cell = self._Cell _Row = self._Row rows = [] for datarow in raw_data: dtypes = cycle(self._datatypes) newrow = _Row(datarow, datatype='data', table=self, celltype=_Cell) for cell in newrow: try: cell.datatype = next(dtypes) except NameError: #Python 2.5 or earlier cell.datatype = next(dtypes) cell.row = newrow #a cell knows its row rows.append(newrow) logging.debug('Exit SimpleTable.data2rows.') return rows
def _data2rows(self, raw_data): """Return list of Row, the raw data as rows of cells. """ logging.debug('Enter SimpleTable.data2rows.') _Cell = self._Cell _Row = self._Row rows = [] for datarow in raw_data: dtypes = cycle(self._datatypes) newrow = _Row(datarow, datatype='data', table=self, celltype=_Cell) for cell in newrow: try: cell.datatype = next(dtypes) except NameError: #Python 2.5 or earlier cell.datatype = next(dtypes) cell.row = newrow #a cell knows its row rows.append(newrow) logging.debug('Exit SimpleTable.data2rows.') return rows
def cv_loo(self, params): """ Similar to the cross validation leave-one-out estimator. Modified to reflect the linear components. Parameters ---------- params: array_like Vector consisting of the coefficients (b) and the bandwidths (bw). The first ``k_linear`` elements are the coefficients. Returns ------- L: float The value of the objective function References ---------- See p.254 in [1] """ params = np.asarray(params) b = params[0 : self.k_linear] bw = params[self.k_linear:] LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() LOO_Z = LeaveOneOut(self.exog_nonparametric).__iter__() Xb = np.dot(self.exog, b)[:,None] L = 0 for ii, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) Z = next(LOO_Z) Xb_j = np.dot(X_not_i, b)[:,None] Yx = Y - Xb_j G = self.func(bw, endog=Yx, exog=-Z, data_predict=-self.exog_nonparametric[ii, :])[0] lt = Xb[ii, :] #.sum() # linear term L += (self.endog[ii] - lt - G) ** 2 return L
def _data2rows(self, raw_data): """Return list of Row, the raw data as rows of cells. """ _Cell = self._Cell _Row = self._Row rows = [] for datarow in raw_data: dtypes = cycle(self._datatypes) newrow = _Row(datarow, datatype='data', table=self, celltype=_Cell) for cell in newrow: cell.datatype = next(dtypes) cell.row = newrow # a cell knows its row rows.append(newrow) return rows
def _data2rows(self, raw_data): """Return list of Row, the raw data as rows of cells. """ _Cell = self._Cell _Row = self._Row rows = [] for datarow in raw_data: dtypes = cycle(self._datatypes) newrow = _Row(datarow, datatype='data', table=self, celltype=_Cell) for cell in newrow: cell.datatype = next(dtypes) cell.row = newrow # a cell knows its row rows.append(newrow) return rows
def _data2rows(self, raw_data): """Return list of Row, the raw data as rows of cells. """ logging.debug("Enter SimpleTable.data2rows.") _Cell = self._Cell _Row = self._Row rows = [] for datarow in raw_data: dtypes = cycle(self._datatypes) newrow = _Row(datarow, datatype="data", table=self, celltype=_Cell) for cell in newrow: cell.datatype = next(dtypes) cell.row = newrow # a cell knows its row rows.append(newrow) logging.debug("Exit SimpleTable.data2rows.") return rows
def cv_loo(self, params): # See p. 254 in Textbook params = np.asarray(params) b = params[0 : self.K] bw = params[self.K:] LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() L = 0 for i, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) #print b.shape, np.dot(self.exog[i:i+1, :], b).shape, bw, G = self.func(bw, endog=Y, exog=-np.dot(X_not_i, b)[:,None], #data_predict=-b*self.exog[i, :])[0] data_predict=-np.dot(self.exog[i:i+1, :], b))[0] #print G.shape L += (self.endog[i] - G) ** 2 # Note: There might be a way to vectorize this. See p.72 in [1] return L / self.nobs
def cv_loo(self, params): # See p. 254 in Textbook params = np.asarray(params) b = params[0 : self.K] bw = params[self.K:] LOO_X = LeaveOneOut(self.exog) LOO_Y = LeaveOneOut(self.endog).__iter__() L = 0 for i, X_not_i in enumerate(LOO_X): Y = next(LOO_Y) #print b.shape, np.dot(self.exog[i:i+1, :], b).shape, bw, G = self.func(bw, endog=Y, exog=-np.dot(X_not_i, b)[:,None], #data_predict=-b*self.exog[i, :])[0] data_predict=-np.dot(self.exog[i:i+1, :], b))[0] #print G.shape L += (self.endog[i] - G) ** 2 # Note: There might be a way to vectorize this. See p.72 in [1] return L / self.nobs
def loo_likelihood(self, bw, func=lambda x: x): """ Returns the leave-one-out conditional likelihood of the data. If `func` is not equal to the default, what's calculated is a function of the leave-one-out conditional likelihood. Parameters ---------- bw: array_like The bandwidth parameter(s). func: callable, optional Function to transform the likelihood values (before summing); for the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. Returns ------- L: float The value of the leave-one-out function for the data. Notes ----- Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(x)`` for ``f(x)``. """ yLOO = LeaveOneOut(self.data) xLOO = LeaveOneOut(self.exog).__iter__() L = 0 for i, Y_j in enumerate(yLOO): X_not_i = next(xLOO) f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :], var_type=(self.dep_type + self.indep_type)) f_x = gpke(bw[self.k_dep:], data=-X_not_i, data_predict=-self.exog[i, :], var_type=self.indep_type) f_i = f_yx / f_x L += func(f_i) return -L
def loo_likelihood(self, bw, func=lambda x: x): """ Returns the leave-one-out conditional likelihood of the data. If `func` is not equal to the default, what's calculated is a function of the leave-one-out conditional likelihood. Parameters ---------- bw: array_like The bandwidth parameter(s). func: callable, optional Function to transform the likelihood values (before summing); for the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. Returns ------- L: float The value of the leave-one-out function for the data. Notes ----- Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(x)`` for ``f(x)``. """ yLOO = LeaveOneOut(self.data) xLOO = LeaveOneOut(self.exog).__iter__() L = 0 for i, Y_j in enumerate(yLOO): X_not_i = next(xLOO) f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :], var_type=(self.dep_type + self.indep_type)) f_x = gpke(bw[self.k_dep:], data=-X_not_i, data_predict=-self.exog[i, :], var_type=self.indep_type) f_i = f_yx / f_x L += func(f_i) return -L