def test_implement(self): g = glm( 'ycts~c(yr,2000)+h(x1,(0,.1,.3),0) + l(x2,(0.1,.5,.6)) + x3 + x4 + x5 + x 6', self.df, 'normal') # test internal space in variable name g.implement( '/home/will/PycharmProjects/modeling_tools/temp/modeltest.py') import temp.modeltest as m from temp.modeltest import model yhat = model(self.df) tmp = pd.DataFrame(yhat) tmp.columns = ['file'] tmp['internal'] = g.fitted chk = abs(tmp.internal - tmp.file).max() self.assertAlmostEqual(chk, 0, 5, 'glm: Implementation') g = glm('y~c(yr,2000)+h(x1,(0,.1,.3),0) + x2 + x3 + x4 + x5 + x 6', self.df, 'binomial') # test internal space in variable name g.implement( '/home/will/PycharmProjects/modeling_tools/temp/modeltest.py') importlib.reload(m) from temp.modeltest import model yhat = model(self.df) tmp = pd.DataFrame(yhat) tmp.columns = ['file'] tmp['internal'] = g.fitted chk = abs(tmp.internal - tmp.file).max() self.assertAlmostEqual(chk, 0, 5, 'glm: Implementation')
def test_tTable(self): """Check parameters for logistic and normal regression vs statsmodels""" g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal') mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() diff = np.asarray(modfitted.tvalues) - np.asarray(g.t_table.t) chk = np.abs(diff).max() self.assertAlmostEqual(chk, 0, 5, 'glm: t-statistic calculation') g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial') mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() diff = np.asarray(modfitted.tvalues) - np.asarray(g.t_table.t) chk = np.abs(diff).max() # not as strict since totally different methods only asymptotically the same self.assertAlmostEqual(chk, 0.0, 3, 'glm: t-statistic calculation')
def test_totalSumOfSquares(self): g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6', self.df, 'normal') # test internal space in variable name resid = self.df.ycts - self.df.ycts.mean() tss = np.multiply(resid, resid).sum() chk = abs(tss - g.total_sum_of_squares) self.assertAlmostEqual(chk, 0, 5, "glm: TotalSumOfSquares")
def test_fStatistic(self): g = glm('ycts~c(yr,2000)+h(x1,(0,.1,.3),0) + x2 + x3 + x4 + x5 + x 6', self.df, 'normal') # test internal space in variable name f = ((g.total_sum_of_squares - g.sse) / (g.p - 1)) / g.mse chk = abs(f - g.model_significance[0]) self.assertAlmostEqual(chk, 0, 5, 'glm: model_significance') g = glm('y~x1 + x2 + x3 + x4 + x5 + x 6', self.df, 'binomial') mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() ttt = modfitted.summary() l1 = ttt.tables[0][4][3] l2 = ttt.tables[0][5][3] chisq = 2 * (float(l1.data) - float(l2.data)) gchisq = g.model_significance[0] chk = abs(chisq - gchisq) / chisq self.assertAlmostEqual(chk, 0, delta=0.02)
def test_ks(self): from modeling.functions import ks_calculate g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial') ks1 = ks_calculate( g.fitted, g.y, plot=True) # same function but KS_calculate has been tested chk = abs(ks1 - g.ks) self.assertAlmostEqual(chk, 0, 0.00001, "glm: KS")
def test_sse(self): """Check sum of squared error vs statsmodels (doesn't apply to logistic regression)""" g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal') mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() pred = modfitted.predict() diff = pred - np.asarray(self.df.ycts) chk1 = abs(np.multiply(diff, diff).sum() - g.sse) self.assertAlmostEqual(chk1, 0, 5, 'glm: sse calculation')
def test_rSquare(self): """Check R2 calculation by using pandas correlation between y and yhat""" g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6', self.df, 'normal') # test internal space in variable name df_test = pd.DataFrame(np.append(g.fitted, g.y, axis=1), columns=['y', 'yhat']) corr = df_test.corr() chk = abs(corr.yhat[0] * corr.yhat[0] - g.r_square) self.assertAlmostEqual(chk, 0, 5, 'glm: Rsquare')
def test_predict(self): """Check predicted values for logistic and normal regression vs statsmodels""" g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial') mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() pred = modfitted.predict(self.dfnew) predglm = g.predict(self.dfnew) diff = np.squeeze(np.asarray(predglm)) - pred chk = np.abs(diff).max() g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal') mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() pred = modfitted.predict(self.dfnew) predglm = g.predict(self.dfnew) diff = np.squeeze(np.asarray(predglm)) - pred chk1 = np.abs(diff).max() self.assertAlmostEqual(chk + chk1, 0, 5, 'glm: predicted values')
def test_parameters(self): """Check parameters for logistic and normal regression vs statsmodels""" g = glm('ycts~', self.df, 'normal') chk = float(abs(g.parameters[0] - self.df.ycts.mean())) self.assertAlmostEqual(chk, 0, 5, 'glm: mean-only model does not work') g = glm('ycts~x1 + x2 + x3 + x4 + x5 ', self.df, 'normal') mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() diff = np.asarray(modfitted.params) - np.squeeze(g.parameters) chk = np.abs(diff).max() self.assertAlmostEqual(chk, 0, 5, 'glm: parameters calculation') g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial') mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() diff = np.asarray(modfitted.params) - np.squeeze(g.parameters) chk = np.abs(diff).max() ttt = modfitted.summary() self.assertAlmostEqual(chk, 0, 5, 'glm: parameters calculation')
def test_restrictions(self): """ Tests that the restricions work and work, too, with categorical levels """ r = ['x1=1', 'yr:2001-yr:2002=0', 'yr:2002-yr:2003=0'] g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6+c(yr,2000)', self.df, 'normal', restrictions=r) # test internal space in variable name chk = abs(g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2002']) chk += abs(g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2003']) chk += abs(g.t_table.beta['x1'] - 1) self.assertAlmostEqual(chk, 0, 5, 'glm: restrictions') r = ['x1=1', '2*yr:2001-yr:2002=0', 'yr:2002-0.5*yr:2003=0'] g = glm('y~x1 + x2 + x3 + x4 + x5 + x 6+c(yr,2000)', self.df, 'binomial', restrictions=r) # test internal space in variable name chk = abs(2 * g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2002']) chk += abs(g.t_table.beta['yr:2002'] - 0.5 * g.t_table.beta['yr:2003']) chk += abs(g.t_table.beta['x1'] - 1) self.assertAlmostEqual(chk, 0, 5, 'glm: restrictions')
def test_smooth(self): from modeling.glm import glm def fx(x): # very not linear #y = -2 * np.sin(2 * 3.14 * x) #y[x > 0.75] = 2 #y =1 + 4*x #y = -2 * np.sin(2 * 3.14 * x) #y[x > 0.75] = 2 y = 1 + 3 * x y[x > 0.5] = 2.5 return y n = 30000 df = pd.DataFrame(np.random.uniform(0, 1, (n, 6)), columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x 6']) y = fx(df.x1) + np.random.normal(0, .5, n) df['y'] = y hts = np.matrix(linear_splines_basis2(df.x1, np.arange(.05, .95, .05))) wts = np.zeros(y.shape[0]) wts.fill(1) xtest = pd.DataFrame(np.arange(0.01, .99, .01), columns=['x']) xhts = np.matrix( linear_splines_basis2(xtest.x, np.arange(.05, .95, .05))) sm = smooth_linear_splines(hts, y, xhts, 15, wts=wts) diff = np.abs(y - np.squeeze(np.array(sm['yhat']))) chk = diff.max() tmp = glm('y~h(x1,(0,.5,1),0)', df, 'normal') yh = np.squeeze(np.array(tmp.fitted)) diff = np.abs(np.squeeze(np.array(sm['yhat'])) - yh) chk1 = diff.max() corr = np.corrcoef(yh, np.squeeze(np.array(sm['yhat']))) ok = (corr[0, 1] > 0.95) self.assertEqual( ok, True, 'smooth_linear_splines: difference exceeds tolerance')
def test_decile_plot(self): n = 30000 df = pd.DataFrame(np.random.uniform(0, 1, (n, 6)), columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x 6']) df['yr'] = np.random.randint(2000, 2017, n) y = -1 + 2 * df['x1'] - 3 * df['x2'] + 1.5 * df['x3'] - .1 * df[ 'x4'] + 7.7 * df['x5'] p = np.exp(y) / (1 + np.exp(y)) df['p'] = p df.p.describe() y0 = np.zeros(n) y1 = np.zeros(n) y1.fill(1.0) u = np.random.uniform(0, 1, n) yz = np.where(u < p, y1, y0) df['y'] = yz y0 = np.chararray(n) y0.fill('n') y1 = np.chararray(n) y1.fill('y') yc = np.where(u < p, y1, y0) df['yc'] = yc df['ycts'] = y + np.random.normal(0, 1, n) g = glm('y~x1+x2+x3+x4+x5', df, 'binomial') score = pd.Series(np.squeeze(np.asarray(g.fitted))) binvar = pd.Series(np.squeeze(np.asarray(g.y))) decile_plot(score, binvar, 'The Score', 'The Actual', 'Test Title', subtitle='Test Subtitle', correlation=0.01) decile_plot(g.fitted, g.y, 'The Score', 'The Actual', 'Test Title', subtitle='Test Subtitle', correlation=0.01)
def incr_build(model: str, target_var: str, start_list: list, add_list: list, get_data_fn, sample_size: int, client: clickhouse_driver.Client, global_valid_df_in: pd.DataFrame, family='normal'): """ This function builds a sequence of GLM models. The get_data_fn takes a list of values as contained in start_list and add_list and returns data subset to those values. The initial model is built on the values of start_list and then evaluated on the data subset to the first value of add_list. At the next step, the data in the first element of add_list is added to the start_list data, the model is updated and the evaluation is conducted on the second element of add_list. This function is the GLM counterpart to incr_build :param model: model specification for glm :param target_var: response variable we're modeling :param start_list: list of (general) time periods for model build for the first model build :param add_list: list of out-of-time periods to evaluate :param get_data_fn: function to get a pandas DataFrame of data to work on :param sample_size: size of pandas DataFrames to get :param client: db connector :param family: family of the model ('normal' or 'binomial') :param global_valid_df_in: pandas DataFrame covering all the values of add_list for validation :return: lists of out-of-sample values: add_list rmse root mean squared error corr correlation """ build_list = start_list global_valid_df = global_valid_df_in.copy() global_valid_df['model_glm_inc'] = np.full((global_valid_df.shape[0]), 0.0) rmse_valid = [] corr_valid = [] segs = [] for j, valid in enumerate(add_list): segs += [valid] model_df = get_data_fn(build_list, sample_size, client) valid_df = get_data_fn([valid], sample_size, client) print( 'Data sizes for out-of-sample value {0}: build {1}, validate {2}'. format(valid, model_df.shape[0], valid_df.shape[0])) # print('Build list: {0}'.format(build_list)) glm_model = glm(model, model_df, family=family) build_list += [valid] gyh = glm_model.predict(global_valid_df) i = global_valid_df['vintage'] == valid global_valid_df.loc[i, 'model_glm_inc'] = gyh[i] yh = glm_model.predict(valid_df) res = valid_df[target_var] - np.array(yh).flatten() rmse_valid += [math.sqrt(np.square(res).mean())] valid_df['yh'] = yh cor = genu.r_square(valid_df['yh'], valid_df[target_var]) corr_valid += [cor] return segs, rmse_valid, corr_valid, global_valid_df