Ejemplo n.º 1
0
    def test_implement(self):
        g = glm(
            'ycts~c(yr,2000)+h(x1,(0,.1,.3),0) + l(x2,(0.1,.5,.6)) + x3 + x4 + x5 + x 6',
            self.df, 'normal')  # test internal space in variable name
        g.implement(
            '/home/will/PycharmProjects/modeling_tools/temp/modeltest.py')
        import temp.modeltest as m
        from temp.modeltest import model
        yhat = model(self.df)
        tmp = pd.DataFrame(yhat)
        tmp.columns = ['file']
        tmp['internal'] = g.fitted
        chk = abs(tmp.internal - tmp.file).max()
        self.assertAlmostEqual(chk, 0, 5, 'glm: Implementation')

        g = glm('y~c(yr,2000)+h(x1,(0,.1,.3),0) + x2 + x3 + x4 + x5 + x 6',
                self.df, 'binomial')  # test internal space in variable name
        g.implement(
            '/home/will/PycharmProjects/modeling_tools/temp/modeltest.py')
        importlib.reload(m)
        from temp.modeltest import model
        yhat = model(self.df)
        tmp = pd.DataFrame(yhat)
        tmp.columns = ['file']
        tmp['internal'] = g.fitted
        chk = abs(tmp.internal - tmp.file).max()
        self.assertAlmostEqual(chk, 0, 5, 'glm: Implementation')
Ejemplo n.º 2
0
    def test_tTable(self):
        """Check parameters for logistic and normal regression vs statsmodels"""
        g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal')
        mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        diff = np.asarray(modfitted.tvalues) - np.asarray(g.t_table.t)
        chk = np.abs(diff).max()
        self.assertAlmostEqual(chk, 0, 5, 'glm: t-statistic calculation')

        g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial')
        mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        diff = np.asarray(modfitted.tvalues) - np.asarray(g.t_table.t)
        chk = np.abs(diff).max()
        # not as strict since totally different methods only asymptotically the same
        self.assertAlmostEqual(chk, 0.0, 3, 'glm: t-statistic calculation')
Ejemplo n.º 3
0
 def test_totalSumOfSquares(self):
     g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6', self.df,
             'normal')  # test internal space in variable name
     resid = self.df.ycts - self.df.ycts.mean()
     tss = np.multiply(resid, resid).sum()
     chk = abs(tss - g.total_sum_of_squares)
     self.assertAlmostEqual(chk, 0, 5, "glm: TotalSumOfSquares")
Ejemplo n.º 4
0
    def test_fStatistic(self):
        g = glm('ycts~c(yr,2000)+h(x1,(0,.1,.3),0) + x2 + x3 + x4 + x5 + x 6',
                self.df, 'normal')  # test internal space in variable name
        f = ((g.total_sum_of_squares - g.sse) / (g.p - 1)) / g.mse
        chk = abs(f - g.model_significance[0])
        self.assertAlmostEqual(chk, 0, 5, 'glm: model_significance')

        g = glm('y~x1 + x2 + x3 + x4 + x5 + x 6', self.df, 'binomial')
        mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        ttt = modfitted.summary()
        l1 = ttt.tables[0][4][3]
        l2 = ttt.tables[0][5][3]
        chisq = 2 * (float(l1.data) - float(l2.data))
        gchisq = g.model_significance[0]
        chk = abs(chisq - gchisq) / chisq
        self.assertAlmostEqual(chk, 0, delta=0.02)
Ejemplo n.º 5
0
 def test_ks(self):
     from modeling.functions import ks_calculate
     g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial')
     ks1 = ks_calculate(
         g.fitted, g.y,
         plot=True)  # same function but KS_calculate has been tested
     chk = abs(ks1 - g.ks)
     self.assertAlmostEqual(chk, 0, 0.00001, "glm: KS")
Ejemplo n.º 6
0
 def test_sse(self):
     """Check sum of squared error vs statsmodels (doesn't apply to logistic regression)"""
     g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal')
     mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df)
     modfitted = mod.fit()
     pred = modfitted.predict()
     diff = pred - np.asarray(self.df.ycts)
     chk1 = abs(np.multiply(diff, diff).sum() - g.sse)
     self.assertAlmostEqual(chk1, 0, 5, 'glm: sse calculation')
Ejemplo n.º 7
0
 def test_rSquare(self):
     """Check R2 calculation by using pandas correlation between y and yhat"""
     g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6', self.df,
             'normal')  # test internal space in variable name
     df_test = pd.DataFrame(np.append(g.fitted, g.y, axis=1),
                            columns=['y', 'yhat'])
     corr = df_test.corr()
     chk = abs(corr.yhat[0] * corr.yhat[0] - g.r_square)
     self.assertAlmostEqual(chk, 0, 5, 'glm: Rsquare')
Ejemplo n.º 8
0
    def test_predict(self):
        """Check predicted values for logistic and normal regression vs statsmodels"""
        g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial')
        mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        pred = modfitted.predict(self.dfnew)
        predglm = g.predict(self.dfnew)
        diff = np.squeeze(np.asarray(predglm)) - pred
        chk = np.abs(diff).max()

        g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal')
        mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        pred = modfitted.predict(self.dfnew)
        predglm = g.predict(self.dfnew)
        diff = np.squeeze(np.asarray(predglm)) - pred
        chk1 = np.abs(diff).max()

        self.assertAlmostEqual(chk + chk1, 0, 5, 'glm: predicted values')
Ejemplo n.º 9
0
    def test_parameters(self):
        """Check parameters for logistic and normal regression vs statsmodels"""
        g = glm('ycts~', self.df, 'normal')
        chk = float(abs(g.parameters[0] - self.df.ycts.mean()))
        self.assertAlmostEqual(chk, 0, 5, 'glm: mean-only model does not work')

        g = glm('ycts~x1 + x2 + x3 + x4 + x5 ', self.df, 'normal')
        mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        diff = np.asarray(modfitted.params) - np.squeeze(g.parameters)
        chk = np.abs(diff).max()
        self.assertAlmostEqual(chk, 0, 5, 'glm: parameters calculation')

        g = glm('y~x1 + x2 + x3 + x4 + x5', self.df, 'binomial')
        mod = smf.logit(formula='y~x1 + x2 + x3 + x4 + x5', data=self.df)
        modfitted = mod.fit()
        diff = np.asarray(modfitted.params) - np.squeeze(g.parameters)
        chk = np.abs(diff).max()
        ttt = modfitted.summary()
        self.assertAlmostEqual(chk, 0, 5, 'glm: parameters calculation')
Ejemplo n.º 10
0
    def test_restrictions(self):
        """
        Tests that the restricions work and work, too, with categorical levels
        """
        r = ['x1=1', 'yr:2001-yr:2002=0', 'yr:2002-yr:2003=0']
        g = glm('ycts~x1 + x2 + x3 + x4 + x5 + x 6+c(yr,2000)',
                self.df,
                'normal',
                restrictions=r)  # test internal space in variable name
        chk = abs(g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2002'])
        chk += abs(g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2003'])
        chk += abs(g.t_table.beta['x1'] - 1)
        self.assertAlmostEqual(chk, 0, 5, 'glm: restrictions')

        r = ['x1=1', '2*yr:2001-yr:2002=0', 'yr:2002-0.5*yr:2003=0']
        g = glm('y~x1 + x2 + x3 + x4 + x5 + x 6+c(yr,2000)',
                self.df,
                'binomial',
                restrictions=r)  # test internal space in variable name
        chk = abs(2 * g.t_table.beta['yr:2001'] - g.t_table.beta['yr:2002'])
        chk += abs(g.t_table.beta['yr:2002'] - 0.5 * g.t_table.beta['yr:2003'])
        chk += abs(g.t_table.beta['x1'] - 1)
        self.assertAlmostEqual(chk, 0, 5, 'glm: restrictions')
Ejemplo n.º 11
0
    def test_smooth(self):
        from modeling.glm import glm

        def fx(x):
            # very not linear
            #y = -2 * np.sin(2 * 3.14 * x)
            #y[x > 0.75] = 2

            #y =1 + 4*x

            #y = -2 * np.sin(2 * 3.14 * x)
            #y[x > 0.75] = 2
            y = 1 + 3 * x
            y[x > 0.5] = 2.5

            return y

        n = 30000
        df = pd.DataFrame(np.random.uniform(0, 1, (n, 6)),
                          columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x 6'])
        y = fx(df.x1) + np.random.normal(0, .5, n)
        df['y'] = y
        hts = np.matrix(linear_splines_basis2(df.x1, np.arange(.05, .95, .05)))
        wts = np.zeros(y.shape[0])
        wts.fill(1)
        xtest = pd.DataFrame(np.arange(0.01, .99, .01), columns=['x'])
        xhts = np.matrix(
            linear_splines_basis2(xtest.x, np.arange(.05, .95, .05)))

        sm = smooth_linear_splines(hts, y, xhts, 15, wts=wts)
        diff = np.abs(y - np.squeeze(np.array(sm['yhat'])))
        chk = diff.max()

        tmp = glm('y~h(x1,(0,.5,1),0)', df, 'normal')
        yh = np.squeeze(np.array(tmp.fitted))
        diff = np.abs(np.squeeze(np.array(sm['yhat'])) - yh)
        chk1 = diff.max()
        corr = np.corrcoef(yh, np.squeeze(np.array(sm['yhat'])))
        ok = (corr[0, 1] > 0.95)

        self.assertEqual(
            ok, True, 'smooth_linear_splines: difference exceeds tolerance')
Ejemplo n.º 12
0
    def test_decile_plot(self):
        n = 30000
        df = pd.DataFrame(np.random.uniform(0, 1, (n, 6)),
                          columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x 6'])
        df['yr'] = np.random.randint(2000, 2017, n)
        y = -1 + 2 * df['x1'] - 3 * df['x2'] + 1.5 * df['x3'] - .1 * df[
            'x4'] + 7.7 * df['x5']
        p = np.exp(y) / (1 + np.exp(y))
        df['p'] = p
        df.p.describe()
        y0 = np.zeros(n)
        y1 = np.zeros(n)
        y1.fill(1.0)
        u = np.random.uniform(0, 1, n)
        yz = np.where(u < p, y1, y0)
        df['y'] = yz
        y0 = np.chararray(n)
        y0.fill('n')
        y1 = np.chararray(n)
        y1.fill('y')
        yc = np.where(u < p, y1, y0)
        df['yc'] = yc
        df['ycts'] = y + np.random.normal(0, 1, n)

        g = glm('y~x1+x2+x3+x4+x5', df, 'binomial')
        score = pd.Series(np.squeeze(np.asarray(g.fitted)))
        binvar = pd.Series(np.squeeze(np.asarray(g.y)))
        decile_plot(score,
                    binvar,
                    'The Score',
                    'The Actual',
                    'Test Title',
                    subtitle='Test Subtitle',
                    correlation=0.01)
        decile_plot(g.fitted,
                    g.y,
                    'The Score',
                    'The Actual',
                    'Test Title',
                    subtitle='Test Subtitle',
                    correlation=0.01)
Ejemplo n.º 13
0
def incr_build(model: str,
               target_var: str,
               start_list: list,
               add_list: list,
               get_data_fn,
               sample_size: int,
               client: clickhouse_driver.Client,
               global_valid_df_in: pd.DataFrame,
               family='normal'):
    """
    This function builds a sequence of GLM models. The get_data_fn takes a list of values as contained in
    start_list and add_list and returns data subset to those values. The initial model is built on the
    values of start_list and then evaluated on the data subset to the first value of add_list.

    At the next step, the data in the first element of add_list is added to the start_list data, the model
    is updated and the evaluation is conducted on the second element of add_list.

    This function is the GLM counterpart to incr_build

    :param model: model specification for glm
    :param target_var: response variable we're modeling
    :param start_list: list of (general) time periods for model build for the first model build
    :param add_list: list of out-of-time periods to evaluate
    :param get_data_fn: function to get a pandas DataFrame of data to work on
    :param sample_size: size of pandas DataFrames to get
    :param client: db connector
    :param family: family of the model ('normal' or 'binomial')
    :param global_valid_df_in: pandas DataFrame covering all the values of add_list for validation
    :return: lists of out-of-sample values:
             add_list
             rmse  root mean squared error
             corr  correlation
    """

    build_list = start_list
    global_valid_df = global_valid_df_in.copy()
    global_valid_df['model_glm_inc'] = np.full((global_valid_df.shape[0]), 0.0)
    rmse_valid = []
    corr_valid = []
    segs = []
    for j, valid in enumerate(add_list):
        segs += [valid]
        model_df = get_data_fn(build_list, sample_size, client)
        valid_df = get_data_fn([valid], sample_size, client)
        print(
            'Data sizes for out-of-sample value {0}: build {1}, validate {2}'.
            format(valid, model_df.shape[0], valid_df.shape[0]))
        # print('Build list: {0}'.format(build_list))

        glm_model = glm(model, model_df, family=family)
        build_list += [valid]

        gyh = glm_model.predict(global_valid_df)
        i = global_valid_df['vintage'] == valid
        global_valid_df.loc[i, 'model_glm_inc'] = gyh[i]

        yh = glm_model.predict(valid_df)
        res = valid_df[target_var] - np.array(yh).flatten()
        rmse_valid += [math.sqrt(np.square(res).mean())]
        valid_df['yh'] = yh
        cor = genu.r_square(valid_df['yh'], valid_df[target_var])
        corr_valid += [cor]

    return segs, rmse_valid, corr_valid, global_valid_df