Example #1
0
    def evalSocialOptimum(self):
        '''
        Execute a simulation run that calculates "socially optimal" objects.
        '''
        self.update()
        self.solve()
        self.CalcSocialOptimum = True
        self.initializeSim()
        self.simulate()
        self.iLvlSocOpt_histX = deepcopy(self.iLvlSocOpt_hist)
        self.CopaySocOpt_histX = deepcopy(self.CopaySocOpt_hist)
        self.LifePriceSocOpt_histX = deepcopy(self.LifePriceSocOpt_hist)
        self.CumLivPrb_histX = deepcopy(self.CumLivPrb_hist)
        self.CalcSocialOptimum = False

        try:
            Health = self.hLvlNow_hist.flatten()
            Weights = self.CumLivPrb_hist.flatten()
            Copays = self.CopaySocOpt_hist.flatten()
            Age = np.tile(
                np.reshape(np.arange(self.T_cycle), (self.T_cycle, 1)),
                (1, self.AgentCount)).flatten()
            AgeSq = Age**2.
            HealthSq = Health**2
            Ones = np.ones_like(Health)
            AgeHealth = Age * Health
            AgeHealthSq = Age * HealthSq
            AgeSqHealth = AgeSq * Health
            AgeSqHealthSq = AgeSq * HealthSq
            these = np.logical_not(np.isnan(Copays))
            regressors = np.transpose(
                np.vstack((Ones, Health, HealthSq, Age, AgeHealth, AgeHealthSq,
                           AgeSq, AgeSqHealth, AgeSqHealthSq)))
            copay_model = WLS(Copays[these],
                              regressors[these, :],
                              weights=Weights[these])
            coeffs = (copay_model.fit()).params

            UpperCopayFunc = ConstantFunction(1.0)
            LowerCopayFunc = ConstantFunction(0.01)
            OptimalCopayInvstFunc = []
            for t in range(self.T_cycle):
                c0 = coeffs[0] + t * coeffs[3] + t**2 * coeffs[6]
                c1 = coeffs[1] + t * coeffs[4] + t**2 * coeffs[7]
                c2 = coeffs[2] + t * coeffs[5] + t**2 * coeffs[8]
                TempFunc = QuadraticFunction(c0, c1, c2)
                OptimalCopayInvstFunc_t = UpperEnvelope(
                    LowerEnvelope(TempFunc, UpperCopayFunc), LowerCopayFunc)
                OptimalCopayInvstFunc.append(OptimalCopayInvstFunc_t)

        except:
            for t in range(self.T_cycle):
                OptimalCopayInvstFunc.append(ConstantFunction(1.0))

        self.OptimalCopayInvstFunc = OptimalCopayInvstFunc
        self.delSolution()
Example #2
0
    def __run_model(self):

        X = add_constant(self.__x)

        mod_wls = WLS(self.__y,
                      X,
                      weights=self.__weights,
                      missing="drop",
                      hasconst=True)
        res_wls = mod_wls.fit()
        self.__alpha = res_wls.params[1]
        self.__Beta = res_wls.params[0]
Example #3
0
def _compute_vif(exog, exog_idx, weights=None, model_config=None):
    """
    Compute variance inflation factor, VIF, for one exogenous variable
    for OLS and WLS that allows weights.
    Parameters
    ----------
    exog: X features [X_1, X_2, ..., X_n]
    exog_idx: ith index for features
    weights: weights
    model_config: {"hasconst": True,
    "cov_type": "HC3"} by default
    
    Returns: vif
    -------
    """
    if model_config is None:
        model_config = {"hasconst": True,
                        "cov_type": "HC3"}
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    if weights is None:
        r_squared_i = OLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"]).fit().rsquared
    else:
        r_squared_i = WLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"],
                          weights=weights).fit(
            cov_type=model_config["cov_type"]).rsquared
    vif = 1. / (1. - r_squared_i)
    return vif
Example #4
0
    def _engine_factory(self, fy, X, check_integrity=True):
        ws = self._get_weights()
        if not self._check_integrity(fy, X, ws):
            if len(fy) == 2 and len(X) == 2 and len(ws) == 1:
                ws = hstack((ws, ws[0]))
            else:
                return

        return WLS(fy, X, weights=ws)
Example #5
0
def fit_gravity(data, deg=2, **kwargs):
    """Polynomial fit of the gravity values.

    """

    # endog
    endog = np.asarray(data.g)

    # design_matrix
    exog = np.vander(data.level.values, N=deg + 1, increasing=True)[:, 1:]

    # rename unknowns
    poly_cnames = [x for x in ascii_lowercase[:deg]]
    exog = pd.DataFrame(exog, columns=poly_cnames)

    # fit
    results = WLS(endog, exog, **kwargs).fit()

    return results
Example #6
0
    def calculate(self):
        if not len(self.xs) or \
            not len(self.ys) or\
            not len(self.yserr):
            return

        if len(self.xs) != len(self.ys) or len(self.xs) != len(self.yserr):
            return

        xs = self.xs
        xs = asarray(xs)
        es = asarray(self.yserr)
        ys = self.ys

        X = self._get_X()
        self._wls = WLS(ys, X,
                        weights=1 / es ** 2
                        )
        self._result = self._wls.fit()
Example #7
0
def fit_floating_gravity(data, deg=2, **kwargs):
    """Fit floating gravity model to the observations.

    """

    # transform data
    df = pd.DataFrame({
        'g':
        np.concatenate((np.zeros_like(data.delta_g), data.delta_g)),
        'h':
        np.concatenate((data['level_1'], data['level_2'])) / 1000,
        'ci':
        np.tile(data.runn, 2)
    })

    df = df.drop_duplicates(['ci', 'h', 'g'])
    df = df.sort_values(['ci', 'g'], ascending=[True, False])
    df = df.reset_index(drop=True)

    # observations
    endog = np.asarray(df.g)

    # design matrix
    exog_1 = np.vander(df.h.values, N=deg + 1, increasing=True)[:, 1:]
    exog_2 = categorical(df.ci.values, drop=True)
    exog = np.hstack((exog_2, exog_1))

    # rename unknowns
    h_level_1 = data.drop_duplicates(['level_1', 'runn']).level_1
    h0 = ['h({:,.3f})'.format(hi / 1000) for hi in np.asarray(h_level_1)]
    poly_cnames = [x for x in ascii_lowercase[:deg]]
    cnames = np.append(h0, poly_cnames)
    exog = pd.DataFrame(exog, columns=cnames)

    # fit
    results = WLS(endog, exog, **kwargs).fit()

    return df, results
Example #8
0
    def params(self):
        """

        Fits an AFT model and returns parameters.

        Parameters
        ---------
        None


        Returns
        -------
        Fitted params

        Notes
        -----
        To avoid dividing by zero, max(endog) is assumed to be uncensored.
        """
        self.model.modif_censors = np.copy(self.model.censors)
        self.model.modif_censors[-1] = 1
        wts = self.model._make_km(self.model.endog, self.model.modif_censors)
        res = WLS(self.model.endog, self.model.exog, wts).fit()
        params = res.params
        return params
Example #9
0
def makeValidationFigures(params, use_cohorts):
    '''
    Make several figures that compare simulated outcomes from the estimated model
    to their data counterparts, for external validation.
    
    Parameters
    ----------
    params : np.array
        Size 33 array of model parameters, like that used for estimation.
    use_cohorts : bool
        Indicator for whether or not to model differences across cohorts.
        
    Returns
    -------
    None
    '''
    # Make, solve, and simulate the types
    param_dict = convertVecToDict(params)
    if use_cohorts:
        type_list = makeMultiTypeWithCohorts(param_dict)
    else:
        type_list = makeMultiTypeSimple(param_dict)
    for this_type in type_list:
        this_type.track_vars.append('MedLvlNow')
        this_type.track_vars.append('iLvlNow')
        this_type.track_vars.append('HitCfloor')
        this_type.CalcExpectationFuncs = True
        this_type.DeleteSolution = False
    multiThreadCommandsFake(type_list, ['estimationAction()'], num_jobs=5)

    # Combine simulated data across all types
    aLvlHist = np.concatenate(
        [this_type.aLvlNow_hist for this_type in type_list], axis=1)
    hLvlHist = np.concatenate(
        [this_type.hLvlNow_hist for this_type in type_list], axis=1)
    OOPhist = np.concatenate(
        [this_type.OOPmedNow_hist for this_type in type_list], axis=1)
    MortHist = np.concatenate(
        [this_type.DiePrbNow_hist for this_type in type_list], axis=1)
    WeightHist = np.concatenate(
        [this_type.CumLivPrb_hist for this_type in type_list], axis=1)
    MedHist = np.concatenate(
        [this_type.MedLvlNow_hist for this_type in type_list], axis=1)

    # Combine data labels across types
    HealthTert = np.concatenate(
        [this_type.HealthTert for this_type in type_list])
    HealthQuint = np.concatenate(
        [this_type.HealthQuint for this_type in type_list])
    WealthQuint = np.concatenate(
        [this_type.WealthQuint for this_type in type_list])
    IncQuint = np.concatenate(
        [this_type.IncQuintLong for this_type in type_list])
    Sex = np.concatenate([this_type.SexLong for this_type in type_list])

    # Combine in-data-span masking array across all types
    Active = hLvlHist > 0.
    InDataSpan = np.concatenate(
        [this_type.InDataSpanArray for this_type in type_list], axis=1)
    WeightAdj = InDataSpan * WeightHist

    # For each type, calculate the probability that no health investment is purchased at each age
    # and the probability the
    iLvlZeroRate = np.zeros((10, 25))
    HitCfloorRate = np.zeros((10, 25))
    for j in range(10):
        this_type = type_list[j]
        iLvlZero = this_type.iLvlNow_hist == 0.
        HitCfloor = this_type.HitCfloor_hist == 1.
        iLvlZeroSum = np.sum(iLvlZero * this_type.CumLivPrb_hist, axis=1)
        HitCfloorSum = np.sum(HitCfloor * this_type.CumLivPrb_hist, axis=1)
        PopSum = np.sum(this_type.CumLivPrb_hist, axis=1)
        iLvlZeroRate[j, :] = iLvlZeroSum / PopSum
        HitCfloorRate[j, :] = HitCfloorSum / PopSum

    # Calculate median (pseudo) bank balances for each type
    bLvl_init_median = np.zeros(10)
    for n in range(10):
        bLvl_init_median[n] = np.median(
            type_list[n].aLvlInit) + type_list[n].IncomeNow[2]

    # Extract deciles of health by age from the simulated data
    pctiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    SimHealthPctiles = np.zeros((15, len(pctiles)))
    for t in range(15):
        SimHealthPctiles[t, :] = getPercentiles(hLvlHist[t, :],
                                                weights=WeightAdj[t, :],
                                                percentiles=pctiles)

    # Plot the probability of purchasing zero health investment by age, sex, and income
    colors = ['b', 'r', 'g', 'c', 'm']
    AgeVec = np.linspace(67., 95., num=15)
    for n in range(5):
        plt.plot(AgeVec, iLvlZeroRate[n, :15], '-' + colors[n])
    plt.xlabel('Age')
    plt.ylabel(r'Prob[$n_{it}=0$]')
    plt.title('Probability of Buying No Health Investment, Women')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ])
    plt.savefig('../Figures/ZeroInvstWomen.pdf')
    plt.show()
    for n in range(5):
        plt.plot(AgeVec, iLvlZeroRate[n + 5, :15], '-' + colors[n])
    plt.xlabel('Age')
    plt.ylabel(r'Prob[$n_{it}=0$]')
    plt.title('Probability of Buying No Health Investment, Men')
    plt.savefig('../Figures/ZeroInvstMen.pdf')
    plt.show()

    # Plot the probability of hitting the consumption floor by age, sex, and income
    colors = ['b', 'r', 'g', 'c', 'm']
    AgeVec = np.linspace(67., 95., num=15)
    for n in range(5):
        plt.plot(AgeVec, HitCfloorRate[n, :15], '-' + colors[n])
    plt.xlabel('Age')
    plt.ylabel(r'Prob[$c_{it}={c}$]')
    plt.title('Probability of Using Consumption Floor, Women')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ])
    plt.savefig('../Figures/cFloorWomen.pdf')
    plt.show()
    for n in range(5):
        plt.plot(AgeVec, HitCfloorRate[n + 5, :15], '-' + colors[n])
    plt.xlabel('Age')
    plt.ylabel(r'Prob[$c_{it}={c}$]')
    plt.title('Probability of Using Consumption Floor, Men')
    plt.savefig('../Figures/cFloorMen.pdf')
    plt.show()

    # Plot health investment as a function of market resources by type, holding h and Dev fixed
    B = np.linspace(1., 50., 201)
    some_ones = np.ones_like(B)
    hLvl = 0.6
    Dev = 0.0
    t = 0
    Age = str(65 + 2 * t)
    for i in range(5):
        this_type = type_list[i]
        MedShk = np.exp(this_type.MedShkMeanFunc[t](hLvl) +
                        Dev * this_type.MedShkStdFunc(hLvl))
        I = np.maximum(
            this_type.solution[t].PolicyFunc.iFunc(B, hLvl * some_ones,
                                                   MedShk * some_ones), 0.0)
        plt.plot(B, I, '-' + colors[i])
    plt.xlabel(r'Bank balances $b_{it}$, \$10,000 (y2000)')
    plt.ylabel(r'Health investment $n_{it}$, \$10,000 (y2000)')
    plt.xlim([1., 50.])
    plt.ylim([-0.01, 0.65])
    #plt.legend(['Bottom quintile','Second quintile','Third quintile','Fourth quintile','Top quintile'])
    plt.title('Health Investment Function at Age ' + Age + ' by Income, Women')
    plt.savefig('../Figures/iFuncWomen.pdf')
    plt.show()
    for i in range(5):
        this_type = type_list[i + 5]
        MedShk = np.exp(this_type.MedShkMeanFunc[t](hLvl) +
                        Dev * this_type.MedShkStdFunc(hLvl))
        I = np.maximum(
            this_type.solution[t].PolicyFunc.iFunc(B, hLvl * some_ones,
                                                   MedShk * some_ones), 0.0)
        plt.plot(B, I, '-' + colors[i])
    plt.xlabel(r'Bank balances $b_{it}$, \$10,000 (y2000)')
    plt.ylabel(r'Health investment $n_{it}$, \$10,000 (y2000)')
    plt.xlim([1., 50.])
    plt.ylim([-0.01, 0.65])
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ],
               loc=4)
    plt.title('Health Investment Function at Age ' + Age + ' by Income, Men')
    plt.savefig('../Figures/iFuncMen.pdf')
    plt.show()

    # Plot PDV of total medical expenses by health at median wealth at age 69-70 by income quintile and sex
    t = 2
    H = np.linspace(0.0, 1.0, 201)
    for n in range(5):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].TotalMedPDVfunc(B, H)
        plt.plot(H, M, color=colors[n])
    plt.xlim([0., 1.])
    plt.ylim([0., 17])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('PDV total medical care, $10,000 (y2000)')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ])
    plt.title('Total Medical Expenses by Health and Income, Women')
    plt.savefig('../Figures/TotalMedPDVbyIncomeWomen.pdf')
    plt.show()
    for n in range(5, 10):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].TotalMedPDVfunc(B, H)
        plt.plot(H, M, color=colors[n - 5])
    plt.xlim([0., 1.])
    plt.ylim([0., 17])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('PDV total medical care, $10,000 (y2000)')
    #plt.legend(['Bottom quintile','Second quintile','Third quintile','Fourth quintile','Top quintile'])
    plt.title('Total Medical Expenses by Health and Income, Men')
    plt.savefig('../Figures/TotalMedPDVbyIncomeMen.pdf')
    plt.show()

    # Plot PDV of OOP medical expenses by health at median wealth at age 69-70 by income quintile and sex
    colors = ['b', 'r', 'g', 'c', 'm']
    t = 2
    H = np.linspace(0.0, 1.0, 201)
    for n in range(5):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].OOPmedPDVfunc(B, H)
        plt.plot(H, M, color=colors[n])
    plt.xlim([0., 1.])
    plt.ylim([0., 3.5])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('PDV OOP medical expenses, $10,000 (y2000)')
    #plt.legend(['Bottom quintile','Second quintile','Third quintile','Fourth quintile','Top quintile'])
    plt.title('OOP Medical Expenses by Health and Income, Women')
    plt.savefig('../Figures/OOPmedPDVbyIncomeWomen.pdf')
    plt.show()
    for n in range(5, 10):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].OOPmedPDVfunc(B, H)
        plt.plot(H, M, color=colors[n - 5])
    plt.xlim([0., 1.])
    plt.ylim([0., 3.5])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('PDV total medical care, $10,000 (y2000)')
    #plt.legend(['Bottom quintile','Second quintile','Third quintile','Fourth quintile','Top quintile'])
    plt.title('OOP Medical Expenses by Health and Income, Men')
    plt.savefig('../Figures/OOPmedPDVbyIncomeMen.pdf')
    plt.show()

    # Plot life expectancy by health at median wealth at age 69-70 by income quintile and sex
    colors = ['b', 'r', 'g', 'c', 'm']
    t = 2
    H = np.linspace(0.0, 1.0, 201)
    for n in range(5):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].ExpectedLifeFunc(B, H)
        plt.plot(H, M, color=colors[n])
    plt.xlim([0., 1.])
    plt.ylim([0., 20.])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('Remaining years of life expectancy')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ])
    plt.title('Life Expectancy at Age 69 by Health and Income, Women')
    plt.savefig('../Figures/LifeExpectancybyIncomeWomen.pdf')
    plt.show()
    for n in range(5, 10):
        B = bLvl_init_median[n] * np.ones_like(H)
        M = type_list[n].solution[t].ExpectedLifeFunc(B, H)
        plt.plot(H, M, color=colors[n - 5])
    plt.xlim([0., 1.])
    plt.ylim([0., 20.])
    plt.xlabel(r'Health capital $h_{it}$')
    plt.ylabel('Remaining years of life expectancy')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ])
    plt.title('Life Expectancy at Age 69 by Health and Income, Men')
    plt.savefig('../Figures/LifeExpectancybyIncomeMen.pdf')
    plt.show()

    # Extract deciles of health from the HRS data
    DataHealthPctiles = np.zeros((15, len(pctiles)))
    for t in range(15):
        these = np.logical_and(Data.AgeBoolArray[:, :, t], Data.Alive)
        h_temp = Data.h_data[these]
        DataHealthPctiles[t, :] = getPercentiles(h_temp, percentiles=pctiles)

    # Plot deciles of health by by age
    plt.plot(AgeVec, SimHealthPctiles, '-k')
    plt.plot(AgeVec, DataHealthPctiles, '--k')
    plt.ylim(0., 1.)
    plt.ylabel('Health capital $h_{it}$')
    plt.xlabel('Age')
    plt.title('Simulated vs Actual Distribution of Health by Age')
    plt.savefig('../Figures/HealthDistribution.pdf')
    plt.show()

    OOPmodFunc = lambda x: np.log(10000 * x)

    # Extract many percentiles of OOP spending from the simulated data
    OOP_sim = OOPhist.flatten()
    Weight_temp = WeightAdj.flatten()
    CDFvalsSim = np.linspace(0.0001, 0.999, 1000)
    OOPsimCDF_A0 = getPercentiles(OOP_sim * 10000,
                                  weights=Weight_temp,
                                  percentiles=CDFvalsSim)
    OOPsimCDF_B0 = getPercentiles(OOPmodFunc(OOP_sim),
                                  weights=Weight_temp,
                                  percentiles=CDFvalsSim)

    # Extract some percentiles of OOP spending from the HRS data
    these = np.logical_and(Data.Alive, np.logical_not(np.isnan(Data.m_data)))
    OOP_data = Data.m_data[these]
    CDFvalsData = np.linspace(0.0001, 0.999, 500)
    OOPdataCDF_A0 = getPercentiles(OOP_data * 10000,
                                   weights=None,
                                   percentiles=CDFvalsData)
    OOPdataCDF_B0 = getPercentiles(OOPmodFunc(OOP_data),
                                   weights=None,
                                   percentiles=CDFvalsData)

    # Plot the CDF of log out-of-pocket medical spending
    plt.subplot(211)
    plt.title('CDF of OOP Medical Spending')
    plt.plot(OOPdataCDF_B0, CDFvalsData, '-r')
    plt.plot(OOPsimCDF_B0, CDFvalsSim, '-b')
    plt.xlim(8., 11.5)
    plt.ylim(0.85, 1.0)
    plt.xticks([
        np.log(3000),
        np.log(6000),
        np.log(12000),
        np.log(24000),
        np.log(48000),
        np.log(96000)
    ], ['3000', '6000', '12000', '24000', '48000', '96000'])

    # Plot the CDF of out-of-pocket medical spending
    plt.subplot(212)
    plt.plot(OOPdataCDF_A0, CDFvalsData, '-r')
    plt.plot(OOPsimCDF_A0, CDFvalsSim, '-b')
    plt.xlim(0., 3000.)
    plt.ylim(0.0, 0.9)
    plt.xlabel('Out-of-pocket medical expenses, biannual')
    plt.ylabel('Cumulative distribution')
    plt.legend(['HRS data', 'Model'], loc=4)
    plt.savefig('../Figures/OOPdistribution.pdf')
    plt.show()

    # Calculate the serial correlation of log OOP medical spending in simulated data
    Med_sim = np.log(10000 * OOPhist + 1.)
    serial_corr_sim = np.zeros(15)
    serial_corr_sim_inc = np.zeros((15, 5))
    for t in range(15):
        these = np.logical_and(WeightAdj[t + 1, :] > 0., WeightAdj[t + 1, :] <
                               1.)  # Alive but not the first simulated period
        Med_t = Med_sim[t + 1, these]
        Med_tm1 = Med_sim[t, these]
        weight_reg = WeightAdj[t + 1, these]
        const_reg = np.ones_like(Med_t)
        regressors = np.transpose(np.vstack([const_reg, Med_tm1]))
        temp_model = WLS(Med_t, regressors, weights=weight_reg)
        temp_results = temp_model.fit()
        serial_corr_sim[t] = temp_results.rsquared
        for i in range(5):
            those = np.logical_and(these, IncQuint == i + 1)
            Med_t = Med_sim[t + 1, those]
            Med_tm1 = Med_sim[t, those]
            weight_reg = WeightAdj[t + 1, those]
            const_reg = np.ones_like(Med_t)
            regressors = np.transpose(np.vstack([const_reg, Med_tm1]))
            temp_model = WLS(Med_t, regressors, weights=weight_reg)
            temp_results = temp_model.fit()
            serial_corr_sim_inc[t, i] = temp_results.rsquared

    # Calculate the serial correlation of log OOP medical spending in HRS data
    DataExists = np.logical_and(np.logical_not(np.isnan(Data.m_data[:-1, :])),
                                np.logical_not(np.isnan(Data.m_data[1:, :])))
    BothAlive = np.logical_and(Data.Alive[:-1, :], Data.Alive[1:, :])
    Usable = np.logical_and(DataExists, BothAlive)
    serial_corr_data = np.zeros(15)
    serial_corr_data_inc = np.zeros((15, 5))
    Med_data = np.log(10000 * Data.m_data + 1.)
    for t in range(15):
        these = np.logical_and(Usable, Data.AgeBoolArray[:-1, :, t])
        Med_t = Med_data[1:, :][these]
        Med_tm1 = Med_data[:-1, :][these]
        const_reg = np.ones_like(Med_t)
        regressors = np.transpose(np.vstack([const_reg, Med_tm1]))
        temp_model = OLS(Med_t, regressors)
        temp_results = temp_model.fit()
        serial_corr_data[t] = temp_results.rsquared
        for i in range(5):
            those = np.logical_and(these, Data.IncQuintBoolArray[:-1, :, i])
            Med_t = Med_data[1:, :][those]
            Med_tm1 = Med_data[:-1, :][those]
            const_reg = np.ones_like(Med_t)
            regressors = np.transpose(np.vstack([const_reg, Med_tm1]))
            temp_model = OLS(Med_t, regressors)
            temp_results = temp_model.fit()
            serial_corr_data_inc[t, i] = temp_results.rsquared

    # Make a plot of serial correlation of OOP medical expenses
    plt.subplot(3, 2, 1)
    plt.plot(AgeVec, serial_corr_data, '-r')
    plt.plot(AgeVec, serial_corr_sim, '-b')
    plt.ylim(0, 0.5)
    plt.xticks([])
    plt.text(75, 0.4, 'All individuals')

    plt.subplot(3, 2, 2)
    plt.plot(AgeVec, serial_corr_data_inc[:, 0], '-r')
    plt.plot(AgeVec, serial_corr_sim_inc[:, 0], '-b')
    plt.ylim(0, 0.5)
    plt.xticks([])
    plt.yticks([])
    plt.text(70, 0.4, 'Bottom income quintile')

    plt.subplot(3, 2, 3)
    plt.plot(AgeVec, serial_corr_data_inc[:, 1], '-r')
    plt.plot(AgeVec, serial_corr_sim_inc[:, 1], '-b')
    plt.ylim(0, 0.5)
    plt.xticks([])
    plt.text(67, 0.4, 'Second income quintile')
    plt.ylabel('$R^2$ of regression of $\log(OOP_{t})$ on $\log(OOP_{t-1})$')

    plt.subplot(3, 2, 4)
    plt.plot(AgeVec, serial_corr_data_inc[:, 2], '-r')
    plt.plot(AgeVec, serial_corr_sim_inc[:, 2], '-b')
    plt.ylim(0, 0.5)
    plt.xticks([])
    plt.yticks([])
    plt.text(70, 0.4, 'Third income quintile')

    plt.subplot(3, 2, 5)
    plt.plot(AgeVec, serial_corr_data_inc[:, 3], '-r')
    plt.plot(AgeVec, serial_corr_sim_inc[:, 3], '-b')
    plt.ylim(0, 0.5)
    plt.xlabel('Age')
    plt.text(70, 0.4, 'Fourth income quintile')

    plt.subplot(3, 2, 6)
    plt.plot(AgeVec, serial_corr_data_inc[:, 4], '-r')
    plt.plot(AgeVec, serial_corr_sim_inc[:, 4], '-b')
    plt.ylim(0, 0.5)
    plt.xlabel('Age')
    plt.yticks([])
    plt.text(70, 0.4, 'Top income quintile')
    plt.savefig('../Figures/SerialCorrOOP.pdf')
    plt.show()

    # Make a plot of serial correlation of OOP medical expenses
    plt.plot(AgeVec + 2, serial_corr_data, '-r')
    plt.plot(AgeVec + 2, serial_corr_sim, '-b')
    plt.xlabel('Age')
    plt.ylabel('$R^2$ of regression of $\log(OOP_{t})$ on $\log(OOP_{t-1})$')
    plt.legend(['HRS data', 'Model'], loc=1)
    plt.show()

    # Calculate mortality probability by age and income quintile in simulated data
    MortByIncAge_data = Data.MortByIncAge
    MortByIncAge_sim = np.zeros((5, 15))
    MortByAge_sim = np.zeros(15)
    for t in range(15):
        THESE = np.logical_and(Active[t, :], InDataSpan[t, :])
        Weight = WeightHist[t + 1, THESE]
        WeightSum = np.sum(Weight)
        Mort = MortHist[t + 1, THESE]
        MortByAge_sim[t] = np.dot(Mort, Weight) / WeightSum
        for i in range(5):
            right_inc = IncQuint == i + 1
            these = np.logical_and(THESE, right_inc)
            Mort = MortHist[t + 1, these]
            Weight = WeightHist[t + 1, these]
            WeightSum = np.sum(Weight)
            MortByIncAge_sim[i, t] = np.dot(Mort, Weight) / WeightSum

    # Plot mortality probability by age and income quintile
    income_colors = ['b', 'r', 'g', 'm', 'c']
    for i in range(5):
        plt.plot(AgeVec, MortByIncAge_sim[i, :] - MortByAge_sim,
                 '-' + income_colors[i])
    for i in range(5):
        plt.plot(AgeVec, MortByIncAge_data[i, :] - MortByAge_sim,
                 '.' + income_colors[i])
    plt.xlabel('Age')
    plt.ylabel('Relative death probability (biannual)')
    plt.title('Death Probability by Income Quintile')
    plt.legend([
        'Bottom quintile', 'Second quintile', 'Third quintile',
        'Fourth quintile', 'Top quintile'
    ],
               loc=2)
    plt.savefig('../Figures/MortByIncAge.pdf')
    plt.show()

    # Plot the 99% confidence band of the health production function
    mean = np.array([-2.13369276099, 1.71842956397])
    covar = np.array([[0.02248322, 0.01628292], [0.01628308, 0.01564192]])
    dstn = multivariate_normal(mean, covar)
    N = 10000
    M = 201
    draws = dstn.rvs(10000)
    MedVec = np.linspace(0., 1.5, M)
    func_data = np.zeros((N, M))

    def makeHealthProdFunc(LogSlope, LogCurve):
        LogJerk = 15.6
        tempw = np.exp(LogJerk)
        HealthProd0 = 1. - tempw
        tempx = np.exp(
            LogSlope)  # Slope of health production function at iLvl=0
        HealthProd2 = np.exp(LogJerk - LogCurve)
        HealthProdFunc = lambda i: tempx / HealthProd0 * (
            (i * HealthProd2**(
                (1. - HealthProd0) / HealthProd0) + HealthProd2**
             (1. / HealthProd0))**HealthProd0 - HealthProd2)
        return HealthProdFunc

    for n in range(N):
        f = makeHealthProdFunc(draws[n, 0], draws[n, 1])
        func_data[n, :] = f(MedVec)

    f = makeHealthProdFunc(Params.test_param_vec[25],
                           Params.test_param_vec[26])
    CI_array = np.zeros((M, 2))
    for m in range(M):
        CI_array[m, :] = getPercentiles(func_data[:, m],
                                        percentiles=[0.025, 0.975])
    health_prod = f(MedVec)

    plt.plot(MedVec, health_prod, '-r')
    plt.plot(MedVec, CI_array[:, 0], '--k', linewidth=0.5)
    plt.plot(MedVec, CI_array[:, 1], '--k', linewidth=0.5)
    plt.xlim([-0.005, 1.5])
    plt.ylim([0., None])
    plt.xlabel('Health investment $n_{it}$, \$10,000 (y2000)')
    plt.ylabel('Health produced ')
    plt.title('Estimated Health Production Function')
    plt.legend([
        'Estimated health production function',
        'Pointwise 95% confidence bounds'
    ],
               loc=4)
    plt.savefig('../Figures/HealthProdFunc.pdf')
    plt.show()
Example #10
0
 def _engine_factory(self, fy, X):
     ws = self._get_weights()
     if self._check_integrity(fy, X, ws):
         return WLS(fy, X, weights=ws)
Example #11
0
def wls(A, b, w):
    "weighted least-squares estimation."
    from statsmodels.api import WLS
    # Note: statsmodel is a bit more accurate than directly calling lstsq
    return WLS(b, A, weights=w).fit().params
Example #12
0
def store_csv(df, name):
    df.to_csv("csv/" + name + ".csv", index=False)


def store_pkl(result, name):
    result.save("pkl/" + name + ".pkl", remove_data=True)


auto_df = load_csv("Auto")
print(auto_df.dtypes)

auto_X, auto_y = split_csv(auto_df)

auto_formula = "mpg ~ C(cylinders) + displacement + horsepower + weight + acceleration + C(model_year) + C(origin)"


def build_auto(model, name):
    result = model.fit()
    print(result.summary())

    store_pkl(result, name)

    mpg = DataFrame(result.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name)


build_auto(OLS(auto_y, auto_X), "OLSAuto")
build_auto(ols(formula=auto_formula, data=auto_df), "OLSFormulaAuto")
build_auto(WLS(auto_y, auto_X), "WLSAuto")
build_auto(wls(formula=auto_formula, data=auto_df), "WLSFormulaAuto")
Example #13
0
# Filter out only for "Mother & Baby Products"
rx_cat = re.compile(r'.*/{}/.*'.format(r'mother & baby'))
prices = pi_all.loc[pi_all.item_category_detail.str.contains(rx_cat),
                    ['price_ori', 'price_actual']]

Y = prices.price_actual
X = prices.price_ori
X = add_constant(X)
ols = OLS(Y, X).fit()
prices['price_actual_ols'] = ols.fittedvalues

# https://stats.stackexchange.com/questions/246085/how-to-determine-weights-for-wls-regression-in-r
# https://stats.stackexchange.com/questions/97832/how-do-you-find-weights-for-weighted-least-squares-regression
w = 1 / (OLS(abs(ols.resid), ols.fittedvalues).fit().fittedvalues**2)

wls = WLS(Y, X, weights=w).fit()
prices['price_actual_wls'] = wls.fittedvalues

# https://select-statistics.co.uk/calculators/sample-size-calculator-population-proportion/
me = 0.01  # margin of error
p = 0.05  # sample proportion
ci = 0.95  # confidence interval
N = len(prices)  # population size
# z score for two tail norm dist
z = st.norm.ppf((1 + ci) / 2)
x = (z**2) * p * (1 - p) / (me**2)
n = N * x / (x + N - 1)
n = math.ceil(n)  # final sample size

# take sample for plotting chart only to save resources
# regression modelling still using the population data
Example #14
0
plt.clf()
xx = np.linspace(x.min(), x.max(), 25)
plt.plot(xx, var_est(xx), 'b', label="Estimated variance (quadratic fit)")
plt.plot(cluster_stats[:, 0], cluster_stats[:, 1], 'ro',
                                        label="Cluster variances")
plt.xlabel('x')
plt.ylabel('$s^2$', fontsize=18)
plt.legend(loc='best', numpoints=1)

# The weights for weighted least squares are 1/(s^2).  If we use x_centers
# when doing the quadratic fit of var_est (see comment above), this array
# should match the third column of table 9.1 in Draper and Smith.
weights = 1.0 / var_est(x)

# Use the WLS class from statsmodels to do the weighted least squares fit:
wls_result = WLS(y, X, weights=weights).fit()
print wls_result.summary()

# The next set of plotting commands recreate Fig. 9.2.
plt.figure(3)
plt.clf()
plt.subplot(2, 1, 1)
plt.plot(np.sqrt(weights) * wls_result.fittedvalues, wls_result.wresid, 'bo')
plt.title("WLS Residuals versus weighted fitted values")
plt.xlabel('$\sqrt{w} y$', fontsize=18)
plt.ylabel('e')
plt.grid()
plt.subplot(2, 1, 2)
plt.plot(np.sqrt(weights) * x, wls_result.wresid, 'bo')
plt.title("WLS Residuals versus weighted x")
plt.xlabel("$\sqrt{w} x$", fontsize=18)