Beispiel #1
0
def get_apply_comparisions(f: Callable, data: pd.DataFrame):
    start = time.perf_counter()
    pandas_answer = data.groupby("first category")["y"].apply(f)
    pandas_1 = time.perf_counter() - start

    start = time.perf_counter()
    grouped = data.groupby("first category")["y"]
    if f == np.mean:
        for i in range(n_iters):
            grouped.mean()
    else:
        for i in range(n_iters):
            grouped.apply(f)

    pandas_100 = time.perf_counter() - start

    # Compute group means using Grouped class
    start = time.perf_counter()
    first_category = data["first category"].values
    y = data["y"].values
    group_means = Groupby(first_category).apply(f, y, broadcast=False)
    groupby_one = time.perf_counter() - start

    np.testing.assert_almost_equal(pandas_answer.values, group_means)

    start = time.perf_counter()
    grouped = Groupby(first_category)
    for _ in range(n_iters):
        grouped.apply(f, y, broadcast=False)

    groupby_100 = time.perf_counter() - start
    return pandas_1, pandas_100, groupby_one, groupby_100
def estimate_factor_model(df, teacher, classroom, outcomes, covariates, school):
    grouped = Groupby(df[teacher])
    x = grouped.apply(lambda x: x - np.mean(x, 0), df[covariates].values,
                      width=len(covariates))
    alpha_hat = np.linalg.lstsq(x, df[outcomes].values)[0]
    residuals = df[outcomes].values - df[covariates].values.dot(alpha_hat)
    residuals -= np.mean(residuals, 0)

    residual_cols = ['residual' + str(h) for h in range(n_outcomes)]
    for h, col in enumerate(residual_cols):
        df[col] = residuals[:, h]
    class_data = df[[school, classroom, teacher] + residual_cols].groupby(classroom).mean()
    estimated_cov = get_covariance_matrix(class_data[teacher], class_data[residual_cols].values)
    return estimated_cov
Beispiel #3
0
def main():
    # Compute group means using Pandas groupby
    np.random.seed(int("hi", 36))
    n_obs = 10**4
    n_categories = 10**2

    df = pd.DataFrame({
        "first category": np.random.choice(n_categories, n_obs),
        "y": np.random.normal(0, 1, n_obs),
    })
    assert not Groupby(df["first category"]).already_sorted
    result_table = make_result_df(df)
    print(result_table)

    # Try again when already sorted
    df.sort_values("first category", inplace=True)
    assert Groupby(df["first category"]).already_sorted
    result_table = make_result_df(df)
    print(result_table)
    return
Beispiel #4
0
def get_each_va(df, var_theta_hat, var_epsilon_hat, var_mu_hat, jackknife, teacher):
    grouped = Groupby(df[teacher].values)

    # Get unshrunk VA
    def f(data): return get_unshrunk_va(data, jackknife)

    df['unshrunk va'] = grouped.apply(f, df[['size', 'mean score']].values, broadcast=True, width=1)
    if var_mu_hat > 0:

        def f(data): return get_va(data, var_theta_hat, var_epsilon_hat,
                                   var_mu_hat, jackknife)

        results = df.groupby(teacher)[['size', 'mean score']].apply(f).values

        if not jackknife:  # collapse to teacher level
            df = df.groupby(teacher).size().reset_index()

        df['va'], df['variance'] = zip(*results)

    return df
Beispiel #5
0
def estimate_mu_variance(data, teacher):
    def f(vector):
        val = 0
        for i in range(1, len(vector)):
            val += vector[i:].T.dot(vector[:-i])

        return np.array([val, len(vector) * (len(vector) - 1) / 2])

    # First column is sum of all products, by teacher; second is number of products, by teacher
    mu_estimates = Groupby(data[teacher].values).apply(f, data['mean score'].values,
                                                       width=2)
    return np.sum(mu_estimates[:, 0]) / np.sum(mu_estimates[:, 1])
def get_covariance_matrix(group_key, residuals):
    grouped = Groupby(group_key)
    h = residuals.shape[1]
    estimated_cov = np.zeros((h, h))
    mean_out = np.zeros(h)
    n = 0

    for idx in grouped.indices:
        if len(idx) > 1:
            estimated_cov += residuals[idx[0], :, None].dot(residuals[idx[1], :, None].T)
            mean_out += np.sum(residuals[idx[:2], :], 0)
            n += 1

    mean_out = mean_out[:, None]
    return (estimated_cov - mean_out.dot(mean_out.T) / (2 * n)) / n
Beispiel #7
0
 def test_groupby(self):
     ids = np.array([1, 1, 1, 0, 0])
     y = np.array([1, 2, 3, 4, 7])
     grouped_1 = Groupby(ids)
     means_1 = grouped_1.apply(np.mean, y)
     self.assertFalse(grouped_1.already_sorted)
     grouped_2 = Groupby(ids[::-1])
     means_2 = grouped_2.apply(np.mean, y[::-1])
     self.assertTrue(grouped_2.already_sorted)
     self.assertTrue((means_1 == means_2[::-1]).all())
Beispiel #8
0
df = pd.DataFrame({'first category': first_category, 'y': y})

start = time.clock()
pandas_answer = df.groupby('first category')['y'].apply(get_group_mean)
print('time to compute group means once with Pandas: {0}'.format(
    round(time.clock() - start, n_decimals)))

start = time.clock()
grouped = df.groupby('first category')['y']
for i in range(n_iters):
    grouped.apply(get_group_mean)
print('time to compute group means {0} times with Pandas: {1}'.format(
    n_iters, round(time.clock() - start, n_decimals)))

# Compute group means using Grouped class
start = time.clock()
group_means = Groupby(first_category).apply(np.mean, y)
print('time to compute group means once with Grouped: {0}'.format(
    round(time.clock() - start, n_decimals)))

start = time.clock()
grouped = Groupby(first_category)
for _ in range(n_iters):
    grouped.apply(np.mean, y)

print('time to compute group means {0} times with Grouped: {1}'.format(
    n_iters, round(time.clock() - start, n_decimals)))

print(np.hstack(pandas_answer.values) - group_means)
Beispiel #9
0
    def __init__(self, data: pd.DataFrame, outcome: str, teacher: str,
                 dense_controls: np.ndarray, categorical_controls: list,
                 jackknife: bool, class_level_vars: list, moments_only: bool):
        """

        :param data:
        :param outcome:
        :param teacher:
        :param dense_controls:
        :param categorical_controls:
        :param jackknife:
        :param class_level_vars:
        :param moments_only:
        """
        if not moments_only and jackknife:
            raise NotImplementedError('jackknife must be false')
        assert len(class_level_vars) == 1
        # Set x
        if categorical_controls is None:
            x = dense_controls
        elif len(categorical_controls) == 1:
            x = np.hstack((dense_controls,
                           make_dummies(data[categorical_controls[0]],
                                        True).A))
        else:
            x = sps.hstack([
                make_dummies(data[elt], True) for elt in categorical_controls
            ]).A
            if dense_controls is not None:
                x = np.hstack((dense_controls, x))
        # Make sure everything varies within teacher, so beta is identified
        x_with_teacher_dummies = np.hstack((make_dummies(data[teacher],
                                                         False).A, x))
        collinear_cols, not_collinear_cols = find_collinear_cols(
            x_with_teacher_dummies, .01)
        if len(collinear_cols) > 0:
            print('Found', len(collinear_cols), 'collinear columns in x')
            x = x_with_teacher_dummies[:,
                                       not_collinear_cols][:,
                                                           len(
                                                               set(data[teacher]
                                                                   )):]
        self.moments_only = moments_only

        y = data[outcome].values

        # Make Groupby objects
        class_grouped = Groupby(data[class_level_vars].values)
        assert class_grouped.already_sorted
        self.n_students_per_class = class_grouped.apply(len,
                                                        y,
                                                        broadcast=False)
        self.n_students = len(y)
        self.y_tilde = class_grouped.apply(lambda vec: vec - np.mean(vec), y)
        self.x_tilde = class_grouped.apply(lambda arr: arr - np.mean(arr, 0),
                                           x,
                                           width=x.shape[1])
        self.x_bar = class_grouped.apply(lambda arr: np.mean(arr, 0),
                                         x,
                                         broadcast=False,
                                         width=x.shape[1])
        del x
        self.y_bar = class_grouped.apply(np.mean, y, broadcast=False)

        teachers = data[teacher].values[class_grouped.first_occurrences]
        self.sigma_mu_squared, self.sigma_theta_squared, self.sigma_epsilon_squared = \
            np.ones(3) * np.var(data[outcome]) / 6
        del data
        self.teacher_grouped = Groupby(teachers)
        assert self.teacher_grouped.already_sorted
        self.n_teachers = len(self.teacher_grouped.first_occurrences)
        x_bar_bar = self.teacher_grouped.apply(lambda arr: np.mean(arr, 0),
                                               self.x_bar,
                                               width=self.x_bar.shape[1],
                                               broadcast=False)
        x_bar_bar = np.hstack((np.ones((self.n_teachers, 1)), x_bar_bar))
        collin_x_bar_bar, not_collin_x_bar_bar = find_collinear_cols(x_bar_bar)
        if len(collin_x_bar_bar) > 0:
            print('Found', len(collin_x_bar_bar),
                  'collinear columns in x bar bar')
            self.x_tilde = self.x_tilde[:, not_collin_x_bar_bar[1:] - 1]
            self.x_bar = self.x_bar[:, not_collin_x_bar_bar[1:] - 1]
        self.xx_tilde = self.x_tilde.T.dot(self.x_tilde)
        self.xy_tilde = self.x_tilde.T.dot(self.y_tilde)[:, 0]
        assert self.xy_tilde.ndim == 1
        self.n_classes = len(class_grouped.first_occurrences)
        self.n_students = self.x_tilde.shape[0]

        self.beta = np.zeros(self.x_bar.shape[1])
        self.lambda_ = np.zeros(self.x_bar.shape[1])
        self.alpha = 0
        self.h, self.h_sum = None, None
        self.y_bar_tilde, self.x_bar_tilde = None, None
        self.x_bar_bar_long, self.y_bar_bar_long = None, None
        self.predictable_var, self.total_var = None, None
        self.individual_scores = None
        n_params = 2 * self.x_bar.shape[1] + 4
        self.asymp_var = np.full((n_params, n_params), np.nan)
        self.hessian = self.asymp_var.copy()
        self.bias_correction, self.total_var, self.total_var_se = np.nan, np.nan, np.nan
        self.sigma_mu_squared_se = np.nan
Beispiel #10
0
class MLE:
    def __init__(self, data: pd.DataFrame, outcome: str, teacher: str,
                 dense_controls: np.ndarray, categorical_controls: list,
                 jackknife: bool, class_level_vars: list, moments_only: bool):
        """

        :param data:
        :param outcome:
        :param teacher:
        :param dense_controls:
        :param categorical_controls:
        :param jackknife:
        :param class_level_vars:
        :param moments_only:
        """
        if not moments_only and jackknife:
            raise NotImplementedError('jackknife must be false')
        assert len(class_level_vars) == 1
        # Set x
        if categorical_controls is None:
            x = dense_controls
        elif len(categorical_controls) == 1:
            x = np.hstack((dense_controls,
                           make_dummies(data[categorical_controls[0]],
                                        True).A))
        else:
            x = sps.hstack([
                make_dummies(data[elt], True) for elt in categorical_controls
            ]).A
            if dense_controls is not None:
                x = np.hstack((dense_controls, x))
        # Make sure everything varies within teacher, so beta is identified
        x_with_teacher_dummies = np.hstack((make_dummies(data[teacher],
                                                         False).A, x))
        collinear_cols, not_collinear_cols = find_collinear_cols(
            x_with_teacher_dummies, .01)
        if len(collinear_cols) > 0:
            print('Found', len(collinear_cols), 'collinear columns in x')
            x = x_with_teacher_dummies[:,
                                       not_collinear_cols][:,
                                                           len(
                                                               set(data[teacher]
                                                                   )):]
        self.moments_only = moments_only

        y = data[outcome].values

        # Make Groupby objects
        class_grouped = Groupby(data[class_level_vars].values)
        assert class_grouped.already_sorted
        self.n_students_per_class = class_grouped.apply(len,
                                                        y,
                                                        broadcast=False)
        self.n_students = len(y)
        self.y_tilde = class_grouped.apply(lambda vec: vec - np.mean(vec), y)
        self.x_tilde = class_grouped.apply(lambda arr: arr - np.mean(arr, 0),
                                           x,
                                           width=x.shape[1])
        self.x_bar = class_grouped.apply(lambda arr: np.mean(arr, 0),
                                         x,
                                         broadcast=False,
                                         width=x.shape[1])
        del x
        self.y_bar = class_grouped.apply(np.mean, y, broadcast=False)

        teachers = data[teacher].values[class_grouped.first_occurrences]
        self.sigma_mu_squared, self.sigma_theta_squared, self.sigma_epsilon_squared = \
            np.ones(3) * np.var(data[outcome]) / 6
        del data
        self.teacher_grouped = Groupby(teachers)
        assert self.teacher_grouped.already_sorted
        self.n_teachers = len(self.teacher_grouped.first_occurrences)
        x_bar_bar = self.teacher_grouped.apply(lambda arr: np.mean(arr, 0),
                                               self.x_bar,
                                               width=self.x_bar.shape[1],
                                               broadcast=False)
        x_bar_bar = np.hstack((np.ones((self.n_teachers, 1)), x_bar_bar))
        collin_x_bar_bar, not_collin_x_bar_bar = find_collinear_cols(x_bar_bar)
        if len(collin_x_bar_bar) > 0:
            print('Found', len(collin_x_bar_bar),
                  'collinear columns in x bar bar')
            self.x_tilde = self.x_tilde[:, not_collin_x_bar_bar[1:] - 1]
            self.x_bar = self.x_bar[:, not_collin_x_bar_bar[1:] - 1]
        self.xx_tilde = self.x_tilde.T.dot(self.x_tilde)
        self.xy_tilde = self.x_tilde.T.dot(self.y_tilde)[:, 0]
        assert self.xy_tilde.ndim == 1
        self.n_classes = len(class_grouped.first_occurrences)
        self.n_students = self.x_tilde.shape[0]

        self.beta = np.zeros(self.x_bar.shape[1])
        self.lambda_ = np.zeros(self.x_bar.shape[1])
        self.alpha = 0
        self.h, self.h_sum = None, None
        self.y_bar_tilde, self.x_bar_tilde = None, None
        self.x_bar_bar_long, self.y_bar_bar_long = None, None
        self.predictable_var, self.total_var = None, None
        self.individual_scores = None
        n_params = 2 * self.x_bar.shape[1] + 4
        self.asymp_var = np.full((n_params, n_params), np.nan)
        self.hessian = self.asymp_var.copy()
        self.bias_correction, self.total_var, self.total_var_se = np.nan, np.nan, np.nan
        self.sigma_mu_squared_se = np.nan

    def get_ll_grad(self,
                    log_sigma_mu_squared: float = None,
                    log_sigma_theta_squared=None,
                    log_sigma_epsilon_squared=None,
                    beta=None,
                    lambda_=None,
                    alpha=None,
                    get_grad=False,
                    variances_only=False):

        if beta is None:
            beta = self.beta
        if lambda_ is None:
            lambda_ = self.lambda_
        if alpha is None:
            alpha = self.alpha

        change_variances = log_sigma_mu_squared is not None or log_sigma_theta_squared is not None\
            or log_sigma_epsilon_squared is not None
        if log_sigma_mu_squared is None:
            log_sigma_mu_squared = np.log(self.sigma_mu_squared)
            sigma_mu_squared = self.sigma_mu_squared
        else:
            sigma_mu_squared = np.exp(log_sigma_mu_squared)

        if log_sigma_theta_squared is None:
            sigma_theta_squared = self.sigma_theta_squared
        else:
            sigma_theta_squared = np.exp(log_sigma_theta_squared)

        if log_sigma_epsilon_squared is None:
            log_sigma_epsilon_squared = np.log(self.sigma_epsilon_squared)
            sigma_epsilon_squared = self.sigma_epsilon_squared
        else:
            sigma_epsilon_squared = np.exp(log_sigma_epsilon_squared)

        if change_variances:
            h = 1 / (sigma_theta_squared +
                     sigma_epsilon_squared / self.n_students_per_class)
        else:
            h = self.h
        assert isinstance(h, np.ndarray)
        assert np.all(h > 0)
        if change_variances:
            h_sum_long = self.teacher_grouped.apply(np.sum, h)[:, 0]
            assert np.min(h_sum_long) >= np.min(h)
            precision_weights = h / h_sum_long
            y_bar_bar_long = self.teacher_grouped.apply(
                np.sum, precision_weights * self.y_bar)[:, 0]

            x_bar_bar_long = self.teacher_grouped.apply(
                lambda x: np.sum(x, 0),
                precision_weights[:, None] * self.x_bar,
                width=self.x_bar.shape[1])

            y_bar_tilde = self.y_bar - y_bar_bar_long
            x_bar_tilde = self.x_bar - x_bar_bar_long

            h_sum = h_sum_long[self.teacher_grouped.first_occurrences]
            assert np.min(h_sum) == np.min(h_sum_long)
        else:
            y_bar_tilde = self.y_bar_tilde
            x_bar_tilde = self.x_bar_tilde
            y_bar_bar_long = self.y_bar_bar_long
            x_bar_bar_long = self.x_bar_bar_long
            h_sum = self.h_sum

        assert isinstance(h_sum, np.ndarray)
        assert np.all(h_sum > 0)

        y_bar_bar = y_bar_bar_long[self.teacher_grouped.first_occurrences]
        x_bar_bar = x_bar_bar_long[self.teacher_grouped.first_occurrences, :]

        # Done with setup
        one_over_h_sum = 1 / h_sum
        bar_bar_err = y_bar_bar - x_bar_bar.dot(beta + lambda_) - alpha

        ll = (self.n_classes - self.n_students) * log_sigma_epsilon_squared \
            + np.sum(np.log(h)) - np.sum(np.log(h_sum)) \
            - np.sum(np.log(sigma_mu_squared + one_over_h_sum)) \
            - np.sum((self.y_tilde[:, 0] - self.x_tilde.dot(beta))**2) / sigma_epsilon_squared \
            - h.dot((y_bar_tilde - x_bar_tilde.dot(beta))**2) \
            - np.dot(bar_bar_err**2, 1 / (sigma_mu_squared + one_over_h_sum))
        ll /= -2
        assert np.isfinite(ll)
        if not get_grad:
            return ll

        gradient = [None, None, None]
        # Gradient for log sigma mu squared
        tmp = sigma_mu_squared + 1 / h_sum
        grad_s_mu = np.dot(bar_bar_err**2, 1 / tmp**2) - np.sum(1 / tmp)
        grad_s_mu *= -sigma_mu_squared / 2
        gradient[0] = grad_s_mu

        # Get gradient for log sigma theta squared
        first = 1 / h - (y_bar_tilde - x_bar_tilde.dot(beta))**2
        second = -one_over_h_sum + one_over_h_sum**2 / (sigma_mu_squared + one_over_h_sum) \
                 - (one_over_h_sum / (sigma_mu_squared + one_over_h_sum))**2 * bar_bar_err**2

        d_h_d_log_t = -h**2 * sigma_theta_squared
        d_h_sum = self.teacher_grouped.apply(np.sum,
                                             d_h_d_log_t,
                                             broadcast=False)

        grad_s_theta = d_h_d_log_t.dot(first)
        grad_s_theta += d_h_sum.dot(second)
        grad_s_theta /= -2
        gradient[1] = grad_s_theta

        # Get gradient for log sigma epsilon squared
        d_h_d_log_e = -h**2 * sigma_epsilon_squared / self.n_students_per_class
        d_e_sum = self.teacher_grouped.apply(np.sum,
                                             d_h_d_log_e,
                                             broadcast=False)

        grad_s_eps = self.n_classes - self.n_students + d_h_d_log_e.dot(first)
        grad_s_eps += d_e_sum.dot(second)
        grad_s_eps += np.sum((self.y_tilde[:, 0] - self.x_tilde.dot(beta))**
                             2) / sigma_epsilon_squared

        gradient[2] = grad_s_eps / -2

        if variances_only:
            return ll, np.array(gradient)
        grad_beta = -self.x_tilde.T.dot(self.y_tilde[:, 0] - self.x_tilde.dot(beta)) / sigma_epsilon_squared \
                    - (h[:, None] * x_bar_tilde).T.dot(y_bar_tilde - x_bar_tilde.dot(beta))

        w = 1 / (np.exp(log_sigma_mu_squared) + 1 / h_sum[:, None])
        assert isinstance(w, np.ndarray)
        assert isinstance(bar_bar_err, np.ndarray)
        grad_lambda = -(w * x_bar_bar).T.dot(bar_bar_err)
        grad_alpha = -bar_bar_err.dot(w)

        gradient = np.concatenate(
            (gradient, grad_beta, grad_lambda, grad_alpha))

        return ll, gradient

    def update_variances(self):
        def get_ll_helper(params: np.ndarray):
            return self.get_ll_grad(*params, get_grad=False)

        def get_grad_helper(params: np.ndarray):
            _, grad = self.get_ll_grad(*params,
                                       get_grad=True,
                                       variances_only=True)
            return grad

        bounds = [-np.inf, np.log(np.var(self.y_tilde))]
        # bounds = np.var(self.y_tilde) * 1e-7, np.var(self.y_tilde)

        result = minimize(get_ll_helper,
                          np.log(
                              np.array([
                                  self.sigma_mu_squared,
                                  self.sigma_theta_squared,
                                  self.sigma_epsilon_squared
                              ])),
                          jac=get_grad_helper,
                          method='L-BFGS-B',
                          bounds=[bounds, bounds, bounds],
                          options={
                              'disp': False,
                              'ftol': 1e-14,
                              'gtol': 1e-7
                          })
        self.sigma_mu_squared, self.sigma_theta_squared, self.sigma_epsilon_squared = np.exp(
            result['x'])
        return

    def update_coefficients(self):
        """
        Resets beta, lambda, alpha, precisions, and weighted means; keeps variances constant.
        :return:
        """
        self.h = 1 / (self.sigma_theta_squared +
                      self.sigma_epsilon_squared / self.n_students_per_class)
        h_sum_long = self.teacher_grouped.apply(np.sum, self.h)[:, 0]
        self.h_sum = h_sum_long[self.teacher_grouped.first_occurrences]
        # For beta
        precision_weights = self.h / h_sum_long
        self.y_bar_bar_long = self.teacher_grouped.apply(
            np.sum, precision_weights * self.y_bar)[:, 0]
        self.x_bar_bar_long = self.teacher_grouped.apply(
            lambda x: np.sum(x, 0),
            precision_weights[:, None] * self.x_bar,
            width=self.x_bar.shape[1])
        self.y_bar_tilde = self.y_bar - self.y_bar_bar_long
        self.x_bar_tilde = self.x_bar - self.x_bar_bar_long
        x_mat = self.xx_tilde / self.sigma_epsilon_squared + self.x_bar_tilde.T.dot(
            self.x_bar_tilde * self.h[:, None])
        y_mat = self.xy_tilde / self.sigma_epsilon_squared + self.x_bar_tilde.T.dot(
            self.y_bar_tilde * self.h)
        self.beta = np.linalg.solve(x_mat, y_mat)
        # Now get beta + lambda
        y_bar_bar = self.y_bar_bar_long[self.teacher_grouped.first_occurrences]
        teacher_precision_sums = h_sum_long[
            self.teacher_grouped.first_occurrences]
        sqrt_weights = 1 / np.sqrt(1 / teacher_precision_sums +
                                   self.sigma_mu_squared)
        assert np.all(np.isfinite(sqrt_weights))
        x_bar_bar = self.x_bar_bar_long[
            self.teacher_grouped.first_occurrences, :]
        y_w = (y_bar_bar - x_bar_bar.dot(self.beta)) * sqrt_weights
        stacked = np.hstack((np.ones((self.n_teachers, 1)), x_bar_bar))
        x_w = stacked * sqrt_weights[:, None]
        lambda_, _, rank, _ = np.linalg.lstsq(x_w, y_w)
        if rank != x_w.shape[1]:
            warnings.warn('x_w is rank deficient')
        self.alpha = lambda_[0]
        self.lambda_ = lambda_[1:]
        return

    def get_hess(self, epsilon: float):
        k = len(self.beta)
        hessian = np.zeros((2 * k + 4, 2 * k + 4))

        ll_up, upper = self.get_ll_grad(np.log(self.sigma_mu_squared) +
                                        epsilon,
                                        get_grad=True)
        ll_lo, lower = self.get_ll_grad(np.log(self.sigma_mu_squared) -
                                        epsilon,
                                        get_grad=True)
        hessian[0, :] = (upper - lower) / (2 * epsilon)

        ll_up, upper = self.get_ll_grad(
            log_sigma_theta_squared=np.log(self.sigma_theta_squared) + epsilon,
            get_grad=True)
        ll_lo, lower = self.get_ll_grad(
            log_sigma_theta_squared=np.log(self.sigma_theta_squared) - epsilon,
            get_grad=True)
        hessian[1, :] = (upper - lower) / (2 * epsilon)

        ll_up, upper = self.get_ll_grad(
            log_sigma_epsilon_squared=np.log(self.sigma_epsilon_squared) +
            epsilon,
            get_grad=True)
        ll_lo, lower = self.get_ll_grad(
            log_sigma_epsilon_squared=np.log(self.sigma_epsilon_squared) -
            epsilon,
            get_grad=True)
        hessian[2, :] = (upper - lower) / (2 * epsilon)

        eye = np.eye(len(self.beta))

        for i in range(k):
            ll_up, upper = self.get_ll_grad(beta=self.beta + epsilon * eye[i],
                                            get_grad=True)
            ll_lo, lower = self.get_ll_grad(beta=self.beta - epsilon * eye[i],
                                            get_grad=True)
            hessian[2 + i, :] = (upper - lower) / (2 * epsilon)

        for i in range(k):
            ll_up, upper = self.get_ll_grad(lambda_=self.lambda_ +
                                            epsilon * eye[i],
                                            get_grad=True)
            ll_lo, lower = self.get_ll_grad(lambda_=self.lambda_ -
                                            epsilon * eye[i],
                                            get_grad=True)
            hessian[2 + k + i, :] = (upper - lower) / (2 * epsilon)

        # alpha
        ll_up, upper = self.get_ll_grad(alpha=self.alpha + epsilon,
                                        get_grad=True)
        ll_lo, lower = self.get_ll_grad(alpha=self.alpha - epsilon,
                                        get_grad=True)
        hessian[-1, :] = (upper - lower) / (2 * epsilon)

        hessian += hessian.T
        hessian /= 2
        return hessian

    def fit(self):
        max_diff = 10
        i = 0
        while abs(max_diff) > 1e-7 and i < 30:
            beta_old = self.beta.copy()
            lambda_old = self.lambda_.copy()
            sigma_mu_squared_old = self.sigma_mu_squared
            sigma_theta_squared_old = self.sigma_theta_squared
            sigma_epsilon_squared_old = self.sigma_epsilon_squared
            self.update_coefficients()
            self.update_variances()
            differences = np.array([
                np.max(np.abs(self.beta - beta_old)),
                np.max(np.abs(self.lambda_ - lambda_old)),
                abs(self.sigma_mu_squared - sigma_mu_squared_old),
                abs(self.sigma_theta_squared - sigma_theta_squared_old),
                abs(self.sigma_epsilon_squared - sigma_epsilon_squared_old)
            ])
            max_diff = np.max(differences)
            i += 1
        print('Number of tries', i)
        print('variances', self.sigma_mu_squared, self.sigma_theta_squared,
              self.sigma_epsilon_squared)
        self.update_coefficients()
        self.hessian = self.get_hess(1e-6)
        print('Number of zeros in hessian', np.sum(self.hessian == 0))
        is_hessian_invertible = True
        try:
            self.asymp_var = np.linalg.inv(self.hessian)
            if np.any(np.diag(self.asymp_var) <= 0):
                warnings.warn(
                    'Some variables will have negative or zero variance')
        except np.linalg.LinAlgError:
            warnings.warn('Hessian was not invertible')
            is_hessian_invertible = False

        lambda_idx = slice(-1 - len(self.beta), -1)
        x_bar_bar = self.x_bar_bar_long[self.teacher_grouped.first_occurrences]
        self.predictable_var = np.var(x_bar_bar.dot(self.lambda_))
        x_bar_bar_demeaned = x_bar_bar - np.mean(x_bar_bar, 0)

        if is_hessian_invertible:
            var_lambda = self.asymp_var[lambda_idx, lambda_idx]
            try:
                self.bias_correction = np.sum(x_bar_bar_demeaned.dot(np.linalg.cholesky(var_lambda))**2)\
                    / (self.n_teachers - 1)
                print('bias correction', self.bias_correction)
            except np.linalg.LinAlgError:
                warnings.warn('Hessian was not positive definite')
                self.bias_correction = np.sum([
                    row.T.dot(var_lambda).dot(row)
                    for row in x_bar_bar_demeaned
                ]) / (self.n_teachers - 1)

        if is_hessian_invertible:
            self.total_var = self.predictable_var - self.bias_correction + self.sigma_mu_squared
            # Delta method
            grad = np.zeros(self.asymp_var.shape[0])
            grad[0] = self.sigma_mu_squared
            grad[lambda_idx] = 2 * np.mean(
                x_bar_bar.dot(self.lambda_)[:, None] * x_bar_bar_demeaned, 0)
            self.total_var_se = np.sqrt(grad.T.dot(self.asymp_var).dot(grad))
            self.sigma_mu_squared_se = np.sqrt(
                self.asymp_var[0, 0]) * self.sigma_mu_squared

        if not self.moments_only:
            rho = self.sigma_mu_squared / (self.sigma_mu_squared +
                                           1 / self.h_sum)
            residual = self.y_bar_bar_long[self.teacher_grouped.first_occurrences] - \
                x_bar_bar.dot(self.beta) - self.alpha
            predicted = x_bar_bar.dot(self.lambda_)
            self.individual_scores = (1 - rho) * residual + rho * predicted
        return self
Beispiel #11
0
def moment_matching_alg(data, outcome, teacher, dense_controls, class_level_vars,
                        categorical_controls, jackknife, moments_only, method):

    # If method is 'ks', just ignore teachers when residualizing
    if method == 'ks':
        beta, x, residual = estimate(data, data[outcome].values, dense_controls, 
                                     categorical_controls, get_residual=True,
                                     check_rank=True)
    # Residualize with fixed effects
    else:
        n_teachers = len(set(data[teacher]))
        cat = [teacher] if categorical_controls is None\
            else [teacher] + categorical_controls
        beta, x = estimate(data, data[outcome].values, dense_controls, cat,
                           check_rank=True)
        # add teacher fixed effects back in
        try:
            x = x.A
        except AttributeError:
            pass
        residual = data[outcome].values - x[:, n_teachers:].dot(beta[n_teachers:])
        residual -= np.mean(residual)
        
    assert np.all(np.isfinite(residual))
    assert len(residual) == len(data)
    data['residual'] = residual
    ssr = np.var(residual)

    # Collapse data to class level
    # Count number of students in class

    class_df = data.groupby(class_level_vars).size().reset_index()
    class_df.columns = class_level_vars + ['size']

    # Calculate mean and merge it back into class-level data
    class_df.loc[:, 'mean score'] = \
        data.groupby(class_level_vars)['residual'].mean().values
    class_df.loc[:, 'var'] = \
        data.groupby(class_level_vars)['residual'].var().values
    assert len(class_df) > 0
    
    if jackknife:  # Drop teachers teaching only one class
        keeps = Groupby(class_df[teacher]).apply(lambda elt: len(elt) > 1,
                                                 class_df[teacher]).astype(bool)
        class_df = class_df.loc[keeps, :].reset_index(drop=True)

    # Second, calculate a bunch of moments
    var_epsilon_hat = estimate_var_epsilon(class_df)
    var_mu_hat = estimate_mu_variance(class_df, teacher)

    # Estimate variance of class-level shocks
    var_theta_hat = ssr - var_mu_hat - var_epsilon_hat
    if var_theta_hat < 0:
        warnings.warn('Var theta hat is negative. Measured to be ' +
                      str(var_theta_hat))
        var_theta_hat = 0
        
    if var_mu_hat <= 0:
        warnings.warn('Var mu hat is negative. Measured to be ' + str(var_mu_hat))
    if moments_only:
        return {'sigma mu squared': var_mu_hat, 
                'sigma theta squared': var_theta_hat, 
                'sigma epsilon squared': var_epsilon_hat}

    results = get_each_va(class_df, var_theta_hat, var_epsilon_hat,
                          var_mu_hat, jackknife, teacher)

    return {'individual effects': results, 'sigma mu squared': var_mu_hat, 
            'sigma theta squared': var_theta_hat, 
            'sigma epsilon squared': var_epsilon_hat}
def group_mode(x1, x2):
    if np.isscalar(x1) and np.isscalar(x2):
        return x2
    x1, x2 = shape_wrapper(x1, x2)
    return Groupby(x1).apply(lambda x: mode(x).mode[0], x2, broadcast=True)
def group_max(x1, x2):
    if np.isscalar(x1) and np.isscalar(x2):
        return x2
    x1, x2 = shape_wrapper(x1, x2)
    return Groupby(x1).apply(np.max, x2, broadcast=True)
def group_count(x1):
    if np.isscalar(x1):
        return x1
    return Groupby(x1).apply(len, x1, broadcast=True)