def _check_optimality_conditions(self,
                                     model_params,
                                     lambdas,
                                     opt_thres=1e-2):
        # sanity check function to see that cvxpy is solving to a good enough accuracy
        # check that the gradient is close to zero
        # can use this to check that our implicit derivative assumptions hold
        # lambdas must be an exploded lambda matrix
        print "check_optimality_conditions!"

        alpha = model_params["alpha"]
        beta = model_params["beta"]
        gamma = model_params["gamma"]

        u_hat, sigma_hat, v_hat = self._get_svd_mini(gamma)

        d_square_loss = -1.0 / self.num_train * np.multiply(
            self.train_vec,
            make_column_major_flat(self.data.observed_matrix -
                                   get_matrix_completion_fitted_values(
                                       self.data.row_features, self.data.
                                       col_features, alpha, beta, gamma)))

        left_grad_at_opt_gamma = (u_hat.T * make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols)) +
                                  lambdas[0] * np.sign(sigma_hat) * v_hat.T)
        right_grad_at_opt_gamma = (make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols)) * v_hat +
                                   lambdas[0] * u_hat * np.sign(sigma_hat))
        left_grad_norm = np.linalg.norm(left_grad_at_opt_gamma)
        right_grad_norm = np.linalg.norm(right_grad_at_opt_gamma)
        print "grad_at_opt wrt gamma (should be zero)", left_grad_norm, right_grad_norm
        assert (left_grad_norm < opt_thres)
        assert (right_grad_norm < opt_thres)

        print "alpha", alpha
        grad_at_opt_alpha = []
        for i in range(alpha.size):
            if np.abs(alpha[i]) > self.zero_thres:
                alpha_sign = np.sign(alpha[i])
                grad_at_opt_alpha.append(
                    (d_square_loss.T * make_column_major_flat(
                        self.data.row_features[:, i] * self.onesT_row) +
                     lambdas[1] * alpha_sign + lambdas[2] * alpha[i])[0, 0])
        print "grad_at_opt wrt alpha (should be zero)", grad_at_opt_alpha
        assert (np.all(np.abs(grad_at_opt_alpha) < opt_thres))

        print "beta", beta
        grad_at_opt_beta = []
        for i in range(beta.size):
            if np.abs(beta[i]) > self.zero_thres:
                beta_sign = np.sign(beta[i])
                grad_at_opt_beta.append(
                    (d_square_loss.T * make_column_major_flat(
                        (self.data.col_features[:, i] * self.onesT_col).T) +
                     lambdas[3] * beta_sign + lambdas[4] * beta[i])[0, 0])
        print "grad_at_opt wrt beta (should be zero)", grad_at_opt_beta
        assert (np.all(np.abs(grad_at_opt_beta) < opt_thres))
Esempio n. 2
0
 def _get_masked(self, obs_matrix):
     masked_obs_vec = make_column_major_flat(obs_matrix)
     masked_obs_vec[self.non_train_mask_vec] = 0
     masked_obs_m = make_column_major_reshape(
         masked_obs_vec, (self.num_rows, self.num_cols))
     return masked_obs_m
Esempio n. 3
0
 def _get_gamma_grad():
     return make_column_major_reshape(d_square_loss,
                                      (self.num_rows, self.num_cols))
    def _double_check_derivative_indepth_lambda0(self, model1, model2, model0,
                                                 eps):
        # not everything should be zero, if it is not differentiable at that point
        dalpha_dlambda = (model1["alpha"] - model2["alpha"]) / (eps * 2)
        dbeta_dlambda = (model1["beta"] - model2["beta"]) / (eps * 2)

        gamma1 = model1["gamma"]
        u1, s1, v1 = self._get_svd_mini(gamma1)
        gamma2 = model2["gamma"]
        u2, s2, v2 = self._get_svd_mini(gamma2)
        gamma0 = model0["gamma"]
        u_hat, sigma_hat, v_hat = self._get_svd_mini(gamma0)
        dU_dlambda = (u1 - u2) / (eps * 2)
        dV_dlambda = (v1 - v2) / (eps * 2)
        dSigma_dlambda = (s1 - s2) / (eps * 2)
        dgamma_dlambda = (gamma1 - gamma2) / (eps * 2)

        print "dalpha_dlambda0, %s" % (dalpha_dlambda)
        print "dBeta_dlambda0, %s" % (dbeta_dlambda)
        print "dU_dlambda0", dU_dlambda
        print "ds_dlambda0, %s" % (dSigma_dlambda)
        print "dgamma_dlambda0, %s" % (dgamma_dlambda)

        split_dgamma_dlambda = dU_dlambda * sigma_hat * v_hat.T + u_hat * dSigma_dlambda * v_hat.T + u_hat * sigma_hat * dV_dlambda.T

        # print "alpha1", model1["alpha"]
        # print 'alpha2', model2["alpha"]
        # print "eps", eps
        # print "u_hat", u_hat
        # print "u1", u1
        # print "u2", u2
        # print "v_hat", v_hat
        # print "v1", v1
        # print "v2", v2
        # print "sigma_hat", sigma_hat
        # print "s1", s1
        # print "s2", s2

        print "should be zero? dU_dlambda * u.T", u_hat.T * dU_dlambda + dU_dlambda.T * u_hat
        print "should be zero? dv_dlambda * v.T", dV_dlambda.T * v_hat + v_hat.T * dV_dlambda

        print "should be zero? dgamma_dlambda - dgamma_dlambda", split_dgamma_dlambda - dgamma_dlambda

        d_square_loss = 1.0 / self.num_train * self.train_vec_diag * make_column_major_flat(
            dgamma_dlambda +
            self.data.row_features * dalpha_dlambda * self.onesT_row +
            (self.data.col_features * dbeta_dlambda * self.onesT_col).T)

        dalpha_dlambda_imp = []
        for i in range(dalpha_dlambda.size):
            dalpha_dlambda_imp.append(
                (d_square_loss.T * make_column_major_flat(
                    self.data.row_features[:, i] * self.onesT_row) +
                 self.fmodel.current_lambdas[1] * dalpha_dlambda[i])[0, 0])
        print "should be zero? numerical plugin to the imp deriv eqn, dalpha_dlambda", dalpha_dlambda_imp

        db_dlambda_imp = []
        for i in range(dbeta_dlambda.size):
            db_dlambda_imp.append(
                (d_square_loss.T * make_column_major_flat(
                    (self.data.col_features[:, i] * self.onesT_col).T) +
                 self.fmodel.current_lambdas[1] * dbeta_dlambda[i])[0, 0])
        print "should be zero? numerical plugin to the imp deriv eqn, dbeta_dlambda_imp", db_dlambda_imp

        print "should be zero? numerical plugin to the imp deriv eqn, dgamma_dlambda", (
            u_hat.T * make_column_major_reshape(
                d_square_loss,
                (self.data.num_rows, self.data.num_cols)) * v_hat +
            np.sign(sigma_hat) + u_hat.T * self.fmodel.current_lambdas[0] *
            dU_dlambda * np.sign(sigma_hat) + self.fmodel.current_lambdas[0] *
            np.sign(sigma_hat) * dV_dlambda.T * v_hat)
    def _get_dmodel_dlambda(
        self,
        lambda_idx,
        imp_derivs,
        alpha,
        beta,
        gamma,
        row_features,
        col_features,
        u_hat,
        sigma_hat,
        v_hat,
        lambdas,
    ):
        # TODO: make this same as above. this is wrong right now and will crash - solve with obejctive function, not constraints
        d_square_loss = self._get_d_square_loss(alpha, beta, gamma,
                                                row_features, col_features)
        d_square_loss_reshape = make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols))
        dd_square_loss = self._get_dd_square_loss(imp_derivs, row_features,
                                                  col_features)
        dd_square_loss_reshape = reshape(
            dd_square_loss,
            self.data.num_rows,
            self.data.num_cols,
        )
        sigma_mask = self._create_sigma_mask(sigma_hat)

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt gamma
        constraints_dgamma = []
        if sigma_hat.size > 0:
            dgamma_imp_deriv_dlambda = (
                imp_derivs.dU_dlambda.T * d_square_loss_reshape * v_hat +
                u_hat.T * dd_square_loss_reshape * v_hat +
                u_hat.T * d_square_loss_reshape * imp_derivs.dV_dlambda)
            if lambda_idx == 0:
                dgamma_imp_deriv_dlambda += np.sign(sigma_hat)

            constraints_dgamma = [
                sigma_mask * vec(dgamma_imp_deriv_dlambda) == np.zeros(
                    (self.data.num_rows * self.data.num_cols, 1))
            ]

        def _make_alpha_constraint(i):
            dalpha_imp_deriv_dlambda = (
                dd_square_loss.T * vec(row_features[:, i] * self.onesT_row) +
                lambdas[1] * imp_derivs.dalpha_dlambda[i])
            if lambda_idx == 1:
                dalpha_imp_deriv_dlambda += np.sign(alpha[i]) + alpha[i]
            return dalpha_imp_deriv_dlambda == 0

        def _make_beta_constraint(i):
            dbeta_imp_deriv_dlambda = (
                dd_square_loss.T * vec(
                    (col_features[:, i] * self.onesT_col).T) +
                lambdas[1] * imp_derivs.dbeta_dlambda[i])
            if lambda_idx == 1:
                dbeta_imp_deriv_dlambda += np.sign(beta[i]) + beta[i]
            return dbeta_imp_deriv_dlambda == 0

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt
        # alpha and beta, respectively
        constraints_dalpha = [
            _make_alpha_constraint(i) for i in range(alpha.size)
        ]
        constraints_dbeta = [
            _make_beta_constraint(i) for i in range(beta.size)
        ]

        return imp_derivs.solve(constraints_dgamma + constraints_dalpha +
                                constraints_dbeta)
    def _get_dmodel_dlambda(
        self,
        lambda_idx,
        imp_derivs,
        alpha,
        beta,
        gamma,
        row_features,
        col_features,
        u_hat,
        sigma_hat,
        v_hat,
        lambdas,
    ):
        # this fcn accepts mini-fied model parameters - alpha, beta, and u/sigma/v
        # returns the gradient of the model parameters wrt lambda
        d_square_loss = self._get_d_square_loss(alpha, beta, gamma,
                                                row_features, col_features)
        d_square_loss_reshape = make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols))
        dd_square_loss = self._get_dd_square_loss(imp_derivs, row_features,
                                                  col_features)
        dd_square_loss_reshape = reshape(
            dd_square_loss,
            self.data.num_rows,
            self.data.num_cols,
        )
        sigma_mask = self._create_sigma_mask(sigma_hat)
        obj = 0

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt gamma
        constraints_dgamma = []
        if sigma_hat.size > 0:
            # left multiply U^T and implicit derivative
            dgamma_left_imp_deriv_dlambda = (
                imp_derivs.dU_dlambda.T * d_square_loss_reshape +
                u_hat.T * dd_square_loss_reshape +
                lambdas[0] * np.sign(sigma_hat) * imp_derivs.dV_dlambda.T)

            # right multiply V and implicit derivative
            dgamma_right_imp_deriv_dlambda = (
                d_square_loss_reshape * imp_derivs.dV_dlambda +
                dd_square_loss_reshape * v_hat +
                lambdas[0] * imp_derivs.dU_dlambda * np.sign(sigma_hat))
            if lambda_idx == 0:
                dgamma_left_imp_deriv_dlambda += np.sign(sigma_hat) * v_hat.T
                dgamma_right_imp_deriv_dlambda += u_hat * np.sign(sigma_hat)

            constraints_dgamma = [
                dgamma_left_imp_deriv_dlambda == 0,
                dgamma_right_imp_deriv_dlambda == 0
            ]
            obj += sum_squares(dgamma_left_imp_deriv_dlambda) + sum_squares(
                dgamma_right_imp_deriv_dlambda)

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt
        # alpha and beta, respectively

        constraints_dalpha = []
        for i in range(alpha.size):
            dalpha_imp_deriv_dlambda = (
                dd_square_loss.T * vec(row_features[:, i] * self.onesT_row) +
                lambdas[2] * imp_derivs.dalpha_dlambda[i])
            if lambda_idx == 1:
                dalpha_imp_deriv_dlambda += np.sign(alpha[i])
            elif lambda_idx == 2:
                dalpha_imp_deriv_dlambda += alpha[i]
            constraints_dalpha.append(dalpha_imp_deriv_dlambda == 0)
            obj += sum_squares(dalpha_imp_deriv_dlambda)

        constraints_dbeta = []
        for i in range(beta.size):
            dbeta_imp_deriv_dlambda = (
                dd_square_loss.T * vec(
                    (col_features[:, i] * self.onesT_col).T) +
                lambdas[4] * imp_derivs.dbeta_dlambda[i])
            if lambda_idx == 3:
                dbeta_imp_deriv_dlambda += np.sign(beta[i])
            elif lambda_idx == 4:
                dbeta_imp_deriv_dlambda += beta[i]
            constraints_dbeta.append(dbeta_imp_deriv_dlambda == 0)
            obj += sum_squares(dbeta_imp_deriv_dlambda)

        return imp_derivs.solve(
            constraints_dgamma + constraints_dalpha + constraints_dbeta, obj)
    def _get_dmodel_dlambda(
        self,
        lambda_idx,
        imp_derivs,
        alphas,
        betas,
        gamma,
        row_features,
        col_features,
        u_hat,
        sigma_hat,
        v_hat,
        lambdas,
    ):
        # this fcn accepts mini-fied model parameters - alpha, beta, and u/sigma/v
        # returns the gradient of the model parameters wrt lambda
        num_alphas = len(alphas)
        dd_square_loss_mini = self._get_dd_square_loss_mini(
            imp_derivs, row_features, col_features)
        sigma_mask = self._create_sigma_mask(sigma_hat)
        obj = 0
        lambda_offset = 1 if sigma_hat.size > 0 else 0

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt gamma
        if sigma_hat.size > 0:
            d_square_loss = self._get_d_square_loss(alphas, betas, gamma,
                                                    row_features, col_features)
            d_square_loss_reshape = make_column_major_reshape(
                d_square_loss, (self.data.num_rows, self.data.num_cols))

            dd_square_loss = self._get_dd_square_loss(imp_derivs, row_features,
                                                      col_features)
            dd_square_loss_reshape = reshape(
                dd_square_loss,
                self.data.num_rows,
                self.data.num_cols,
            )

            # left multiply U^T and implicit derivative
            dgamma_left_imp_deriv_dlambda = (
                imp_derivs.dU_dlambda.T * d_square_loss_reshape +
                u_hat.T * dd_square_loss_reshape +
                lambdas[0] * np.sign(sigma_hat) * imp_derivs.dV_dlambda.T)

            # right multiply V and implicit derivative
            dgamma_right_imp_deriv_dlambda = (
                d_square_loss_reshape * imp_derivs.dV_dlambda +
                dd_square_loss_reshape * v_hat +
                lambdas[0] * imp_derivs.dU_dlambda * np.sign(sigma_hat))
            if lambda_idx == 0:
                dgamma_left_imp_deriv_dlambda += np.sign(sigma_hat) * v_hat.T
                dgamma_right_imp_deriv_dlambda += u_hat * np.sign(sigma_hat)

            obj += sum_squares(dgamma_left_imp_deriv_dlambda) + sum_squares(
                dgamma_right_imp_deriv_dlambda)

        # Constraint from implicit differentiation of the optimality conditions
        # that were defined by taking the gradient of the training objective wrt
        # alpha and beta, respectively

        for i, a_tuple in enumerate(
                zip(row_features, alphas, imp_derivs.dalphas_dlambda)):
            row_f, alpha, da_dlambda = a_tuple
            for j in range(alpha.size):
                dalpha_imp_deriv_dlambda = (
                    dd_square_loss_mini.T *
                    vec(row_f[:, j] * self.onesT_row)[self.data.train_idx] +
                    lambdas[1] * (da_dlambda[j] / get_norm2(alpha, power=1) -
                                  alpha[j] / get_norm2(alpha, power=3) *
                                  (alpha.T * da_dlambda)))
                if lambda_idx == 1:
                    dalpha_imp_deriv_dlambda += alpha[j] / get_norm2(alpha,
                                                                     power=1)
                obj += sum_squares(dalpha_imp_deriv_dlambda)

        for i, b_tuple in enumerate(
                zip(col_features, betas, imp_derivs.dbetas_dlambda)):
            col_f, beta, db_dlambda = b_tuple
            for j in range(beta.size):
                dbeta_imp_deriv_dlambda = (
                    dd_square_loss_mini.T * vec(
                        (col_f[:, j] * self.onesT_col).T)[self.data.train_idx]
                    + lambdas[1] * (db_dlambda[j] / get_norm2(beta, power=1) -
                                    beta[j] / get_norm2(beta, power=3) *
                                    (beta.T * db_dlambda)))
                if lambda_idx == 1:
                    dbeta_imp_deriv_dlambda += beta[j] / get_norm2(beta,
                                                                   power=1)
                obj += sum_squares(dbeta_imp_deriv_dlambda)

        return imp_derivs.solve(obj)
    def _check_optimality_conditions(self,
                                     model_params,
                                     lambdas,
                                     opt_thres=1e-2):
        # sanity check function to see that cvxpy is solving to a good enough accuracy
        # check that the gradient is close to zero
        # can use this to check that our implicit derivative assumptions hold
        # lambdas must be an exploded lambda matrix
        print "check_optimality_conditions!"

        alphas = model_params["alphas"]
        betas = model_params["betas"]
        gamma = model_params["gamma"]

        u_hat, sigma_hat, v_hat = self._get_svd_mini(gamma)
        a = self.data.observed_matrix - get_matrix_completion_groups_fitted_values(
            self.data.row_features, self.data.col_features, alphas, betas,
            gamma)

        d_square_loss = self._get_d_square_loss(
            alphas,
            betas,
            gamma,
            self.data.row_features,
            self.data.col_features,
        )

        left_grad_at_opt_gamma = (make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols)) * v_hat +
                                  lambdas[0] * u_hat * np.sign(sigma_hat))
        right_grad_at_opt_gamma = (u_hat.T * make_column_major_reshape(
            d_square_loss, (self.data.num_rows, self.data.num_cols)) +
                                   lambdas[0] * np.sign(sigma_hat) * v_hat.T)
        print "left grad_at_opt wrt gamma (should be zero)", get_norm2(
            left_grad_at_opt_gamma)
        print "right grad_at_opt wrt gamma (should be zero)", get_norm2(
            right_grad_at_opt_gamma)
        # assert(get_norm2(left_grad_at_opt_gamma) < opt_thres)
        # assert(get_norm2(right_grad_at_opt_gamma) < opt_thres)

        for i, a_f_tuple in enumerate(zip(alphas, self.data.row_features)):
            alpha, row_f = a_f_tuple
            if np.linalg.norm(alpha) > 1e-5:
                grad_at_opt_alpha = []
                for j in range(alpha.size):
                    grad_at_opt_alpha.append(
                        (d_square_loss.T *
                         make_column_major_flat(row_f[:, j] * self.onesT_row) +
                         lambdas[1 + i] * alpha[j] /
                         np.linalg.norm(alpha, ord=None))[0, 0])
                print "grad_at_opt wrt alpha (should be zero)", get_norm2(
                    grad_at_opt_alpha)
                # assert(np.linalg.norm(grad_at_opt_alpha) < opt_thres)

        for i, b_f_tuple in enumerate(zip(betas, self.data.col_features)):
            beta, col_f = b_f_tuple
            if np.linalg.norm(beta) > 1e-5:
                grad_at_opt_beta = []
                for j in range(beta.size):
                    grad_at_opt_beta.append(
                        (d_square_loss.T * make_column_major_flat(
                            (col_f[:, j] * self.onesT_col).T) +
                         lambdas[1 + self.settings.num_row_groups + i] *
                         beta[j] / np.linalg.norm(beta, ord=None))[0, 0])
                print "grad_at_opt wrt beta (should be zero)", get_norm2(
                    grad_at_opt_beta)