def _objective(self, x0, cov_mtx, loadings):
        """
        The objective function.

        Parameters
        ----------
        x0: array-like
            The combined (X, 1) array. These are the
            initial values for the `minimize()` function.
        cov_mtx : array-like
            The covariance matrix from the original data
            set.
        loadings : array-like
            The loadings matrix (n_factors * n_variables)
            from the model parser. This tells the objective
            function what elements should be fixed.

        Returns
        -------
        error : float
            The error from the objective function.
        """
        (loadings_init,
         error_vars_init,
         factor_vars_init,
         factor_covs_init) = self._split(x0,
                                         self.model.n_factors,
                                         self.model.n_variables,
                                         self.model.n_lower_diag)

        # set the loadings to zero where applicable
        loadings_init[np.where(loadings == 0)] = 0

        # combine factor variances and covariances into a single matrix
        factor_varcov_init = merge_variance_covariance(factor_vars_init, factor_covs_init)

        # make the error variance into a variance-covariance matrix
        error_varcov_init = merge_variance_covariance(error_vars_init)

        # make the factor variance-covariance matrix into a correlation matrix
        with np.errstate(all='ignore'):
            factor_varcov_init = covariance_to_correlation(factor_varcov_init)

        # calculate sigma-theta, needed for the objective function
        sigma_theta = loadings_init.dot(factor_varcov_init) \
                                   .dot(loadings_init.T) + error_varcov_init

        with np.errstate(all='ignore'):
            error = -(((-self.n_obs * self.model.n_variables / 2) * np.log(2 * np.pi)) -
                      (self.n_obs / 2) * (np.log(np.linalg.det(sigma_theta)) +
                                          np.trace(cov_mtx.dot(np.linalg.inv(sigma_theta)))))

            # make sure the error is greater than or
            # equal to zero before we return it; we
            # do not do this for the Bollen approach
            error = 0.0 if error < 0.0 else error

        return error
    def __init__(self,
                 loadings,
                 n_factors,
                 n_variables,
                 factor_names=None,
                 variable_names=None):

        assert isinstance(loadings, np.ndarray)
        assert loadings.shape[0] == n_variables
        assert loadings.shape[1] == n_factors

        self._loadings = loadings
        self._n_factors = n_factors
        self._n_variables = n_variables
        self._factor_names = factor_names
        self._variable_names = variable_names

        self._n_lower_diag = get_symmetric_lower_idxs(n_factors, False).shape[0]

        self._error_vars = np.full((n_variables, 1), np.nan)
        self._factor_covs = np.full((n_factors, n_factors), np.nan)

        self._loadings_free = get_free_parameter_idxs(loadings, eq=1)
        self._error_vars_free = merge_variance_covariance(self._error_vars)
        self._error_vars_free = get_free_parameter_idxs(self._error_vars_free, eq=-1)
        self._factor_covs_free = get_symmetric_lower_idxs(n_factors, False)
def test_merge_variance_covariance_no_covariance():

    expected = np.eye(4)

    x = np.array([1, 1, 1, 1])

    output = merge_variance_covariance(x)
    assert_array_equal(output, expected)
Example #4
0
def test_merge_variance_covariance():

    expected = [[1, .25, .45], [.25, 1, .35], [.45, .35, 1]]
    expected = np.array(expected)

    x = np.array([1, 1, 1])
    y = np.array([.25, .45, .35])

    output = merge_variance_covariance(x, y)
    assert_array_equal(output, expected)
    def fit(self, X, y=None):
        """
        Perform confirmatory factor analysis.

        Parameters
        ----------
        X : array-like
            The data to use for confirmatory
            factor analysis. If this is just a
            covariance matrix, make sure `is_cov_matrix`
            was set to True.
        y : ignored

        Raises
        ------
        ValueError
            If the specification is not None or a
            ``ModelSpecification`` object
        AssertionError
            If ``is_cov_matrix=True`` and the matrix
            is not square.
        AssertionError
            If len(bounds) != len(x0)

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
        >>> cfa.fit(X.values)
        >>> cfa.loadings_
        array([[0.99131285, 0.        ],
               [0.46074919, 0.        ],
               [0.3502267 , 0.        ],
               [0.58331488, 0.        ],
               [0.        , 0.98621042],
               [0.        , 0.73389239],
               [0.        , 0.37602988],
               [0.        , 0.50049507]])
        """
        if self.specification is None:
            self.model = ModelSpecificationParser.parse_model_specification_from_array(X)
        elif isinstance(self.specification, ModelSpecification):
            self.model = self.specification.copy()
        else:
            raise ValueError('The `specification` must be None or `ModelSpecification` '
                             'instance, not {}'.format(type(self.specification)))

        if isinstance(X, pd.DataFrame):
            X = X.values

        # now check the array, and make sure it
        # meets all of our expected criteria
        X = check_array(X,
                        force_all_finite='allow-nan',
                        estimator=self,
                        copy=True)

        # check to see if there are any null values, and if
        # so impute using the desired imputation approach
        if np.isnan(X).any() and not self.is_cov_matrix:
            X = impute_values(X, how=self.impute)

        if not self.is_cov_matrix:
            # make sure that the columns are in the proper order
            # data = data[variable_names].copy()
            # get the number of observations from the data, if `n_obs` not passed;
            # then, calculate the covariance matrix from the data set
            self.n_obs = X.shape[0] if self.n_obs is None else self.n_obs
            self.mean_ = np.mean(X, axis=0)
            cov_mtx = cov(X)
        else:
            error_msg = ('If `is_cov_matrix=True`, then the rows and column in the data '
                         'set must be equal, and must equal the number of variables '
                         'in your model.')
            assert X.shape[0] == X.shape[1] == self.model.n_variables, error_msg
            cov_mtx = X.copy()

        self.cov_ = cov_mtx.copy()

        # we initialize all of the arrays, setting the covariances
        # lower than the expected variances, and the loadings to 1 or 0
        loading_init = self.model.loadings
        error_vars_init = np.full((self.model.n_variables, 1), 0.5)
        factor_vars_init = np.full((self.model.n_factors, 1), 1.0)
        factor_covs_init = np.full((self.model.n_lower_diag, 1), 0.05)

        # we merge all of the arrays into a single 1d vector
        x0 = self._combine(loading_init,
                           error_vars_init,
                           factor_vars_init,
                           factor_covs_init,
                           self.model.n_factors,
                           self.model.n_variables,
                           self.model.n_lower_diag)

        # if the bounds argument is None, then we initialized the
        # boundaries to (None, None) for everything except factor covariances;
        # at some point in the future, we may update this to place limits
        # on the loading matrix boundaries, too, but the case in R and SAS
        if self.bounds is not None:
            error_msg = ('The length of `bounds` must equal the length of your '
                         'input array `x0`: {} != {}.'.format(len(self.bounds), len(x0)))
            assert len(self.bounds) == len(x0), error_msg

        # fit the actual model using L-BFGS algorithm;
        # the constraints are set inside the objective function,
        # so that we can avoid using linear programming methods (e.g. SLSQP)
        res = minimize(self._objective, x0,
                       method='L-BFGS-B',
                       options={'maxiter': self.max_iter, 'disp': self.disp},
                       bounds=self.bounds,
                       args=(cov_mtx, self.model.loadings))

        # if the optimizer failed to converge, print the message
        if not res.success:
            warnings.warn('The optimization routine failed '
                          'to converge: {}'.format(str(res.message)))

        # we split all the 1d array back into the set of original arrays
        (loadings_res,
         error_vars_res,
         factor_vars_res,
         factor_covs_res) = self._split(res.x,
                                        self.model.n_factors,
                                        self.model.n_variables,
                                        self.model.n_lower_diag)

        # we combine the factor covariances and variances into
        # a single variance-covariance matrix to make things easier,
        # but also check to make see if anything was fixed
        factor_varcovs_res = merge_variance_covariance(factor_vars_res, factor_covs_res)
        with np.errstate(all='ignore'):
            factor_varcovs_res = covariance_to_correlation(factor_varcovs_res)

        self.loadings_ = loadings_res
        self.error_vars_ = error_vars_res
        self.factor_varcovs_ = factor_varcovs_res

        # we also calculate the log-likelihood, AIC, and BIC
        self.log_likelihood_ = -res.fun
        self.aic_ = 2 * res.fun + 2 * (x0.shape[0] + self.model.n_variables)
        if self.n_obs is not None:
            self.bic_ = 2 * res.fun + np.log(self.n_obs) * (x0.shape[0] + self.model.n_variables)
        return self