def test_getframe_smoke(): # mostly smoke tests for now mod = Factor(X.iloc[:, 1:-1], 2, smc=True) res = mod.fit() df = res.get_loadings_frame(style='raw') assert_(isinstance(df, pd.DataFrame)) if pd.__version__ < '0.17': return lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3) lds.to_latex() # The Styler option require jinja2, skip if not available try: from jinja2 import Template except ImportError: return try: from pandas.io import formats as pd_formats except ImportError: from pandas import formats as pd_formats ldf = res.get_loadings_frame(style='display') assert_(isinstance(ldf, pd_formats.style.Styler)) assert_(isinstance(ldf.data, pd.DataFrame)) res.get_loadings_frame(style='display', decimals=3, threshold=0.2) res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO') res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)
def fa_neighbors(vecs, d, num_neighbors=5, rotation=None, method='pa', rotate_args=[]): # find latent factors fa = Factor(vecs.vectors, d, method=method) loadings = fa.fit().loadings padding = np.zeros((vecs.vectors.shape[1], d - loadings.shape[1])) loadings = np.hstack([loadings, padding]) # rotate factors if rotation is not None: loadings, transformation = rotate_factors(loadings, rotation, *rotate_args) # find neighbors labels = np.array(list(range(d))) neighbors = compute_nn(vecs, loadings.T, labels, num_neighbors, whole_matrix=True) return neighbors, loadings
def test_getframe_smoke(): # mostly smoke tests for now mod = Factor(X.iloc[:, 1:-1], 2, smc=True) res = mod.fit() df = res.get_loadings_frame(style='raw') assert_(isinstance(df, pd.DataFrame)) if pd.__version__ < '0.17': return lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3) lds.to_latex() # The Styler option require jinja2, skip if not available try: from jinja2 import Template except ImportError: return ldf = res.get_loadings_frame(style='display') assert_(isinstance(ldf, pd.formats.style.Styler)) assert_(isinstance(ldf.data, pd.DataFrame)) res.get_loadings_frame(style='display', decimals=3, threshold=0.2) res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO') res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)
def test_plots(): mod = Factor(X.iloc[:, 1:], 3) results = mod.fit() results.rotate('oblimin') results.plot_scree() fig_loadings = results.plot_loadings() assert_equal(3, len(fig_loadings))
def test_plots(close_figures): mod = Factor(X.iloc[:, 1:], 3) results = mod.fit() results.rotate('oblimin') fig = results.plot_scree() fig_loadings = results.plot_loadings() assert_equal(3, len(fig_loadings))
def test_factor_scoring(): path = os.path.abspath(__file__) dir_path = os.path.dirname(path) csv_path = os.path.join(dir_path, 'results', 'factor_data.csv') y = pd.read_csv(csv_path) csv_path = os.path.join(dir_path, 'results', 'factors_stata.csv') f_s = pd.read_csv(csv_path) # mostly smoke tests for now mod = Factor(y, 2) res = mod.fit(maxiter=1) res.rotate('varimax') f_reg = res.factor_scoring(method='reg') assert_allclose(f_reg * [1, -1], f_s[["f1", 'f2']].values, atol=1e-4, rtol=1e-3) f_bart = res.factor_scoring() assert_allclose(f_bart * [1, -1], f_s[["f1b", 'f2b']].values, atol=1e-4, rtol=1e-3) # check we have high correlation to ols and gls f_ols = res.factor_scoring(method='ols') f_gls = res.factor_scoring(method='gls') f_reg_z = _zscore(f_reg) f_ols_z = _zscore(f_ols) f_gls_z = _zscore(f_gls) assert_array_less(0.98, (f_ols_z * f_reg_z).mean(0)) assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0)) # with oblique rotation res.rotate('oblimin') # Note: Stata has second factor with flipped sign compared to statsmodels assert_allclose(res._corr_factors()[0, 1], (-1) * 0.25651037, rtol=1e-3) f_reg = res.factor_scoring(method='reg') assert_allclose(f_reg * [1, -1], f_s[["f1o", 'f2o']].values, atol=1e-4, rtol=1e-3) f_bart = res.factor_scoring() assert_allclose(f_bart * [1, -1], f_s[["f1ob", 'f2ob']].values, atol=1e-4, rtol=1e-3) # check we have high correlation to ols and gls f_ols = res.factor_scoring(method='ols') f_gls = res.factor_scoring(method='gls') f_reg_z = _zscore(f_reg) f_ols_z = _zscore(f_ols) f_gls_z = _zscore(f_gls) assert_array_less(0.97, (f_ols_z * f_reg_z).mean(0)) assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0)) # check provided endog f_ols2 = res.factor_scoring(method='ols', endog=res.model.endog) assert_allclose(f_ols2, f_ols, rtol=1e-13)
def test_plots(): mod = Factor(X.iloc[:, 1:], 3) results = mod.fit() results.rotate('oblimin') fig = results.plot_scree() plt.close(fig) fig_loadings = results.plot_loadings() assert_equal(3, len(fig_loadings)) for fig in fig_loadings[:-1]: plt.close(fig) plt.close('all')
def test_em(): n_factor = 1 cor = np.asarray([[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]]) fa = Factor(corr=cor, n_factor=n_factor, method='ml') rslt = fa.fit(opt={'gtol': 1e-3}) load_opt = rslt.loadings uniq_opt = rslt.uniqueness load_em, uniq_em = fa._fit_ml_em(1000) cc = np.dot(load_em, load_em.T) cc.flat[::cc.shape[0]+1] += uniq_em assert_allclose(cc, rslt.fitted_cov, rtol=1e-2, atol=1e-2)
def test_2factor(): """ # R code: r = 0.4 p = 6 ii = seq(0, p-1) ii = outer(ii, ii, "-") ii = abs(ii) cm = r^ii factanal(covmat=cm, factors=2) """ r = 0.4 p = 6 ii = np.arange(p) cm = r**np.abs(np.subtract.outer(ii, ii)) fa = Factor(corr=cm, n_factor=2, nobs=100, method='ml') rslt = fa.fit() for j in 0, 1: if rslt.loadings[0, j] < 0: rslt.loadings[:, j] *= -1 uniq = np.r_[0.782, 0.367, 0.696, 0.696, 0.367, 0.782] assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3) loads = [ np.r_[0.323, 0.586, 0.519, 0.519, 0.586, 0.323], np.r_[0.337, 0.538, 0.187, -0.187, -0.538, -0.337] ] for k in 0, 1: if np.dot(loads[k], rslt.loadings[:, k]) < 0: loads[k] *= -1 assert_allclose(loads[k], rslt.loadings[:, k], rtol=1e-3, atol=1e-3) assert_equal(rslt.df, 4) # Smoke test for standard errors e = np.asarray([ 0.11056836, 0.05191071, 0.09836349, 0.09836349, 0.05191071, 0.11056836 ]) assert_allclose(rslt.uniq_stderr, e, atol=1e-4) e = np.asarray([[0.08842151, 0.08842151], [0.06058582, 0.06058582], [0.08339874, 0.08339874], [0.08339874, 0.08339874], [0.06058582, 0.06058582], [0.08842151, 0.08842151]]) assert_allclose(rslt.load_stderr, e, atol=1e-4)
def test_direct_corr_matrix(): # Test specifying the correlation matrix directly mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0), smc=False) results = mod.fit(tol=1e-10) a = np.array([[0.965392158864, 0.225880658666255], [0.967587154301, 0.212758741910989], [0.929891035996, -0.000603217967568], [0.486822656362, -0.869649573289374]]) assert_array_almost_equal(results.loadings, a, decimal=8) # Test set and get endog_names mod.endog_names = X.iloc[:, 1:-1].columns assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id']) # Test set endog_names with the wrong number of elements assert_raises(ValueError, setattr, mod, 'endog_names', X.iloc[:, :1].columns)
def test_2factor(): """ # R code: r = 0.4 p = 6 ii = seq(0, p-1) ii = outer(ii, ii, "-") ii = abs(ii) cm = r^ii factanal(covmat=cm, factors=2) """ r = 0.4 p = 6 ii = np.arange(p) cm = r ** np.abs(np.subtract.outer(ii, ii)) fa = Factor(corr=cm, n_factor=2, nobs=100, method='ml') rslt = fa.fit() for j in 0, 1: if rslt.loadings[0, j] < 0: rslt.loadings[:, j] *= -1 uniq = np.r_[0.782, 0.367, 0.696, 0.696, 0.367, 0.782] assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3) loads = [np.r_[0.323, 0.586, 0.519, 0.519, 0.586, 0.323], np.r_[0.337, 0.538, 0.187, -0.187, -0.538, -0.337]] for k in 0, 1: if np.dot(loads[k], rslt.loadings[:, k]) < 0: loads[k] *= -1 assert_allclose(loads[k], rslt.loadings[:, k], rtol=1e-3, atol=1e-3) assert_equal(rslt.df, 4) # Smoke test for standard errors e = np.asarray([0.11056836, 0.05191071, 0.09836349, 0.09836349, 0.05191071, 0.11056836]) assert_allclose(rslt.uniq_stderr, e, atol=1e-4) e = np.asarray([[0.08842151, 0.08842151], [0.06058582, 0.06058582], [0.08339874, 0.08339874], [0.08339874, 0.08339874], [0.06058582, 0.06058582], [0.08842151, 0.08842151]]) assert_allclose(rslt.load_stderr, e, atol=1e-4)
def test_getframe_smoke(): # mostly smoke tests for now mod = Factor(X.iloc[:, 1:-1], 2, smc=True) res = mod.fit() df = res.get_loadings_frame(style='raw') assert_(isinstance(df, pd.DataFrame)) lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3) # The Styler option require jinja2, skip if not available try: from jinja2 import Template # noqa:F401 except ImportError: return # TODO: separate this and do pytest.skip? # Old implementation that warns if PD_LT_1_4: with warnings.catch_warnings(): warnings.simplefilter("always") lds.to_latex() else: # Smoke test using new style to_latex lds.style.to_latex() try: from pandas.io import formats as pd_formats except ImportError: from pandas import formats as pd_formats ldf = res.get_loadings_frame(style='display') assert_(isinstance(ldf, pd_formats.style.Styler)) assert_(isinstance(ldf.data, pd.DataFrame)) res.get_loadings_frame(style='display', decimals=3, threshold=0.2) res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO') res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)
def test_exact(): # Test if we can recover exact factor-structured matrices with # default starting values. np.random.seed(23324) # Works for larger k_var but slow for routine testing. for k_var in 5, 10, 25: for n_factor in 1, 2, 3: load = np.random.normal(size=(k_var, n_factor)) uniq = np.linspace(1, 2, k_var) c = np.dot(load, load.T) c.flat[::c.shape[0]+1] += uniq s = np.sqrt(np.diag(c)) c /= np.outer(s, s) fa = Factor(corr=c, n_factor=n_factor, method='ml') rslt = fa.fit() assert_allclose(rslt.fitted_cov, c, rtol=1e-4, atol=1e-4) rslt.summary() # smoke test
def test_1factor(): """ # R code: r = 0.4 p = 4 ii = seq(0, p-1) ii = outer(ii, ii, "-") ii = abs(ii) cm = r^ii fa = factanal(covmat=cm, factors=1) print(fa, digits=10) """ r = 0.4 p = 4 ii = np.arange(p) cm = r ** np.abs(np.subtract.outer(ii, ii)) fa = Factor(corr=cm, n_factor=1, method='ml') rslt = fa.fit() if rslt.loadings[0, 0] < 0: rslt.loadings[:, 0] *= -1 # R solution, but our likelihood is higher # uniq = np.r_[0.8392472054, 0.5820958187, 0.5820958187, 0.8392472054] # load = np.asarray([[0.4009399224, 0.6464550935, 0.6464550935, # 0.4009399224]]).T # l1 = fa.loglike(fa._pack(load, uniq)) # l2 = fa.loglike(fa._pack(rslt.loadings, rslt.uniqueness)) # So use a smoke test uniq = np.r_[0.85290232, 0.60916033, 0.55382266, 0.82610666] load = np.asarray([[0.38353316], [0.62517171], [0.66796508], [0.4170052]]) assert_allclose(load, rslt.loadings, rtol=1e-3, atol=1e-3) assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3) assert_equal(rslt.df, 2)
def test_example_compare_to_R_output(): # Testing basic functions and compare to R output # R code for producing the results: # library(psych) # library(GPArotation) # Basal = c(2.068, 2.068, 2.09, 2.097, 2.117, 2.14, 2.045, 2.076, 2.09, 2.111, 2.093, 2.1, 2.104) # Occ = c(2.07, 2.074, 2.09, 2.093, 2.125, 2.146, 2.054, 2.088, 2.093, 2.114, 2.098, 2.106, 2.101) # Max = c(1.58, 1.602, 1.613, 1.613, 1.663, 1.681, 1.58, 1.602, 1.643, 1.643, 1.653, 1.623, 1.653) # id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13) # Y <- cbind(Basal, Occ, Max, id) # a <- fa(Y, nfactors=2, fm="pa", rotate="none", SMC=FALSE, min.err=1e-10) # b <- cbind(a$loadings[,1], -a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="Promax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="Varimax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="quartimax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], -a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="oblimin", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # No rotation without squared multiple correlations prior # produce same results as in R `fa` mod = Factor(X.iloc[:, 1:-1], 2, smc=False) results = mod.fit(tol=1e-10) a = np.array([[0.965392158864, 0.225880658666255], [0.967587154301, 0.212758741910989], [0.929891035996, -0.000603217967568], [0.486822656362, -0.869649573289374]]) assert_array_almost_equal(results.loadings, a, decimal=8) # No rotation WITH squared multiple correlations prior # produce same results as in R `fa` mod = Factor(X.iloc[:, 1:-1], 2, smc=True) results = mod.fit() a = np.array([[0.97541115, 0.20280987], [0.97113975, 0.17207499], [0.9618705, -0.2004196], [0.37570708, -0.45821379]]) assert_array_almost_equal(results.loadings, a, decimal=8) # Same as R GRArotation results.rotate('varimax') a = np.array([[0.98828898, -0.12587155], [0.97424206, -0.15354033], [0.84418097, -0.502714], [0.20601929, -0.55558235]]) assert_array_almost_equal(results.loadings, a, decimal=8) results.rotate('quartimax') # Same as R fa a = np.array([[0.98935598, 0.98242714, 0.94078972, 0.33442284], [0.117190049, 0.086943252, -0.283332952, -0.489159543]]) assert_array_almost_equal(results.loadings, a.T, decimal=8) results.rotate('equamax') # Not the same as R fa results.rotate('promax') # Not the same as R fa results.rotate('biquartimin') # Not the same as R fa results.rotate('oblimin') # Same as R fa a = np.array( [[1.02834170170, 1.00178840104, 0.71824931384, -0.00013510048], [0.06563421, 0.03096076, -0.39658839, -0.59261944]]) assert_array_almost_equal(results.loadings, a.T, decimal=8) # Testing result summary string results.rotate('varimax') desired = (""" Factor analysis results ============================= Eigenvalues ----------------------------- Basal Occ Max id ----------------------------- 2.9609 0.3209 0.0000 -0.0000 ----------------------------- ----------------------------- Communality ----------------------------- Basal Occ Max id ----------------------------- 0.9926 0.9727 0.9654 0.3511 ----------------------------- ----------------------------- Pre-rotated loadings ----------------------------------- factor 0 factor 1 ----------------------------------- Basal 0.9754 0.2028 Occ 0.9711 0.1721 Max 0.9619 -0.2004 id 0.3757 -0.4582 ----------------------------- ----------------------------- varimax rotated loadings ----------------------------------- factor 0 factor 1 ----------------------------------- Basal 0.9883 -0.1259 Occ 0.9742 -0.1535 Max 0.8442 -0.5027 id 0.2060 -0.5556 ============================= """) actual = results.summary().as_text() actual = "\n".join(line.rstrip() for line in actual.splitlines()) + "\n" assert_equal(actual, desired)
def test_example_compare_to_R_output(): # Testing basic functions and compare to R output # R code for producing the results: # library(psych) # library(GPArotation) # Basal = c(2.068, 2.068, 2.09, 2.097, 2.117, 2.14, 2.045, 2.076, 2.09, 2.111, 2.093, 2.1, 2.104) # Occ = c(2.07, 2.074, 2.09, 2.093, 2.125, 2.146, 2.054, 2.088, 2.093, 2.114, 2.098, 2.106, 2.101) # Max = c(1.58, 1.602, 1.613, 1.613, 1.663, 1.681, 1.58, 1.602, 1.643, 1.643, 1.653, 1.623, 1.653) # id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13) # Y <- cbind(Basal, Occ, Max, id) # a <- fa(Y, nfactors=2, fm="pa", rotate="none", SMC=FALSE, min.err=1e-10) # b <- cbind(a$loadings[,1], -a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="Promax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="Varimax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="quartimax", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], -a$loadings[,2]) # b # a <- fa(Y, nfactors=2, fm="pa", rotate="oblimin", SMC=TRUE, min.err=1e-10) # b <- cbind(a$loadings[,1], a$loadings[,2]) # b # No rotation without squared multiple correlations prior # produce same results as in R `fa` mod = Factor(X.iloc[:, 1:-1], 2, smc=False) results = mod.fit(tol=1e-10) a = np.array([[0.965392158864, 0.225880658666255], [0.967587154301, 0.212758741910989], [0.929891035996, -0.000603217967568], [0.486822656362, -0.869649573289374]]) assert_array_almost_equal(results.loadings, a, decimal=8) # No rotation WITH squared multiple correlations prior # produce same results as in R `fa` mod = Factor(X.iloc[:, 1:-1], 2, smc=True) results = mod.fit() a = np.array([[0.97541115, 0.20280987], [0.97113975, 0.17207499], [0.9618705, -0.2004196], [0.37570708, -0.45821379]]) assert_array_almost_equal(results.loadings, a, decimal=8) # Same as R GRArotation results.rotate('varimax') a = np.array([[0.98828898, -0.12587155], [0.97424206, -0.15354033], [0.84418097, -0.502714], [0.20601929, -0.55558235]]) assert_array_almost_equal(results.loadings, a, decimal=8) results.rotate('quartimax') # Same as R fa a = np.array([[0.98935598, 0.98242714, 0.94078972, 0.33442284], [0.117190049, 0.086943252, -0.283332952, -0.489159543]]) assert_array_almost_equal(results.loadings, a.T, decimal=8) results.rotate('equamax') # Not the same as R fa results.rotate('promax') # Not the same as R fa results.rotate('biquartimin') # Not the same as R fa results.rotate('oblimin') # Same as R fa a = np.array([[1.02834170170, 1.00178840104, 0.71824931384, -0.00013510048], [0.06563421, 0.03096076, -0.39658839, -0.59261944]]) assert_array_almost_equal(results.loadings, a.T, decimal=8) # Testing result summary string results.rotate('varimax') desired = ( """ Factor analysis results ============================= Eigenvalues ----------------------------- Basal Occ Max id ----------------------------- 2.9609 0.3209 0.0000 -0.0000 ----------------------------- ----------------------------- Communality ----------------------------- Basal Occ Max id ----------------------------- 0.9926 0.9727 0.9654 0.3511 ----------------------------- ----------------------------- Pre-rotated loadings ----------------------------------- factor 0 factor 1 ----------------------------------- Basal 0.9754 0.2028 Occ 0.9711 0.1721 Max 0.9619 -0.2004 id 0.3757 -0.4582 ----------------------------- ----------------------------- varimax rotated loadings ----------------------------------- factor 0 factor 1 ----------------------------------- Basal 0.9883 -0.1259 Occ 0.9742 -0.1535 Max 0.8442 -0.5027 id 0.2060 -0.5556 ============================= """) actual = results.summary().as_text() actual = "\n".join(line.rstrip() for line in actual.splitlines()) + "\n" assert_equal(actual, desired)