def get_fa_loads(d_phens, kmo_threshold=0.6, bartlett_threshold=0.05, n_shuffle=100, test_factorability=False): """ Get factors :param d_phens: :param loading_thresh: :param kmo_threshold: :param bartlett_threshold: :param n_shuffle: :param test_factorability: :return: """ # Evaluation of the “factorability” of phenotypes if test_factorability: _, bartlett_value = calculate_bartlett_sphericity(d_phens) _, kmo_model = calculate_kmo(d_phens) if (kmo_model < kmo_threshold) or (bartlett_value > bartlett_threshold): # raise ValueError('Phenotypic data does not contain factors') warnings.warn('\nPhenotypic data does not contain factors') return None # Define the number of afctors by parallel analysis n_factors = pa(d_phens, n_shuffle) # factor analysis fa = FactorAnalyzer(n_factors=n_factors) fa.fit(d_phens) loads = pd.DataFrame(data=fa.loadings_, index=d_phens.columns) return loads
def test_calculate_bartlett_sphericity(): path = 'tests/data/test01.csv' data = pd.read_csv(path) s, p = calculate_bartlett_sphericity(data.values) assert_almost_equal(s, 14185) assert_almost_equal(p, 0)
def perform_fa(df): chi_square_value, p_value = calculate_bartlett_sphericity(df) if p_value > 0.05: return ( f"P-value=({p_value}). Statistically insignifincant, factorial analaisis can not be performed" ) else: return ( f"P-value=({p_value}). Statistically significant, factorial analaisis can be performed" )
def stat_test(mat_features): m_corr = np.array(mat_features.corr()) print('determinant feature matrix is: ', np.linalg.det(m_corr)) chi_square_value, p_value = calculate_bartlett_sphericity(m_corr) print('Bartlett test') print('value of chi square: ', chi_square_value, 'p-value: ', p_value) kmo_all, kmo_model = calculate_kmo(m_corr) print('Kaiser-Meyer-Olkin test') print('value of kmo: ', kmo_model)
def bartlett_sphericity(self): chi_square_value, p_value = calculate_bartlett_sphericity(self.dataset) out = "chi^2 = %.3f" % chi_square_value if p_value < 0.001: out += ", p<0.0001" else: out += ", p=%.3f" % p_value self.logger.info(out) if p_value < self.p_alpha: self.logger.info("It is not an identity matrix.") else: self.logger.info("It is an identity matrix.") return 0
def out_Ade(df2): #充分性测试(Adequacy Test) #KMO值:0.9以上非常好;0.8以上好;0.7一般;0.6差;0.5很差;0.5以下不能接受;巴特利球形检验的值范围在0-1,越接近1,使用因子分析效果越好 #Kaiser-Meyer-Olkin Test kmo_all, kmo_model = factor_analyzer.calculate_kmo(df2) print('kmo:{}'.format(kmo_model)) #Bartlett's Test chi_square_value, p_value = factor_analyzer.calculate_bartlett_sphericity( df2) print('Bartlett_p:{}'.format(p_value)) out_Ade_df = pd.DataFrame([['充分性测试(Adequacy Test)', None], ['kmo', 'Bartlett_p'], [kmo_model, p_value]]) # print(out_Ade_df) return out_Ade_df
def factor_analysis(df, name, all_factors=True): chi_square_value, p_value = calculate_bartlett_sphericity(df) kmo_all, kmo_model = calculate_kmo(df) pairwise_correlations = pairwiseCorr(df) print("\n" + "Chi_square_value and p_value:") print(chi_square_value, p_value) print("\n" + "Kmo model:") print(kmo_model) print("\n" + "Pairwise correlations:") print(pairwise_correlations) if all_factors == True: eigenvalues = get_factor_eigenvalues(df) print("\n" + "FA Eigenvalues:") print(eigenvalues) plot_correlations(df)
def efa_model_tests(item_data): st.header('Model Tests') col1, col2 = st.beta_columns(2) with col1: st.write('Bartlett Sphericity') #runSphericity = st.button('Run Analysis') #if runSphericity: chi2, pValue = calculate_bartlett_sphericity(item_data) st.write(f'chi-squared = {chi2}, p = {pValue}') with col2: st.write('KMO Test') #runKmo = st.button('Run KMO') #if runKmo: kmoAll, kmoModel = calculate_kmo(item_data) st.write(f'KMO Statistic: {kmoModel}')
def factor_analysis(df, name, not_test_set=True): chi_square_value, p_value = calculate_bartlett_sphericity(df) kmo_all, kmo_model = calculate_kmo(df) p_values, pairwise_correlations = pairwiseCorr(df) print("\n" + "Chi_square_value and p_value:") print(chi_square_value, p_value) print("\n" + "Kmo model:") print(kmo_model) print("\n" + "Statistically significant correlations:") print(p_values) print("\n" + "Most correlated variables:") print(pairwise_correlations) if not_test_set == True: eigenvalues = get_factor_eigenvalues(df) print("\n" + "FA Eigenvalues:") print(eigenvalues) plot_correlations(df, name)
def FA(observied_variables, name): from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity chi_square_value, p_value = calculate_bartlett_sphericity( observied_variables) print("chi_square_value", chi_square_value, "p-value:", p_value) from factor_analyzer.factor_analyzer import calculate_kmo kmo_all, kmo_model = calculate_kmo(observied_variables) print("KMO value", kmo_model) # Create factor analysis object and perform factor analysis if name == 'phone': fa = FactorAnalyzer(n_factors=2) if name == 'QOL': fa = FactorAnalyzer(n_factors=4) fa.fit_transform(observied_variables) # Check Eigenvalues eigen_values, vectors = fa.get_eigenvalues() print(eigen_values) """ # Create scree plot using matplotlib plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values) plt.plot(range(1,observied_variables.shape[1]+1),eigen_values) if name == 'phone': plt.title('Scree Plot for phone features',fontsize=24) if name == 'QOL': plt.title('Scree Plot for QOL features',fontsize=24) plt.xlabel('Factors', fontsize=18) plt.ylabel('Eigenvalue',fontsize=18) plt.grid() plt.show() """ loadings = fa.loadings_ print(pd.DataFrame(loadings, observied_variables.columns)) #print(pd.DataFrame(fa.get_communalities())) return pd.DataFrame(loadings, observied_variables.columns) # Get variance of each factors print( pd.DataFrame(fa.get_factor_variance(), ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo from factor_analyzer import FactorAnalyzer import seaborn as sns df = pd.read_pickle('process_data_after_remove_variabel_remain_96.pkl') # import dataset X = df.drop('qc_salzrckhalt', axis=1) y = df['qc_salzrckhalt'] # Adequcy Test : need to evaluate the “factorability” of our dataset. # Factorability means "can we found the factors in the dataset?" # Bartletss`s Test VarbList = df.columns chi_square_value, p_value = calculate_bartlett_sphericity(X) chi_square_value, p_value # --> p Value = 0 that mean the test was statistically significant, the obvserved correlation matrix is not an identy matrix # Kaiser_Meyer_Olkin Test kmo_all, kmo_model = calculate_kmo(X) kmo_model # --> KMO value of 0.653 indicates a moderate suitableity for factory analysis ' Source Cureton, E. E./ D'Agostino, R. B. 1983: Factor analysis: an applied approach. Hillside, NJ: Lawrence Erlbaum Associates, S. 389 f. # Choosing Number of Factors # Create factor analysis object and perform factor analysis fa = FactorAnalyzer(rotation=None, n_factors=30) fa.fit(X) # Check Eigenvalues ev, v = fa.get_eigenvalues() ev
# standardize data df_standard = preprocessing.scale(df[vars_tot]) #-------------------------------------------- # Factor Analysis #-------------------------------------------- '''NOTE: from the previous PCA we know that the first three principle components explain 72% of the variance. Hence we base our analysis on three components. The standard method of the FA package is minimum residual. Also possible: MLE, PCA. ''' # Pre-tests ## Bartlett's test. H0: equal variance bartlett_chi, bartlett_p = calculate_bartlett_sphericity( df[vars_tot]) # p = 0.0 ## Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.6 kmo_all, kmo_model = calculate_kmo(df[vars_tot]) #kmo_model = 0.7297 #-------------------------------------------- # Factor Analysis fa = FactorAnalyzer(rotation=None, n_factors=4) fa.fit(df[vars_tot]) ev, v = fa.get_eigenvalues() '''NOTE: First four factors have an eigen value greater than 1. Use those.''' # Perform a parallel analysis list_ev_rand = [] np.random.seed(10)
def test(self): self.chi_square_value, self.p_value = calculate_bartlett_sphericity( self.data[self.col]) self.kmo_all, self.kmo_model = calculate_kmo(self.data[self.col]) return self.chi_square_value, self.p_value, self.kmo_all, self.kmo_model
# Normalizing the data # In[35]: card_df_norm = MinMaxScaler().fit_transform(card_df) pd.DataFrame(card_df_norm).head() # ## 3.1 Dimensionality Reduction - Factor Analysis # Bartlett’s test of sphericity checks whether or not the observed variables intercorrelate at all using the observed correlation matrix against the identity matrix. If the test found statistically insignificant, you should not employ a factor analysis # # In[36]: chi_square_value, p_value = calculate_bartlett_sphericity(card_df_norm) print(chi_square_value, p_value) # ##### In this Bartlett ’s test, the p-value is 0. The test was statistically significant, indicating that the observed correlation matrix is not an identity matrix. # # Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis. It determines the adequacy for each observed variable and for the complete model. KMO estimates the proportion of variance among all the observed variable. Lower proportion id more suitable for factor analysis. KMO values range between 0 and 1. Value of KMO less than 0.6 is considered inadequate. # In[37]: kmo_all, kmo_model = calculate_kmo(card_df_norm) kmo_model # ###### Here kmo_model value is 0.64, so it is adequate. # ### Choosing the Number of Factors
'''NOTE: All reject H0 --> Not normal distributed ''' '''Conclusion: Use principle factors method ''' #-------------------------------------------- # Pre-tests #-------------------------------------------- ''' We perform two pre-tests 1) Bartlett's test of sphericity: tests whether the correlation matrix equals an identiy matrix (H0), which means that the variables are unrelated; 2) Kaiser-Meyer-Olkin test: Is a statistic that indicates the proportion of variance that might be caused by underlying factors. Test statistic should be over .5' ''' # Bartlett's test. H0: equal variance bartlett_chi, bartlett_p = calculate_bartlett_sphericity( df_standard) # p = 0.0 # Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.5 kmo_all, kmo_model = calculate_kmo(df_standard) #kmo_model = 0.85 '''Note: looks good ''' #-------------------------------------------- # Determine number of factors #-------------------------------------------- '''We use multiple selection criteria: 1) Scree plot (elbow plot) 2) Kaiser-Guttman rule 3) Parallel analysis ''' # Get factor estimates