def get_ev(self, X): num_features = len(X.columns) fa = FactorAnalyzer(num_features, rotation=None, method=self.method) fa.fit(X) ev, v = fa.get_eigenvalues() return ev
def find_number_of_Factors_1(eigenval_limit, dimensions, obs, kind, prnt): """this function calculates the number of factors with an Eigenvalue which is greater then the 'eigenval_limit, without the param trial_index :param eigenval_limit: number (float) , recommended = 1.0 dimensions: dimensions before dimensionality reduction (obs.shape[1]) obs: 2 dim array holding the averaged data kind: 0, if data is averaged, 1 if data is single trial, 2 if data is concatenated :return: the number of factors generating the the data with eigenvalues greater then eigenval limit """ fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=dimensions, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(obs) eigenvals, x = fa.get_eigenvalues() # take the eigenvals >= 1 --> number of them = number of relevant factors num_FA_dim = len(eigenvals[eigenvals >= eigenval_limit]) if prnt: if kind == 0: print('averaged:') print('Number of Factors: ', num_FA_dim) elif kind == 2: print('concatenated:') print('Number of Factors: ', num_FA_dim) return num_FA_dim
def eigenvalues_plt(data): img = io.BytesIO() plt.switch_backend('Agg') plt.style.use('ggplot') fa = FactorAnalyzer() fa.fit(data) eigen_values, vectors = fa.get_eigenvalues() plt.figure(figsize=(10, 10)) plt.scatter(range(1, data.shape[1] + 1), eigen_values) plt.plot(range(1, data.shape[1] + 1), eigen_values) plt.title('Factor Importance by Eigenvalues') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.savefig(img, format='png') img.seek(0) graph_url = base64.b64encode(img.getvalue()).decode() plt.close() return 'data:image/png;base64,{}'.format(graph_url)
def plotfig(cols): c = df1.corr() xa = df1[df1.columns[2:7]] fa = FactorAnalyzer() fa.fit(xa, 10) #Get Eigen values and plot them ev, v = fa.get_eigenvalues() ev #plt.plot(range(1,xa.shape[1]+1),ev) fig = px.scatter(x=range(1, xa.shape[1] + 1), y=ev) fig.update_traces(mode='lines+markers') fig.update_layout(yaxis={'visible': True, 'showticklabels': True}) fig.update_layout(xaxis={'visible': True, 'showticklabels': True}) fig.update_layout(width=700, height=200, plot_bgcolor='rgb(255,255,255)') fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#dddddd') fig.update_xaxes(showline=True, linewidth=1, linecolor='black') fig.update_yaxes(showline=True, linewidth=1, linecolor='black') fig['layout'].update(margin=dict(l=0, r=20, b=20, t=10)) fig.update_traces(line=dict(color="#0863ae")) fig.update_layout(xaxis_title="X", yaxis_title="Y", legend_title="Factor Analysis", font=dict(family="Courier New, monospace", size=12, color="black")) return fig
def factor_analysis(self, input_x): ss_x = StandardScaler().fit_transform(input_x) norm_x = normalize(input_x, axis=0) factor_number = 9 fa = FactorAnalyzer( n_factors=factor_number, rotation='oblimin') # oblimin/promax varimax:orthogonal fa.fit(ss_x) ev, v = fa.get_eigenvalues() factor_loading_matrix = fa.loadings_ fa_score = fa.transform(ss_x) print('ev', ev) # print('v',v) # print('factor_loading_matrix',factor_loading_matrix) fa_name = list(self.table_data.columns[1::]) # print('quantization_score', len(fa_name),fa_name) for i in range(factor_number): all_coefficients = np.sort(factor_loading_matrix[:, i]) coefficients_index = np.argsort(factor_loading_matrix[:, i]) print('factor_i', i) for j, coefficient in enumerate(all_coefficients): if coefficient > 0.5: print('coefficients_index', coefficients_index[j], fa_name[coefficients_index[j]]) plt.scatter(range(1, input_x.shape[1] + 1), ev) plt.plot(range(1, input_x.shape[1] + 1), ev) plt.title('scree figure') plt.ylabel('eigenvalues') plt.grid() plt.show() return fa_score
def make_loadings_matrix(rating_m): '''Takes a rating matrix and returns the loading matrix. Optimized for number of components using the knee, with a oblimin rotation for interpretability ''' # Fit the initial factor analysis fa = FactorAnalyzer(n_factors=10, rotation='oblimin') fa.fit(rating_m) x = list(range(1, 16)) fa_eigens = fa.get_eigenvalues()[1] fa_matrix_knee = KneeLocator(x, fa_eigens, S=1.0, curve='convex', direction='decreasing') fa_knee = fa_matrix_knee.knee fa_kneed = FactorAnalyzer(n_factors=fa_knee, rotation='varimax').fit(rating_m) loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2)) loadings_m.index = get_construct_names() loadings_m.index = loadings_m.index.rename(name='Construct') loadings_m.columns = [ 'Factor {} ({:.0f}%)'.format( i + 1, fa_kneed.get_factor_variance()[1][i] * 100) for i in loadings_m.columns ] return loadings_m
def def_factor_analysis(X, k, rotation_=None): model = FactorAnalyzer(n_factors=k, rotation=rotation_).fit(X) eigen = model.get_eigenvalues() l = model.loadings_ v = model.get_factor_variance() return eigen, l, v
def FA(self): fa = FactorAnalyzer(n_factors=1, method="principal", rotation="varimax") fa.fit(self.df) # Print eigenvalues ev, v = fa.get_eigenvalues() print(ev) # Print loadings print(fa.loadings_) self.coeff = fa.loadings_ return 0
def best_num_factors(df): fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=10, rotation=None, rotation_kwargs={}, use_smc=True) fa.fit(df) ev, v = fa.get_eigenvalues() num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()]) return num_f
def calculate_py_output(test_name, factors, method, rotation, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer() fa.analyze(data, factors, method=method, rotation=rotation) evalues, values = fa.get_eigenvalues() return {'value': values, 'evalues': evalues, 'structure': fa.structure, 'loading': fa.loadings, 'uniquenesses': fa.get_uniqueness(), 'communalities': fa.get_communalities(), 'scores': fa.get_scores(data)}
def show_num_factors(df): fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=10, rotation='varimax', rotation_kwargs={}, use_smc=True) fa.fit(df) ev, v = fa.get_eigenvalues() num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()]) res_f = len([e for e in ev if e > 1]) return f"Best number of factors: {num_f}. Other possible factors {res_f-num_f}"
def FA(observied_variables, name): from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity chi_square_value, p_value = calculate_bartlett_sphericity( observied_variables) print("chi_square_value", chi_square_value, "p-value:", p_value) from factor_analyzer.factor_analyzer import calculate_kmo kmo_all, kmo_model = calculate_kmo(observied_variables) print("KMO value", kmo_model) # Create factor analysis object and perform factor analysis if name == 'phone': fa = FactorAnalyzer(n_factors=2) if name == 'QOL': fa = FactorAnalyzer(n_factors=4) fa.fit_transform(observied_variables) # Check Eigenvalues eigen_values, vectors = fa.get_eigenvalues() print(eigen_values) """ # Create scree plot using matplotlib plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values) plt.plot(range(1,observied_variables.shape[1]+1),eigen_values) if name == 'phone': plt.title('Scree Plot for phone features',fontsize=24) if name == 'QOL': plt.title('Scree Plot for QOL features',fontsize=24) plt.xlabel('Factors', fontsize=18) plt.ylabel('Eigenvalue',fontsize=18) plt.grid() plt.show() """ loadings = fa.loadings_ print(pd.DataFrame(loadings, observied_variables.columns)) #print(pd.DataFrame(fa.get_communalities())) return pd.DataFrame(loadings, observied_variables.columns) # Get variance of each factors print( pd.DataFrame(fa.get_factor_variance(), ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
def FA(self): print(self.df.columns) print(self.mean) print(self.sigma) ''' print(self.df) chi_square_value,p_value=calculate_bartlett_sphericity(self.df) print(chi_square_value, p_value) # Bartlett ’s test, the p-value is 0. The test was statistically significant, indicating that the observed correlation matrix is not an identity matrix. kmo_all,kmo_model=calculate_kmo(self.df) print(kmo_model) # Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis. fa = FactorAnalyzer(n_factors=self.df.shape[1], rotation=None) fa.fit(self.df) # Check Eigenvalues ev, v = fa.get_eigenvalues() print(ev) ''' fa = FactorAnalyzer(n_factors=1, method="principal", rotation="varimax") fa.fit(self.df) # Print eigenvalues ev, v = fa.get_eigenvalues() print(ev) # Print loadings print(fa.loadings_) #print(fa.transform(self.df)) ''' plt.scatter(range(1,self.df.shape[1]+1),ev) plt.plot(range(1,self.df.shape[1]+1) ,ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.savefig('{}.png'.format(self.indicatorName)) plt.close() ''' return 0
class factor_analysis(): def __init__(self, data, col, n): self.data = data self.col = col self.n = n def test(self): self.chi_square_value, self.p_value = calculate_bartlett_sphericity( self.data[self.col]) self.kmo_all, self.kmo_model = calculate_kmo(self.data[self.col]) return self.chi_square_value, self.p_value, self.kmo_all, self.kmo_model def analysis(self): self.fa = FactorAnalyzer(self.n, rotation=None) self.fa.fit(self.data[self.col]) return self def _plot(self): ev, v = self.fa.get_eigenvalues() plt.scatter(range(1, self.data[self.col].shape[1] + 1), ev) plt.plot(range(1, self.data[self.col].shape[1] + 1), ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() df_cm = pd.DataFrame(np.abs(self.fa.loadings_), index=self.col) plt.figure(figsize=(14, 14)) ax = sns.heatmap(df_cm, annot=True, cmap="BuPu") # 设置y轴的字体的大小l ax.yaxis.set_tick_params(labelsize=15) plt.title('Factor Analysis', fontsize='xx-large') # Set y-axis label plt.ylabel('Sepal Width', fontsize='xx-large') # plt.savefig('factorAnalysis.png', dpi=500) def _transform(self): return self.fa(self.data[self.col])
def KaiserGuttman(self): fa = FactorAnalyzer(n_factors=self.N_adj, rotation=self.kaiser_guttman_rotation, method=self.kaiser_guttman_method) fa.fit(self.dataset) ev, v = fa.get_eigenvalues() N_factor = np.where(ev >= 1)[0].shape[0] # To draw eigen values plt.figure() plt.plot() plt.plot(range(1, ev.shape[0] + 1), ev, label="eigen value", marker="o") plt.axhline(y=1, color='red', linestyle="dotted", label=r"$y=1$") plt.legend() plt.xlim(0.5, ev.shape[0] + 0.6) plt.xlabel("factor") plt.ylabel("eigen value") outf = "%s/eigen_value.%s" % (self.outd, self.fig_ext) plt.savefig(outf, dpi=args.fig_dpi) self.logger.info("%s is saved." % outf) return N_factor
import sklearn.metrics as skm from sklearn.model_selection import train_test_split from factor_analyzer import FactorAnalyzer from factor_analyzer import (ConfirmatoryFactorAnalyzer, ModelSpecificationParser) from factor_analyzer.utils import (corr, impute_values, partial_correlations, smc) data1 = pd.read_csv("https://donatello-telesca.squarespace.com/s/Exposure-t4yx.csv") # Perform Factor Analysis fa = FactorAnalyzer() # fa.set_params(n_factors=6,rotation=None) fa.set_params(n_factors=6,rotation='varimax') fa.fit(data1) # Check factors factor_loadings = fa.loadings_ eigen_values, vectors = fa.get_eigenvalues() communalities = fa.get_communalities() # Create scree plot # plt.scatter(range(1,29),eigen_values) # plt.plot(range(1,29),eigen_values) # plt.title('Scree Plot') # plt.xlabel('Factors') # plt.ylabel('Eigenvalue') # plt.grid() # plt.show() def clump_factor_vars(factor_loadings,factor_num): observed_vars = [] for each in range(len(factor_loadings)): if factor_loadings[each].argmax() == factor_num: observed_vars.append(each)
plt.show() ################################################################################################# '''Week 3''' # Factor Analysis # from factor_analyzer import FactorAnalyzer fact = FactorAnalyzer(n_factors=2, rotation='promax') df_cor = pd.merge(df_cor, df_med) a_data = df_cor[[ 'hs_degree', 'median_age', 'second_mortgage', 'pct_own', 'bad_debt' ]] fact.fit_transform(a_data) ev, v = fact.get_eigenvalues() plt.plot(ev) plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5']) plt.show() plt.plot(fact.loadings_) plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5']) plt.show() fact.get_communalities() ################################################################################################## '''Week 4''' # Regression Analysis #
MLE, PCA. ''' # Pre-tests ## Bartlett's test. H0: equal variance bartlett_chi, bartlett_p = calculate_bartlett_sphericity( df[vars_tot]) # p = 0.0 ## Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.6 kmo_all, kmo_model = calculate_kmo(df[vars_tot]) #kmo_model = 0.7297 #-------------------------------------------- # Factor Analysis fa = FactorAnalyzer(rotation=None, n_factors=4) fa.fit(df[vars_tot]) ev, v = fa.get_eigenvalues() '''NOTE: First four factors have an eigen value greater than 1. Use those.''' # Perform a parallel analysis list_ev_rand = [] np.random.seed(10) for i in range(100): df_rand = pd.DataFrame(np.random.rand(*df[vars_tot].shape)) fa_rand = FactorAnalyzer(rotation=None, n_factors=4).fit(df_rand) ev_rand, _ = fa_rand.get_eigenvalues() list_ev_rand.append(ev_rand) fig, ax = plt.subplots(figsize=(15, 9)) ax.set(ylabel='Eigen Value', xlabel='Factor') ax.plot(range(1, df_standard.shape[1] + 1), ev, marker='o', label='Factor')
def get_eigenvalues(item_data, items): fa = FactorAnalyzer(len(items), rotation=None) fa.fit(item_data) return fa.get_eigenvalues()
class Factor_Analyse_select(Feature): """ 因子分析 """ data = None selected_column = list() method = 'minres' def set_data(self, data): """ 传入数据 :param data: 需要处理的数据 :return: """ self.data = data self.full_data = self._check_missing_value() self.numeric_data = self.get_numeric_data() """ def __init__(self, data): self.set_data(data) def select_column(self, *colnames): for colname in colnames: if colname not in self.data.columns.values.tolist(): raise ValueError("所选列不存在") elif colname not in self.full_data: raise ValueError("所选列存在缺失值") elif colname not in self.numeric_data: raise ValueError("所选列不为数值") elif colname in self.selected_column: raise ValueError("this column has been selected") else: self.selected_column.append(colname) """ def set_method(self, method=None): """ 设置方法 :param method: 选择的方法 str 可选:"minres", "ml", "principal" :return: """ if method is None: warnings.warn("参数未设置") elif method not in ['minres', 'ml', 'principal']: raise ValueError("invalid method") else: self.method = method def fit(self): """ 按所选方法进行变换 :return: 变换完毕的所有向量 """ feed_data = self.data[self.selected_column] self.model = FactorAnalyzer(n_factors=feed_data.shape[1], method=self.method, rotation=None) self.model.fit(feed_data) return self.model.transform(feed_data) def select_by_number(self, num): """ 选择特征值最大的num个向量 :param num: 选择向量个数 int :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ if num < 0 or num > len(self.selected_column): raise ValueError("too many or too less columns are selected") temp = self.fit() result = pd.DataFrame(temp[:, 0:num]) colnames = list() for i in range(num): colnames.append("FA " + str(i + 1)) result.columns = colnames for i in self.data.columns.values.tolist()[::-1]: result.insert(0, column=i, value=self.data[[i]]) return result def select_by_eig_GE_1(self): """ 选择特征值大于1的因子 :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ pre_list = self.model.get_eigenvalues() index = 0 for i in pre_list[0]: if i < 1: break index += 1 temp = self.fit() result = pd.DataFrame(temp[:, 0:index]) colnames = list() for i in range(index): colnames.append("FA " + str(i + 1)) result.columns = colnames for i in self.data.columns.values.tolist()[::-1]: result.insert(0, column=i, value=self.data[[i]]) return result def _select_by(self, **type_arg): """ 按输入参数返回因子分析结果 :param type_arg: 控制变量字典 字典中"method": 因子分析的方法 "minres":最小残差法(默认), "ml":极大似然, "principal":主成分分析 字典中"type" == 0: 按数量选择结果, typearg: 选择特征值最大的typearg个因子 字典中"type" == 1: 选择特征值大于1的所有向量 :return: 所有输入列和分箱结果向量组成的数据框(包含输入表的所有数据) pandas.dataFrame """ if "method" in type_arg.keys(): self.set_method(type_arg["method"]) if type_arg["type"] == 0: self.select_column(*type_arg["columns"]) return self.select_by_number(type_arg["typearg"]) elif type_arg["type"] == 1: self.select_column(*type_arg["columns"]) return self.select_by_eig_GE_1() else: raise ValueError("type error:不存在所选类")
def calculate_py_output(test_name, factors, method, rotation, svd_method='randomized', use_corr_matrix=False, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation svd_method : str, optional The SVD method to use Defaults to 'randomized' use_corr_matrix : bool, optional Whether to use the correlation matrix. Defaults to False. top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) if use_corr_matrix: X = data.corr() else: X = data.copy() rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer(n_factors=factors, method=method, svd_method=svd_method, rotation=rotation, is_corr_matrix=use_corr_matrix) fa.fit(X) evalues, values = fa.get_eigenvalues() return { 'value': values, 'evalues': evalues, 'structure': fa.structure_, 'loading': fa.loadings_, 'uniquenesses': fa.get_uniquenesses(), 'communalities': fa.get_communalities(), 'scores': fa.transform(data) }
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False): """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability. You want "promax" if you want Oblimin on large datasets. See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. """ assert not df.isnull().values.any(), "Data must not contain any nan or inf values" assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)" def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return print("KMO Value: {}.".format(data_suitable(df, kmo_value = True))) fa = FactorAnalyzer(method = "minres", rotation = rotation, n_factors = n_factors) fa.fit(df) def eigenplot(df): df = pd.DataFrame(df) fig = go.Figure() fig.add_trace( go.Scatter( x = df.index.values, y = df[0].values, mode = 'lines' ) ) fig.add_shape( type = "line", y0 = 1, x0 = 0, y1 = 1, x1 = len(df), line = dict( color = 'red', dash = 'dash' ) ) fig.update_layout( title = "Factor Eigenvalues", yaxis_title="Eigenvalue", xaxis_title="Factor", xaxis = dict( range = [0,df[df[0] > 0].index.values[-1]] ) ) fig.show() return eigenplot(fa.get_eigenvalues()[1]) Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000) tmp = pd.DataFrame(fa.get_factor_variance()[1:]) tmp.index = ["Proportional Varience","Cumulative Varience"] Plotting.dfTable(tmp) if rotation == 'promax': Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, title = "Varience Explained", x = list(df.columns), description = "The proportion of each variables varience that can be explained by the factors.", expand = True, height = 300, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, title = "Variable Uniqueness", x = list(df.columns), expand = True, height = 300, width = 2000) if transform: return fa.transform(df) return
# Teste da Esferacidade de Bartlett!! chi_square_value, p_value = calculate_bartlett_sphericity(data) print(chi_square_value, p_value) # p_value = 0, entao podemos proceder com a analise de fatores pois quanto menor esse valor, mais confianca temos nas predicoes # Teste de Kiaser-Meyer-Olkin(KMO) kmo_all, kmo_model = calculate_kmo(data) print(kmo_model) # Resultado 0.84, entao podemos proceder com a nossa analise de fatores # Criamos um objeto analise de fatores sem rotacao analisador_sem_rotacao = FactorAnalyzer(n_factors=20, rotation=None) analisador_sem_rotacao.fit(data) # Aqui estamos checando os nossos autovalores autovalores, v = analisador_sem_rotacao.get_eigenvalues() print(autovalores) # Criamos o Grafico Scree para observar quais autovalores sao maiores que 1, neste caso usaremos 6 fatores plt.scatter(range(1, data.shape[1] + 1), autovalores) plt.plot(range(1, data.shape[1] + 1), autovalores) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() # Criamos um objeto analise de faotres com rotacao varimax analisador_varimax = FactorAnalyzer(n_factors=5, rotation="varimax") analisador_varimax.fit(data)
VarbList = df.columns chi_square_value, p_value = calculate_bartlett_sphericity(X) chi_square_value, p_value # --> p Value = 0 that mean the test was statistically significant, the obvserved correlation matrix is not an identy matrix # Kaiser_Meyer_Olkin Test kmo_all, kmo_model = calculate_kmo(X) kmo_model # --> KMO value of 0.653 indicates a moderate suitableity for factory analysis ' Source Cureton, E. E./ D'Agostino, R. B. 1983: Factor analysis: an applied approach. Hillside, NJ: Lawrence Erlbaum Associates, S. 389 f. # Choosing Number of Factors # Create factor analysis object and perform factor analysis fa = FactorAnalyzer(rotation=None, n_factors=30) fa.fit(X) # Check Eigenvalues ev, v = fa.get_eigenvalues() ev # --> only 30 Eigenvalues greater than 1 , so only choose them ? # Create scree plot g = plt.scatter(range(1, X.shape[1] + 1), ev) #g = plt.plot(range(1,X.shape[1]+1),ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() figure = g.get_figure() figure.savefig('Scree_plot.pdf', dpi=400)
# I know the scores turn out in this order.... pca_names = ['STM', 'reasoning', 'verbal'] # Build and collect dataframes that will be used for figures and table # generation. First, the loadings. loadings = pd.DataFrame(Ypca.loadings_, index=cbs.test_names(), columns=pca_names) # Pairwise correlations between test scores var_corrs = pd.DataFrame(Ypca.corr_, index=cbs.test_names(), columns=cbs.test_names()) # Eigenvalues of the components eigen_values = pd.DataFrame(Ypca.get_eigenvalues()[0][0:3], index=pca_names, columns=['eigenvalues']).T # Percentage variabnce explained by each component pct_variance = pd.DataFrame(Ypca.get_factor_variance()[1] * 100, index=pca_names, columns=['% variance']).T # Generates and displays the chord plot to visualize the factors fig = chord_plot(loadings.copy(), var_corrs.copy(), cscale_name='Picnic', width=700, height=350, threshold=0.20)
def run_sampling_adequacy_app(): st.header('■Measure of Sampling Adequacy') st.write( 'To investigate the adequay of the number of samples for questionnaire.Kaiser-Meyer-Olkin (KMO) Test is used. ' ) st.write('KMO values between 0.8 and 1 indicate the sampling is adequate.') st.write( 'KMO values less than 0.6 indicate the sampling is not adequate and that remedial action should be taken. ' ) st.write( 'Some authors put this value at 0.5, so use your own judgment for values between 0.5 and 0.6.' ) st.sidebar.subheader('Data upload') df_edu = pd.read_csv("data/eng_sample_data_sampling.csv") def download_link(object_to_download, download_filename, download_link_text): if isinstance(object_to_download, pd.DataFrame): object_to_download = object_to_download.to_csv( index=False, encoding='utf_8_sig') b64 = base64.b64encode(object_to_download.encode()).decode() return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>' tmp_download_link = download_link(df_edu, 'sample_sampling.csv', 'Download sample csv file.') st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True) try: uploaded_file = st.sidebar.file_uploader( "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)", type=["csv"]) # uploaded_file = st.file_uploader( # label = 'File Upload(Drag and drop csv/Excel file)', # type = ['csv', 'xlsx'] # ) if uploaded_file is not None: df_edu = pd.read_csv(uploaded_file) uploaded_file.seek(0) display_data = st.sidebar.checkbox(label='Show uploaded data') if display_data: st.dataframe(df_edu) else: df_edu = pd.read_csv('data/eng_sample_data_sampling.csv') show_df = st.sidebar.checkbox('Show DataFrame') if show_df == True: st.write(df_edu) df_edu = df_edu.dropna() df_edu = df_edu.drop(['student'], axis=1) from factor_analyzer.factor_analyzer import calculate_kmo kmo_all, kmo_model = calculate_kmo(df_edu) st.write('## KMO value:', kmo_model.round(2)) st.subheader('Data overview (correlation coefficient)') st.write(df_edu.corr().style.background_gradient(cmap='coolwarm')) fa = FactorAnalyzer() fa.fit(df_edu) ev, v = fa.get_eigenvalues() st.set_option('deprecation.showPyplotGlobalUse', False) plt.figure(figsize=(7, 5)) plt.scatter(range(1, df_edu.shape[1] + 1), ev) plt.plot(range(1, df_edu.shape[1] + 1), ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() st.pyplot() fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop') fa.fit(df_edu) df_result = pd.DataFrame(fa.loadings_, columns=['1st', '2nd', '3rd']) df_result.index = df_edu.columns cm = sns.light_palette('blue', as_cmap=True) df_factor = df_result.style.background_gradient(cmap=cm) st.write(df_factor) except Exception as e: st.header( 'ERROR: Data inconsistency. Check data format to be uploaded.') print('Data inconsistency error')
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0) plt.style.use('ggplot') ####### Setup the dataset df = pd.read_csv("dims_new.csv", delimiter=',', header=0) #df.drop(['Unnamed: 0','gender', 'education', 'age'],axis=1,inplace=True) # Dropping missing values rows #df.dropna(inplace=True) df.head() ####### Find eigenvalues fa = FactorAnalyzer(rotation=None, n_factors=17) fa = fa.fit(df) # used instead of "fa.analyze(df, 17, rotation=None)" ev, v = fa.get_eigenvalues() # Check Eigenvalues ####### Plot eigenvalues plt.scatter(range(1, df.shape[1] + 1), ev) plt.plot(range(1, df.shape[1] + 1), ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.axhline(y=1, c='k') ##### Eigenvalues suggest that 6 dimensions is a good fit def loadThem(rotation, factors): fa = FactorAnalyzer(rotation=rotation, n_factors=factors) fa = fa.fit(df.values) loadings = fa.loadings_