def factor_analysis_perdomain(self, write=False): ''' Function to perform factor analysis per domain Args: write: Boolean variable to choose whether to write the output to a file or not. Returns: None ''' # load domains into domain data domain_data = self.load_domains() # Get mapping of original numbers to shuffled numbers true_num = list(pd.read_csv(self.data, index_col=0)) for d in self.domains: curr = domain_data[d] col = list(curr) ind = [] for c in col: ind.append(true_num.index(c)) index_num = list(map(int, ind)) column_name = list(np.array(self.extract_features())[index_num]) inp = domain_data[d] fa = FactorAnalyzer(n_factors=self.n, rotation='varimax') # In some cases, factor analysis does not success try: fa.fit(inp) except: print( 'Data from ' + str(d) + ' domain cannot be factorized as it results in a singular matrix.' ) continue magnitude = fa.get_communalities() mag_dict = {} for i, _ in enumerate(column_name): mag_dict[column_name[i]] = magnitude[i] sorted_mag = sorted(mag_dict.items(), key=lambda kv: kv[1], reverse=True) if write == True: factors = pd.DataFrame(sorted_mag, columns=['Feature', 'Importance']) if self.DC: factors.to_csv('output/fa_decorrelated_' + str(d) + '_' + str(self.VER) + '.csv', index=False) else: factors.to_csv('output/fa_' + str(d) + '_' + str(self.VER) + '.csv', index=False)
def FactorAnalyze(self, rotate="varimax"): self.SharedVariance = copy.deepcopy(self.NormSubThresh) self.SharedVariance = self.SharedVariance.iloc[:1] self.SharedVariance.index.rename("Shared Variance", inplace=True) self.FactorLoadings = copy.deepcopy(self.NormSubThresh) self.FactorLoadings = self.FactorLoadings.iloc[:3] self.FactorLoadings.index.rename("Factor Number", inplace=True) for key in self.SharedVariance.columns.unique(level=0): factor = FactorAnalyzer(n_factors=3, rotation=rotate) factor.fit(sklearn.preprocessing.StandardScaler().fit_transform( self.NormSubThresh[key].values)) self.SharedVariance[key] = np.atleast_2d( factor.get_communalities()) self.FactorLoadings[key] = factor.loadings_.T
def calculate_py_output(test_name, factors, method, rotation, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer() fa.analyze(data, factors, method=method, rotation=rotation) evalues, values = fa.get_eigenvalues() return {'value': values, 'evalues': evalues, 'structure': fa.structure, 'loading': fa.loadings, 'uniquenesses': fa.get_uniqueness(), 'communalities': fa.get_communalities(), 'scores': fa.get_scores(data)}
def factor_analysis(self, *x_columns: str, n_factor:int=None) -> dict: """因子分析 :param x_column: x因子所在的列名 :param n_factor: 公因子个数(可手动设置,默认为自动) :return: 字典,包括公因子方差、成分矩阵和解释的总方差 """ columns = [] for x in x_columns: columns.append(x) X_data = pd.DataFrame(self.data, columns=columns) if n_factor is not None: fa = FactorAnalyzer(method="principal", n_factors=n_factor) else: fa = FactorAnalyzer(method="principal") fa.fit(X_data) result_dict = dict() result_dict['communalities'] = fa.get_communalities().tolist() result_dict['component_matrix'] = fa.loadings_.tolist() result_dict['factor_variance'] = [arr.tolist() for arr in fa.get_factor_variance()] return result_dict
def run(self, dfx, n_factors=3): self.n_factors = n_factors msg = {} x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols dfu = dfx[x_numer_cols] fa = FactorAnalyzer() fa.analyze(dfu, n_factors, rotation=None) l = fa.loadings c = fa.get_communalities() s = fa.get_scores(dfu) l.columns = ['因子%s荷载系数' % (i + 1) for i in range(n_factors)] c.columns = ['共同度'] s.columns = ['因子%s' % (i + 1) for i in range(n_factors)] res = l.join(c) return {'result': res, 'msg': msg, 'factor': s}
def factor_analysis(self, write=False): ''' Function to get the features and their importance in factor analysis Args: write: Boolean variable to choose whether to write the output to a file or not. Returns: inp: pandas DataFrame that contains the original data sorted_mag: list of tuples to store the features and their importance in decreasing format ''' inp = pd.read_csv(self.data, index_col=0) # fa stores the output of the factor_analyzer fa = FactorAnalyzer(n_factors=self.n, rotation='varimax') # fits the input to get feature importances fa.fit(inp) # gets the factor loadings magnitude = fa.get_communalities() feat = self.extract_features() # Dictionary to hold the correct feature name to the number mag_dict = {} for t, f in enumerate(feat): mag_dict[f] = magnitude[t] sorted_mag = sorted(mag_dict.items(), key=lambda kv: kv[1], reverse=True) # Writes the output of factor analysis to a file if write == True: factors = pd.DataFrame(sorted_mag, columns=['Feature', 'Importance']) factors.to_csv(self.fa_file, index=False) return inp, sorted_mag
for i in range(m): data_new.iloc[i, j] = (coef_var[j]/sum(coef_var))*data_normal.iloc[i, j] # data_new即是处理后的股票数据 # print(data_new) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # 建立模型 fa = FactorAnalyzer(rotation='varimax', n_factors=12) # 固定公共因子个数为5 fa.fit(data_new) print("公因子方差:\n", fa.get_communalities()) # 公因子方差 matrix_orth = fa.loadings_ print("\n成分矩阵\n", matrix_orth) var = fa.get_factor_variance() # 给出贡献率 print("\n解释的总方差(即贡献率):\n", var) # 分别取两位小数 print("\n特征值:\n", list(map(lambda x: round(x, 4), var[0]))) print("\n因子贡献率:\n", list(map(lambda x: round(x, 4), var[1]))) print("\n累计贡献率:\n", list(map(lambda x: round(x, 4), var[2]))) # 设置数据框的最大行、最大列和不换行(针对数据框) pd.set_option('display.max_rows', 10) pd.set_option('display.max_columns', 10) pd.set_option('expand_frame_repr', False) # 将数据类型转换为数据框 data22 = pd.DataFrame(data)
from sklearn.model_selection import train_test_split from factor_analyzer import FactorAnalyzer from factor_analyzer import (ConfirmatoryFactorAnalyzer, ModelSpecificationParser) from factor_analyzer.utils import (corr, impute_values, partial_correlations, smc) data1 = pd.read_csv("https://donatello-telesca.squarespace.com/s/Exposure-t4yx.csv") # Perform Factor Analysis fa = FactorAnalyzer() # fa.set_params(n_factors=6,rotation=None) fa.set_params(n_factors=6,rotation='varimax') fa.fit(data1) # Check factors factor_loadings = fa.loadings_ eigen_values, vectors = fa.get_eigenvalues() communalities = fa.get_communalities() # Create scree plot # plt.scatter(range(1,29),eigen_values) # plt.plot(range(1,29),eigen_values) # plt.title('Scree Plot') # plt.xlabel('Factors') # plt.ylabel('Eigenvalue') # plt.grid() # plt.show() def clump_factor_vars(factor_loadings,factor_num): observed_vars = [] for each in range(len(factor_loadings)): if factor_loadings[each].argmax() == factor_num: observed_vars.append(each) return observed_vars
import datetime as dt from dateutil import relativedelta from factor_analyzer import FactorAnalyzer # input symbols = ['AAPL','MSFT','AMD','NVDA'] start = dt.datetime.now() - dt.timedelta(days = 365*7) end = dt.datetime.now() df = pd.DataFrame() for s in symbols: df[s] = yf.download(s,start,end)['Adj Close'] fa = FactorAnalyzer(rotation=None) fa.fit(df) print(fa.get_communalities()) ev, v = fa.get_eigenvalues() plt.scatter(range(1,df.shape[1]+1),ev) plt.plot(range(1,df.shape[1]+1),ev) plt.title('Factor Analysis') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity chi_square_value,p_value=calculate_bartlett_sphericity(df) print(chi_square_value, p_value)
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False): """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability. You want "promax" if you want Oblimin on large datasets. See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. """ assert not df.isnull().values.any(), "Data must not contain any nan or inf values" assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)" def data_suitable(df, kmo_value = False, ignore = False): #Test to ensure data is not identity Matrix chi_square_value, p_value = calculate_bartlett_sphericity(df) # test to ensure that observed data is adquite for FA. Must be > 0.6 kmo_all, kmo_model = calculate_kmo(df) if (p_value > 0.1 or kmo_model < 0.6) and ignore != True: raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}. KMO model Score: {}".format(p_value, kmo_model)) if kmo_value: return kmo_model else: return print("KMO Value: {}.".format(data_suitable(df, kmo_value = True))) fa = FactorAnalyzer(method = "minres", rotation = rotation, n_factors = n_factors) fa.fit(df) def eigenplot(df): df = pd.DataFrame(df) fig = go.Figure() fig.add_trace( go.Scatter( x = df.index.values, y = df[0].values, mode = 'lines' ) ) fig.add_shape( type = "line", y0 = 1, x0 = 0, y1 = 1, x1 = len(df), line = dict( color = 'red', dash = 'dash' ) ) fig.update_layout( title = "Factor Eigenvalues", yaxis_title="Eigenvalue", xaxis_title="Factor", xaxis = dict( range = [0,df[df[0] > 0].index.values[-1]] ) ) fig.show() return eigenplot(fa.get_eigenvalues()[1]) Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000) tmp = pd.DataFrame(fa.get_factor_variance()[1:]) tmp.index = ["Proportional Varience","Cumulative Varience"] Plotting.dfTable(tmp) if rotation == 'promax': Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, title = "Varience Explained", x = list(df.columns), description = "The proportion of each variables varience that can be explained by the factors.", expand = True, height = 300, width = 2000) Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, title = "Variable Uniqueness", x = list(df.columns), expand = True, height = 300, width = 2000) if transform: return fa.transform(df) return
a_data = df_cor[[ 'hs_degree', 'median_age', 'second_mortgage', 'pct_own', 'bad_debt' ]] fact.fit_transform(a_data) ev, v = fact.get_eigenvalues() plt.plot(ev) plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5']) plt.show() plt.plot(fact.loadings_) plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5']) plt.show() fact.get_communalities() ################################################################################################## '''Week 4''' # Regression Analysis # from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error df_reg= df_cor[['area_code','type', 'pop', 'family_mean','second_mortgage',\ 'bad_debt','pct_own','median_age', 'hc_mortgage_mean', 'rent_mean']] df_reg['density'] = df_cor['pop'] / df_cor['ALand'] le = LabelEncoder()
def factor_analysis(factor_df, max_feature_count=None, plot=True): """ 因子分析,提取N个特征,查看是否有效 :param factor_df: :param max_feature_count: :param plot: :return: """ ana_dic = {} max_feature_count = np.min( [factor_df.shape[1] // 3, 50] if max_feature_count is None else max_feature_count) for n_features in range(2, max_feature_count): logger.info(f"{n_features} 个因子时:") fa = FactorAnalyzer(n_factors=n_features, rotation=None) exception = None for _ in range(8, 0, -1): df = factor_df if _ == 0 else factor_df.sample( factor_df.shape[0] // (_ + 1) * _) try: fa.fit(df) break except LinAlgError as exp: exception = exp logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样", df.shape, _, _) logger.warning(exception is None) else: logger.warning(exception is None) raise exception from exception communalities = fa.get_communalities() logger.info(f"\t共因子方差比(communality)({communalities.shape})") # 公因子方差 # logger.debug('\n%s', communalities) loadings = fa.loadings_ logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})") # 成分矩阵 # logger.debug('\n%s', loadings) # 成分矩阵 var = fa.get_factor_variance() # 给出贡献率 # 1. Sum of squared loadings (variance) # 2. Proportional variance # 3. Cumulative variance logger.info(f"\tCumulative variance {var[2]}") kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df)) if kmo_total < 0.6: logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析') else: logger.info( f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析') ana_dic[n_features] = { "FactorAnalyzer": fa, # "communalities": communalities, # "loadings": loadings, # "Sum of squared loadings": var[0], # "Proportional variance": var[1], "Cumulative variance": var[2][-1], "KOM_Test_total": kmo_total, } if var[2][-1] > 0.95 and kmo_total > 0.6: break ana_data = pd.DataFrame( {k: v for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T if plot: ana_data.plot(subplots=True, figsize=(9, 6)) plt.show() return ana_dic
def calculate_py_output(test_name, factors, method, rotation, svd_method='randomized', use_corr_matrix=False, top_dir=None): """ Use the `FactorAnalyzer()` class to perform the factor analysis and return a dictionary with relevant output for given scenario. Parameters ---------- test_name : str The name of the test factors : int The number of factors method : str The rotation method rotation : str The type of rotation svd_method : str, optional The SVD method to use Defaults to 'randomized' use_corr_matrix : bool, optional Whether to use the correlation matrix. Defaults to False. top_dir : str, optional The top directory for test data Defaults to `DATA_DIR`` Returns ------- output : dict A dictionary containing the outputs for all `OUTPUT_TYPES`. """ if top_dir is None: top_dir = DATA_DIR filename = join(top_dir, test_name + '.csv') data = pd.read_csv(filename) if use_corr_matrix: X = data.corr() else: X = data.copy() rotation = None if rotation == 'none' else rotation method = {'uls': 'minres'}.get(method, method) fa = FactorAnalyzer(n_factors=factors, method=method, svd_method=svd_method, rotation=rotation, is_corr_matrix=use_corr_matrix) fa.fit(X) evalues, values = fa.get_eigenvalues() return { 'value': values, 'evalues': evalues, 'structure': fa.structure_, 'loading': fa.loadings_, 'uniquenesses': fa.get_uniquenesses(), 'communalities': fa.get_communalities(), 'scores': fa.transform(data) }
plt.xlabel('The number of factors') plt.ylabel('Propotion of Variance') leg = plt.legend(['Variance Of factor'], loc='best', borderpad=0.3, shadow=False, prop=matplotlib.font_manager.FontProperties(size='large'), markerscale=0.4) leg.get_frame().set_alpha(0.4) plt.show() # fa fit - 최소제곱법 method를 minres로 함 fa = FactorAnalyzer(n_factors=2, rotation=None, method='minres') # fa.analyze(data_df, 2, rotation = None, method = "minres") fa.fit(data_df) minres_result_2_fa = fa.loadings_ minres_result_2_com = pd.DataFrame(fa.get_communalities()) minres_result_2_list = [minres_result_2_fa, minres_result_2_com] minres_result_2_list # fa fit - 최대우도법 method를 ML로 함 요인은 2개 n_factor = 2 faML = FactorAnalyzer(n_factors=2, rotation=None, method='ML') faML.fit(data_df) ML_result_2_fa = faML.loadings_ ML_result_2_com = pd.DataFrame(faML.get_communalities()) ML_result_2_list = [ML_result_2_fa, ML_result_2_com] ML_result_2_list # define biplot function def fn_biplot_rev(data, col_ind_1, col_ind_2, xlim_lb, xlim_ub, ylim_lb, ylim_ub, labels=None): # function