Exemple #1
0
    def factor_analysis_perdomain(self, write=False):
        '''
        Function to perform factor analysis per domain
        Args: 
            write: Boolean variable to choose whether to write the output to a file or not. 
        Returns:
            None
        '''
        # load domains into domain data
        domain_data = self.load_domains()

        # Get mapping of original numbers to shuffled numbers
        true_num = list(pd.read_csv(self.data, index_col=0))
        for d in self.domains:
            curr = domain_data[d]
            col = list(curr)
            ind = []
            for c in col:
                ind.append(true_num.index(c))
            index_num = list(map(int, ind))
            column_name = list(np.array(self.extract_features())[index_num])

            inp = domain_data[d]
            fa = FactorAnalyzer(n_factors=self.n, rotation='varimax')

            # In some cases, factor analysis does not success
            try:
                fa.fit(inp)
            except:
                print(
                    'Data from ' + str(d) +
                    ' domain cannot be factorized as it results in a singular matrix.'
                )
                continue
            magnitude = fa.get_communalities()

            mag_dict = {}
            for i, _ in enumerate(column_name):
                mag_dict[column_name[i]] = magnitude[i]

            sorted_mag = sorted(mag_dict.items(),
                                key=lambda kv: kv[1],
                                reverse=True)

            if write == True:
                factors = pd.DataFrame(sorted_mag,
                                       columns=['Feature', 'Importance'])
                if self.DC:
                    factors.to_csv('output/fa_decorrelated_' + str(d) + '_' +
                                   str(self.VER) + '.csv',
                                   index=False)
                else:
                    factors.to_csv('output/fa_' + str(d) + '_' +
                                   str(self.VER) + '.csv',
                                   index=False)
 def FactorAnalyze(self, rotate="varimax"):
     self.SharedVariance = copy.deepcopy(self.NormSubThresh)
     self.SharedVariance = self.SharedVariance.iloc[:1]
     self.SharedVariance.index.rename("Shared Variance", inplace=True)
     self.FactorLoadings = copy.deepcopy(self.NormSubThresh)
     self.FactorLoadings = self.FactorLoadings.iloc[:3]
     self.FactorLoadings.index.rename("Factor Number", inplace=True)
     for key in self.SharedVariance.columns.unique(level=0):
         factor = FactorAnalyzer(n_factors=3, rotation=rotate)
         factor.fit(sklearn.preprocessing.StandardScaler().fit_transform(
             self.NormSubThresh[key].values))
         self.SharedVariance[key] = np.atleast_2d(
             factor.get_communalities())
         self.FactorLoadings[key] = factor.loadings_.T
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer()
    fa.analyze(data, factors, method=method, rotation=rotation)

    evalues, values = fa.get_eigenvalues()

    return {'value': values,
            'evalues': evalues,
            'structure': fa.structure,
            'loading': fa.loadings,
            'uniquenesses': fa.get_uniqueness(),
            'communalities': fa.get_communalities(),
            'scores': fa.get_scores(data)}
Exemple #4
0
    def factor_analysis(self, *x_columns: str, n_factor:int=None) -> dict:
        """因子分析

        :param x_column: x因子所在的列名
        :param n_factor: 公因子个数(可手动设置,默认为自动)
        :return: 字典,包括公因子方差、成分矩阵和解释的总方差
        """
        columns = []
        for x in x_columns:
            columns.append(x)
        X_data = pd.DataFrame(self.data, columns=columns)
        if n_factor is not None:
            fa = FactorAnalyzer(method="principal", n_factors=n_factor)
        else:
            fa = FactorAnalyzer(method="principal")
        fa.fit(X_data)
        result_dict = dict()
        result_dict['communalities'] = fa.get_communalities().tolist()
        result_dict['component_matrix'] = fa.loadings_.tolist()
        result_dict['factor_variance'] = [arr.tolist() for arr in fa.get_factor_variance()]
        return result_dict
Exemple #5
0
    def run(self, dfx, n_factors=3):

        self.n_factors = n_factors

        msg = {}

        x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

        if x_numer_cols == []:
            logging.error(
                'All input dfx are no numeric columns, Please check your input dfx data!'
            )
            msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!'
            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            if x_cate_cols != []:
                logging.warning(
                    'input dfx has non-numeric columns: %s, will ignore these columns!'
                    % x_cate_cols)

                msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols

        dfu = dfx[x_numer_cols]

        fa = FactorAnalyzer()
        fa.analyze(dfu, n_factors, rotation=None)
        l = fa.loadings
        c = fa.get_communalities()
        s = fa.get_scores(dfu)

        l.columns = ['因子%s荷载系数' % (i + 1) for i in range(n_factors)]
        c.columns = ['共同度']
        s.columns = ['因子%s' % (i + 1) for i in range(n_factors)]

        res = l.join(c)

        return {'result': res, 'msg': msg, 'factor': s}
Exemple #6
0
    def factor_analysis(self, write=False):
        '''
        Function to get the features and their importance in factor analysis
        Args: 
            write: Boolean variable to choose whether to write the output to a file or not. 
        Returns: 
            inp: pandas DataFrame that contains the original data 
            sorted_mag: list of tuples to store the features and their importance in decreasing format
        '''
        inp = pd.read_csv(self.data, index_col=0)

        # fa stores the output of the factor_analyzer
        fa = FactorAnalyzer(n_factors=self.n, rotation='varimax')

        # fits the input to get feature importances
        fa.fit(inp)

        # gets the factor loadings
        magnitude = fa.get_communalities()

        feat = self.extract_features()

        # Dictionary to hold the correct feature name to the number
        mag_dict = {}
        for t, f in enumerate(feat):
            mag_dict[f] = magnitude[t]

        sorted_mag = sorted(mag_dict.items(),
                            key=lambda kv: kv[1],
                            reverse=True)

        # Writes the output of factor analysis to a file
        if write == True:
            factors = pd.DataFrame(sorted_mag,
                                   columns=['Feature', 'Importance'])
            factors.to_csv(self.fa_file, index=False)

        return inp, sorted_mag
Exemple #7
0
    for i in range(m):
        data_new.iloc[i, j] = (coef_var[j]/sum(coef_var))*data_normal.iloc[i, j]

# data_new即是处理后的股票数据
# print(data_new)


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# 建立模型
fa = FactorAnalyzer(rotation='varimax', n_factors=12)  # 固定公共因子个数为5
fa.fit(data_new)
print("公因子方差:\n", fa.get_communalities())  # 公因子方差
matrix_orth = fa.loadings_
print("\n成分矩阵\n", matrix_orth)
var = fa.get_factor_variance()  # 给出贡献率
print("\n解释的总方差(即贡献率):\n", var)
# 分别取两位小数
print("\n特征值:\n", list(map(lambda x: round(x, 4), var[0])))
print("\n因子贡献率:\n", list(map(lambda x: round(x, 4), var[1])))
print("\n累计贡献率:\n", list(map(lambda x: round(x, 4), var[2])))

# 设置数据框的最大行、最大列和不换行(针对数据框)
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('expand_frame_repr', False)
# 将数据类型转换为数据框
data22 = pd.DataFrame(data)
Exemple #8
0
from sklearn.model_selection import train_test_split
from factor_analyzer import FactorAnalyzer
from factor_analyzer import (ConfirmatoryFactorAnalyzer, ModelSpecificationParser)
from factor_analyzer.utils import (corr, impute_values, partial_correlations, smc)

data1 = pd.read_csv("https://donatello-telesca.squarespace.com/s/Exposure-t4yx.csv")

# Perform Factor Analysis
fa = FactorAnalyzer()
# fa.set_params(n_factors=6,rotation=None)
fa.set_params(n_factors=6,rotation='varimax')
fa.fit(data1)
# Check factors
factor_loadings = fa.loadings_
eigen_values, vectors = fa.get_eigenvalues()
communalities = fa.get_communalities()
# Create scree plot 
# plt.scatter(range(1,29),eigen_values)
# plt.plot(range(1,29),eigen_values)
# plt.title('Scree Plot')
# plt.xlabel('Factors')
# plt.ylabel('Eigenvalue')
# plt.grid()
# plt.show()

def clump_factor_vars(factor_loadings,factor_num):
	observed_vars = []
	for each in range(len(factor_loadings)):
		if factor_loadings[each].argmax() == factor_num:
			observed_vars.append(each)
	return observed_vars
Exemple #9
0
import datetime as dt
from dateutil import relativedelta
from factor_analyzer import FactorAnalyzer

# input
symbols = ['AAPL','MSFT','AMD','NVDA']
start = dt.datetime.now() - dt.timedelta(days = 365*7)
end = dt.datetime.now()

df = pd.DataFrame()
for s in symbols:
    df[s] = yf.download(s,start,end)['Adj Close']

fa = FactorAnalyzer(rotation=None)
fa.fit(df)
print(fa.get_communalities())

ev, v = fa.get_eigenvalues()

plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Factor Analysis')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(df)
print(chi_square_value, p_value)
Exemple #10
0
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False):

    """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common
        You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability.
        You want "promax" if you want Oblimin on large datasets.
        
        See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. 
    """   

    assert not df.isnull().values.any(), "Data must not contain any nan or inf values"
    assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)"  

    def data_suitable(df, kmo_value = False, ignore = False):
        
        #Test to ensure data is not identity Matrix
        chi_square_value, p_value = calculate_bartlett_sphericity(df)
        
        # test to ensure that observed data is adquite for FA. Must be > 0.6
        kmo_all, kmo_model = calculate_kmo(df)

        if (p_value > 0.1 or kmo_model < 0.6) and ignore != True:
            raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}.  KMO model Score: {}".format(p_value, kmo_model))
        
        if kmo_value:
            return kmo_model
        else:
            return
        
        
    print("KMO Value: {}.".format(data_suitable(df, kmo_value = True)))

    fa = FactorAnalyzer(method = "minres", 
                        rotation = rotation,
                        n_factors = n_factors)

    fa.fit(df)

    def eigenplot(df):
        df = pd.DataFrame(df)
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x = df.index.values,
                y = df[0].values,
                mode = 'lines'
            )
        )
        
        
        fig.add_shape(
            type = "line",
            y0 = 1,
            x0 = 0,
            y1 = 1,
            x1 = len(df),
            line = dict(
                color = 'red',
                dash = 'dash'
            )
        )
        
        fig.update_layout(
            title = "Factor Eigenvalues",
            yaxis_title="Eigenvalue",
            xaxis_title="Factor",
            xaxis = dict(
                range = [0,df[df[0] > 0].index.values[-1]]
                )
        )
        
        fig.show()
        return

    eigenplot(fa.get_eigenvalues()[1])
    Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000)

    tmp = pd.DataFrame(fa.get_factor_variance()[1:]) 
    tmp.index = ["Proportional Varience","Cumulative Varience"]
    Plotting.dfTable(tmp)

    if rotation == 'promax':
        Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000)
        Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, 
                            title = "Varience Explained",
                            x = list(df.columns), 
                            description = "The proportion of each variables varience that can be explained by the factors.", 
                            expand = True, 
                            height = 300, 
                            width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, 
                            title = "Variable Uniqueness",
                            x = list(df.columns),
                            expand = True, 
                            height = 300,
                             width = 2000)

    if transform:
        return fa.transform(df)

    return 
a_data = df_cor[[
    'hs_degree', 'median_age', 'second_mortgage', 'pct_own', 'bad_debt'
]]

fact.fit_transform(a_data)
ev, v = fact.get_eigenvalues()

plt.plot(ev)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

plt.plot(fact.loadings_)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

fact.get_communalities()

##################################################################################################
'''Week 4'''
# Regression Analysis #

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

df_reg= df_cor[['area_code','type', 'pop', 'family_mean','second_mortgage',\
    'bad_debt','pct_own','median_age', 'hc_mortgage_mean', 'rent_mean']]

df_reg['density'] = df_cor['pop'] / df_cor['ALand']

le = LabelEncoder()
Exemple #12
0
def factor_analysis(factor_df, max_feature_count=None, plot=True):
    """
    因子分析,提取N个特征,查看是否有效
    :param factor_df:
    :param max_feature_count:
    :param plot:
    :return:
    """
    ana_dic = {}
    max_feature_count = np.min(
        [factor_df.shape[1] //
         3, 50] if max_feature_count is None else max_feature_count)
    for n_features in range(2, max_feature_count):
        logger.info(f"{n_features} 个因子时:")
        fa = FactorAnalyzer(n_factors=n_features, rotation=None)
        exception = None
        for _ in range(8, 0, -1):
            df = factor_df if _ == 0 else factor_df.sample(
                factor_df.shape[0] // (_ + 1) * _)
            try:
                fa.fit(df)
                break
            except LinAlgError as exp:
                exception = exp
                logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样",
                                 df.shape, _, _)
                logger.warning(exception is None)
        else:
            logger.warning(exception is None)
            raise exception from exception

        communalities = fa.get_communalities()
        logger.info(f"\t共因子方差比(communality)({communalities.shape})")  # 公因子方差
        # logger.debug('\n%s', communalities)
        loadings = fa.loadings_
        logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})")  # 成分矩阵
        # logger.debug('\n%s', loadings)  # 成分矩阵
        var = fa.get_factor_variance()  # 给出贡献率
        # 1. Sum of squared loadings (variance)
        # 2. Proportional variance
        # 3. Cumulative variance
        logger.info(f"\tCumulative variance {var[2]}")
        kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df))
        if kmo_total < 0.6:
            logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析')
        else:
            logger.info(
                f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析')
        ana_dic[n_features] = {
            "FactorAnalyzer": fa,
            # "communalities": communalities,
            # "loadings": loadings,
            # "Sum of squared loadings": var[0],
            # "Proportional variance": var[1],
            "Cumulative variance": var[2][-1],
            "KOM_Test_total": kmo_total,
        }
        if var[2][-1] > 0.95 and kmo_total > 0.6:
            break

    ana_data = pd.DataFrame(
        {k: v
         for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T
    if plot:
        ana_data.plot(subplots=True, figsize=(9, 6))
        plt.show()

    return ana_dic
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        svd_method='randomized',
                        use_corr_matrix=False,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    svd_method : str, optional
        The SVD method to use
        Defaults to 'randomized'
    use_corr_matrix : bool, optional
        Whether to use the correlation matrix.
        Defaults to False.
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    if use_corr_matrix:
        X = data.corr()
    else:
        X = data.copy()

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer(n_factors=factors,
                        method=method,
                        svd_method=svd_method,
                        rotation=rotation,
                        is_corr_matrix=use_corr_matrix)
    fa.fit(X)

    evalues, values = fa.get_eigenvalues()

    return {
        'value': values,
        'evalues': evalues,
        'structure': fa.structure_,
        'loading': fa.loadings_,
        'uniquenesses': fa.get_uniquenesses(),
        'communalities': fa.get_communalities(),
        'scores': fa.transform(data)
    }
Exemple #14
0
plt.xlabel('The number of factors')
plt.ylabel('Propotion of Variance')

leg = plt.legend(['Variance Of factor'], loc='best', borderpad=0.3,
                 shadow=False, prop=matplotlib.font_manager.FontProperties(size='large'),
                 markerscale=0.4)
leg.get_frame().set_alpha(0.4)
plt.show()

# fa fit - 최소제곱법 method를 minres로 함
fa = FactorAnalyzer(n_factors=2, rotation=None, method='minres')

# fa.analyze(data_df, 2, rotation = None, method = "minres")
fa.fit(data_df)
minres_result_2_fa = fa.loadings_
minres_result_2_com =  pd.DataFrame(fa.get_communalities())
minres_result_2_list = [minres_result_2_fa, minres_result_2_com]
minres_result_2_list

# fa fit  - 최대우도법 method를 ML로 함 요인은 2개 n_factor = 2
faML = FactorAnalyzer(n_factors=2, rotation=None, method='ML')
faML.fit(data_df)
ML_result_2_fa = faML.loadings_
ML_result_2_com =  pd.DataFrame(faML.get_communalities())
ML_result_2_list = [ML_result_2_fa, ML_result_2_com]
ML_result_2_list


# define biplot function
def fn_biplot_rev(data, col_ind_1, col_ind_2, xlim_lb, xlim_ub, ylim_lb, ylim_ub, labels=None):
 # function