Esempio n. 1
0
    def get_ev(self, X):
        num_features = len(X.columns)
        fa = FactorAnalyzer(num_features, rotation=None, method=self.method)
        fa.fit(X)

        ev, v = fa.get_eigenvalues()
        return ev
Esempio n. 2
0
def find_number_of_Factors_1(eigenval_limit, dimensions, obs,  kind, prnt):

    """this function calculates the number of factors with an Eigenvalue which is greater then the 'eigenval_limit,
    without the param trial_index
        :param   eigenval_limit: number (float) , recommended = 1.0
                 dimensions:     dimensions before dimensionality reduction (obs.shape[1])
                 obs:            2 dim array holding the averaged data
                 kind:           0, if data is averaged, 1 if data is single trial, 2 if data is concatenated
        :return: the number of factors generating the the data with eigenvalues greater then eigenval limit
                  """

    fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                        method='minres', n_factors=dimensions, rotation=None, rotation_kwargs={},
                        use_smc=True)

    fa.fit(obs)
    eigenvals, x = fa.get_eigenvalues()

    # take the eigenvals >= 1 --> number of them = number of relevant factors
    num_FA_dim = len(eigenvals[eigenvals >= eigenval_limit])

    if prnt:

        if kind == 0:
            print('averaged:')
            print('Number of Factors:                           ', num_FA_dim)


        elif kind == 2:
            print('concatenated:')
            print('Number of Factors:                           ', num_FA_dim)

    return num_FA_dim
Esempio n. 3
0
def eigenvalues_plt(data):
    img = io.BytesIO()

    plt.switch_backend('Agg')

    plt.style.use('ggplot')
    fa = FactorAnalyzer()
    fa.fit(data)
    eigen_values, vectors = fa.get_eigenvalues()
    plt.figure(figsize=(10, 10))
    plt.scatter(range(1, data.shape[1] + 1), eigen_values)
    plt.plot(range(1, data.shape[1] + 1), eigen_values)
    plt.title('Factor Importance by Eigenvalues')

    plt.xlabel('Factors')
    plt.ylabel('Eigenvalue')
    plt.grid()

    plt.savefig(img, format='png')

    img.seek(0)
    graph_url = base64.b64encode(img.getvalue()).decode()
    plt.close()

    return 'data:image/png;base64,{}'.format(graph_url)
Esempio n. 4
0
def plotfig(cols):
    c = df1.corr()
    xa = df1[df1.columns[2:7]]
    fa = FactorAnalyzer()
    fa.fit(xa, 10)  #Get Eigen values and plot them
    ev, v = fa.get_eigenvalues()
    ev
    #plt.plot(range(1,xa.shape[1]+1),ev)
    fig = px.scatter(x=range(1, xa.shape[1] + 1), y=ev)
    fig.update_traces(mode='lines+markers')

    fig.update_layout(yaxis={'visible': True, 'showticklabels': True})
    fig.update_layout(xaxis={'visible': True, 'showticklabels': True})
    fig.update_layout(width=700, height=200, plot_bgcolor='rgb(255,255,255)')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#dddddd')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')

    fig['layout'].update(margin=dict(l=0, r=20, b=20, t=10))
    fig.update_traces(line=dict(color="#0863ae"))
    fig.update_layout(xaxis_title="X",
                      yaxis_title="Y",
                      legend_title="Factor Analysis",
                      font=dict(family="Courier New, monospace",
                                size=12,
                                color="black"))
    return fig
Esempio n. 5
0
    def factor_analysis(self, input_x):
        ss_x = StandardScaler().fit_transform(input_x)
        norm_x = normalize(input_x, axis=0)
        factor_number = 9
        fa = FactorAnalyzer(
            n_factors=factor_number,
            rotation='oblimin')  # oblimin/promax varimax:orthogonal
        fa.fit(ss_x)
        ev, v = fa.get_eigenvalues()
        factor_loading_matrix = fa.loadings_
        fa_score = fa.transform(ss_x)
        print('ev', ev)
        # print('v',v)
        # print('factor_loading_matrix',factor_loading_matrix)
        fa_name = list(self.table_data.columns[1::])
        # print('quantization_score', len(fa_name),fa_name)
        for i in range(factor_number):
            all_coefficients = np.sort(factor_loading_matrix[:, i])
            coefficients_index = np.argsort(factor_loading_matrix[:, i])
            print('factor_i', i)
            for j, coefficient in enumerate(all_coefficients):
                if coefficient > 0.5:
                    print('coefficients_index', coefficients_index[j],
                          fa_name[coefficients_index[j]])

        plt.scatter(range(1, input_x.shape[1] + 1), ev)
        plt.plot(range(1, input_x.shape[1] + 1), ev)
        plt.title('scree figure')
        plt.ylabel('eigenvalues')
        plt.grid()
        plt.show()

        return fa_score
def make_loadings_matrix(rating_m):
    '''Takes a rating matrix and returns the loading matrix. Optimized for number of components
    using the knee, with a oblimin rotation for interpretability
    '''
    # Fit the initial factor analysis
    fa = FactorAnalyzer(n_factors=10, rotation='oblimin')
    fa.fit(rating_m)
    x = list(range(1, 16))
    fa_eigens = fa.get_eigenvalues()[1]
    fa_matrix_knee = KneeLocator(x,
                                 fa_eigens,
                                 S=1.0,
                                 curve='convex',
                                 direction='decreasing')
    fa_knee = fa_matrix_knee.knee
    fa_kneed = FactorAnalyzer(n_factors=fa_knee,
                              rotation='varimax').fit(rating_m)
    loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2))
    loadings_m.index = get_construct_names()
    loadings_m.index = loadings_m.index.rename(name='Construct')
    loadings_m.columns = [
        'Factor {} ({:.0f}%)'.format(
            i + 1,
            fa_kneed.get_factor_variance()[1][i] * 100)
        for i in loadings_m.columns
    ]
    return loadings_m
Esempio n. 7
0
def def_factor_analysis(X, k, rotation_=None):
    model = FactorAnalyzer(n_factors=k, rotation=rotation_).fit(X)

    eigen = model.get_eigenvalues()
    l = model.loadings_
    v = model.get_factor_variance()

    return eigen, l, v
Esempio n. 8
0
 def FA(self):
     fa = FactorAnalyzer(n_factors=1, method="principal", rotation="varimax")
     fa.fit(self.df)
     # Print eigenvalues
     ev, v = fa.get_eigenvalues()
     print(ev)
     # Print loadings
     print(fa.loadings_)
     self.coeff = fa.loadings_
     return 0
Esempio n. 9
0
def best_num_factors(df):
    fa = FactorAnalyzer(bounds=(0.005, 1),
                        impute='median',
                        is_corr_matrix=False,
                        method='minres',
                        n_factors=10,
                        rotation=None,
                        rotation_kwargs={},
                        use_smc=True)
    fa.fit(df)
    ev, v = fa.get_eigenvalues()
    num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()])
    return num_f
Esempio n. 10
0
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer()
    fa.analyze(data, factors, method=method, rotation=rotation)

    evalues, values = fa.get_eigenvalues()

    return {'value': values,
            'evalues': evalues,
            'structure': fa.structure,
            'loading': fa.loadings,
            'uniquenesses': fa.get_uniqueness(),
            'communalities': fa.get_communalities(),
            'scores': fa.get_scores(data)}
Esempio n. 11
0
def show_num_factors(df):
    fa = FactorAnalyzer(bounds=(0.005, 1),
                        impute='median',
                        is_corr_matrix=False,
                        method='minres',
                        n_factors=10,
                        rotation='varimax',
                        rotation_kwargs={},
                        use_smc=True)
    fa.fit(df)
    ev, v = fa.get_eigenvalues()
    num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()])
    res_f = len([e for e in ev if e > 1])
    return f"Best number of factors: {num_f}. Other possible factors {res_f-num_f}"
Esempio n. 12
0
def FA(observied_variables, name):
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
    chi_square_value, p_value = calculate_bartlett_sphericity(
        observied_variables)
    print("chi_square_value", chi_square_value, "p-value:", p_value)
    from factor_analyzer.factor_analyzer import calculate_kmo
    kmo_all, kmo_model = calculate_kmo(observied_variables)
    print("KMO value", kmo_model)

    # Create factor analysis object and perform factor analysis
    if name == 'phone':
        fa = FactorAnalyzer(n_factors=2)
    if name == 'QOL':
        fa = FactorAnalyzer(n_factors=4)
    fa.fit_transform(observied_variables)
    # Check Eigenvalues
    eigen_values, vectors = fa.get_eigenvalues()
    print(eigen_values)
    """
    # Create scree plot using matplotlib
    plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values)
    plt.plot(range(1,observied_variables.shape[1]+1),eigen_values)
    if name == 'phone':
        plt.title('Scree Plot for phone features',fontsize=24)
    if name == 'QOL':
        plt.title('Scree Plot for QOL features',fontsize=24)
    plt.xlabel('Factors', fontsize=18)
    plt.ylabel('Eigenvalue',fontsize=18)
    plt.grid()
    plt.show()
    """

    loadings = fa.loadings_
    print(pd.DataFrame(loadings, observied_variables.columns))
    #print(pd.DataFrame(fa.get_communalities()))
    return pd.DataFrame(loadings, observied_variables.columns)

    # Get variance of each factors
    print(
        pd.DataFrame(fa.get_factor_variance(),
                     ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
Esempio n. 13
0
    def FA(self):
        print(self.df.columns)
        print(self.mean)
        print(self.sigma)
        '''
        print(self.df)
        chi_square_value,p_value=calculate_bartlett_sphericity(self.df)
        print(chi_square_value, p_value)
        # Bartlett ’s test, the p-value is 0. The test was statistically significant, indicating that the observed correlation matrix is not an identity matrix.
        kmo_all,kmo_model=calculate_kmo(self.df)
        print(kmo_model)
        # Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis.
        fa = FactorAnalyzer(n_factors=self.df.shape[1], rotation=None)
        fa.fit(self.df)
        # Check Eigenvalues
        ev, v = fa.get_eigenvalues()
        print(ev)
        '''
        fa = FactorAnalyzer(n_factors=1,
                            method="principal",
                            rotation="varimax")
        fa.fit(self.df)
        # Print eigenvalues
        ev, v = fa.get_eigenvalues()
        print(ev)

        # Print loadings
        print(fa.loadings_)
        #print(fa.transform(self.df))
        '''
        plt.scatter(range(1,self.df.shape[1]+1),ev)
        plt.plot(range(1,self.df.shape[1]+1) ,ev)
        plt.title('Scree Plot')
        plt.xlabel('Factors')
        plt.ylabel('Eigenvalue')
        plt.grid()
        plt.savefig('{}.png'.format(self.indicatorName))
        plt.close()
        '''
        return 0
Esempio n. 14
0
class factor_analysis():
    def __init__(self, data, col, n):
        self.data = data
        self.col = col
        self.n = n

    def test(self):
        self.chi_square_value, self.p_value = calculate_bartlett_sphericity(
            self.data[self.col])
        self.kmo_all, self.kmo_model = calculate_kmo(self.data[self.col])
        return self.chi_square_value, self.p_value, self.kmo_all, self.kmo_model

    def analysis(self):
        self.fa = FactorAnalyzer(self.n, rotation=None)
        self.fa.fit(self.data[self.col])
        return self

    def _plot(self):
        ev, v = self.fa.get_eigenvalues()
        plt.scatter(range(1, self.data[self.col].shape[1] + 1), ev)
        plt.plot(range(1, self.data[self.col].shape[1] + 1), ev)
        plt.title('Scree Plot')
        plt.xlabel('Factors')
        plt.ylabel('Eigenvalue')
        plt.grid()
        plt.show()
        df_cm = pd.DataFrame(np.abs(self.fa.loadings_), index=self.col)
        plt.figure(figsize=(14, 14))
        ax = sns.heatmap(df_cm, annot=True, cmap="BuPu")
        # 设置y轴的字体的大小l
        ax.yaxis.set_tick_params(labelsize=15)
        plt.title('Factor Analysis', fontsize='xx-large')
        # Set y-axis label
        plt.ylabel('Sepal Width', fontsize='xx-large')
#        plt.savefig('factorAnalysis.png', dpi=500)

    def _transform(self):
        return self.fa(self.data[self.col])
Esempio n. 15
0
 def KaiserGuttman(self):
     fa = FactorAnalyzer(n_factors=self.N_adj,
                         rotation=self.kaiser_guttman_rotation,
                         method=self.kaiser_guttman_method)
     fa.fit(self.dataset)
     ev, v = fa.get_eigenvalues()
     N_factor = np.where(ev >= 1)[0].shape[0]
     # To draw eigen values
     plt.figure()
     plt.plot()
     plt.plot(range(1, ev.shape[0] + 1),
              ev,
              label="eigen value",
              marker="o")
     plt.axhline(y=1, color='red', linestyle="dotted", label=r"$y=1$")
     plt.legend()
     plt.xlim(0.5, ev.shape[0] + 0.6)
     plt.xlabel("factor")
     plt.ylabel("eigen value")
     outf = "%s/eigen_value.%s" % (self.outd, self.fig_ext)
     plt.savefig(outf, dpi=args.fig_dpi)
     self.logger.info("%s is saved." % outf)
     return N_factor
Esempio n. 16
0
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from factor_analyzer import FactorAnalyzer
from factor_analyzer import (ConfirmatoryFactorAnalyzer, ModelSpecificationParser)
from factor_analyzer.utils import (corr, impute_values, partial_correlations, smc)

data1 = pd.read_csv("https://donatello-telesca.squarespace.com/s/Exposure-t4yx.csv")

# Perform Factor Analysis
fa = FactorAnalyzer()
# fa.set_params(n_factors=6,rotation=None)
fa.set_params(n_factors=6,rotation='varimax')
fa.fit(data1)
# Check factors
factor_loadings = fa.loadings_
eigen_values, vectors = fa.get_eigenvalues()
communalities = fa.get_communalities()
# Create scree plot 
# plt.scatter(range(1,29),eigen_values)
# plt.plot(range(1,29),eigen_values)
# plt.title('Scree Plot')
# plt.xlabel('Factors')
# plt.ylabel('Eigenvalue')
# plt.grid()
# plt.show()

def clump_factor_vars(factor_loadings,factor_num):
	observed_vars = []
	for each in range(len(factor_loadings)):
		if factor_loadings[each].argmax() == factor_num:
			observed_vars.append(each)
plt.show()

#################################################################################################
'''Week 3'''
# Factor Analysis #

from factor_analyzer import FactorAnalyzer
fact = FactorAnalyzer(n_factors=2, rotation='promax')

df_cor = pd.merge(df_cor, df_med)
a_data = df_cor[[
    'hs_degree', 'median_age', 'second_mortgage', 'pct_own', 'bad_debt'
]]

fact.fit_transform(a_data)
ev, v = fact.get_eigenvalues()

plt.plot(ev)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

plt.plot(fact.loadings_)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

fact.get_communalities()

##################################################################################################
'''Week 4'''
# Regression Analysis #
Esempio n. 18
0
        MLE, PCA. 
    '''

# Pre-tests
## Bartlett's test. H0: equal variance
bartlett_chi, bartlett_p = calculate_bartlett_sphericity(
    df[vars_tot])  # p = 0.0

## Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.6
kmo_all, kmo_model = calculate_kmo(df[vars_tot])  #kmo_model = 0.7297

#--------------------------------------------
# Factor Analysis
fa = FactorAnalyzer(rotation=None, n_factors=4)
fa.fit(df[vars_tot])
ev, v = fa.get_eigenvalues()
'''NOTE: First four factors have an eigen value greater than 1. Use those.'''

# Perform a parallel analysis
list_ev_rand = []

np.random.seed(10)
for i in range(100):
    df_rand = pd.DataFrame(np.random.rand(*df[vars_tot].shape))
    fa_rand = FactorAnalyzer(rotation=None, n_factors=4).fit(df_rand)
    ev_rand, _ = fa_rand.get_eigenvalues()
    list_ev_rand.append(ev_rand)

fig, ax = plt.subplots(figsize=(15, 9))
ax.set(ylabel='Eigen Value', xlabel='Factor')
ax.plot(range(1, df_standard.shape[1] + 1), ev, marker='o', label='Factor')
Esempio n. 19
0
def get_eigenvalues(item_data, items):
    fa = FactorAnalyzer(len(items), rotation=None)
    fa.fit(item_data)
    return fa.get_eigenvalues()
Esempio n. 20
0
class Factor_Analyse_select(Feature):
    """
    因子分析
    """
    data = None
    selected_column = list()
    method = 'minres'

    def set_data(self, data):
        """
        传入数据
        :param data: 需要处理的数据
        :return:
        """
        self.data = data
        self.full_data = self._check_missing_value()
        self.numeric_data = self.get_numeric_data()

    """
    def __init__(self, data):
        self.set_data(data)
    
    def select_column(self, *colnames):
        for colname in colnames:
            if colname not in self.data.columns.values.tolist():
                raise ValueError("所选列不存在")
            elif colname not in self.full_data:
                raise ValueError("所选列存在缺失值")
            elif colname not in self.numeric_data:
                raise ValueError("所选列不为数值")
            elif colname in self.selected_column:
                raise ValueError("this column has been selected")
            else:
                self.selected_column.append(colname)
    """

    def set_method(self, method=None):
        """
        设置方法
        :param method: 选择的方法 str 可选:"minres", "ml", "principal"
        :return:
        """
        if method is None:
            warnings.warn("参数未设置")
        elif method not in ['minres', 'ml', 'principal']:
            raise ValueError("invalid method")
        else:
            self.method = method

    def fit(self):
        """
        按所选方法进行变换
        :return: 变换完毕的所有向量
        """
        feed_data = self.data[self.selected_column]
        self.model = FactorAnalyzer(n_factors=feed_data.shape[1],
                                    method=self.method,
                                    rotation=None)
        self.model.fit(feed_data)
        return self.model.transform(feed_data)

    def select_by_number(self, num):
        """
        选择特征值最大的num个向量
        :param num: 选择向量个数 int
        :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        if num < 0 or num > len(self.selected_column):
            raise ValueError("too many or too less columns are selected")
        temp = self.fit()
        result = pd.DataFrame(temp[:, 0:num])
        colnames = list()
        for i in range(num):
            colnames.append("FA " + str(i + 1))
        result.columns = colnames
        for i in self.data.columns.values.tolist()[::-1]:
            result.insert(0, column=i, value=self.data[[i]])
        return result

    def select_by_eig_GE_1(self):
        """
        选择特征值大于1的因子
        :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        pre_list = self.model.get_eigenvalues()
        index = 0
        for i in pre_list[0]:
            if i < 1:
                break
            index += 1
        temp = self.fit()
        result = pd.DataFrame(temp[:, 0:index])
        colnames = list()
        for i in range(index):
            colnames.append("FA " + str(i + 1))
        result.columns = colnames
        for i in self.data.columns.values.tolist()[::-1]:
            result.insert(0, column=i, value=self.data[[i]])
        return result

    def _select_by(self, **type_arg):
        """
        按输入参数返回因子分析结果
        :param type_arg: 控制变量字典
        字典中"method": 因子分析的方法 "minres":最小残差法(默认), "ml":极大似然, "principal":主成分分析
        字典中"type" == 0: 按数量选择结果, typearg: 选择特征值最大的typearg个因子
        字典中"type" == 1: 选择特征值大于1的所有向量
        :return: 所有输入列和分箱结果向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        if "method" in type_arg.keys():
            self.set_method(type_arg["method"])
        if type_arg["type"] == 0:
            self.select_column(*type_arg["columns"])
            return self.select_by_number(type_arg["typearg"])
        elif type_arg["type"] == 1:
            self.select_column(*type_arg["columns"])
            return self.select_by_eig_GE_1()
        else:
            raise ValueError("type error:不存在所选类")
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        svd_method='randomized',
                        use_corr_matrix=False,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    svd_method : str, optional
        The SVD method to use
        Defaults to 'randomized'
    use_corr_matrix : bool, optional
        Whether to use the correlation matrix.
        Defaults to False.
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    if use_corr_matrix:
        X = data.corr()
    else:
        X = data.copy()

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer(n_factors=factors,
                        method=method,
                        svd_method=svd_method,
                        rotation=rotation,
                        is_corr_matrix=use_corr_matrix)
    fa.fit(X)

    evalues, values = fa.get_eigenvalues()

    return {
        'value': values,
        'evalues': evalues,
        'structure': fa.structure_,
        'loading': fa.loadings_,
        'uniquenesses': fa.get_uniquenesses(),
        'communalities': fa.get_communalities(),
        'scores': fa.transform(data)
    }
Esempio n. 22
0
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False):

    """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common
        You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability.
        You want "promax" if you want Oblimin on large datasets.
        
        See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. 
    """   

    assert not df.isnull().values.any(), "Data must not contain any nan or inf values"
    assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)"  

    def data_suitable(df, kmo_value = False, ignore = False):
        
        #Test to ensure data is not identity Matrix
        chi_square_value, p_value = calculate_bartlett_sphericity(df)
        
        # test to ensure that observed data is adquite for FA. Must be > 0.6
        kmo_all, kmo_model = calculate_kmo(df)

        if (p_value > 0.1 or kmo_model < 0.6) and ignore != True:
            raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}.  KMO model Score: {}".format(p_value, kmo_model))
        
        if kmo_value:
            return kmo_model
        else:
            return
        
        
    print("KMO Value: {}.".format(data_suitable(df, kmo_value = True)))

    fa = FactorAnalyzer(method = "minres", 
                        rotation = rotation,
                        n_factors = n_factors)

    fa.fit(df)

    def eigenplot(df):
        df = pd.DataFrame(df)
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x = df.index.values,
                y = df[0].values,
                mode = 'lines'
            )
        )
        
        
        fig.add_shape(
            type = "line",
            y0 = 1,
            x0 = 0,
            y1 = 1,
            x1 = len(df),
            line = dict(
                color = 'red',
                dash = 'dash'
            )
        )
        
        fig.update_layout(
            title = "Factor Eigenvalues",
            yaxis_title="Eigenvalue",
            xaxis_title="Factor",
            xaxis = dict(
                range = [0,df[df[0] > 0].index.values[-1]]
                )
        )
        
        fig.show()
        return

    eigenplot(fa.get_eigenvalues()[1])
    Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000)

    tmp = pd.DataFrame(fa.get_factor_variance()[1:]) 
    tmp.index = ["Proportional Varience","Cumulative Varience"]
    Plotting.dfTable(tmp)

    if rotation == 'promax':
        Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000)
        Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, 
                            title = "Varience Explained",
                            x = list(df.columns), 
                            description = "The proportion of each variables varience that can be explained by the factors.", 
                            expand = True, 
                            height = 300, 
                            width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, 
                            title = "Variable Uniqueness",
                            x = list(df.columns),
                            expand = True, 
                            height = 300,
                             width = 2000)

    if transform:
        return fa.transform(df)

    return 
Esempio n. 23
0
# Teste da Esferacidade de Bartlett!!
chi_square_value, p_value = calculate_bartlett_sphericity(data)
print(chi_square_value, p_value)
# p_value = 0, entao podemos proceder com a analise de fatores pois quanto menor esse valor, mais confianca temos nas predicoes

# Teste de Kiaser-Meyer-Olkin(KMO)
kmo_all, kmo_model = calculate_kmo(data)
print(kmo_model)
# Resultado 0.84, entao podemos proceder com a nossa analise de fatores

# Criamos um objeto analise de fatores sem rotacao
analisador_sem_rotacao = FactorAnalyzer(n_factors=20, rotation=None)
analisador_sem_rotacao.fit(data)
# Aqui estamos checando os nossos autovalores
autovalores, v = analisador_sem_rotacao.get_eigenvalues()
print(autovalores)

# Criamos o Grafico Scree para observar quais autovalores sao maiores que 1, neste caso usaremos 6 fatores
plt.scatter(range(1, data.shape[1] + 1), autovalores)
plt.plot(range(1, data.shape[1] + 1), autovalores)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

# Criamos um objeto analise de faotres com rotacao varimax
analisador_varimax = FactorAnalyzer(n_factors=5, rotation="varimax")
analisador_varimax.fit(data)
Esempio n. 24
0
VarbList = df.columns
chi_square_value, p_value = calculate_bartlett_sphericity(X)
chi_square_value, p_value
# --> p Value = 0 that mean the test was statistically significant, the obvserved correlation matrix is not an identy matrix

# Kaiser_Meyer_Olkin Test
kmo_all, kmo_model = calculate_kmo(X)
kmo_model
# --> KMO value of 0.653 indicates a moderate suitableity for factory analysis  ' Source Cureton, E. E./ D'Agostino, R. B. 1983: Factor analysis: an applied approach. Hillside, NJ: Lawrence Erlbaum Associates, S. 389 f.

# Choosing Number of Factors
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(rotation=None, n_factors=30)
fa.fit(X)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
# --> only 30 Eigenvalues greater than 1 , so only choose them ?

# Create scree plot
g = plt.scatter(range(1, X.shape[1] + 1), ev)
#g = plt.plot(range(1,X.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

figure = g.get_figure()
figure.savefig('Scree_plot.pdf', dpi=400)
Esempio n. 25
0
# I know the scores turn out in this order....
pca_names = ['STM', 'reasoning', 'verbal']

# Build and collect dataframes that will be used for figures and table
# generation. First, the loadings.
loadings = pd.DataFrame(Ypca.loadings_,
                        index=cbs.test_names(),
                        columns=pca_names)

# Pairwise correlations between test scores
var_corrs = pd.DataFrame(Ypca.corr_,
                         index=cbs.test_names(),
                         columns=cbs.test_names())

# Eigenvalues of the components
eigen_values = pd.DataFrame(Ypca.get_eigenvalues()[0][0:3],
                            index=pca_names,
                            columns=['eigenvalues']).T

# Percentage variabnce explained by each component
pct_variance = pd.DataFrame(Ypca.get_factor_variance()[1] * 100,
                            index=pca_names,
                            columns=['% variance']).T

# Generates and displays the chord plot to visualize the factors
fig = chord_plot(loadings.copy(),
                 var_corrs.copy(),
                 cscale_name='Picnic',
                 width=700,
                 height=350,
                 threshold=0.20)
def run_sampling_adequacy_app():

    st.header('■Measure of Sampling Adequacy')
    st.write(
        'To investigate the adequay of the number of samples for questionnaire.Kaiser-Meyer-Olkin (KMO) Test is used. '
    )
    st.write('KMO values between 0.8 and 1 indicate the sampling is adequate.')
    st.write(
        'KMO values less than 0.6 indicate the sampling is not adequate and that remedial action should be taken. '
    )
    st.write(
        'Some authors put this value at 0.5, so use your own judgment for values between 0.5 and 0.6.'
    )

    st.sidebar.subheader('Data upload')
    df_edu = pd.read_csv("data/eng_sample_data_sampling.csv")

    def download_link(object_to_download, download_filename,
                      download_link_text):
        if isinstance(object_to_download, pd.DataFrame):
            object_to_download = object_to_download.to_csv(
                index=False, encoding='utf_8_sig')
            b64 = base64.b64encode(object_to_download.encode()).decode()
            return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

    tmp_download_link = download_link(df_edu, 'sample_sampling.csv',
                                      'Download sample csv file.')
    st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True)

    try:

        uploaded_file = st.sidebar.file_uploader(
            "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)",
            type=["csv"])
        # uploaded_file = st.file_uploader(
        #     label = 'File Upload(Drag and drop csv/Excel file)',
        #     type = ['csv', 'xlsx']
        # )
        if uploaded_file is not None:
            df_edu = pd.read_csv(uploaded_file)
            uploaded_file.seek(0)
            display_data = st.sidebar.checkbox(label='Show uploaded data')

            if display_data:
                st.dataframe(df_edu)

        else:
            df_edu = pd.read_csv('data/eng_sample_data_sampling.csv')

            show_df = st.sidebar.checkbox('Show DataFrame')

            if show_df == True:
                st.write(df_edu)

        df_edu = df_edu.dropna()
        df_edu = df_edu.drop(['student'], axis=1)
        from factor_analyzer.factor_analyzer import calculate_kmo
        kmo_all, kmo_model = calculate_kmo(df_edu)
        st.write('## KMO value:', kmo_model.round(2))

        st.subheader('Data overview (correlation coefficient)')
        st.write(df_edu.corr().style.background_gradient(cmap='coolwarm'))

        fa = FactorAnalyzer()
        fa.fit(df_edu)
        ev, v = fa.get_eigenvalues()

        st.set_option('deprecation.showPyplotGlobalUse', False)
        plt.figure(figsize=(7, 5))
        plt.scatter(range(1, df_edu.shape[1] + 1), ev)
        plt.plot(range(1, df_edu.shape[1] + 1), ev)
        plt.title('Scree Plot')
        plt.xlabel('Factors')
        plt.ylabel('Eigenvalue')
        plt.grid()
        st.pyplot()

        fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop')
        fa.fit(df_edu)
        df_result = pd.DataFrame(fa.loadings_, columns=['1st', '2nd', '3rd'])
        df_result.index = df_edu.columns
        cm = sns.light_palette('blue', as_cmap=True)
        df_factor = df_result.style.background_gradient(cmap=cm)
        st.write(df_factor)

    except Exception as e:
        st.header(
            'ERROR: Data inconsistency. Check data format to be uploaded.')
        print('Data inconsistency error')
Esempio n. 27
0
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

plt.style.use('ggplot')

####### Setup the dataset
df = pd.read_csv("dims_new.csv", delimiter=',', header=0)
#df.drop(['Unnamed: 0','gender', 'education', 'age'],axis=1,inplace=True)
# Dropping missing values rows
#df.dropna(inplace=True)
df.head()

####### Find eigenvalues
fa = FactorAnalyzer(rotation=None, n_factors=17)
fa = fa.fit(df)  # used instead of "fa.analyze(df, 17, rotation=None)"
ev, v = fa.get_eigenvalues()  # Check Eigenvalues

####### Plot eigenvalues
plt.scatter(range(1, df.shape[1] + 1), ev)
plt.plot(range(1, df.shape[1] + 1), ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.axhline(y=1, c='k')


##### Eigenvalues suggest that 6 dimensions is a good fit
def loadThem(rotation, factors):
    fa = FactorAnalyzer(rotation=rotation, n_factors=factors)
    fa = fa.fit(df.values)
    loadings = fa.loadings_