Ejemplo n.º 1
0
 def scatter_2d(self) -> go.Figure:
     """ 2D scatter plot for clustered data """
     fa = FactorAnalyzer(rotation='varimax', n_factors=2, method='ml')
     components = fa.fit_transform(self.df)
     total_var = self.pro_var.sum() * 100
     return self._plot_scatter_2d(components, self.clustered_labels.cluster,
                                  total_var)
Ejemplo n.º 2
0
def FA(observied_variables, name):
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
    chi_square_value, p_value = calculate_bartlett_sphericity(
        observied_variables)
    print("chi_square_value", chi_square_value, "p-value:", p_value)
    from factor_analyzer.factor_analyzer import calculate_kmo
    kmo_all, kmo_model = calculate_kmo(observied_variables)
    print("KMO value", kmo_model)

    # Create factor analysis object and perform factor analysis
    if name == 'phone':
        fa = FactorAnalyzer(n_factors=2)
    if name == 'QOL':
        fa = FactorAnalyzer(n_factors=4)
    fa.fit_transform(observied_variables)
    # Check Eigenvalues
    eigen_values, vectors = fa.get_eigenvalues()
    print(eigen_values)
    """
    # Create scree plot using matplotlib
    plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values)
    plt.plot(range(1,observied_variables.shape[1]+1),eigen_values)
    if name == 'phone':
        plt.title('Scree Plot for phone features',fontsize=24)
    if name == 'QOL':
        plt.title('Scree Plot for QOL features',fontsize=24)
    plt.xlabel('Factors', fontsize=18)
    plt.ylabel('Eigenvalue',fontsize=18)
    plt.grid()
    plt.show()
    """

    loadings = fa.loadings_
    print(pd.DataFrame(loadings, observied_variables.columns))
    #print(pd.DataFrame(fa.get_communalities()))
    return pd.DataFrame(loadings, observied_variables.columns)

    # Get variance of each factors
    print(
        pd.DataFrame(fa.get_factor_variance(),
                     ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
Ejemplo n.º 3
0
 def factor_analysis(self):
     fa = FactorAnalyzer(n_factors=self.N_factor,
                         rotation=self.rotation,
                         method=self.method)
     score = fa.fit_transform(self.dataset)
     header = ["Factor_%s" % i for i in range(1, self.N_factor + 1)]
     ### 因子負荷量
     self.loadings = fa.loadings_
     outf = "%s/factor_loadings.tsv" % self.outd
     df = pd.DataFrame(fa.loadings_, columns=header)
     df.to_csv(outf, sep="\t", index=False)
     self.logger.info("Facotr loadings are saved as %s." % outf)
     ### 因子得点
     outf = "%s/factor_score.tsv" % self.outd
     df = pd.DataFrame(score, columns=header)
     df.to_csv(outf, sep="\t", index=False)
     self.logger.info("Facotr scores are saved as %s." % outf)
     return 0
    step=1))  #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color='red', fontsize=16)

ax.grid(axis='x')
plt.show()

# In[15]:

#mismo PCA factor analyzer pero con otra librería"
from factor_analyzer import FactorAnalyzer
transformer2 = FactorAnalyzer(n_factors=3, rotation='varimax')
X_transformed2 = transformer2.fit_transform(X)

# In[16]:

pc2 = pd.DataFrame(transformer2.loadings_, index=datamat)
export_excel2 = pc2.to_excel('Downloads/pcvartotpais2021.xlsx',
                             index=True,
                             header=True)

# In[ ]:

pcaex = pd.DataFrame(transformer.components_, columns=datamat)
export_excel2 = pcaex.to_excel('Downloads/pcaex.xlsx', index=True, header=True)

# In[57]:
    X2 = sm.add_constant(X_train_transformed)
    model = sm.OLS(y_train, X2)
    fii = model.fit()
    p_values = fii.summary2().tables[1]['P>|t|']
    print("\nModel p-values: ")
    print(p_values)


# Split data before it is transformed.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    data['medv'],
                                                    test_size=0.3,
                                                    random_state=1)
# Transform data with factor components.
X_train_transformed = fa.fit_transform(X_train)
X_test_tranformed = fa.transform(X_test)

#----------------------------------------------------------
# Build first model
#----------------------------------------------------------
# Train regression model on training data
model = LinearRegression()
model.fit(X_train_transformed, y_train)

# Show model statistics.
showModelSummary(model, y_test, X_test_tranformed)

# Check coefficient significance.
showCoefficientPValues(y_train, X_train_transformed)
sns.heatmap(df_corr, annot=True)
plt.show()

#################################################################################################
'''Week 3'''
# Factor Analysis #

from factor_analyzer import FactorAnalyzer
fact = FactorAnalyzer(n_factors=2, rotation='promax')

df_cor = pd.merge(df_cor, df_med)
a_data = df_cor[[
    'hs_degree', 'median_age', 'second_mortgage', 'pct_own', 'bad_debt'
]]

fact.fit_transform(a_data)
ev, v = fact.get_eigenvalues()

plt.plot(ev)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

plt.plot(fact.loadings_)
plt.xticks(range(len(a_data.columns)), labels=['1', '2', '3', '4', '5'])
plt.show()

fact.get_communalities()

##################################################################################################
'''Week 4'''
# Regression Analysis #