def test_pandas(self, close_figures): pc = PCA(pd.DataFrame(self.x)) pc1 = PCA(self.x) assert_allclose(pc.factors.values, pc1.factors) fig = pc.plot_scree() fig = pc.plot_scree(ncomp=10) fig = pc.plot_scree(log_scale=False) fig = pc.plot_rsquare() fig = pc.plot_rsquare(ncomp=5) proj = pc.project(2) PCA(pd.DataFrame(self.x), ncomp=4, gls=True) PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
def test_pandas(self): pc = PCA(pd.DataFrame(self.x)) pc1 = PCA(self.x) assert_equal(pc.factors.values, pc1.factors) fig = pc.plot_scree() fig = pc.plot_scree(ncomp=10) fig = pc.plot_scree(log_scale=False) fig = pc.plot_rsquare() fig = pc.plot_rsquare(ncomp=5) proj = pc.project(2) PCA(pd.DataFrame(self.x), ncomp=4, gls=True) PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
def test_smoke_plot_and_repr(self): pc = PCA(self.x) fig = pc.plot_scree() fig = pc.plot_scree(ncomp=10) fig = pc.plot_scree(log_scale=False) fig = pc.plot_scree(cumulative=True) fig = pc.plot_rsquare() fig = pc.plot_rsquare(ncomp=5) # Additional smoke test pc.__repr__() pc = PCA(self.x, standardize=False) pc.__repr__() pc = PCA(self.x, standardize=False, demean=False) pc.__repr__() # Check data for no changes assert_equal(self.x, pc.data)
def test_smoke_plot_and_repr(self, close_figures): pc = PCA(self.x) fig = pc.plot_scree() fig = pc.plot_scree(ncomp=10) fig = pc.plot_scree(log_scale=False) fig = pc.plot_scree(cumulative=True) fig = pc.plot_rsquare() fig = pc.plot_rsquare(ncomp=5) # Additional smoke test pc.__repr__() pc = PCA(self.x, standardize=False) pc.__repr__() pc = PCA(self.x, standardize=False, demean=False) pc.__repr__() # Check data for no changes assert_equal(self.x, pc.data)
# 值得一看,PCA中消除了平均趋势。 它表明这个数据集涵盖的时间段内,生育率稳步下降。请注意,均值是 # 使用国家/地区作为分析单位来计算的,而忽略了人口规模。 对于以下进行的 PCA 分析也是如此。 更复杂 # 的分析可能会对这些国家加权,比如说 1980 年的人口。 ax = dta.mean().plot(grid=False) ax.set_xlabel("Year", size=17) ax.set_ylabel("Fertility rate", size=17) ax.set_xlim(0, 51) # 接下来,运行 PCA: pca_model = PCA(dta.T, standardize=False, demean=True) # 基于特征值,我们看到第一个主成分(PC)占主导,第二和第三个主成分(PC)可能捕获了少量有意义的变化。 fig = pca_model.plot_scree(log_scale=False) # 接下来,我们将绘制主成分(PC)因子。 主导因子是单调递增的。与上面显示的平均值相比,第一个因子得分为正的国家的生育率增长更快(或下降更快)。 # 在第一个因子上得分为负的国家/地区的生育率下降得比平均值快。第二个因子呈U形,并在 1985 年左右出现一个正峰值。第二个因子的正向评分较高的国家 # 将在数据范围的开始和结束时低于平均受精率,但高于数据中心的平均受精率的范围。 fig, ax = plt.subplots(figsize=(8, 4)) lines = ax.plot(pca_model.factors.iloc[:, :3], lw=4, alpha=.6) ax.set_xticklabels(dta.columns.values[::10]) ax.set_xlim(0, 51) ax.set_xlabel("Year", size=17) fig.subplots_adjust(.1, .1, .85, .9) legend = fig.legend(lines, ['PC 1', 'PC 2', 'PC 3'], loc='center right') legend.draw_frame(False) # 为了更好地理解发生了什么,我们将绘制一系列的类似主成分因子得分的国家/地区的生育率轨迹。下面这个函数很简便的绘制了此图。
# pca in statsmodels import numpy as np from statsmodels.multivariate.pca import PCA X = np.random.randn(100)[:, None] X = X + np.random.randn(100, 100) pc = PCA(X) print(pc.factors.shape) pc.plot_scree(ncomp = 5).show()
cancorr(endog, exog) #####1st & 2nd & 3rd Canonical Correlations are 0.89133498, 0.41448753, 0.12295234 result_cancorr = CanCorr(endog, exog, tolerance=1e-8, missing='none') #Richard Johnson ch.10 test = result_cancorr.corr_test test().stats test().stats_mv #####PCA, result_pca = PCA(df_variables, standardize=False, demean=True, missing='drop-row') result_pca.plot_scree() #Question1: identify variables that correlate with GFP expression #####Multivariate Linear Regression Model, selection of predictor variables see p.385 R.Johnson #GFP intens mean can be seen as count variable, that can be modeled using possion/negbino link in GLM. ##[Linear mixed effects model] can be performed for the effects of High/Midium/Low LNP dose ######Here starts the question2: Does internuclear distance correlate with GFP expression?(nuc intens mean/compactness v.s. GFP intens) # import statsmodels.formula.api as smf # mixed_model = smf.mixedlm("'GFP intens Mean' ~ 'Nuc intens Mean'", df_mix, groups=df_mix["LNP dose"]) import statsmodels.regression.mixed_linear_model as smm import statsmodels.regression.mixed_linear_model as smm Mixed_model = smm.MixedLM(endog=df_mix['GFP intens Mean'].to_numpy(), exog=df_mix['Nuc intens Mean'].to_numpy(),
ax = dta.mean().plot(grid=False) ax.set_xlabel("Year", size=17) ax.set_ylabel( "Fertility rate", size=17) ax.set_xlim(0, 51) # Next we perform the PCA: pca_model = PCA(dta.T, standardize=False, demean=True) # Based on the eigenvalues, we see that the first PC dominates, with # perhaps a small amount of meaningful variation captured in the second and # third PC's. fig = pca_model.plot_scree(log_scale=False) # Next we will plot the PC factors. The dominant factor is monotonically # increasing. Countries with a positive score on the first factor will # increase faster (or decrease slower) compared to the mean shown above. # Countries with a negative score on the first factor will decrease faster # than the mean. The second factor is U-shaped with a positive peak at # around 1985. Countries with a large positive score on the second factor # will have lower than average fertilities at the beginning and end of the # data range, but higher than average fertility in the middle of the range. fig, ax = plt.subplots(figsize=(8, 4)) lines = ax.plot(pca_model.factors.iloc[:, :3], lw=4, alpha=.6) ax.set_xticklabels(dta.columns.values[::10]) ax.set_xlim(0, 51) ax.set_xlabel("Year", size=17)