Beispiel #1
1
 def test_pandas(self, close_figures):
     pc = PCA(pd.DataFrame(self.x))
     pc1 = PCA(self.x)
     assert_allclose(pc.factors.values, pc1.factors)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     proj = pc.project(2)
     PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
     PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
Beispiel #2
0
 def test_pandas(self):
     pc = PCA(pd.DataFrame(self.x))
     pc1 = PCA(self.x)
     assert_equal(pc.factors.values, pc1.factors)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     proj = pc.project(2)
     PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
     PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
Beispiel #3
0
 def test_smoke_plot_and_repr(self):
     pc = PCA(self.x)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_scree(cumulative=True)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     # Additional smoke test
     pc.__repr__()
     pc = PCA(self.x, standardize=False)
     pc.__repr__()
     pc = PCA(self.x, standardize=False, demean=False)
     pc.__repr__()
     # Check data for no changes
     assert_equal(self.x, pc.data)
Beispiel #4
0
 def test_smoke_plot_and_repr(self, close_figures):
     pc = PCA(self.x)
     fig = pc.plot_scree()
     fig = pc.plot_scree(ncomp=10)
     fig = pc.plot_scree(log_scale=False)
     fig = pc.plot_scree(cumulative=True)
     fig = pc.plot_rsquare()
     fig = pc.plot_rsquare(ncomp=5)
     # Additional smoke test
     pc.__repr__()
     pc = PCA(self.x, standardize=False)
     pc.__repr__()
     pc = PCA(self.x, standardize=False, demean=False)
     pc.__repr__()
     # Check data for no changes
     assert_equal(self.x, pc.data)
# 值得一看,PCA中消除了平均趋势。 它表明这个数据集涵盖的时间段内,生育率稳步下降。请注意,均值是
# 使用国家/地区作为分析单位来计算的,而忽略了人口规模。 对于以下进行的 PCA 分析也是如此。 更复杂
# 的分析可能会对这些国家加权,比如说 1980 年的人口。

ax = dta.mean().plot(grid=False)
ax.set_xlabel("Year", size=17)
ax.set_ylabel("Fertility rate", size=17)
ax.set_xlim(0, 51)

# 接下来,运行 PCA:

pca_model = PCA(dta.T, standardize=False, demean=True)

# 基于特征值,我们看到第一个主成分(PC)占主导,第二和第三个主成分(PC)可能捕获了少量有意义的变化。

fig = pca_model.plot_scree(log_scale=False)

# 接下来,我们将绘制主成分(PC)因子。 主导因子是单调递增的。与上面显示的平均值相比,第一个因子得分为正的国家的生育率增长更快(或下降更快)。
# 在第一个因子上得分为负的国家/地区的生育率下降得比平均值快。第二个因子呈U形,并在 1985 年左右出现一个正峰值。第二个因子的正向评分较高的国家
# 将在数据范围的开始和结束时低于平均受精率,但高于数据中心的平均受精率的范围。

fig, ax = plt.subplots(figsize=(8, 4))
lines = ax.plot(pca_model.factors.iloc[:, :3], lw=4, alpha=.6)
ax.set_xticklabels(dta.columns.values[::10])
ax.set_xlim(0, 51)
ax.set_xlabel("Year", size=17)
fig.subplots_adjust(.1, .1, .85, .9)
legend = fig.legend(lines, ['PC 1', 'PC 2', 'PC 3'], loc='center right')
legend.draw_frame(False)

# 为了更好地理解发生了什么,我们将绘制一系列的类似主成分因子得分的国家/地区的生育率轨迹。下面这个函数很简便的绘制了此图。
Beispiel #6
0
# pca in statsmodels
import numpy as np
from statsmodels.multivariate.pca import PCA
X = np.random.randn(100)[:, None]
X = X + np.random.randn(100, 100)
pc = PCA(X)

print(pc.factors.shape) 
pc.plot_scree(ncomp = 5).show()
Beispiel #7
0
cancorr(endog, exog)
#####1st & 2nd & 3rd Canonical Correlations are 0.89133498, 0.41448753, 0.12295234

result_cancorr = CanCorr(endog, exog, tolerance=1e-8, missing='none')

#Richard Johnson ch.10
test = result_cancorr.corr_test
test().stats
test().stats_mv

#####PCA,
result_pca = PCA(df_variables,
                 standardize=False,
                 demean=True,
                 missing='drop-row')
result_pca.plot_scree()

#Question1: identify variables that correlate with GFP expression
#####Multivariate Linear Regression Model, selection of predictor variables see p.385 R.Johnson
#GFP intens mean can be seen as count variable, that can be modeled using possion/negbino link in GLM.

##[Linear mixed effects model] can be performed for the effects of High/Midium/Low LNP dose

######Here starts the question2: Does internuclear distance correlate with GFP expression?(nuc intens mean/compactness v.s. GFP intens)
# import statsmodels.formula.api as smf
# mixed_model = smf.mixedlm("'GFP intens Mean' ~ 'Nuc intens Mean'", df_mix, groups=df_mix["LNP dose"])
import statsmodels.regression.mixed_linear_model as smm
import statsmodels.regression.mixed_linear_model as smm

Mixed_model = smm.MixedLM(endog=df_mix['GFP intens Mean'].to_numpy(),
                          exog=df_mix['Nuc intens Mean'].to_numpy(),
ax = dta.mean().plot(grid=False)
ax.set_xlabel("Year", size=17)
ax.set_ylabel(
    "Fertility rate", size=17)
ax.set_xlim(0, 51)

# Next we perform the PCA:

pca_model = PCA(dta.T, standardize=False, demean=True)

# Based on the eigenvalues, we see that the first PC dominates, with
# perhaps a small amount of meaningful variation captured in the second and
# third PC's.

fig = pca_model.plot_scree(log_scale=False)

# Next we will plot the PC factors.  The dominant factor is monotonically
# increasing.  Countries with a positive score on the first factor will
# increase faster (or decrease slower) compared to the mean shown above.
# Countries with a negative score on the first factor will decrease faster
# than the mean.  The second factor is U-shaped with a positive peak at
# around 1985.  Countries with a large positive score on the second factor
# will have lower than average fertilities at the beginning and end of the
# data range, but higher than average fertility in the middle of the range.

fig, ax = plt.subplots(figsize=(8, 4))
lines = ax.plot(pca_model.factors.iloc[:, :3], lw=4, alpha=.6)
ax.set_xticklabels(dta.columns.values[::10])
ax.set_xlim(0, 51)
ax.set_xlabel("Year", size=17)