prep8 = prep_data_TLTLR_with_nan[sc8].median(axis=1)
prep9 = prep_data_TLTLR_with_nan[sc9].median(axis=1)
prep10 = prep_data_TLTLR_with_nan[sc10].median(axis=1)
prep11 = prep_data_TLTLR_with_nan[sc11].median(axis=1)

prep_avg_length = pd.concat([prep8,prep9,prep10,prep11],axis=1)
prep_avg_length.columns = ['8','9','10','11']
prep_avg_length_norm = prep_avg_length.sub(prep_avg_length.mean(axis=1),axis=0)
#z_pca = sch.linkage(prep_avg_length_norm,method='ward',optimal_ordering=True) 
#cg_pca = sns.clustermap(prep_avg_length_norm,row_linkage=z_pca, col_cluster=False
#                        ,row_cluster=True, vmin=-1.5,vmax=1.5,cmap='coolwarm')
#%%
num_PCA = 2
pca,transformed,loadings = doPCA(prep_avg_length_norm)
#plot explained variance by PC
plt.figure()
pd.Series(pca.explained_variance_ratio_).plot(kind='bar')
plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14)
plt.tight_layout()
print('Fraction explained by the first ',str(num_PCA), 'PCAs :',sum(pca.explained_variance_ratio_[:num_PCA]))
list_PCAs = list(transformed.columns[:num_PCA])
z_pca = sch.linkage(transformed.loc[:,list_PCAs],method='ward',optimal_ordering=True) 

#plot clustermap of medians by length
cg_pca = sns.clustermap(prep_avg_length_norm,row_linkage=z_pca, col_cluster=False
                        ,row_cluster=True, vmin=-1.5,vmax=1.5,cmap='coolwarm')
X = transformed.loc[:,list_PCAs]
c, coph_dists = cophenet(z_pca, pdist(X))
print('cophenetic distance: ',c)
Esempio n. 2
0
#Normalized each tetraloop-receptor with respect to its mean accross columns
mean_per_row = prep_data.mean(axis=1)
prep_data_norm = prep_data.copy()
prep_data_norm = prep_data_norm.sub(mean_per_row, axis=0)
prep_data_norm_with_nan = prep_data_norm.copy()
prep_data_norm_with_nan[original_nan] = np.nan
#%%
#Normalized data relative to the canonical 11ntR
WT_data = prep_data.loc['11ntR']
prep_data_rel11ntR = prep_data.copy()
prep_data_rel11ntR = prep_data_rel11ntR.subtract(WT_data)
prep_data_rel11ntR_with_nan = prep_data_rel11ntR.copy()
prep_data_rel11ntR_with_nan[original_nan] = np.nan
#%%
'''------------PCA Analysis of raw data-----------------'''
pca, transformed, loadings = doPCA(prep_data_rel11ntR)
#plot explained variance by PC
pd.Series(pca.explained_variance_ratio_).plot(kind='bar')
plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14)
plt.tight_layout()
num_PCA = 12
print('Fraction explained by the first ', str(num_PCA), 'PCAs :',
      sum(pca.explained_variance_ratio_[:num_PCA]))
#%%
distance_threshold = 20
sch.dendrogram(z_pca, color_threshold=distance_threshold)
max_d = distance_threshold
clusters = fcluster(z_pca, max_d, criterion='distance')
number_clusters = max(clusters)
print('number of clusters based on distance of ', str(max_d), ':',
      str(number_clusters))
Esempio n. 3
0
]
combined_data_rearr = combined_data_rearr[columns_reordered]

#(2) Eliminate rows with too much missing data, fill remaining missing data with nearest neighbors info,
# replace values above threshold with limit
prep_data, original_nan = prep_data_for_clustering_ver2(
    combined_data_rearr, dG_threshold, dG_replace, nan_threshold,
    num_neighbors)

#(3) Subtract the mean per row
norm_data = prep_data.sub(prep_data.mean(axis=1), axis=0)
norm_data_nan = norm_data.copy()
norm_data_nan[original_nan] = np.NAN
#%%
#PCA Analyis
pca, transformed, loadings = doPCA(norm_data)
#plot explained variance by PC
pd.Series(pca.explained_variance_ratio_).plot(kind='bar')
plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14)
plt.tight_layout()
num_PCA = 4
print('Fraction explained by the first ', str(num_PCA), 'PCAs :',
      sum(pca.explained_variance_ratio_[:num_PCA]))
#plt.show()
plt.close()
#%%
#Hierarchical clustering of the first 4 PCs
list_PCAs = list(transformed.columns[:num_PCA])
z_pca = sch.linkage(transformed.loc[:, list_PCAs], method='ward')
cg_pca = sns.clustermap(norm_data_nan,
                        row_linkage=z_pca,
Esempio n. 4
0
#Normalized each tetraloop-receptor with respect to its mean accross columns
mean_per_row = prep_data.mean(axis=1)
prep_data_norm = prep_data.copy()
prep_data_norm = prep_data_norm.sub(mean_per_row, axis=0)
prep_data_norm_with_nan = prep_data_norm.copy()
prep_data_norm_with_nan[original_nan] = np.nan
#%%
#Normalized data relative to the canonical 11ntR
WT_data = prep_data.loc['11ntR']
prep_data_rel11ntR = prep_data.copy()
prep_data_rel11ntR = prep_data_rel11ntR.subtract(WT_data)
prep_data_rel11ntR_with_nan = prep_data_rel11ntR.copy()
prep_data_rel11ntR_with_nan[original_nan] = np.nan
#%%
'''------------PCA Analysis of raw data-----------------'''
pca, transformed, loadings = doPCA(prep_data_norm)
#plot explained variance by PC
pd.Series(pca.explained_variance_ratio_).plot(kind='bar')
plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14)
plt.tight_layout()
num_PCA = 25
print('Fraction explained by the first ', str(num_PCA), 'PCAs :',
      sum(pca.explained_variance_ratio_[:num_PCA]))
#%%
list_PCAs = list(transformed.columns[:num_PCA])
z_pca = sch.linkage(transformed.loc[:, list_PCAs], method='ward')
cg_pca = sns.clustermap(prep_data_with_nan,
                        row_linkage=z_pca,
                        col_cluster=False,
                        vmin=-15,
                        vmax=-7)