prep8 = prep_data_TLTLR_with_nan[sc8].median(axis=1) prep9 = prep_data_TLTLR_with_nan[sc9].median(axis=1) prep10 = prep_data_TLTLR_with_nan[sc10].median(axis=1) prep11 = prep_data_TLTLR_with_nan[sc11].median(axis=1) prep_avg_length = pd.concat([prep8,prep9,prep10,prep11],axis=1) prep_avg_length.columns = ['8','9','10','11'] prep_avg_length_norm = prep_avg_length.sub(prep_avg_length.mean(axis=1),axis=0) #z_pca = sch.linkage(prep_avg_length_norm,method='ward',optimal_ordering=True) #cg_pca = sns.clustermap(prep_avg_length_norm,row_linkage=z_pca, col_cluster=False # ,row_cluster=True, vmin=-1.5,vmax=1.5,cmap='coolwarm') #%% num_PCA = 2 pca,transformed,loadings = doPCA(prep_avg_length_norm) #plot explained variance by PC plt.figure() pd.Series(pca.explained_variance_ratio_).plot(kind='bar') plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14) plt.tight_layout() print('Fraction explained by the first ',str(num_PCA), 'PCAs :',sum(pca.explained_variance_ratio_[:num_PCA])) list_PCAs = list(transformed.columns[:num_PCA]) z_pca = sch.linkage(transformed.loc[:,list_PCAs],method='ward',optimal_ordering=True) #plot clustermap of medians by length cg_pca = sns.clustermap(prep_avg_length_norm,row_linkage=z_pca, col_cluster=False ,row_cluster=True, vmin=-1.5,vmax=1.5,cmap='coolwarm') X = transformed.loc[:,list_PCAs] c, coph_dists = cophenet(z_pca, pdist(X)) print('cophenetic distance: ',c)
#Normalized each tetraloop-receptor with respect to its mean accross columns mean_per_row = prep_data.mean(axis=1) prep_data_norm = prep_data.copy() prep_data_norm = prep_data_norm.sub(mean_per_row, axis=0) prep_data_norm_with_nan = prep_data_norm.copy() prep_data_norm_with_nan[original_nan] = np.nan #%% #Normalized data relative to the canonical 11ntR WT_data = prep_data.loc['11ntR'] prep_data_rel11ntR = prep_data.copy() prep_data_rel11ntR = prep_data_rel11ntR.subtract(WT_data) prep_data_rel11ntR_with_nan = prep_data_rel11ntR.copy() prep_data_rel11ntR_with_nan[original_nan] = np.nan #%% '''------------PCA Analysis of raw data-----------------''' pca, transformed, loadings = doPCA(prep_data_rel11ntR) #plot explained variance by PC pd.Series(pca.explained_variance_ratio_).plot(kind='bar') plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14) plt.tight_layout() num_PCA = 12 print('Fraction explained by the first ', str(num_PCA), 'PCAs :', sum(pca.explained_variance_ratio_[:num_PCA])) #%% distance_threshold = 20 sch.dendrogram(z_pca, color_threshold=distance_threshold) max_d = distance_threshold clusters = fcluster(z_pca, max_d, criterion='distance') number_clusters = max(clusters) print('number of clusters based on distance of ', str(max_d), ':', str(number_clusters))
] combined_data_rearr = combined_data_rearr[columns_reordered] #(2) Eliminate rows with too much missing data, fill remaining missing data with nearest neighbors info, # replace values above threshold with limit prep_data, original_nan = prep_data_for_clustering_ver2( combined_data_rearr, dG_threshold, dG_replace, nan_threshold, num_neighbors) #(3) Subtract the mean per row norm_data = prep_data.sub(prep_data.mean(axis=1), axis=0) norm_data_nan = norm_data.copy() norm_data_nan[original_nan] = np.NAN #%% #PCA Analyis pca, transformed, loadings = doPCA(norm_data) #plot explained variance by PC pd.Series(pca.explained_variance_ratio_).plot(kind='bar') plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14) plt.tight_layout() num_PCA = 4 print('Fraction explained by the first ', str(num_PCA), 'PCAs :', sum(pca.explained_variance_ratio_[:num_PCA])) #plt.show() plt.close() #%% #Hierarchical clustering of the first 4 PCs list_PCAs = list(transformed.columns[:num_PCA]) z_pca = sch.linkage(transformed.loc[:, list_PCAs], method='ward') cg_pca = sns.clustermap(norm_data_nan, row_linkage=z_pca,
#Normalized each tetraloop-receptor with respect to its mean accross columns mean_per_row = prep_data.mean(axis=1) prep_data_norm = prep_data.copy() prep_data_norm = prep_data_norm.sub(mean_per_row, axis=0) prep_data_norm_with_nan = prep_data_norm.copy() prep_data_norm_with_nan[original_nan] = np.nan #%% #Normalized data relative to the canonical 11ntR WT_data = prep_data.loc['11ntR'] prep_data_rel11ntR = prep_data.copy() prep_data_rel11ntR = prep_data_rel11ntR.subtract(WT_data) prep_data_rel11ntR_with_nan = prep_data_rel11ntR.copy() prep_data_rel11ntR_with_nan[original_nan] = np.nan #%% '''------------PCA Analysis of raw data-----------------''' pca, transformed, loadings = doPCA(prep_data_norm) #plot explained variance by PC pd.Series(pca.explained_variance_ratio_).plot(kind='bar') plt.ylabel('fraction of variance \nexplained by each PC', fontsize=14) plt.tight_layout() num_PCA = 25 print('Fraction explained by the first ', str(num_PCA), 'PCAs :', sum(pca.explained_variance_ratio_[:num_PCA])) #%% list_PCAs = list(transformed.columns[:num_PCA]) z_pca = sch.linkage(transformed.loc[:, list_PCAs], method='ward') cg_pca = sns.clustermap(prep_data_with_nan, row_linkage=z_pca, col_cluster=False, vmin=-15, vmax=-7)