#tregion = region_scorer.set_params(n_clusters=4) #ts, scores, pval = permutation_test_score(tregion, seq_data, y=known_subs.values, # scoring = SeqSklearn.normalized_mutual_info_score_linker, # n_permutations=100, # cv=Bootstrap(seq_region.shape[0], train_size=0.7, n_iter=1)) #print ts, pval # <codecell> seq_df, known_subs = aligned_seqs['Int'].dropna().align(nlanl_data['PSSMScore'].dropna(), join='inner') # <codecell> from SeqSklearn import BinBasedCluster clust = BinBasedCluster(bins=pssm_bins) getter = itemgetter(*range(109-17,109+17)) seq_list_seqs = [list(l) for l in seq_df.values] seq_region = np.array(map(getter, seq_list_seqs)) region_transform.fit(seq_region, np.ones((seq_region.shape[0], 1))) seq_data = region_transform.transform(seq_region) pca_trans, biny, xx, yy, Z = clust.make_vern_points(seq_data, known_subs.values) # <codecell> from pylab import get_cmap plt.figure(figsize=(10,10)) jitter = 0.01*np.random.randn(*pca_trans.shape)+pca_trans plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=known_subs, cmap=get_cmap('copper_r'), alpha=0.5) cbar = plt.colorbar()
n_clusts=range(2, 60)) t = trop_data.groupby('n_clusters')['score'].mean() e = trop_data.groupby('n_clusters')['score'].std() plt.errorbar(t.index, t.values, yerr=e.values) plt.title('Clustering of North American V3 sequnces') plt.xlabel('Cluster Size') plt.xlim([1.5, 60]) plt.ylim([0, 1]) plt.ylabel('Silhouette Score') plt.savefig('final_figures/long_NA_v3_clustering.png', dpi = 1000) # <codecell> from SeqSklearn import BinBasedCluster bin_clust = BinBasedCluster(bins = pssm_bins) pca_trans, biny, xx, yy, Z = bin_clust.make_vern_points(NA_blood_df.values, NA_wanted_lanl['PSSMScore']) # <codecell> from pylab import get_cmap plt.figure(figsize=(10,10)) jitter = 0.1*np.random.randn(*pca_trans.shape)+pca_trans plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=NA_wanted_lanl['PSSMScore'], cmap=get_cmap('copper_r'), alpha=0.5) cbar = plt.colorbar() cbar.set_label('PSSMScore') plt.ylabel('PC-1') plt.xlabel('PC-2')