#tregion = region_scorer.set_params(n_clusters=4)
#ts, scores, pval = permutation_test_score(tregion, seq_data, y=known_subs.values,
#                                          scoring = SeqSklearn.normalized_mutual_info_score_linker,                       
#                                          n_permutations=100,
#                                          cv=Bootstrap(seq_region.shape[0], train_size=0.7, n_iter=1))
#print ts, pval

# <codecell>

seq_df, known_subs = aligned_seqs['Int'].dropna().align(nlanl_data['PSSMScore'].dropna(), join='inner')

# <codecell>

from SeqSklearn import BinBasedCluster

clust = BinBasedCluster(bins=pssm_bins)
getter = itemgetter(*range(109-17,109+17))
seq_list_seqs = [list(l) for l in seq_df.values]
seq_region = np.array(map(getter, seq_list_seqs))
region_transform.fit(seq_region, np.ones((seq_region.shape[0], 1)))
seq_data = region_transform.transform(seq_region)

pca_trans, biny, xx, yy, Z = clust.make_vern_points(seq_data, known_subs.values)

# <codecell>

from pylab import get_cmap
plt.figure(figsize=(10,10))
jitter = 0.01*np.random.randn(*pca_trans.shape)+pca_trans
plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=known_subs, cmap=get_cmap('copper_r'), alpha=0.5)
cbar = plt.colorbar()
                                n_clusts=range(2, 60))
t = trop_data.groupby('n_clusters')['score'].mean()
e = trop_data.groupby('n_clusters')['score'].std()
plt.errorbar(t.index, t.values, yerr=e.values)
plt.title('Clustering of North American V3 sequnces')
plt.xlabel('Cluster Size')
plt.xlim([1.5, 60])
plt.ylim([0, 1])
plt.ylabel('Silhouette Score')
plt.savefig('final_figures/long_NA_v3_clustering.png', dpi = 1000)

# <codecell>

from SeqSklearn import BinBasedCluster

bin_clust = BinBasedCluster(bins = pssm_bins)

pca_trans, biny, xx, yy, Z = bin_clust.make_vern_points(NA_blood_df.values, NA_wanted_lanl['PSSMScore'])


# <codecell>

from pylab import get_cmap
plt.figure(figsize=(10,10))
jitter = 0.1*np.random.randn(*pca_trans.shape)+pca_trans
plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=NA_wanted_lanl['PSSMScore'], cmap=get_cmap('copper_r'), alpha=0.5)
cbar = plt.colorbar()
cbar.set_label('PSSMScore')
plt.ylabel('PC-1')
plt.xlabel('PC-2')