-
Notifications
You must be signed in to change notification settings - Fork 0
/
SeqSklearn.py
130 lines (90 loc) · 3.7 KB
/
SeqSklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
__author__ = 'will'
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from itertools import product
from scipy.sparse import csr_matrix, eye
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cross_validation import Bootstrap
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from types import IntType, ListType, TupleType
from sklearn.decomposition import PCA, RandomizedPCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
####Linker functions!
def silhouette_score_linker(predictor, X, y=None):
clusters = predictor.predict(X)
if len(set(clusters)) == 1:
clusters[-1] += 1
return silhouette_score(X, clusters)
def normalized_mutual_info_score_linker(predictor, X, y):
clusters = predictor.predict(X)
return normalized_mutual_info_score(y, clusters)
def normalized_mutual_info_score_scorefunc(X, y):
scores = []
pvals = []
for col in range(X.shape[1]):
scores.append(normalized_mutual_info_score(X[:,col], y))
pvals.append(1)
return np.array(scores), np.array(pvals)
###Tranformers
class BioTransformer(BaseEstimator, TransformerMixin):
def __init__(self, typ='nuc'):
self.typ = typ
def fit(self, *args):
return self
def transform(self, X):
if self.typ == 'nuc':
letters = 'ACGT-'
else:
letters = 'ARNDCEQGHILKMFPSTWYV-'
nrows, ncols = X.shape
#out = eye(nrows, ncols*len(letters), format='csr')
data = []
rows = []
cols = []
for row in range(nrows):
for num, (col,l) in enumerate(product(range(ncols), letters)):
if X[row, col].upper()==l:
data.append(1)
rows.append(row)
cols.append(num)
return csr_matrix((np.array(data), (np.array(rows), np.array(cols))),
shape=(nrows, ncols*len(letters)), dtype=float).todense()
### Estimators
class BinBasedCluster(BaseEstimator):
def __init__(self, bins=[0, 0.5, 1]+range(5, 36)):
self.bins=bins
def fit(self, X, y):
biny = self.bin_data(y)
self.pred = NearestCentroid().fit(X, biny)
return self
def predict(self, X):
return self.pred.predict(X)
def score(self, X, y, is_raw=True):
clusters = self.pred.predict(X)
if is_raw:
return adjusted_rand_score(self.bin_data(y), clusters)
else:
return adjusted_rand_score(y, clusters)
def bin_data(self, y):
return np.digitize(y, self.bins)
def make_vern_points(self, X, y):
sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc)
sdata = sel.fit_transform(X, y)
print X.shape, sdata.shape
pca = PCA(n_components=2)
pca_trans = pca.fit_transform(sdata)
biny = self.bin_data(y)
pred = NearestCentroid().fit(pca_trans, biny)
x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1
y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50),
np.linspace(y_min, y_max, 50))
Z = pred.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
return pca_trans, biny, xx, yy, Z