-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
107 lines (91 loc) · 3.52 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os, sys
import numpy as np
import pandas as pd
import error_metrics as em
from sklearn import ensemble
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
import sklearn.metrics as metrics
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
from mile import get_attributes_map
from sklearn.externals import joblib
def get_balanced_data(X):
#sort into a promoter matrix and an enhancer matrix
X_p = X[X['Labels'].isin([1])]
X_p.index = range(len(X_p))
X_e = X[X['Labels'].isin([0])]
X_e.index = range(len(X_e))
#take a random sample of the larger promoter matrix
#such that it is the same length as the enhancers
X_p_sample = X_p.sample(len(X_e))
#X_p_short = X_p.iloc[0:len(X_e),:]
#combine the promoters and enhancers back into one matrix
X_balanced = X_p_sample.append(X_e)
# X_balanced = X_p_short.append(X_e)
X_balanced.index = range(len(X_balanced))
#Shuffle the data, such that it is not all promoters first
X_balanced_shuffled = X_balanced.reindex(np.random.permutation(X_balanced.index))
X_balanced_shuffled.index = range(len(X_balanced_shuffled))
return X_balanced_shuffled
def trainModel(X_train, Y_train, X_test, Y_test, model="NB"):
if model == "NB":
clf = GaussianNB()
elif model == "RF":
clf = ensemble.RandomForestClassifier()
elif model == "GB":
clf = ensemble.GradientBoostingClassifier(learning_rate=0.1,
n_estimators=100,verbose=1)
clf.fit(X_train, Y_train)
Y_score = clf.predict_proba(X_test)
auc = em.get_roc(Y_test,Y_score[:,1])
return clf, auc
def print_important_params(model_type, clf, k):
if model_type == "NB":
imp_features = pd.Series(clf.theta_[1,:])
else:
imp_features = pd.Series(clf.feature_importances_)
attributes_map = get_attributes_map(['A','C','G','T'],k)
reverse_attributes_map = pd.Series(attributes_map.index)
top_ten_weights = imp_features.sort_values(ascending=False).head(10)
top_ten_kmers = reverse_attributes_map.iloc[top_ten_weights.index]
print("Top ten important features:")
print(top_ten_weights)
print("Top 10 kmers:")
print(top_ten_kmers)
def main():
cell_types = []
cell_types.append("A549")
cell_types.append("GM12878")
cell_types.append("K562")
cell_types.append("combination")
for i in range(0, 1):
k=6
trained_cell_type = cell_types[3] # training on combination/all data (A549, GM12878, K562)
min_region_size = 1000
train_matrix = "/srv/scratch/zho/" + str(min_region_size) + "_" + trained_cell_type + "_train_matrix.csv"
output_vector = "/srv/scratch/zho/" + str(min_region_size) + "_" + trained_cell_type + "_output_vector.csv"
X = pd.DataFrame.from_csv(train_matrix)
X['Labels']= pd.Series.from_csv(output_vector)
X_balanced_shuffled = get_balanced_data(X)
kf = KFold(len(X_balanced_shuffled),n_folds=6)
cur_auc = 0
for train, test in kf:
df = X_balanced_shuffled
m,n = df.shape
new_clf, new_auc = trainModel(df.iloc[train,0:n-1], df.iloc[train,n-1],
df.iloc[test,0:n-1], df.iloc[test,n-1],sys.argv[1])
if new_auc > cur_auc:
cur_auc = new_auc
best_clf = new_clf
print("updating best auc to:")
print(cur_auc)
#save model to file
joblib.dump(best_clf, "/srv/scratch/zho/models/" + str(sys.argv[1]) + "_model.pkl")
print_important_params(sys.argv[1], best_clf, k)
return
if __name__ == '__main__':
main()