/
utils.py
122 lines (103 loc) · 4.05 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# encoding: utf-8
import numpy as np, sys, math, os, h5py
from sklearn.metrics import roc_curve, precision_recall_curve, auc, r2_score
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
from multiprocessing import Pool
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
# Generate ids for k-flods cross-validation
def Id_k_folds(seqs_num, k_folds, ratio):
train_ids = []; test_ids = []; valid_ids = []
if k_folds == 1:
train_num = int(seqs_num*0.7)
test_num = seqs_num - train_num
valid_num = int(train_num*ratio)
train_num = train_num - valid_num
index = range(seqs_num)
train_ids.append(np.asarray(index[:train_num]))
valid_ids.append(np.asarray(index[train_num:train_num+valid_num]))
test_ids.append(np.asarray(index[train_num+valid_num:]))
else:
each_fold_num = int(math.ceil(seqs_num/k_folds))
for fold in range(k_folds):
index = range(seqs_num)
index_slice = index[fold*each_fold_num:(fold+1)*each_fold_num]
index_left = list(set(index) - set(index_slice))
test_ids.append(np.asarray(index_slice))
train_num = len(index_left) - int(len(index_left) * ratio)
train_ids.append(np.asarray(index_left[:train_num]))
valid_ids.append(np.asarray(index_left[train_num:]))
return (train_ids, test_ids, valid_ids)
# Compute the roc AUC and the precision-recall AUC
def ComputeAUC(y_pred, y_real):
# roc_auc_score(y_real, y_pred)
fpr, tpr, thresholds = roc_curve(y_real, y_pred)
roc_auc = auc(fpr, tpr)
# average_precision_score(y_real, y_pred)
precision, recall, thresholds = precision_recall_curve(y_real,y_pred)
pr_auc = auc(recall,precision)
return (roc_auc, pr_auc)
#Compute the pearson corelation coefficient
def ComputePCC(y_pred, y_real):
# pearson coefficient
#coeff, pvalue = stats.pearsonr(y_pred, y_real)
score = r2_score(y_real, y_pred)
return score
# Generate random hyper-paramter settings
def RandomSample():
space = {
'DROPOUT': hp.choice( 'drop', ( 0.2, 0.5)),
'DELTA': hp.choice( 'delta', ( 1e-04, 1e-06, 1e-08)),
'MOMENT': hp.choice( 'moment', (0.9, 0.99, 0.999 ))
}
params = sample(space)
return params
# select the best paramter setting
def SelectBest(history_all, file_path, fold, monitor='val_loss'):
if monitor == 'val_loss':
loss = 100000.
for num, History in history_all.items():
if np.min(History.history['val_loss']) < loss:
best_num = int(num)
loss = np.min(History.history['val_loss'])
else:
acc = 0.
for num, History in history_all.items():
if np.max(History.history['val_acc']) > acc:
best_num = int(num)
acc = np.max(History.history['val_acc'])
del_num = range(len(history_all))
del_num.pop(best_num)
# delete the useless model paramters
for num in del_num:
os.remove(file_path + 'params%d_bestmodel_%dfold.hdf5' %(num, fold))
return best_num
# plot and save the training process
def PlotandSave(History, filepath, fold, monitor='val_loss'):
if monitor == 'val_loss':
train_loss = History.history['loss']
valid_loss = History.history['val_loss']
x = range(len(train_loss))
plt.figure(num = fold)
plt.title('mode loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.plot(x, train_loss, 'r-', x, valid_loss, 'g-')
plt.legend(['train_loss', 'valid_loss'], loc = 'upper left')
#plt.show()
else:
train_acc = History.history['acc']
valid_acc = History.history['val_acc']
x = range(len(train_acc))
plt.figure(num = fold)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.plot(x, train_acc, 'r-', x, valid_acc, 'g-')
plt.legend(['train_acc', 'valid_acc'], loc = 'upper left')
#plt.show()
plt.savefig(filepath, format = 'png')