def SVM_eval_func(self, chromosome): cost, gamma, window_size = self.decode_chromosome(chromosome) if self.check_log(cost, gamma, window_size): return self.get_means_from_log(cost, gamma, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = svm.SVC(C=cost, gamma=gamma, class_weight='auto') clf.fit(train_dataset, train_labels) decision_values = clf.decision_function(test_dataset) if type(decision_values[0]) is list or type(decision_values[0]) is numpy.ndarray: decision_values = map(lambda x: x[0], decision_values) AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(cost, gamma, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(cost, gamma, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def neuralNetwork_eval_func(self, chromosome): node_num, learning_rate, window_size = self.decode_chromosome(chromosome) if self.check_log(node_num, learning_rate, window_size): return self.get_means_from_log(node_num, learning_rate, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True ds = SupervisedDataSet(indim, 1) for i in xrange(len(train_labels)): ds.appendLinked(train_dataset[i], [train_labels[i]]) net = buildNetwork(indim, node_num, 1, outclass=SigmoidLayer, bias=True) trainer = BackpropTrainer(net, ds, learningrate=learning_rate) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs_for_trainer) decision_values = [net.activate(test_dataset[i]) for i in xrange(len(test_labels))] decision_values = map(lambda x: x[0], decision_values) AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(node_num, learning_rate, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(node_num, learning_rate, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def randomForest_eval_func(self, chromosome): n_estimators, max_features, window_size = self.decode_chromosome(chromosome) if self.check_log(n_estimators, max_features, window_size): return self.get_means_from_log(n_estimators, max_features, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) clf.fit(train_dataset, train_labels) probas = clf.predict_proba(test_dataset) decision_values = map(lambda x: x[1], probas) # Probability of being binding residue AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def run_SVM(self, cost, gamma): mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 mean_performance_by_decision_value = [0, 0, 0, 0] # [SE, SP, ACC, MCC] mean_performance_by_predict_function = [0, 0, 0, 0] # [SE, SP, ACC, MCC] for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = self.folded_dataset.get_test_and_training_dataset(test_fold) clf = svm.SVC(C=cost, gamma=gamma, class_weight='auto') clf.fit(train_dataset, train_labels) decision_values = clf.decision_function(test_dataset) if type(decision_values[0]) is list or type(decision_values[0]) is numpy.ndarray: decision_values = map(lambda x: x[0], decision_values) AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) predicted_labels_by_decision_value = [1 if decision_value >= decision_value_and_max_mcc[0] else 0 for decision_value in decision_values] predicted_labels_by_predict_function = clf.predict(test_dataset) # [SE, SP, ACC, MCC] performances = validate_performance.calculate_performance(test_labels, predicted_labels_by_decision_value) mean_performance_by_decision_value = self.add_performances(mean_performance_by_decision_value, performances) performances = validate_performance.calculate_performance(test_labels, predicted_labels_by_predict_function) mean_performance_by_predict_function = self.add_performances(mean_performance_by_predict_function, performances) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold mean_performance_by_decision_value = map(lambda x: x/float(self.fold), mean_performance_by_decision_value) mean_performance_by_predict_function = map(lambda x: x/float(self.fold), mean_performance_by_predict_function) self.write_log(cost, gamma, mean_AUC, mean_decision_value, mean_mcc, mean_performance_by_decision_value, mean_performance_by_predict_function)
def neuralNetwork_eval_func(self, chromosome): node_num, learning_rate, window_size = self.decode_chromosome( chromosome) if self.check_log(node_num, learning_rate, window_size): return self.get_means_from_log(node_num, learning_rate, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset( test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True ds = SupervisedDataSet(indim, 1) for i in xrange(len(train_labels)): ds.appendLinked(train_dataset[i], [train_labels[i]]) net = buildNetwork(indim, node_num, 1, outclass=SigmoidLayer, bias=True) trainer = BackpropTrainer(net, ds, learningrate=learning_rate) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs_for_trainer) decision_values = [ net.activate(test_dataset[i]) for i in xrange(len(test_labels)) ] decision_values = map(lambda x: x[0], decision_values) AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC( decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(node_num, learning_rate, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(node_num, learning_rate, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def test_calculate_AUC(self): decision_values = [-1, -0.5, -0.1, 0.1, 0.5, 1] correct_labels = [0, 0, 0, 1, 1, 1] AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(AUC, 1.0) decision_values = [-1, 0.5, -0.1, 0.1, -0.5, 1] correct_labels = [0, 1, 0, 1, 0, 1] AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(AUC, 1.0) decision_values = [1.0]*6 correct_labels = [0, 1, 0, 1, 0, 1] AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(AUC, 0.5) decision_values = [-1, -0.5, 0.5, 1] correct_labels = [0, 1, 0, 1] AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(AUC, 0.75) decision_values = [1.0] * 2 + [-1.0] * 8 + [1.0] * 2 + [0.5] * 8 correct_labels = [0] * 10 + [1] * 10 AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(round(AUC*(10**5))/(10**5), 0.82) decision_values = [-1.0] * 8 + [0.0] * 2 + [0.0] * 2 + [0.5] * 8 correct_labels = [0] * 10 + [1] * 10 AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(round(AUC*(10**5))/(10**5), 0.98) decision_values = [-1.0] * 8 + [0.2] * 2 + [0.1]*3 +[0.2]*2 + [0.5]*5 correct_labels = [0] * 10 + [1] * 10 AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(round(AUC*(10**5))/(10**5), 0.92) decision_values = [-1.0]*6 + [0.0]*2+ [0.2]*2 + [0.0]*2+ [0.1]*1 +[0.2]*2 + [0.5]*5 correct_labels = [0] * 10 + [1] * 10 AUC, mcc = validate_performance.calculate_AUC(decision_values, correct_labels) self.assertEqual(round(AUC*(10**5))/(10**5), 0.9)
def randomForest_eval_func(self, chromosome): n_estimators, max_features, window_size = self.decode_chromosome( chromosome) if self.check_log(n_estimators, max_features, window_size): return self.get_means_from_log(n_estimators, max_features, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset( test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) clf.fit(train_dataset, train_labels) probas = clf.predict_proba(test_dataset) decision_values = map( lambda x: x[1], probas) # Probability of being binding residue AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC( decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def SVM_eval_func(self, chromosome): cost, gamma, window_size = self.decode_chromosome(chromosome) if self.check_log(cost, gamma, window_size): return self.get_means_from_log(cost, gamma, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset( test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = svm.SVC(C=cost, gamma=gamma, class_weight='auto') clf.fit(train_dataset, train_labels) decision_values = clf.decision_function(test_dataset) if type(decision_values[0]) is list or type( decision_values[0]) is numpy.ndarray: decision_values = map(lambda x: x[0], decision_values) AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC( decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(cost, gamma, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(cost, gamma, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC