def main(): tr_data_arr, tr_label_arr = lds.load('./data_set/adult.data') pred_data_arr, pred_label_arr = lds.load('./data_set/adult.test') lr_st = time.time() lr_pred_data_len, lr_wrong, lr_accuracy = lr_predict.predict( tr_data_arr, tr_label_arr, pred_data_arr, pred_label_arr) lr_et = time.time() svm_st = time.time() svm_pred_data_len, svm_wrong, svm_accuracy = svm_predict.predict( tr_data_arr, tr_label_arr, pred_data_arr, pred_label_arr) svm_et = time.time() ada_st = time.time() ada_pred_data_len, ada_wrong, ada_accuracy = ada_predict.predict( tr_data_arr, tr_label_arr, pred_data_arr, pred_label_arr) ada_et = time.time() print '测试样本总数:', lr_pred_data_len print 'LR预测错误数:', lr_wrong print 'LR预测准确率:%s' % lr_accuracy, '%' print 'LR训练模型以及预测共耗时:%s秒' % (lr_et - lr_st) print '---------------------' print 'SVM预测错误数:', svm_wrong print 'SVM预测准确率:%s' % svm_accuracy, '%' print 'SVM训练模型以及预测共耗时:%s秒' % (svm_et - svm_st) print '---------------------' print 'AdaBoost预测错误数:', ada_wrong print 'AdaBoost预测准确率:%s' % ada_accuracy, '%' print 'AdaBoost训练模型以及预测共耗时:%s秒' % (ada_et - ada_st)
def get_cv_data_file(): """获取进行交叉验证的训练样本""" data_arr, label_arr = lds.load('./data_set/adult.data') with open('cv_data.smp', 'w+') as fp: for d_idx, data in enumerate(data_arr): feature_list = [] for f_idx, feature in enumerate(data): feature_list.append('%s:%s' % (f_idx + 1, feature)) feature_str = ' '.join(feature_list) fp.write('%s %s\n' % (label_arr[d_idx], feature_str))
def test_adaboost_roc(): """计算AdaBoost的ROC以及AUC""" from ada_boost.adaboost import ada_boost_train_ds, plotROC tr_data_arr, tr_label_arr = lds.load('./data_set/adult.data') data_arr = np.mat(tr_data_arr) f_label_arr = [] for i in tr_label_arr: if i == 1: f_label_arr.append(i) else: f_label_arr.append(-1) classifier_arr, agg_class_est = ada_boost_train_ds(data_arr, f_label_arr, 30) plotROC(agg_class_est.T, tr_label_arr)
data_matrix.T * error就是目标函数 每一次迭代得出的weights都会使得目标函数增长 """ weights += alpha * np.dot(data_matrix.T, error) return weights def stoc_grad_ascent(data_matrix, class_labels, num_iter=150): """随机梯度上升""" data_matrix = np.array(data_matrix) class_labels = np.array(class_labels) m, n = data_matrix.shape """假如weights是作为参数传入的话,即可实现线上学习""" weights = np.ones(n) for j in xrange(num_iter): data_index = range(m) for i in xrange(m): alpha = 4 / (1.0 + j + i) + 0.01 rand_index = int(random.uniform(0, len(data_index))) h = sigmoid(sum(data_matrix[rand_index] * weights)) error = class_labels[rand_index] - h weights += alpha * error * data_matrix[rand_index] del(data_index[rand_index]) return weights if __name__ == '__main__': data_arr, label_mat = lds.load('./data_set/adult.data') w = grad_ascent(data_arr, label_mat) print('weights:', w)