def run_app(fTrainIn, fTestIn): """ Runs the algorithm on the data """ linesInTest = [line.strip() for line in fTestIn.readlines()] attributes = linesInTest[0].split(" ") #反转后,利用list.pop()去除最后一行,再反转回到原次序 linesInTest.reverse() linesInTest.pop() # pop()弹出并返回最后一行 linesInTest.reverse() attrList, attrDict = prepare_attributes(attributes) targetAttribute = attrList[-1] # prepare testdata testData = [] for line in linesInTest: testData.append(dict(list(zip(attrList, [datum.strip() for datum in line.split("\t")])))) linesInTrain = [lineTrain.strip() for lineTrain in fTrainIn.readlines()] attributesTrain = linesInTrain[0].replace("\t", " ").split(" ") #once we have the attributes remove it from lines linesInTrain.reverse() linesInTrain.pop() # pops from end of list, hence the two reverses linesInTrain.reverse() attrListTrain, attrDictTrain = prepare_attributes(attributesTrain) targetAttrTrain = attrListTrain[-1] # prepare data trainData = [] for lineTrain in linesInTrain: trainData.append(dict(list(zip(attrListTrain, [datum.strip() for datum in lineTrain.split("\t")])))) trainingTree = dtree.create_decision_tree(trainData, attrListTrain, targetAttrTrain, dtree.gain) trainingClassify = dtree.classify(trainingTree, trainData) testTree = dtree.create_decision_tree(testData, attrList, targetAttribute, dtree.gain) testClassify = dtree.classify(testTree, testData) # also returning the example Classify in both the files givenTestClassify = [] for row in testData: givenTestClassify.append(row[targetAttribute]) givenTrainClassify = [] for row in trainData: givenTrainClassify.append(row[targetAttrTrain]) return (trainingTree, trainingClassify, testClassify, givenTrainClassify, givenTestClassify)
def leat_ai_raw(schema, train, test, s_option, nt, lift, z_beta): cov_c45 = 0.0 cov_leat = 0.0 data = np.vstack((train,test)) base_prob = dt.laplace_smoothing(data) obj_prob = lift * base_prob pred = np.zeros(len(data)) alpha_list = [-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 1.0, 1.5, 1.75, 2.0,2.25, 2.5, 2.75, 3.0] # base tree output = [] nt = 1 for alpha in alpha_list: tree = dt.create_decision_tree(data, schema, alpha, -1, True, obj_prob, z_beta) pred_new = dt.apply_rules(data, schema, tree) pred_added = pred + pred_new cov_new = float(np.sum(pred_added > 0))/len(data) cov_orig = float(np.sum(pred > 0))/len(data) if cov_new > cov_orig: pred = pred_added output.append([nt,cov_new]) nt = nt + 1 print nt, cov_new return output
def one_fold(schema, train, test, s_option, nt): pred_a = np.zeros(len(test)) pred_c = np.zeros(len(test)) bag_cnt = 0 alpha_idx = 0 while True: # bagging newdata = sampling(train, s_option) # base tree tree = dt.create_decision_tree(newdata, schema, 1.0, DEPTH) pred = dt.apply_rules(test, schema, tree) pred_c = pred_c + pred # alpha variation pred_down, alpha_cnt_down = alpha_variation(schema, newdata, test, tree, False) pred_up, alpha_cnt_up = alpha_variation(schema, newdata, test, tree, True) pred_a = pred_a + ((pred_down + pred_up + pred) / (alpha_cnt_down + alpha_cnt_up + 1.0)) alpha_idx = alpha_idx + alpha_cnt_down + alpha_cnt_up bag_cnt = bag_cnt + 1 if bag_cnt > nt: break pred_a = pred_a / (bag_cnt) pred_c = pred_c / (bag_cnt) label = test[:,-1] roc_a = st.auc(pred_a, label) roc_c = st.auc(pred_c, label) return roc_a, roc_c, (roc_a/roc_c), (float(alpha_idx)/nt)
def correlation(schema, train, test): pred_a = np.zeros(len(test)) pred_c = np.zeros(len(test)) bag_cnt = 0 alpha_idx = 0 corr_c45 = [] corr_beat = [] # bagging base_data = sampling(train, "None") # base tree tree = dt.create_decision_tree(base_data, schema, 1.0, DEPTH) base_pred = dt.apply_rules(test, schema, tree) base_pred = base_pred - np.mean(base_pred) base_cov = np.sqrt(np.sum(base_pred * base_pred)) for ii in range(10): newdata = sampling(train, "Normal") tree = dt.create_decision_tree(newdata, schema, 1.0, DEPTH) pred = dt.apply_rules(test, schema, tree) pred_c = pred - np.mean(pred) cov_c = np.sqrt(np.sum(pred_c*pred_c)) corr_c45.append(np.dot(base_pred,pred_c)/base_cov/cov_c) # alpha variation pred_down, alpha_cnt_down = alpha_variation(schema, newdata, test, tree, False) pred_up, alpha_cnt_up = alpha_variation(schema, newdata, test, tree, True) if alpha_cnt_down > 0: pred_a = pred_down pred_a = pred_a - np.mean(pred_a) cov_a = np.sqrt(np.sum(pred_a*pred_a)) corr_beat.append(np.dot(base_pred,pred_a)/base_cov/cov_a) if alpha_cnt_up > 0: pred_a = pred_up pred_a = pred_a - np.mean(pred_a) cov_a = np.sqrt(np.sum(pred_a*pred_a)) corr_beat.append(np.dot(base_pred,pred_a)/base_cov/cov_a) if alpha_cnt_up==0 and alpha_cnt_down==0: corr_beat.append(np.dot(base_pred,pred_c/base_cov/cov_c)) return corr_beat, corr_c45
def leat(schema, train, test, s_option, nt, lift, z_beta): cov_c45 = 0.0 cov_leat = 0.0 data = np.vstack((train,test)) base_prob = dt.laplace_smoothing(data) obj_prob = lift * base_prob pred_a = np.zeros(len(data)) pred_c = np.zeros(len(data)) bag_cnt = 0 alpha_list = [-1.0, -0.75, -0.5,-0.25, 0.0, 0.25, 0.5, 1.5, 1.75, 2.0,2.25, 2.5, 2.75, 3.0] while True: # bagging newdata = sampling(data, s_option) # base tree tree = dt.create_decision_tree(newdata, schema, 1.0, -1, True, obj_prob, z_beta) pred = dt.apply_rules(data, schema, tree) pred_c = pred_c + pred pred_a = pred_a + pred for alpha in alpha_list: tree = dt.create_decision_tree(newdata, schema, alpha, -1, True, obj_prob, z_beta) pred = dt.apply_rules(data, schema, tree) pred_a = pred_a + pred bag_cnt = bag_cnt + 1 if bag_cnt > nt: break cov_c45 = float(np.sum(pred_c > 0))/len(data) cov_leat = float(np.sum(pred_a > 0))/len(data) return cov_c45, cov_leat
def alpha_variation(schema, train, test, base_tree, direction): alpha_cnt = 0 pred = np.zeros(len(test)) alpha = 1.0 alpha_tree = base_tree while True: alpha = select_alpha(alpha_tree, train, schema, alpha, direction) if alpha != 1.0: alpha_tree = dt.create_decision_tree(train, schema, alpha, DEPTH) pred = pred + dt.apply_rules(test, schema, alpha_tree) alpha_cnt = alpha_cnt + 1 else: break return pred, alpha_cnt
def create_forest(data, attributes, target_attr, num_trees): forest = [] for i in range(num_trees): #randomly select 30% of data with replacement train_data = numpy.random.choice(data, size=int(float(n) * 0.3), replace=True) att_size = len(attributes) - 1 if len(attributes) < 10 else int( sqrt(len(attributes))) train_attr = list( numpy.random.choice( [a for a in attributes if not a == target_attr], size=att_size)) train_attr.append(target_attr) tree = create_decision_tree(train_data, train_attr, target_attr) forest.append(tree) return forest
def __init__(self, file): '''otwieranie pliku z danymi treningowymi''' fin = open(file, "r") '''przygotowywanie danych treningowych''' #lista wszystkich linii z pliku z danymi testowymi lines = [line.strip() for line in fin.readlines()] #usuwanie atrybutow z listy linii i umieszczanie ich na liscie atrybutow lines.reverse() attributes = [attr.strip() for attr in lines.pop().split(",")] target_attr = attributes[-1] lines.reverse() #tworzenie slownika danych data = [] for line in lines: data.append(dict(zip(attributes, [datum.strip() for datum in line.split(",")]))) '''tworzenie drzewa decyzyjnego na podstawie danych treningowych''' self.tree = dtree.create_decision_tree(data, attributes, target_attr, id3.gain)
def select_alpha(tree, data, schema, alpha, up): """ select alpha that generates a different tree""" new_alpha = alpha while True: if up: new_alpha = new_alpha + 0.5 else: new_alpha = new_alpha - 0.5 if new_alpha < -1.0 or new_alpha > 3.0: new_alpha = 1.0 break new_tree = dt.create_decision_tree(data, schema, new_alpha, 1) if type(tree) != type(new_tree): break elif type(tree) ==dict: old_key = tree.keys()[0] new_key = new_tree.keys()[0] if old_key != new_key: break elif tree != new_tree: break return new_alpha
# -*- coding:utf-8 -*- import sys import dtree """ 调用ID3算法的主函数是run_app。 运行(支持python2.7, 3.x) python2 run.py train.dat test.dat python3 run.py train.dat test.dat 或者使用ori数据集 python2 run.py train-ori.dat test-ori.dat python3 run.py train-ori.dat test-ori.dat 数据格式 dtree.create_decision_tree(examples, attributes, target_attribute, heuristic_funtion) 接受如下输入: examples (训练or测试数据集) : list of dicts (python字典) attributes : list target_attribute: string heuristic_funtion: 指向"dtree.gain"函数的函数指针 数据集文件最后一列为最终决定属性 """
for line in testlines: test.append(dict(zip(attributes, [datum.strip() for datum in line.split(",")]))) print 'Calling create_decision_tree()' #print 'Values : ', c.values #print 'Data : ',c.data[:100] #print 'Attributes : ',attributes #print 'Target-Attr : ',target_attr print 'Number of Entries in Training Set after Cleanup', len(data) print 'Number of Entries in Test Set after Cleanup', len(test) d = input('No. of Training Instances to be taken: ') s1 = input('Start from Index: ') tree = dtree.create_decision_tree(c.values, data[s1:s1+d], attributes, target_attr, id3.gain, None) print '-----------------Decision Tree Created------------------' h = preprocess.Helper() h.print_tree(tree, "") t = input('No. of Test Instances to be taken: ') s2 = input('Start from Index: ') classification = dtree.classify(tree, test[s2:s2+t]) #print classification #print test[s2:s2+t] correct = 0 i = 0 for item in classification: #print item if test[i][target_attr] == (item+'.'):
import sys from dtree import create_decision_tree, classify_decision_tree from util import print_tree, vote, get_data, std_dev if __name__ == "__main__": filename = sys.argv[1] data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data tree = create_decision_tree(train_data, attributes, target_attr) labels = [d[target_attr] for d in valid_data] classification = classify_decision_tree(tree, valid_data, vote(labels)) count = 0 for x, y in zip(classification, labels): if x == y: count += 1 acc = float(count) / len(classification) accs.append(acc) print("accuracy: " + str(100 * acc) + "%") print("standard deviation: " + str(std_dev(accs)))