Example #1
0
File: run.py Project: whatot/id3-py
def run_app(fTrainIn, fTestIn):
    """
    Runs the algorithm on the data
    """

    linesInTest = [line.strip() for line in fTestIn.readlines()]
    attributes = linesInTest[0].split(" ")

    #反转后,利用list.pop()去除最后一行,再反转回到原次序
    linesInTest.reverse()
    linesInTest.pop()    # pop()弹出并返回最后一行
    linesInTest.reverse()

    attrList, attrDict = prepare_attributes(attributes)
    targetAttribute = attrList[-1]

    # prepare testdata
    testData = []
    for line in linesInTest:
        testData.append(dict(list(zip(attrList,
                                      [datum.strip()
                                       for datum in line.split("\t")]))))

    linesInTrain = [lineTrain.strip() for lineTrain in fTrainIn.readlines()]
    attributesTrain = linesInTrain[0].replace("\t", " ").split(" ")

    #once we have the attributes remove it from lines
    linesInTrain.reverse()
    linesInTrain.pop()   # pops from end of list, hence the two reverses
    linesInTrain.reverse()

    attrListTrain, attrDictTrain = prepare_attributes(attributesTrain)
    targetAttrTrain = attrListTrain[-1]

    # prepare data
    trainData = []
    for lineTrain in linesInTrain:
        trainData.append(dict(list(zip(attrListTrain,
                                       [datum.strip()
                                        for datum in lineTrain.split("\t")]))))

    trainingTree = dtree.create_decision_tree(trainData, attrListTrain,
                                              targetAttrTrain, dtree.gain)
    trainingClassify = dtree.classify(trainingTree, trainData)

    testTree = dtree.create_decision_tree(testData, attrList, targetAttribute,
                                          dtree.gain)
    testClassify = dtree.classify(testTree, testData)

    # also returning the example Classify in both the files
    givenTestClassify = []
    for row in testData:
        givenTestClassify.append(row[targetAttribute])

    givenTrainClassify = []
    for row in trainData:
        givenTrainClassify.append(row[targetAttrTrain])

    return (trainingTree, trainingClassify, testClassify, givenTrainClassify,
            givenTestClassify)
Example #2
0
def leat_ai_raw(schema, train, test, s_option, nt, lift, z_beta):
   
   
    cov_c45 = 0.0
    cov_leat = 0.0

    data = np.vstack((train,test))
    base_prob = dt.laplace_smoothing(data)
    obj_prob = lift * base_prob    
    
    pred = np.zeros(len(data))
 
    alpha_list = [-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 1.0, 1.5, 1.75, 2.0,2.25, 2.5, 2.75, 3.0]
    # base tree 
       
    output = [] 
    nt = 1
    for alpha in alpha_list:
        tree = dt.create_decision_tree(data, schema, alpha, -1, 
                                            True, obj_prob, z_beta)
        pred_new = dt.apply_rules(data, schema, tree)
        pred_added = pred + pred_new
          
        cov_new = float(np.sum(pred_added > 0))/len(data)
        cov_orig = float(np.sum(pred > 0))/len(data)
        
        if cov_new > cov_orig:
            pred = pred_added         
            output.append([nt,cov_new])
            nt = nt + 1
            print nt, cov_new
 
    return output
Example #3
0
def one_fold(schema, train, test, s_option, nt):
   
    pred_a = np.zeros(len(test))
    pred_c = np.zeros(len(test))
    bag_cnt = 0 
    alpha_idx = 0
    
    while True:
        # bagging   
        newdata = sampling(train, s_option) 
        # base tree 
        tree = dt.create_decision_tree(newdata, schema, 1.0, DEPTH)
        pred = dt.apply_rules(test, schema, tree)
        pred_c = pred_c + pred
        
        # alpha variation 
        pred_down, alpha_cnt_down = alpha_variation(schema, newdata, test, 
                                                        tree, False)
        pred_up, alpha_cnt_up = alpha_variation(schema, newdata, test, 
                                                        tree, True)
        pred_a = pred_a + ((pred_down + pred_up + pred) / 
                        (alpha_cnt_down + alpha_cnt_up + 1.0))
        alpha_idx = alpha_idx + alpha_cnt_down + alpha_cnt_up 
        bag_cnt = bag_cnt + 1
        if bag_cnt > nt:
            break     
    
    pred_a = pred_a / (bag_cnt)
    pred_c = pred_c / (bag_cnt)
    
    label = test[:,-1]
    roc_a =  st.auc(pred_a, label) 
    roc_c =  st.auc(pred_c, label) 
    
    return roc_a, roc_c, (roc_a/roc_c), (float(alpha_idx)/nt)
Example #4
0
def correlation(schema, train, test):
   
    pred_a = np.zeros(len(test))
    pred_c = np.zeros(len(test))
    bag_cnt = 0 
    alpha_idx = 0

    corr_c45 = []
    corr_beat = []
    
    # bagging   
    base_data = sampling(train, "None") 
    # base tree 
    tree = dt.create_decision_tree(base_data, schema, 1.0, DEPTH)
    base_pred = dt.apply_rules(test, schema, tree)
    base_pred = base_pred - np.mean(base_pred)
    base_cov = np.sqrt(np.sum(base_pred * base_pred))

    for ii in range(10): 
        newdata = sampling(train, "Normal") 
        tree = dt.create_decision_tree(newdata, schema, 1.0, DEPTH)
        pred = dt.apply_rules(test, schema, tree)
        
        pred_c = pred - np.mean(pred)
        cov_c = np.sqrt(np.sum(pred_c*pred_c))
        corr_c45.append(np.dot(base_pred,pred_c)/base_cov/cov_c)
        # alpha variation 
        pred_down, alpha_cnt_down = alpha_variation(schema, newdata, test, 
                                                        tree, False)
        pred_up, alpha_cnt_up = alpha_variation(schema, newdata, test, 
                                                        tree, True)
        if alpha_cnt_down > 0: 
            pred_a = pred_down 
            pred_a = pred_a - np.mean(pred_a)
            cov_a = np.sqrt(np.sum(pred_a*pred_a))
            corr_beat.append(np.dot(base_pred,pred_a)/base_cov/cov_a)
        if alpha_cnt_up > 0: 
            pred_a = pred_up
            pred_a = pred_a - np.mean(pred_a)
            cov_a = np.sqrt(np.sum(pred_a*pred_a))
            corr_beat.append(np.dot(base_pred,pred_a)/base_cov/cov_a)
        if alpha_cnt_up==0 and alpha_cnt_down==0:
            corr_beat.append(np.dot(base_pred,pred_c/base_cov/cov_c))
                 
    return corr_beat, corr_c45
Example #5
0
def leat(schema, train, test, s_option, nt, lift, z_beta):
   
   
    cov_c45 = 0.0
    cov_leat = 0.0

    data = np.vstack((train,test))
    base_prob = dt.laplace_smoothing(data)
    obj_prob = lift * base_prob    
    
    pred_a = np.zeros(len(data))
    pred_c = np.zeros(len(data))
 
    bag_cnt = 0 
    alpha_list = [-1.0, -0.75, -0.5,-0.25, 0.0, 0.25, 0.5, 1.5, 1.75, 2.0,2.25, 2.5, 2.75, 3.0]
    while True:
        # bagging   
        newdata = sampling(data, s_option) 
        # base tree 
        tree = dt.create_decision_tree(newdata, schema, 1.0, -1, 
                                            True, obj_prob, z_beta)
        pred = dt.apply_rules(data, schema, tree)
        pred_c = pred_c + pred
        pred_a = pred_a + pred
        
        for alpha in alpha_list:
            tree = dt.create_decision_tree(newdata, schema, alpha, -1, 
                                            True, obj_prob, z_beta)
            pred = dt.apply_rules(data, schema, tree)
            pred_a = pred_a + pred 
        
        bag_cnt = bag_cnt + 1
        if bag_cnt > nt:
            break     
    
    cov_c45 = float(np.sum(pred_c > 0))/len(data)
    cov_leat = float(np.sum(pred_a > 0))/len(data)
    
    return cov_c45, cov_leat
Example #6
0
def alpha_variation(schema, train, test, base_tree, direction):
    
    alpha_cnt = 0
    pred = np.zeros(len(test))
    alpha = 1.0
    alpha_tree = base_tree
    while True:
        alpha = select_alpha(alpha_tree, train, schema, alpha, direction)
        if alpha != 1.0:
            alpha_tree = dt.create_decision_tree(train, schema, alpha, DEPTH)
            pred = pred + dt.apply_rules(test, schema, alpha_tree)
            alpha_cnt = alpha_cnt + 1
        else:
            break
    return pred, alpha_cnt
Example #7
0
def create_forest(data, attributes, target_attr, num_trees):
    forest = []
    for i in range(num_trees):
        #randomly select 30% of data with replacement
        train_data = numpy.random.choice(data,
                                         size=int(float(n) * 0.3),
                                         replace=True)
        att_size = len(attributes) - 1 if len(attributes) < 10 else int(
            sqrt(len(attributes)))
        train_attr = list(
            numpy.random.choice(
                [a for a in attributes if not a == target_attr],
                size=att_size))
        train_attr.append(target_attr)
        tree = create_decision_tree(train_data, train_attr, target_attr)
        forest.append(tree)
    return forest
    def __init__(self, file):
        '''otwieranie pliku z danymi treningowymi'''
        fin = open(file, "r")

        '''przygotowywanie danych treningowych'''
        #lista wszystkich linii z pliku z danymi testowymi
        lines = [line.strip() for line in fin.readlines()]

        #usuwanie atrybutow z listy linii i umieszczanie ich na liscie atrybutow
        lines.reverse()
        attributes = [attr.strip() for attr in lines.pop().split(",")]
        target_attr = attributes[-1]
        lines.reverse()

        #tworzenie slownika danych
        data = []
        for line in lines:
            data.append(dict(zip(attributes,
                                 [datum.strip() for datum in line.split(",")])))

        '''tworzenie drzewa decyzyjnego na podstawie danych treningowych'''
        self.tree = dtree.create_decision_tree(data, attributes, target_attr, id3.gain)
Example #9
0
def select_alpha(tree, data, schema, alpha, up):
    """ select alpha that generates a different tree""" 
    new_alpha = alpha 
    
    while True:
        if up:
            new_alpha = new_alpha + 0.5
        else:
            new_alpha = new_alpha - 0.5
        if new_alpha < -1.0 or new_alpha > 3.0:
            new_alpha = 1.0
            break
      
        new_tree = dt.create_decision_tree(data, schema, new_alpha, 1)
        if type(tree) != type(new_tree):
            break 
        elif type(tree) ==dict:
            old_key = tree.keys()[0]
            new_key = new_tree.keys()[0]
            if old_key != new_key:
                break
        elif tree != new_tree:
            break  
    return  new_alpha
Example #10
0
File: run.py Project: whatot/id3-py
# -*- coding:utf-8 -*-

import sys
import dtree

"""
调用ID3算法的主函数是run_app。

运行(支持python2.7, 3.x)
python2 run.py train.dat test.dat
python3 run.py train.dat test.dat
或者使用ori数据集
python2 run.py train-ori.dat test-ori.dat
python3 run.py train-ori.dat test-ori.dat


数据格式

dtree.create_decision_tree(examples, attributes, target_attribute,
                           heuristic_funtion)
接受如下输入:

examples (训练or测试数据集) : list of dicts (python字典)
attributes : list
target_attribute: string
heuristic_funtion:  指向"dtree.gain"函数的函数指针

数据集文件最后一列为最终决定属性
"""

Example #11
0
for line in testlines:    
    test.append(dict(zip(attributes,
                [datum.strip() for datum in line.split(",")])))

print 'Calling create_decision_tree()' 
#print 'Values : ', c.values
#print 'Data : ',c.data[:100]
#print 'Attributes : ',attributes
#print 'Target-Attr : ',target_attr
print 'Number of Entries in Training Set after Cleanup', len(data)
print 'Number of Entries in Test Set after Cleanup', len(test)

d = input('No. of Training Instances to be taken: ')
s1 = input('Start from Index: ')

tree = dtree.create_decision_tree(c.values, data[s1:s1+d], attributes, target_attr, id3.gain, None)

print '-----------------Decision Tree Created------------------'
h = preprocess.Helper()

h.print_tree(tree, "")
t = input('No. of Test Instances to be taken: ')
s2 = input('Start from Index: ')
classification = dtree.classify(tree, test[s2:s2+t])
#print classification
#print test[s2:s2+t]
correct = 0
i = 0
for item in classification:
    #print item
    if test[i][target_attr] == (item+'.'):
Example #12
0
import sys
from dtree import create_decision_tree, classify_decision_tree
from util import print_tree, vote, get_data, std_dev

if __name__ == "__main__":
    filename = sys.argv[1]

    data, attributes, target_attr = get_data(filename)
    n = len(data)

    accs = []
    for i in range(5):
        valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 *
                                                    (i + 1))]  #validation data
        train_data = [d for d in data if not d in valid_data]  #training data
        tree = create_decision_tree(train_data, attributes, target_attr)
        labels = [d[target_attr] for d in valid_data]
        classification = classify_decision_tree(tree, valid_data, vote(labels))
        count = 0
        for x, y in zip(classification, labels):
            if x == y:
                count += 1
        acc = float(count) / len(classification)
        accs.append(acc)
        print("accuracy: " + str(100 * acc) + "%")
    print("standard deviation: " + str(std_dev(accs)))