Ejemplo n.º 1
0
def cross_vad(examples, num_folds = 10):
    data = ut.dataCrossSplit(examples, num_folds, False)
    errorRates = []
    for i in range(num_folds):
        egs = data[i]
        dt = DTree(SelectAtt)
        dt.training(egs[0], 1)
        # calculate error rate
        error = [0.] * 2
        for j in range(2):
            for x in egs[j]:
                if dt.predict(x) != x[0]:
                    error[j] = error[j] + 1
            error[j] = error[j] / len(egs[j])
        print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1]
        errorRates.append(error)
    arr = np.array(errorRates)
    print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1])
    print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1]))
Ejemplo n.º 2
0
def getWeakLearner(w, egs):
    """ 
    given: 
        a set of examples egs
        a distribution on the examples w
    return:
        a decision stump y that minimize
            J = \sum w_n I(y(x_n) \neq t_n)
    """
    record = 0
    error = 3171713.0
    Nfeatures = len(egs[0]) - 1
    for i in range(Nfeatures):
        dt = DTree(lambda examples: i + 1)
        dt.training(egs, 1)
        tmp = weightedError(w, egs, dt)
        if tmp < error:
            error = tmp
            record = dt
    return record
Ejemplo n.º 3
0
def cross_vad(examples, num_folds = 10):
    data = ut.dataCrossSplit(examples, num_folds, False)
    errorRates = []
    for i in range(num_folds):
        egs = data[i]
        dt = DTree(SelectAtt)
        dt.training(egs[0], 1)
        # calculate error rate
        error = [0.] * 2
        for j in range(2):
            for x in egs[j]:
                if dt.predict(x) != x[0]:
                    error[j] = error[j] + 1
            error[j] = error[j] / len(egs[j])
        print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1]
        errorRates.append(error)
    arr = np.array(errorRates)
    print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1])
    print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1]))

if __name__ == '__main__':
    filename = sys.argv[1]
    egs = ut.importRawData(filename)
    egs = ut.preprocess(egs)
    cross_vad(egs)
    dt = DTree(SelectAtt)
    dt.training(egs)
    print "========= the decision tree ============"
    dt.printTree()