def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    pC1 = getClassProb(Ytrain, -1)
    pC2 = getClassProb(Ytrain, 1)

    wordList = d.getWordList()
    w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))]
    aw1 = np.asarray(w1)
    w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))]
    aw2 = np.asarray(w2)

    trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2)
    print 'Test error rate is ' + str(testError)
Esempio n. 2
0
def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

##  i is the number of features to be added to cols
    for i in range(40):
        bestJ = 0
        bestErrorRate = 1
        for j in range(n):
            cols.append(j)     
            w = trainRidge(Xtrain[:, cols], Ytrain, lam)
            errorRate = computeError(Xtrain[:, cols], Ytrain, w)
            if errorRate < bestErrorRate:
                bestJ = j
                bestErrorRate = errorRate
##                print 'Best error rate is ' + str(bestErrorRate)
            cols.pop()
            
        if bestErrorRate >= currentError:
            break
        else:
            cols.append(bestJ)  
            dic[bestJ] = currentError - bestErrorRate
            currentError = bestErrorRate
            print 'Current error rate is ' + str(currentError)

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f
def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)
    w = np.asmatrix([0 for elem in range(Xtrain.shape[1])])

    learningRate = 1

##  numTrial is the total number of rounds we want to go through before stopping (in case it is not converged)
##  k is to keep track of how many rounds we have been through   
    numTrial = 5
    k = 0

##  wSum is to count the sum of w in a given round
##  wAvg is to count the avg of w in a given round
    wAvg = w
    while makeError(Xtrain, Ytrain, wAvg):
        
        if k >= numTrial:
            print "No perfect hyperplane found!"
            print "Stop after " + str(numTrial) + " iterations."
            break
        k += 1
        
        for i in range(Xtrain.shape[0]):
            expected = -1
            xtrain = np.asmatrix(Xtrain[i]).T
            if w * xtrain > 0:
                expected = 1
            if expected != Ytrain[i]:
                w = w + learningRate * Ytrain[i] * Xtrain[i]
            if i == 0:
                wSum = w
            else:
                wSum += w
        wAvg = wSum / Xtrain.shape[0]

    trainError = computeError(Xtrain, Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest, Ytest, w)
    print 'Test error rate is ' + str(testError)
def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

    for j in range(n):
        cols.append(j)     
        w = trainRidge(Xtrain[:, cols], Ytrain, lam)
        errorRate = computeError(Xtrain[:, cols], Ytrain, w)
        if errorRate >= currentError:
            cols.pop()
        else:
            dic[j] = currentError - errorRate
            currentError = errorRate
##          print out currentError once a while  
            if j % 10 == 0:
                print currentError

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f