def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    pC1 = getClassProb(Ytrain, -1)
    pC2 = getClassProb(Ytrain, 1)

    wordList = d.getWordList()
    w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))]
    aw1 = np.asarray(w1)
    w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))]
    aw2 = np.asarray(w2)

    trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2)
    print 'Test error rate is ' + str(testError)
def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

##  i is the number of features to be added to cols
    for i in range(40):
        bestJ = 0
        bestErrorRate = 1
        for j in range(n):
            cols.append(j)     
            w = trainRidge(Xtrain[:, cols], Ytrain, lam)
            errorRate = computeError(Xtrain[:, cols], Ytrain, w)
            if errorRate < bestErrorRate:
                bestJ = j
                bestErrorRate = errorRate
##                print 'Best error rate is ' + str(bestErrorRate)
            cols.pop()
            
        if bestErrorRate >= currentError:
            break
        else:
            cols.append(bestJ)  
            dic[bestJ] = currentError - bestErrorRate
            currentError = bestErrorRate
            print 'Current error rate is ' + str(currentError)

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f
def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

    for j in range(n):
        cols.append(j)     
        w = trainRidge(Xtrain[:, cols], Ytrain, lam)
        errorRate = computeError(Xtrain[:, cols], Ytrain, w)
        if errorRate >= currentError:
            cols.pop()
        else:
            dic[j] = currentError - errorRate
            currentError = errorRate
##          print out currentError once a while  
            if j % 10 == 0:
                print currentError

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f