def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) pC1 = getClassProb(Ytrain, -1) pC2 = getClassProb(Ytrain, 1) wordList = d.getWordList() w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))] aw1 = np.asarray(w1) w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))] aw2 = np.asarray(w2) trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2) print 'Test error rate is ' + str(testError)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} ## i is the number of features to be added to cols for i in range(40): bestJ = 0 bestErrorRate = 1 for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate < bestErrorRate: bestJ = j bestErrorRate = errorRate ## print 'Best error rate is ' + str(bestErrorRate) cols.pop() if bestErrorRate >= currentError: break else: cols.append(bestJ) dic[bestJ] = currentError - bestErrorRate currentError = bestErrorRate print 'Current error rate is ' + str(currentError) w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) w = np.asmatrix([0 for elem in range(Xtrain.shape[1])]) learningRate = 1 ## numTrial is the total number of rounds we want to go through before stopping (in case it is not converged) ## k is to keep track of how many rounds we have been through numTrial = 5 k = 0 ## wSum is to count the sum of w in a given round ## wAvg is to count the avg of w in a given round wAvg = w while makeError(Xtrain, Ytrain, wAvg): if k >= numTrial: print "No perfect hyperplane found!" print "Stop after " + str(numTrial) + " iterations." break k += 1 for i in range(Xtrain.shape[0]): expected = -1 xtrain = np.asmatrix(Xtrain[i]).T if w * xtrain > 0: expected = 1 if expected != Ytrain[i]: w = w + learningRate * Ytrain[i] * Xtrain[i] if i == 0: wSum = w else: wSum += w wAvg = wSum / Xtrain.shape[0] trainError = computeError(Xtrain, Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest, Ytest, w) print 'Test error rate is ' + str(testError)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate >= currentError: cols.pop() else: dic[j] = currentError - errorRate currentError = errorRate ## print out currentError once a while if j % 10 == 0: print currentError w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f