Python Dataset.getWordList Examples

Programming Language: Python

Class/Type: Dataset

Method/Function: getWordList

Examples at hotexamples.com: 3

Python Dataset.getWordList - 3 examples found. These are the top rated real world Python examples of Dataset.getWordList from package faiss extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getTrainAndTestSets(4)

getWordList(3)

AKTDataset(1)

Example #1

Show file

File: NaiveBayes.py Project: myliu/document-classification

def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    pC1 = getClassProb(Ytrain, -1)
    pC2 = getClassProb(Ytrain, 1)

    wordList = d.getWordList()
    w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))]
    aw1 = np.asarray(w1)
    w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))]
    aw2 = np.asarray(w2)

    trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2)
    print 'Test error rate is ' + str(testError)

Example #2

Show file

File: Stepwise.py Project: myliu/document-classification

def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

##  i is the number of features to be added to cols
    for i in range(40):
        bestJ = 0
        bestErrorRate = 1
        for j in range(n):
            cols.append(j)     
            w = trainRidge(Xtrain[:, cols], Ytrain, lam)
            errorRate = computeError(Xtrain[:, cols], Ytrain, w)
            if errorRate < bestErrorRate:
                bestJ = j
                bestErrorRate = errorRate
##                print 'Best error rate is ' + str(bestErrorRate)
            cols.pop()
            
        if bestErrorRate >= currentError:
            break
        else:
            cols.append(bestJ)  
            dic[bestJ] = currentError - bestErrorRate
            currentError = bestErrorRate
            print 'Current error rate is ' + str(currentError)

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f

Example #3

Show file

File: Streamwise.py Project: myliu/document-classification

def main():
    d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000)
    (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100)

    lam = 100
    cols = []
    currentError = 1
    n = Xtrain.shape[1]
    dic = {}

    for j in range(n):
        cols.append(j)     
        w = trainRidge(Xtrain[:, cols], Ytrain, lam)
        errorRate = computeError(Xtrain[:, cols], Ytrain, w)
        if errorRate >= currentError:
            cols.pop()
        else:
            dic[j] = currentError - errorRate
            currentError = errorRate
##          print out currentError once a while  
            if j % 10 == 0:
                print currentError

    w = trainRidge(Xtrain[:, cols], Ytrain, lam)
    trainError = computeError(Xtrain[:, cols], Ytrain, w)
    print 'Train error rate is ' + str(trainError)
    testError = computeError(Xtest[:, cols], Ytest, w)
    print 'Test error rate is ' + str(testError)

##  find the top 10 features
    wordList = d.getWordList()
    topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)]
    topCols = topCols[: 10]
    topFeatures = [wordList[index] for (index, value) in topCols]
    for f in topFeatures:
        print f