Beispiel #1
0
def fullConcWeight(weight, numSent):
    goodWords, badWords = utils.getUniqueGoodandBadWords();    
    
#    posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/')
#    negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/')
    
    posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/')
    posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/')
    negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/')
    numberOfReviews = 950
    posReviews = posReviews[:numberOfReviews]
    negReviews = negReviews[:numberOfReviews]

    correct = 0
    
    for review in posReviews:
        if conclusionWeight(review, weight, goodWords, badWords, numSent):
            correct += 1
            
    for review in negReviews:
        if not conclusionWeight(review, weight, goodWords, badWords, numSent):
            correct += 1
            
    accuracy = correct / (len(posReviews) + len(negReviews))

    return accuracy
Beispiel #2
0
def pipeline_test(num):
    #posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/')
    #negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/')
    
    posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/')
    posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/')
    negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/')
    posReviews = posReviews[:950]
    negReviews = negReviews[:950]
    
    
    import random
    import numpy as np
    #random.seed()

    accuracy = 0.0    
    percenttrain = 0.8
    posTrainCount = int(percenttrain*len(posReviews))
    negTrainCount = int(percenttrain*len(negReviews))
    
    count = 0
    for test in range(num): 
        count += 1
        random.shuffle(posReviews)
        random.shuffle(negReviews)                

        train_tups = [(r, 1) for r in posReviews[:posTrainCount]] + [(r, 0) for r in negReviews[:negTrainCount]]
        random.shuffle(train_tups)
        train_data = [tup[0] for tup in train_tups]
        Y_train = np.array([tup[1] for tup in train_tups])
        
        test_tups = [(r, 1) for r in posReviews[posTrainCount:]] + [(r, 0) for r in negReviews[negTrainCount:]]
        random.shuffle(test_tups)
        test_data = [tup[0] for tup in test_tups]
        actual = np.array([tup[1] for tup in test_tups])
    
    
        #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
        #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', BernoulliNB())])
        pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', LogisticRegression())])
        #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', SVC())])
        
        pipe = pipe.fit(train_data, Y_train)
        prediction = pipe.predict(test_data)
        
        correct = 0
        for i in range(len(actual)):
            if(actual[i] == prediction[i]):
                correct+=1
        
        acc = correct / len(actual)
        accuracy += acc
        print("{:.3f} {:.3f}".format(acc, accuracy / count))
    
    
    print("final accuracy: " + "{:.4f}".format(accuracy / num))
Beispiel #3
0
def fullPosNegTest():
    goodWords, badWords = utils.getUniqueGoodandBadWords();
    
#    posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/')
#    negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/')
    posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/')
    posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/')
    negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/')
    
    numberOfReviews = 950
    posReviews = posReviews[:numberOfReviews]
    negReviews = negReviews[:numberOfReviews]

    correct = 0
    count = 0    
    s = ""
    for review in posReviews:
        count += 1
        if posminusneg(review, goodWords, badWords):
            correct += 1
            s = "correct!"
        else:
            s = "wrong :("
        #if count % 10 == 0:
        print(s + "  {:.2f}%  ".format(correct / count * 100) + str(count))
            
    print("halfway there!")
    for review in negReviews:
        count += 1
        if not posminusneg(review, goodWords, badWords):
            correct += 1
            s = "correct!"
        else:
            s = "wrong :("
        #if count % 10 == 0:
        print(s + "  {:.2f}%  ".format(correct / count * 100) + str(count))
            
    return correct / count
def getSuperGoodBadAvg(iterations, topNum):

    posList = ['JJ','NN','RB']    
    inclusion = True
    
#    posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/')
#    negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/')
#    posposList = utils.loadPosList('dataset/txt_sentoken/negposlist.txt', posList, inclusion)
#    negposList = utils.loadPosList('dataset/txt_sentoken/posposlist.txt', posList, inclusion)

    posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/')
    posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/')
    negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/')
    negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/')

    posposList = utils.loadPosList('dataset/ebert_reviews/pos4-0.txt', posList, inclusion)
    posposList += utils.loadPosList('dataset/ebert_reviews/pos3-5.txt', posList, inclusion)
    negposList = utils.loadPosList('dataset/ebert_reviews/pos0-0.txt', posList, inclusion)
    negposList += utils.loadPosList('dataset/ebert_reviews/pos0-5.txt', posList, inclusion)
    negposList += utils.loadPosList('dataset/ebert_reviews/pos1-0.txt', posList, inclusion)
    negposList += utils.loadPosList('dataset/ebert_reviews/pos1-5.txt', posList, inclusion)
    
    numberOfReviews = 950
    posReviews = posReviews[:numberOfReviews]
    negReviews = negReviews[:numberOfReviews]
    posposList = posposList[:numberOfReviews]
    negposList = negposList[:numberOfReviews]
    
    posTuples = list(zip(posReviews, posposList))
    negTuples = list(zip(negReviews, negposList))    
    
    dataSetGoodWords, dataSetBadWords = utils.getUniqueGoodandBadWords()
    
    totalacc = 0.0
    for i in range(iterations):
        totalacc += getSuperGoodBad(topNum, posTuples, negTuples, dataSetGoodWords, dataSetBadWords)
        print("accuracy : " + "{:.4f}".format(totalacc / (i+1)))