def fullConcWeight(weight, numSent): goodWords, badWords = utils.getUniqueGoodandBadWords(); # posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/') # negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/') posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/') posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/') negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/') numberOfReviews = 950 posReviews = posReviews[:numberOfReviews] negReviews = negReviews[:numberOfReviews] correct = 0 for review in posReviews: if conclusionWeight(review, weight, goodWords, badWords, numSent): correct += 1 for review in negReviews: if not conclusionWeight(review, weight, goodWords, badWords, numSent): correct += 1 accuracy = correct / (len(posReviews) + len(negReviews)) return accuracy
def pipeline_test(num): #posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/') #negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/') posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/') posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/') negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/') posReviews = posReviews[:950] negReviews = negReviews[:950] import random import numpy as np #random.seed() accuracy = 0.0 percenttrain = 0.8 posTrainCount = int(percenttrain*len(posReviews)) negTrainCount = int(percenttrain*len(negReviews)) count = 0 for test in range(num): count += 1 random.shuffle(posReviews) random.shuffle(negReviews) train_tups = [(r, 1) for r in posReviews[:posTrainCount]] + [(r, 0) for r in negReviews[:negTrainCount]] random.shuffle(train_tups) train_data = [tup[0] for tup in train_tups] Y_train = np.array([tup[1] for tup in train_tups]) test_tups = [(r, 1) for r in posReviews[posTrainCount:]] + [(r, 0) for r in negReviews[negTrainCount:]] random.shuffle(test_tups) test_data = [tup[0] for tup in test_tups] actual = np.array([tup[1] for tup in test_tups]) #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', BernoulliNB())]) pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', LogisticRegression())]) #pipe = Pipeline([('vect', CountVectorizer(stop_words=nltkstopwords.words('english'))),('tfidf', TfidfTransformer()),('clf', SVC())]) pipe = pipe.fit(train_data, Y_train) prediction = pipe.predict(test_data) correct = 0 for i in range(len(actual)): if(actual[i] == prediction[i]): correct+=1 acc = correct / len(actual) accuracy += acc print("{:.3f} {:.3f}".format(acc, accuracy / count)) print("final accuracy: " + "{:.4f}".format(accuracy / num))
def fullPosNegTest(): goodWords, badWords = utils.getUniqueGoodandBadWords(); # posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/') # negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/') posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/') posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/') negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/') numberOfReviews = 950 posReviews = posReviews[:numberOfReviews] negReviews = negReviews[:numberOfReviews] correct = 0 count = 0 s = "" for review in posReviews: count += 1 if posminusneg(review, goodWords, badWords): correct += 1 s = "correct!" else: s = "wrong :(" #if count % 10 == 0: print(s + " {:.2f}% ".format(correct / count * 100) + str(count)) print("halfway there!") for review in negReviews: count += 1 if not posminusneg(review, goodWords, badWords): correct += 1 s = "correct!" else: s = "wrong :(" #if count % 10 == 0: print(s + " {:.2f}% ".format(correct / count * 100) + str(count)) return correct / count
def getSuperGoodBadAvg(iterations, topNum): posList = ['JJ','NN','RB'] inclusion = True # posReviews = utils.loadAllTextFiles('dataset/txt_sentoken/pos/') # negReviews = utils.loadAllTextFiles('dataset/txt_sentoken/neg/') # posposList = utils.loadPosList('dataset/txt_sentoken/negposlist.txt', posList, inclusion) # negposList = utils.loadPosList('dataset/txt_sentoken/posposlist.txt', posList, inclusion) posReviews = utils.loadAllTextFiles('dataset/ebert_reviews/4-0/') posReviews += utils.loadAllTextFiles('dataset/ebert_reviews/3-5/') negReviews = utils.loadAllTextFiles('dataset/ebert_reviews/0-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/0-5/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-0/') negReviews += utils.loadAllTextFiles('dataset/ebert_reviews/1-5/') posposList = utils.loadPosList('dataset/ebert_reviews/pos4-0.txt', posList, inclusion) posposList += utils.loadPosList('dataset/ebert_reviews/pos3-5.txt', posList, inclusion) negposList = utils.loadPosList('dataset/ebert_reviews/pos0-0.txt', posList, inclusion) negposList += utils.loadPosList('dataset/ebert_reviews/pos0-5.txt', posList, inclusion) negposList += utils.loadPosList('dataset/ebert_reviews/pos1-0.txt', posList, inclusion) negposList += utils.loadPosList('dataset/ebert_reviews/pos1-5.txt', posList, inclusion) numberOfReviews = 950 posReviews = posReviews[:numberOfReviews] negReviews = negReviews[:numberOfReviews] posposList = posposList[:numberOfReviews] negposList = negposList[:numberOfReviews] posTuples = list(zip(posReviews, posposList)) negTuples = list(zip(negReviews, negposList)) dataSetGoodWords, dataSetBadWords = utils.getUniqueGoodandBadWords() totalacc = 0.0 for i in range(iterations): totalacc += getSuperGoodBad(topNum, posTuples, negTuples, dataSetGoodWords, dataSetBadWords) print("accuracy : " + "{:.4f}".format(totalacc / (i+1)))