def shuffleReviews(input_file, output_file): reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" random.shuffle(reviewList) Review.serializeToXML(reviewList, output_file)
def countLabeledReviews(file): reviewList = Review.readReviewsFromXML(file) count = 0 for review in reviewList: if review.getReviewPolarity().strip() != '': count += 1 print count
def seperateByRating(input_file, output_dir): reviewList = Review.readReviewsFromXML(input_file) high5 = [] low1 = [] medium = [] low2 = [] for review in reviewList: if str(review.getReviewRating()) == '5.0': review.setPolarity('1') review.setConfidence('1') high5.append(review) elif str(review.getReviewRating()) == '1.0': review.setPolarity('-1') review.setConfidence('1') low1.append(review) elif str(review.getReviewRating()) == '2.0': review.setPolarity('-1') review.setConfidence('1') low2.append(review) else: medium.append(review) Review.serializeToXML(high5, output_dir + "/high.xml") Review.serializeToXML(low1, output_dir + "/low1.xml") Review.serializeToXML(low2, output_dir + "/low2.xml") Review.serializeToXML(medium, output_dir + "/medium.xml") print "5: " + str(len(high5)) print "1: " + str(len(low1)) print "2: " + str(len(low2))
def separateLabeledAndUnlabeled(file, output_dir): reviewList = Review.readReviewsFromXML(file) labeled = [] unlabeled = [] for review in reviewList: if review.getReviewPolarity().strip() != '': labeled.append(review) else: unlabeled.append(review) Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml") Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
def siftReviewsByPolarity(input_file, output_file, polarity): ''' out_file will contain all reviews from input_file other than the ones labeled as polarity ''' reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" outList = [] for review in reviewList: if str(review.getReviewPolarity()) == str(polarity): continue outList.append(review) Review.serializeToXML(outList, output_file)
def labelTestFile(xml_test_file, weka_csv_results_file, output_file): ''' this method takes the reviews xml file, weka results in CSV format applies polarity and confidence to reviews and write the resultant xml to output_file ''' reviewList = Review.readReviewsFromXML(xml_test_file) results_file = open(weka_csv_results_file, "r") resultsList = results_file.readlines() if len(reviewList) != len(resultsList): print 'Different number of reviews and results' return counter = 0 for review in reviewList: result = resultsList[counter].strip().split(',') counter += 1 review.setPolarity( Util.getNumericLabel(result[2].split(':')[1])) review.setConfidence('0.9' if result[4] == '1' else result[4]) print 'writing labelled test data to ' + output_file Review.serializeToXML(reviewList, output_file)
from Review import Review if __name__ == '__main__': Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
def printCount(file): reviewList = Review.readReviewsFromXML(file) print str(len(reviewList))
return trainingData = {} validationData = {} self.generateKFolds(outdir, trainingData, validationData) for i in range(1,self.k+1): print "generating features for fold " + str(i) trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures() validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging); generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures() if __name__ == '__main__': reviews = Review.readReviewsFromXML("../old-training-shuffled.xml") lemmatizer = nltk.WordNetLemmatizer() print 'reviews: ' + str(len(reviews)) kfg = KFoldGenerator(reviews, 10) kfg.generateFolds("../kfolds/linearSVM/unigrams-lemma-POS-tf-no-stop", lemmatizer, POS_tagging = True, weightScheme = FeatureWeight.TF, includeRating=False, includeDocLength=False)
''' Created on Apr 15, 2013 This is where we invoke modules to generate features for training and test data @author: naresh ''' from Review import Review import nltk from Corpus import Corpus from Dictionary import Dictionary from FeatureGenerator import FeatureGenerator from FeatureWeight import FeatureWeight if __name__ == '__main__': trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml") lemmatizer = nltk.WordNetLemmatizer() testReviews = Review.readReviewsFromXML("../old-test-data.xml") trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF) generator.generateFeatures() testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True); generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF) generator.generateFeatures()
#output files unlabeled_file='../test-data.xml' labeled_file='../traning-data.xml' #lists for labeled and unlabeled reviews unlabeled=[] labeled=[] labeled_high=[] labeled_low=[] labeled_mid=[] for each_file in review_files: #call the readReviewsFromXML reviews = Review.readReviewsFromXML(each_file) for each_review in reviews: #convert reviewId into int, which help in sorting before saving in disk. each_review.reviewId=int(each_review.getReviewId()) #check and append if polarity is empty if (each_review.getReviewPolarity() == ""): unlabeled.append(each_review) elif (each_review.getReviewPolarity() == "-1"): labeled_low.append(each_review) elif(each_review.getReviewPolarity() == "0"): labeled_mid.append(each_review)