Beispiel #1
0
 def shuffleReviews(input_file, output_file):
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     random.shuffle(reviewList)
     Review.serializeToXML(reviewList, output_file)
Beispiel #2
0
 def separateLabeledAndUnlabeled(file, output_dir):
     reviewList = Review.readReviewsFromXML(file)
     labeled = []
     unlabeled = []
     
     for review in reviewList:
         if review.getReviewPolarity().strip() != '':
             labeled.append(review)
         else:
             unlabeled.append(review)
     Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml")
     Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
Beispiel #3
0
 def seperateByRating(input_file, output_dir):
     reviewList = Review.readReviewsFromXML(input_file)
     high5 = []
     low1 = []
     medium = []
     low2 = []
     for review in reviewList:
         if str(review.getReviewRating()) == '5.0':
             review.setPolarity('1')
             review.setConfidence('1')
             high5.append(review)
         elif str(review.getReviewRating()) == '1.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low1.append(review)
         elif str(review.getReviewRating()) == '2.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low2.append(review)
         else:
             medium.append(review)
     
     Review.serializeToXML(high5, output_dir + "/high.xml")
     Review.serializeToXML(low1, output_dir + "/low1.xml")
     Review.serializeToXML(low2, output_dir + "/low2.xml")
     Review.serializeToXML(medium, output_dir + "/medium.xml")
     print "5: " + str(len(high5))
     print "1: " + str(len(low1))
     print "2: " + str(len(low2))       
Beispiel #4
0
 def siftReviewsByPolarity(input_file, output_file, polarity):
     '''
     out_file will contain all reviews from input_file 
     other than the ones labeled as polarity 
     '''
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     outList = []
     for review in reviewList:
         if str(review.getReviewPolarity()) == str(polarity):
             continue
         outList.append(review)
     Review.serializeToXML(outList, output_file)
Beispiel #5
0
 def generateKFolds(self, location = "./", trainingData = {}, validationData = {}):
     if self.reviews == None or len(self.reviews) == 0:
         print 'No data to work on'
         return
     i = 0;
     
     import os
     if not os.path.isdir(location):
         location = "./"
     
     for training, validation in self.k_fold_cross_validation():
         i = i + 1
         Review.serializeToXML(training, location + "/train" + str(i) + ".xml")
         Review.serializeToXML(validation, location + "/valid" + str(i) + ".xml")
         trainingData[str(i)] = training
         validationData[str(i)] = validation
Beispiel #6
0
 def labelTestFile(xml_test_file, weka_csv_results_file, output_file):
     '''
     this method takes the reviews xml file, weka results in CSV format
     applies polarity and confidence to reviews and write the resultant xml to output_file
     '''
     reviewList = Review.readReviewsFromXML(xml_test_file)
     
     results_file = open(weka_csv_results_file, "r")
     
     resultsList = results_file.readlines()
     
     if len(reviewList) != len(resultsList):
         print 'Different number of reviews and results'
         return
     
     counter = 0
     for review in reviewList:
         result = resultsList[counter].strip().split(',')
         counter += 1
         review.setPolarity( Util.getNumericLabel(result[2].split(':')[1]))
         review.setConfidence('0.9' if result[4] == '1' else result[4])
     
     print 'writing labelled test data to ' + output_file    
     Review.serializeToXML(reviewList, output_file)
Beispiel #7
0
from Review import Review

if __name__ == '__main__':
	Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
Beispiel #8
0
	reviewObj.setReviewRating(rating)

#global variables
file_location = "../reviews.xml"

if __name__ == '__main__':
	hotel_url= ['http://www.yelp.com/biz/morimoto-new-york']   
	
	#variable to loop through pages
	i=0
	#variable to assign doc id to reviews
	objCount = 1
	#we store our reviews temporarily in this before we write to file
	buffer = []

	#crawl in a loop
	while(i<=1000):
		web_page= parse(hotel_url[0]+'?start='+str(i)).getroot()
		for review in web_page.cssselect('#bizReviews .externalReview'):
			obj = Review(objCount)
			myparser(obj, review)
			buffer.append(obj)
			objCount += 1
		i=i+40
		print objCount
		#if we crawl too fast, site comes up with captcha
		time.sleep(10)
	
	Review.serializeToXML(buffer, file_location)

Beispiel #9
0
			elif (each_review.getReviewPolarity() == "-1"):
				labeled_low.append(each_review)
			elif(each_review.getReviewPolarity() == "0"):
				labeled_mid.append(each_review)
			elif(each_review.getReviewPolarity() == "1"):
				labeled_high.append(each_review)

	#reviews from 3 files are appended to lists, but they are unsorted. Hence sorting them here.
		
	unlabeled.sort(key = operator.attrgetter('reviewId'))
	labeled_low.sort(key = operator.attrgetter('reviewId'))
	labeled_mid.sort(key = operator.attrgetter('reviewId'))
	labeled_high.sort(key = operator.attrgetter('reviewId'))

	labeled.extend(labeled_low)
	labeled.extend(labeled_mid)
	labeled.extend(labeled_high)
	
	#Saving to disk
	
	Review.serializeToXML(unlabeled,unlabeled_file)
	Review.serializeToXML(labeled,labeled_file)

	#Comment if not required.
	print "Labeled-low: " +str(len(labeled_low))
	print "Labeled-mid: " +str(len(labeled_mid))
	print "Labeled-high: " +str(len(labeled_high))
	print "Total Labeled :"+str(len(labeled))
	print "Unlabeled :"+str(len(unlabeled))