def processReviewXls(self, sheet, row): review = Review() start_col = 0 end_col = 11 for col in range(start_col, end_col): if col == 0: review.reviewId = sheet.cell_value(row, col) elif col == 1: review.review = sheet.cell_value(row, col) elif col == 2: review.Food = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 3: review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 4: review.Ambiance = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 5: review.Service = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 6: review.Location = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 7: review.Deals = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 8: review.Price = self.XlsCheckValue(sheet.cell_value(row, col)) else: pass # control should have never reached here as there are only 11 columns in xls return review
def stemmingStopWRemoval(self, review, vocab): ''' Does Following things: 1. Tokenize review into sentences, and then into words 2. Remove stopwords, punctuation and stem each word 3. Add words into vocab 4. Make Sentence objects and corresponding Review object ''' reviewObj = Review() #copying ratings into reviewObj for ratingType, rating in review["Ratings"].items(): reviewObj.ratings[ratingType] = rating reviewObj.reviewId = review["ReviewID"] stemmer = PorterStemmer() reviewContent = review["Content"] #TODO: Append title too! sentencesInReview = nltk.sent_tokenize(reviewContent) puncs = set(string.punctuation) #punctuation marks for sentence in sentencesInReview: wordList = [] words = nltk.word_tokenize(sentence) for word in words: if not all(c.isdigit() or c in puncs for c in word): word = word.lower() if word not in self.stopWords: word = stemmer.stem(word.lower()) vocab.append(word) wordList.append(word) if wordList: sentenceObj = Sentence(wordList) reviewObj.sentences.append(sentenceObj) if reviewObj.sentences: self.allReviews.append(reviewObj)
def processReviewXls(self, sheet, row): review = Review() start_col = 0 end_col = 11 for col in range(start_col, end_col): if (col == 0): review.reviewId = sheet.cell_value(row, col) elif (col == 1): review.review = sheet.cell_value(row, col) elif (col == 2): review.Food = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 3): review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 4): review.Ambiance = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 5): review.Service = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 6): review.Location = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 7): review.Deals = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 8): review.Price = self.XlsCheckValue(sheet.cell_value(row, col)) else: pass #control should have never reached here as there are only 11 columns in xls return review