def processReviewXls(self, sheet, row): review = Review() start_col = 0 end_col = 11 for col in range(start_col, end_col): if col == 0: review.reviewId = sheet.cell_value(row, col) elif col == 1: review.review = sheet.cell_value(row, col) elif col == 2: review.Food = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 3: review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 4: review.Ambiance = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 5: review.Service = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 6: review.Location = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 7: review.Deals = self.XlsCheckValue(sheet.cell_value(row, col)) elif col == 8: review.Price = self.XlsCheckValue(sheet.cell_value(row, col)) else: pass # control should have never reached here as there are only 11 columns in xls return review
def shuffleReviews(input_file, output_file): reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" random.shuffle(reviewList) Review.serializeToXML(reviewList, output_file)
def stemmingStopWRemoval(self, review, vocab): ''' Does Following things: 1. Tokenize review into sentences, and then into words 2. Remove stopwords, punctuation and stem each word 3. Add words into vocab 4. Make Sentence objects and corresponding Review object ''' reviewObj = Review() #copying ratings into reviewObj for ratingType, rating in review["Ratings"].items(): reviewObj.ratings[ratingType] = rating reviewObj.reviewId = review["ReviewID"] stemmer = PorterStemmer() reviewContent = review["Content"] #TODO: Append title too! sentencesInReview = nltk.sent_tokenize(reviewContent) puncs = set(string.punctuation) #punctuation marks for sentence in sentencesInReview: wordList = [] words = nltk.word_tokenize(sentence) for word in words: if not all(c.isdigit() or c in puncs for c in word): word = word.lower() if word not in self.stopWords: word = stemmer.stem(word.lower()) vocab.append(word) wordList.append(word) if wordList: sentenceObj = Sentence(wordList) reviewObj.sentences.append(sentenceObj) if reviewObj.sentences: self.allReviews.append(reviewObj)
def seperateByRating(input_file, output_dir): reviewList = Review.readReviewsFromXML(input_file) high5 = [] low1 = [] medium = [] low2 = [] for review in reviewList: if str(review.getReviewRating()) == '5.0': review.setPolarity('1') review.setConfidence('1') high5.append(review) elif str(review.getReviewRating()) == '1.0': review.setPolarity('-1') review.setConfidence('1') low1.append(review) elif str(review.getReviewRating()) == '2.0': review.setPolarity('-1') review.setConfidence('1') low2.append(review) else: medium.append(review) Review.serializeToXML(high5, output_dir + "/high.xml") Review.serializeToXML(low1, output_dir + "/low1.xml") Review.serializeToXML(low2, output_dir + "/low2.xml") Review.serializeToXML(medium, output_dir + "/medium.xml") print "5: " + str(len(high5)) print "1: " + str(len(low1)) print "2: " + str(len(low2))
def separateLabeledAndUnlabeled(file, output_dir): reviewList = Review.readReviewsFromXML(file) labeled = [] unlabeled = [] for review in reviewList: if review.getReviewPolarity().strip() != '': labeled.append(review) else: unlabeled.append(review) Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml") Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
def process(self): with codecs.open(SOURCE_TRAIN_FILE, encoding='utf-8') as r: lines = r.readlines() count = 0 for line in lines: print count count += 1 review = Review() self.stage_initial(review, line) self.stage_add_jj(review) self.stage_add_key_word(review) self.stage_post_process(review) if review.is_valid(): self.review_list.append(review)
def siftReviewsByPolarity(input_file, output_file, polarity): ''' out_file will contain all reviews from input_file other than the ones labeled as polarity ''' reviewList = Review.readReviewsFromXML(input_file) if reviewList == None or len(reviewList) == 0: print "No reviews in input file" outList = [] for review in reviewList: if str(review.getReviewPolarity()) == str(polarity): continue outList.append(review) Review.serializeToXML(outList, output_file)
def render_review(): form = reviewForm(request.form) if request.method == 'POST' and form.validate(): review = form.review.data s1 = Review(review) mag_db = root.child("review") mag_db.push({ # "username": session["username"], 'review': s1.get_review(), # "rating":s1.get_rating(), # "companyname":s1.get_rating() }) flash("Thank You !! We Appreciate your Review :) ", "success") return render_template('Review.html', form=form)
def generateKFolds(self, location = "./", trainingData = {}, validationData = {}): if self.reviews == None or len(self.reviews) == 0: print 'No data to work on' return i = 0; import os if not os.path.isdir(location): location = "./" for training, validation in self.k_fold_cross_validation(): i = i + 1 Review.serializeToXML(training, location + "/train" + str(i) + ".xml") Review.serializeToXML(validation, location + "/valid" + str(i) + ".xml") trainingData[str(i)] = training validationData[str(i)] = validation
def dataset_from_contents_labels(self, contents, labels): arr_dataset = [] for i in xrange(len(contents)): dr = Review(contents[i], labels[i]) arr_dataset.append(dr) return self.dataset_from_array(arr_dataset)
def countLabeledReviews(file): reviewList = Review.readReviewsFromXML(file) count = 0 for review in reviewList: if review.getReviewPolarity().strip() != '': count += 1 print count
def scrape_user_comment_list(self, raw_page=None): if not raw_page: raw_page = self.fetch_beer_page() self.reviews = [] try: self.total_ratings except AttributeError: self.parse_metadata(raw_page) page = 1 while len(self.reviews) < self.total_ratings: if page != 1: raw_page = self.fetch_beer_page(page=page) self.reviews +=[Review(beer_uid=self.uid, user_uid=int(user_id), brewery_uid =self.brewery_id, topline_score=float(topline_score), aroma_score=int(aroma), apperance_score=int(apperance), taste_score=int(taste), palete_score=int(palete), overall_score=int(overall), user_loc=user_loc, date = datetime.datetime.strptime(date_str, '%b %d, %Y').date(), comment = comment) for (topline_score, aroma, apperance, taste, palete, overall, user_id, user_name, user_loc, date_str, comment) in \ Beer.reviews_regex.findall(raw_page)] page += 1 if page - 1 > self.total_ratings / 8.0: logging.error( 'parsing should have completed, but did not, forcing.') break
def convertReview(self, serialized_dict): review = Review() for key in serialized_dict: if "review_" in key: value = serialized_dict[key] setattr(review, key, value) return review
def getAllReviews(self): self._c.execute("SELECT * FROM reviews") for row in self._c.fetchall(): review = Review(row[0],row[1],row[2],row[3],row[4],row[5],row[6]) if review.artist not in self.artists: self.artists[review.artist] = Artist(review.artist) if review.bnm == 1: self.artists[review.artist].bnms.append(review) self.artists[review.artist].reviews.append(review)
def test_review(self): review = Review("Review title", "This is the review content", "5", "December 20, 2020", "Deco Oliveira") self.assertEquals(review.title, "Review title") self.assertEquals(review.content, "This is the review content") self.assertEquals(review.rating, "5") self.assertEquals(review.date, "December 20, 2020") self.assertEquals(review.author, "Deco Oliveira")
def delete(self): req_data = request.get_json() username = req_data['username'] restaurant_id = req_data['restaurant_id'] if Review.delete_rating(username, restaurant_id): return {'message': "Review deleted"}, 200 else: return {'message': "No review to delete"}, 200
def from_json(json_filename, from_annotated=False): paper = Paper('', '', None, []) datas = [] with io.open(json_filename, mode='rt', encoding='utf8') as json_file: for line in json_file: try: data = json.loads(line.strip()) datas.append(data) except Exception as e: print(line) continue if len(datas) == 0: return None data = datas[-1] #print data # Read required fields. assert 'title' in data assert 'abstract' in data paper.TITLE = data['title'] paper.ABSTRACT = data['abstract'] if 'id' in data: if data['id'] == "": paper.ID = json_filename.split("/")[-1].split(".")[0] else: paper.ID = data['id'] else: paper.ID = json_filename.split("/")[-1].split(".")[0] # Read optional fields. paper.AUTHORS = data['authors'] if 'authors' in data else None paper.CONFERENCE = data['conference'] if 'conference' in data else None paper.ACCEPTED = data['accepted'] if 'accepted' in data else None paper.SCORE = data['score'] if 'score' in data else None paper.PUBLICATION_TYPE = data[ 'publication_type'] if 'publication_type' in data else None paper.SCIENCEPARSE = data[ 'scienceparse'] if 'scienceparse' in data else None paper.KEYWORDS = data['keywords'] if 'keywords' in data else None paper.AUTHOR_EMAILS = data[ 'author_emails'] if 'author_emails' in data else None paper.DATE_OF_SUBMISSION = data[ 'DATE_OF_SUBMISSION'] if 'DATE_OF_SUBMISSION' in data else None paper.SUBJECTS = data['SUBJECTS'] if 'SUBJECTS' in data else None paper.COMMENTS = data['COMMENTS'] if 'COMMENTS' in data else None paper.VERSION = data['VERSION'] if 'VERSION' in data else None paper.HISTORIES = data['histories'] if 'histories' in data else None # Read reviews (mandatory). assert 'reviews' in data for review_data in data['reviews']: review = Review.from_json_object(review_data) paper.REVIEWS.append(review) return paper
def __init__(self, filename, empty_user=set()): ''' filename: inits the UBRR data from the input file empty_user: skip the reviews by this user (keeps the ratings) ''' self.empty_user = empty_user ur_map = dict() br_map = dict() cnt = 0 skipped = 0 #read the file if filename.endswith('.gz'): f = gzip.open(filename, 'r') else: f = open(filename, 'r') for line in f: vals = line.split("\t") if len(vals) == 0: continue u = vals[0] b = vals[1] r = float(vals[2]) d = vals[3].strip() if u in self.empty_user: #we are skipping this review d = '' skipped += 1 rev = Review(u, b, r, d) #review obj #store biz -> list of reviews if not br_map.get(b): br_map[b] = [] br_map[b].append(rev) #store user -> list of reviews if not ur_map.get(u): ur_map[u] = [] ur_map[u].append(rev) cnt += 1 self.biz_map = br_map self.user_map = ur_map f.close() print('Review Data Manager Initialized with ', cnt, ' reviews') print('Number of skipped users = ', len(self.empty_user)) print('Number of skipped reviews = ', skipped)
def getReview(self, reviewlink): review = urlopen(Request(reviewlink), context=ssl._create_unverified_context()) review_soup = soup(review, 'lxml') root_container = review_soup.find("img", attrs={"src": re.compile('album')}) score_container = root_container.find_next_sibling('div').find('span') name_container = root_container.find_next_sibling('h1').find('a') album_container = root_container.find_next_sibling('h1').find('span') return Review(float(score_container.text), reviewlink, name_container.text, album_container.text)
def findMaterials(link): # Parse the given link into some Beautiful Soup req = requests.get(link).text reviews = BeautifulSoup(req, 'html.parser') # Set up list string variables. reviewAuthor = [] reviewPosition = [] reviewCompany = [] reviewRating = [] sectionHeading = [] sectionText = [] sectionDate = '' # Find the authors name (if there is one) for review in reviews.find_all('span', {'itemprop': 'author'}): reviewAuthor.append(review.contents[0].text) # Find the author's position and company (if applicable) for review in reviews.find_all('span', {'class': 'user-info'}): reviewPosition.append(review.contents[0].text) reviewCompany.append(review.contents[1].text) # Find what the user rated Sitefinity reviewRating = reviews.find_all('span', class_='number')[0].text # Perform find.contents[] for all of the headings and text # and append them to our functions variables for review in reviews.find_all('div', {'class': 'description'}): # Receive review section headings for head in range(6): sectionHeading.append(review.contents[0].contents[0].contents[1].contents[head].contents[0].contents[0].contents[0].contents[0].text) # Receive review section bodies for body in range(6): sectionText.append(" %s" % review.contents[0].contents[0].contents[1].contents[body].contents[1].contents[0].contents[0].contents[0].text) # Wrap up the review information into a dictionary, this is for easy handling reviewDict = dict(zip(sectionHeading, sectionText)) # Get's the date of the review from the review's URL sectionDate = link[56:-9] days = date(int(sectionDate[:-6]), int(sectionDate[5:-3]), int(sectionDate[8:])) # Create a new review using our Review class, and return that review rev = Review(reviewAuthor, reviewPosition, reviewCompany, reviewRating, reviewDict, days) print "Review created for %s..." % rev.name[0] sys.stdout.flush() return rev
def post(self): req_data = request.get_json() username = req_data['username'] restaurant_id = req_data['restaurant_id'] restaurant_name = req_data['restaurant_name'] comment = req_data['comment'] rating = req_data['rating'] if Review.post_rating(username, restaurant_name, restaurant_id, comment, rating): return {'message': "Review created"}, 200 else: return {'message': "Already reviewed restaurant"}, 200
def labelTestFile(xml_test_file, weka_csv_results_file, output_file): ''' this method takes the reviews xml file, weka results in CSV format applies polarity and confidence to reviews and write the resultant xml to output_file ''' reviewList = Review.readReviewsFromXML(xml_test_file) results_file = open(weka_csv_results_file, "r") resultsList = results_file.readlines() if len(reviewList) != len(resultsList): print 'Different number of reviews and results' return counter = 0 for review in reviewList: result = resultsList[counter].strip().split(',') counter += 1 review.setPolarity( Util.getNumericLabel(result[2].split(':')[1])) review.setConfidence('0.9' if result[4] == '1' else result[4]) print 'writing labelled test data to ' + output_file Review.serializeToXML(reviewList, output_file)
def get_page(self, page): myUrl = 'http://android.myapp.com/myapp/app/comment.htm?apkName=com.msxf.loan&apkCode=15701&p=' + page + '&contextData=' + self.contextData user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib.request.Request(myUrl, headers=headers) myResponse = urllib.request.urlopen(req) myPage = myResponse.read() # encode的作用是将unicode编码转换成其他编码的字符串 # decode的作用是将其他编码的字符串转换成unicode编码 unicodePage = myPage.decode("utf-8") jsondata = json.loads(unicodePage) if (not jsondata == None) and 'obj' in jsondata: obj = jsondata['obj'] if not obj == None: if self.total == 0: if 'total' in obj: self.total = obj['total'] if 'commentDetails' in obj: commentDetailes = obj['commentDetails'] if 'contextData' in obj: self.contextData = obj['contextData'] self.crawlCount += len(commentDetailes) reviews = [] for comment in commentDetailes: review = Review() review.appStore = 'myapp' review.packageName = 'com.msxf.loan' if 'content' in comment: review.content = comment['content'] if 'nickName' in comment: review.nickName = comment['nickName'] if 'score' in comment: review.score = comment['score'] if 'versionCode' in comment: review.versionCode = comment['versionCode'] if 'createdTime' in comment: review.reviewTime = datetime.datetime.fromtimestamp(int(comment['createdTime'])).strftime( '%Y-%m-%d %H:%M:%S') reviews.append(review) ReviewsDataSource.insert(reviews) self.enable = self.crawlCount < self.total
def load_review_from_csv(self, infile): with open(infile, "rb") as csvfile: reader = csv.DictReader(csvfile) # init field names & label column self.field_names = reader.fieldnames self.column_label = self.field_names[-1] for rows in reader: review = Review(rows[self.field_names[0]], rows[self.field_names[1]]) self.dataset.append(review) if self.label_values.count(rows[self.column_label]) == 0: self.label_values.append(rows[self.column_label]) return infile
def from_softconf_dump(json_file, conference=None): with io.open(json_file, "r", encoding="utf8") as ifh: json_str = ifh.read() # print (json_str) json_data = json.loads(json_str)["submissions"] papers = [] for i in range(len(json_data)): reviews = [] for k in range(len(json_data[i]["reviews"])): # print(json_data[i]["reviews"][k]) review_data = [] review = Review.from_json_object( json_data[i]["reviews"][k], k == i == 0) #review = None reviews.append(review) authors = json_data[i]["authors"] if "authors" in json_data[i] else None score = json_data[i]["score"] if "score" in json_data[i] else None accepted = json_data[i]["accepted"] if "accepted" in json_data[i] else None publication_type = json_data[i]["publication_type"] if "publication_type" in json_data[i] else None keywords = json_data[i]["KEYWORDS"] if "KEYWORDS" in json_data[i] else None author_emails = json_data[i]["AUTHOR_EMAILS"] if "AUTHOR_EMAILS" in json_data[i] else None date_of_submission = json_data[i]["DATE_OF_SUBMISSION"] if "DATE_OF_SUBMISSION" in json_data[i] else None paper = Paper( json_data[i]["title"], json_data[i]["abstract"], json_data[i]["id"], reviews, authors, conference, accepted, score, publication_type, None, keywords, author_emails, date_of_submission) papers.append(paper) # break return papers
def queryTodayReviews(reviews): conn = sqlite3.connect('reviews.db') try: c = conn.cursor() for row in c.execute( "select nickName,content,reviewTime,appStore,versionCode,packageName,score from reviews WHERE date(reviews.reviewTime) = date('now')"): review = Review() review.appStore = 'myapp' review.nickName = row[0] review.content = row[1] review.reviewTime = row[2] review.appStore = row[3] review.versionCode = row[4] review.packageName = row[5] review.score = row[6] reviews.append(review) except BaseException as e: print('sql error : ' + e.__cause__) c.close() print('queryTodayReviews ' + str(len(reviews)) + ' row.')
def collection(self, reviews): all_reviews = [] factory = ElementFactory() for r in reviews: content = factory.content(r) five_stars = factory.five_stars(r) date = factory.date(r).text author = factory.author(r).text title = factory.title(r).text rating = 5 if five_stars != None else 0 if(rating == 5): review = Review(title, content, rating, date, author) all_reviews.append(review) return all_reviews
def test_add_review(self): expected = "Thank your for giving a review." is_first = True is_second = True count = 3 for k in self.drivers: if is_first and is_second: order_id = self.order1 is_first = False elif is_second: order_id = self.order2 is_second = False else: order_id = self.order3 result = Review().start(self.drivers[k], self.user, order_id, str(count), "Unit testing") self.assertEqual(expected, result) count = count + 1
def load(self): for line in codecs.open(Train_File).readlines(): curr_r = Review() curr_r.parsefromstring(line) self.data.append(curr_r) print "training data loaded, ", len(self.data)
from Review import Review if __name__ == '__main__': Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
def reviewList(self, movie_code): d = json.loads( urllib.urlopen( "http://api.allocine.fr/rest/v3/reviewlist?partner=%s&format=json&code=%s" % (PARTNER_CODE, movie_code)).read()) return [Review(**i) for i in d["feed"]["review"]]
def processReviewXls(self, sheet, row): review = Review() start_col = 0 end_col = 11 for col in range(start_col, end_col): if (col == 0): review.reviewId = sheet.cell_value(row, col) elif (col == 1): review.review = sheet.cell_value(row, col) elif (col == 2): review.Food = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 3): review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 4): review.Ambiance = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 5): review.Service = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 6): review.Location = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 7): review.Deals = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 8): review.Price = self.XlsCheckValue(sheet.cell_value(row, col)) else: pass #control should have never reached here as there are only 11 columns in xls return review
def get(self, username): reviews = Review.get_reviews_for_user(username) if reviews != None: return {'reviews': reviews}, 200 else: return {'message': 'Error getting reviews'}, 404
from Rating import Rating from Experience import Experience from Review import Review from User import User from Recomendation import Recomendation print(0, "->", Rating(0, 0, 2)) print(1, "->", Rating(1, 0, 2)) print(2, "->", Rating(2, 0, 2)) #Rating(9,0,2) #Rating("g") experiencia1 = Experience("Buenas migas", "Restaurante", 1) experiencia2 = Experience("Telepizza", "Restaurante") experiencia2.setId(2) valoracion1 = Rating(1) resenya1 = Review(experiencia1, valoracion1) recomendacion1 = Recomendation(experiencia2, Rating(2)) user1 = User("nombre", "contraseña") user1.setId(1) user1.addRecomendation( Recomendation(Experience("Dominus", "Restaurante", 1), Rating(3))) user1.addReview(Review(Experience("Dominus", "Restaurante", 3), Rating(4))) user1.getRecomendations()[0].setId( (user1.getRecomendations()[0].getExperience().getId(), user1.getId())) user1.getReviews()[0].setId( (user1.getReviews()[0].getExperience().getId(), user1.getId())) user2 = User("otroUser", "otraPassword", id=3) user2.setRecomendations(user1.getRecomendations()) user3 = User("copion", "copionpassword", user1.getReviews(), user2.getRecomendations(), 3)
reviewObj.setReviewRating(rating) #global variables file_location = "../reviews.xml" if __name__ == '__main__': hotel_url= ['http://www.yelp.com/biz/morimoto-new-york'] #variable to loop through pages i=0 #variable to assign doc id to reviews objCount = 1 #we store our reviews temporarily in this before we write to file buffer = [] #crawl in a loop while(i<=1000): web_page= parse(hotel_url[0]+'?start='+str(i)).getroot() for review in web_page.cssselect('#bizReviews .externalReview'): obj = Review(objCount) myparser(obj, review) buffer.append(obj) objCount += 1 i=i+40 print objCount #if we crawl too fast, site comes up with captcha time.sleep(10) Review.serializeToXML(buffer, file_location)
def printCount(file): reviewList = Review.readReviewsFromXML(file) print str(len(reviewList))
def get(self, restaurant_id): reviews = Review.get_reviews_for_restaurant(restaurant_id) if reviews != None: return {'reviews': reviews}, 200 else: return {'message': 'Error getting reviews'}, 404
''' Created on Apr 15, 2013 This is where we invoke modules to generate features for training and test data @author: naresh ''' from Review import Review import nltk from Corpus import Corpus from Dictionary import Dictionary from FeatureGenerator import FeatureGenerator from FeatureWeight import FeatureWeight if __name__ == '__main__': trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml") lemmatizer = nltk.WordNetLemmatizer() testReviews = Review.readReviewsFromXML("../old-test-data.xml") trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF) generator.generateFeatures() testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True); generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF) generator.generateFeatures()
def __init__(self, url, cnx): #temporary for testing #url = '/movie/bond-23' #skip this. metacritic's fault if (url == '/movie/who-the-%-is-jackson-pollock'): return #values that go into database values = {} values['title'] = '' values['url'] = '' values['cScore'] = '' values['uScore'] = '' values['date'] = '' #get all of those single values then put them in the movie table #then find all of the reviews and put them in the reviews table with the movie id #time to get the stuff from the movie page #get movie page response = requests.get('http://www.metacritic.com' + url, allow_redirects=True) if (response.status_code == 400): return url = re.sub( 'http:\/\/www.metacritic.com', '', response.url) #resets the url to the one that was redirected to #convert html to string mainPageHtml = response.content #make the soup mainPageSoup = BeautifulSoup(mainPageHtml) #save the url values['url'] = url #get the title results = mainPageSoup.find_all('span', {'itemprop': 'name'}) values['title'] = results[0].string values['title'] = str( values['title'].lstrip().rstrip()) #get rid of weird whitespace #get the critic score results = mainPageSoup.find_all('span', {'itemprop': 'ratingValue'}) values['cScore'] = str(results[0].string) #get the user score results = mainPageSoup.find_all('a', { 'class': 'metascore_anchor', 'href': url + '/user-reviews' }) #if for some reason it can't find the user score. it happens even though it shouldn't if (len(results) > 0): values['uScore'] = str(results[0].div.string) if (values['uScore'] == 'tbd'): values['uScore'] = str('-1') else: values['uScore'] = str('-1') #get the year results = mainPageSoup.find_all('span', { 'class': 'data', 'itemprop': 'datePublished' }) date = str(results[0].string.lstrip().rstrip()) matches = re.match(r'([a-zA-Z]{3})\s(\d+),\s(\d{4})', date) if (matches): month = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }[matches.group(1)] day = matches.group(2) year = matches.group(3) values['date'] = year + '-' + month + '-' + day else: values['date'] = None #save to the database cursor = cnx.cursor() query = ("select movie_id from movies where movie_url = %s") inDB = False mid = 0 cursor.execute(query, (str(values['url']), )) for (movie_id, ) in cursor: inDB = True id = movie_id if (not inDB): #make a new row for this critic if (values['date'] is not None): add_movie = ("INSERT INTO movies" "(title, movie_url, uScore, cScore, release_date)" "VALUES (%s, %s, %s, %s, %s)") movie_data = (values['title'], values['url'], values['uScore'], values['cScore'], values['date']) else: add_movie = ("INSERT INTO movies" "(title, movie_url, uScore, cScore)" "VALUES (%s, %s, %s, %s)") movie_data = (values['title'], values['url'], values['uScore'], values['cScore']) cursor.execute(add_movie, movie_data) mid = cursor.lastrowid cnx.commit() cursor.close() #get the critic reviews #get html criticPage = openUrl(url) criticSoup = BeautifulSoup(criticPage) criticReviews = criticSoup.find_all( 'div', {'class': 'module reviews_module critic_reviews_module'}) if (len(criticReviews) > 0): reviews = criticReviews[0].find_all('div', {'class': 'review_content'}) else: print('ERROR:' + url) reviews = [] for r in reviews: Rev = Review(mid, values['url'], r, cnx)
def reviewDetail(self, review_container, poi_id): """--------------------------""" '''-------------uid-------------''' uid = None try: try: original_uid = review_container.find_elements_by_css_selector( ".memberOverlayLink")[0].get_attribute("id") #print(original_uid) long_uid = original_uid.split("_")[1] long_uid_split = long_uid.split("-") if len(long_uid_split) > 0: uid = long_uid_split[0] else: uid = long_uid except: original_uid = review_container.find_elements_by_css_selector( ".member_info div")[0].get_attribute("class") try: long_uid = original_uid.split("_")[1] uid = long_uid except: uid = review_container.find_element_by_css_selector( ".username.mo span").text except: uid = None '''-------------review_title-------------''' review_title = None try: review_title = review_container.find_element_by_css_selector( "span.noQuotes").text except: review_title = None '''-------------review_rating-------------''' review_rating = None try: review_rating_string = review_container.find_element_by_css_selector( ".rating span.ui_bubble_rating").get_attribute("class") review_rating = int( review_rating_string.split(" ")[1].split("_")[1]) / 10 except: review_rating = None '''-------------ratingDate-------------''' ratingDate = None try: ratingDate = review_container.find_element_by_css_selector( ".ratingDate.relativeDate").get_attribute("title") except: ratingDate = None '''-------------review-------------''' review = None try: review = review_container.find_element_by_css_selector( ".entry .partial_entry").text except: review = None '''-------------print all data-------------''' # print("uid:",uid,"review_title:",review_title,"review_rating:",review_rating,"review:",review,"ratingDate:",ratingDate) review = Review(poi_id, uid, review_title, review_rating, review, ratingDate) print(review) if self.insertToDB_gate: self.db.insert(review, "review") print("insert ", review.review_title)
#output files unlabeled_file='../test-data.xml' labeled_file='../traning-data.xml' #lists for labeled and unlabeled reviews unlabeled=[] labeled=[] labeled_high=[] labeled_low=[] labeled_mid=[] for each_file in review_files: #call the readReviewsFromXML reviews = Review.readReviewsFromXML(each_file) for each_review in reviews: #convert reviewId into int, which help in sorting before saving in disk. each_review.reviewId=int(each_review.getReviewId()) #check and append if polarity is empty if (each_review.getReviewPolarity() == ""): unlabeled.append(each_review) elif (each_review.getReviewPolarity() == "-1"): labeled_low.append(each_review) elif(each_review.getReviewPolarity() == "0"): labeled_mid.append(each_review)
def parse_album_review(text, site): """Return date, artist, album, and body of review for page""" soup = BeautifulSoup(text, "html.parser") if site == "exclaim": date = dateparser.parse( soup.find("div", { "class": "article-published" }).get_text()[10:]) author = soup.find("div", {"class": "article-author"}).get_text()[3:] try: # Some reviews don't have ratings rating = soup.find("div", {"class": "article-rating"}).get_text() except AttributeError as err: rating = '' artist = soup.find("span", {"class": "article-title"}).get_text() try: album = soup.find("span", {"class": "article-subtitle"}).get_text() except AttributeError as err: album = '' review = soup.find("div", {"class": "article"}).get_text() if rating != '': try: review = re.split('(\n\d{1,2}\n)', review)[2] except IndexError as err: pass review = re.split('(\([^()]+\)\n\n)', review)[0] elif site == "rollingstone": # date will need to be further processed date = dateparser.parse( soup.find("time", { "class": "content-published-date" }).get_text()) author = soup.find("a", {"class": "content-author"}).get_text() # title does not hold artist and album in structured way title = soup.find("h1", {"class": "content-title"}).get_text() # Work in progress -- use URL instead? # from urllib.parse imprt urlparse # url = soup.find('link', {'rel': 'canonical'}).get('href') # parsed_url = urlparse(url) # # get last part of URL, split it into words, and remove the last word which is some id # # should be left with # url_title = parsed_url.path.split("/")[-1].split("-")[:-1] # url_title = urltitle.split("-") if title.startswith("Review:"): title = title.lstrip("Review:") # if ":" in title: # artist, album = title.strip().split(": ") # else: artist, album = title.strip(), "" # Reviews are nested <p> in the article-content <div> # I want to join contents of all <p>s, unescape the HTML, and remove newlines and tabs review = " ".join([ p.get_text() for p in soup.find("div", { "class": "article-content" }).find_all("p") ]) rating = len(soup.select("span.percentage.full")) if len(soup.select("span.percentage.half")) == 1: rating += 0.5 if not review: review = "" return Review(date=date, author=author, rating=rating, artist=artist, album=album, review=review)
return trainingData = {} validationData = {} self.generateKFolds(outdir, trainingData, validationData) for i in range(1,self.k+1): print "generating features for fold " + str(i) trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures() validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging); generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures() if __name__ == '__main__': reviews = Review.readReviewsFromXML("../old-training-shuffled.xml") lemmatizer = nltk.WordNetLemmatizer() print 'reviews: ' + str(len(reviews)) kfg = KFoldGenerator(reviews, 10) kfg.generateFolds("../kfolds/linearSVM/unigrams-lemma-POS-tf-no-stop", lemmatizer, POS_tagging = True, weightScheme = FeatureWeight.TF, includeRating=False, includeDocLength=False)