def dataset_from_contents_labels(self, contents, labels): arr_dataset = [] for i in xrange(len(contents)): dr = Review(contents[i], labels[i]) arr_dataset.append(dr) return self.dataset_from_array(arr_dataset)
def convertReview(self, serialized_dict): review = Review() for key in serialized_dict: if "review_" in key: value = serialized_dict[key] setattr(review, key, value) return review
def scrape_user_comment_list(self, raw_page=None): if not raw_page: raw_page = self.fetch_beer_page() self.reviews = [] try: self.total_ratings except AttributeError: self.parse_metadata(raw_page) page = 1 while len(self.reviews) < self.total_ratings: if page != 1: raw_page = self.fetch_beer_page(page=page) self.reviews +=[Review(beer_uid=self.uid, user_uid=int(user_id), brewery_uid =self.brewery_id, topline_score=float(topline_score), aroma_score=int(aroma), apperance_score=int(apperance), taste_score=int(taste), palete_score=int(palete), overall_score=int(overall), user_loc=user_loc, date = datetime.datetime.strptime(date_str, '%b %d, %Y').date(), comment = comment) for (topline_score, aroma, apperance, taste, palete, overall, user_id, user_name, user_loc, date_str, comment) in \ Beer.reviews_regex.findall(raw_page)] page += 1 if page - 1 > self.total_ratings / 8.0: logging.error( 'parsing should have completed, but did not, forcing.') break
def processReviewXls(self, sheet, row): review = Review() start_col = 0 end_col = 11 for col in range(start_col, end_col): if (col == 0): review.reviewId = sheet.cell_value(row, col) elif (col == 1): review.review = sheet.cell_value(row, col) elif (col == 2): review.Food = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 3): review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 4): review.Ambiance = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 5): review.Service = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 6): review.Location = self.XlsCheckValue(sheet.cell_value( row, col)) elif (col == 7): review.Deals = self.XlsCheckValue(sheet.cell_value(row, col)) elif (col == 8): review.Price = self.XlsCheckValue(sheet.cell_value(row, col)) else: pass #control should have never reached here as there are only 11 columns in xls return review
def stemmingStopWRemoval(self, review, vocab): ''' Does Following things: 1. Tokenize review into sentences, and then into words 2. Remove stopwords, punctuation and stem each word 3. Add words into vocab 4. Make Sentence objects and corresponding Review object ''' reviewObj = Review() #copying ratings into reviewObj for ratingType, rating in review["Ratings"].items(): reviewObj.ratings[ratingType] = rating reviewObj.reviewId = review["ReviewID"] stemmer = PorterStemmer() reviewContent = review["Content"] #TODO: Append title too! sentencesInReview = nltk.sent_tokenize(reviewContent) puncs = set(string.punctuation) #punctuation marks for sentence in sentencesInReview: wordList = [] words = nltk.word_tokenize(sentence) for word in words: if not all(c.isdigit() or c in puncs for c in word): word = word.lower() if word not in self.stopWords: word = stemmer.stem(word.lower()) vocab.append(word) wordList.append(word) if wordList: sentenceObj = Sentence(wordList) reviewObj.sentences.append(sentenceObj) if reviewObj.sentences: self.allReviews.append(reviewObj)
def getAllReviews(self): self._c.execute("SELECT * FROM reviews") for row in self._c.fetchall(): review = Review(row[0],row[1],row[2],row[3],row[4],row[5],row[6]) if review.artist not in self.artists: self.artists[review.artist] = Artist(review.artist) if review.bnm == 1: self.artists[review.artist].bnms.append(review) self.artists[review.artist].reviews.append(review)
def test_review(self): review = Review("Review title", "This is the review content", "5", "December 20, 2020", "Deco Oliveira") self.assertEquals(review.title, "Review title") self.assertEquals(review.content, "This is the review content") self.assertEquals(review.rating, "5") self.assertEquals(review.date, "December 20, 2020") self.assertEquals(review.author, "Deco Oliveira")
def __init__(self, filename, empty_user=set()): ''' filename: inits the UBRR data from the input file empty_user: skip the reviews by this user (keeps the ratings) ''' self.empty_user = empty_user ur_map = dict() br_map = dict() cnt = 0 skipped = 0 #read the file if filename.endswith('.gz'): f = gzip.open(filename, 'r') else: f = open(filename, 'r') for line in f: vals = line.split("\t") if len(vals) == 0: continue u = vals[0] b = vals[1] r = float(vals[2]) d = vals[3].strip() if u in self.empty_user: #we are skipping this review d = '' skipped += 1 rev = Review(u, b, r, d) #review obj #store biz -> list of reviews if not br_map.get(b): br_map[b] = [] br_map[b].append(rev) #store user -> list of reviews if not ur_map.get(u): ur_map[u] = [] ur_map[u].append(rev) cnt += 1 self.biz_map = br_map self.user_map = ur_map f.close() print('Review Data Manager Initialized with ', cnt, ' reviews') print('Number of skipped users = ', len(self.empty_user)) print('Number of skipped reviews = ', skipped)
def getReview(self, reviewlink): review = urlopen(Request(reviewlink), context=ssl._create_unverified_context()) review_soup = soup(review, 'lxml') root_container = review_soup.find("img", attrs={"src": re.compile('album')}) score_container = root_container.find_next_sibling('div').find('span') name_container = root_container.find_next_sibling('h1').find('a') album_container = root_container.find_next_sibling('h1').find('span') return Review(float(score_container.text), reviewlink, name_container.text, album_container.text)
def findMaterials(link): # Parse the given link into some Beautiful Soup req = requests.get(link).text reviews = BeautifulSoup(req, 'html.parser') # Set up list string variables. reviewAuthor = [] reviewPosition = [] reviewCompany = [] reviewRating = [] sectionHeading = [] sectionText = [] sectionDate = '' # Find the authors name (if there is one) for review in reviews.find_all('span', {'itemprop': 'author'}): reviewAuthor.append(review.contents[0].text) # Find the author's position and company (if applicable) for review in reviews.find_all('span', {'class': 'user-info'}): reviewPosition.append(review.contents[0].text) reviewCompany.append(review.contents[1].text) # Find what the user rated Sitefinity reviewRating = reviews.find_all('span', class_='number')[0].text # Perform find.contents[] for all of the headings and text # and append them to our functions variables for review in reviews.find_all('div', {'class': 'description'}): # Receive review section headings for head in range(6): sectionHeading.append(review.contents[0].contents[0].contents[1].contents[head].contents[0].contents[0].contents[0].contents[0].text) # Receive review section bodies for body in range(6): sectionText.append(" %s" % review.contents[0].contents[0].contents[1].contents[body].contents[1].contents[0].contents[0].contents[0].text) # Wrap up the review information into a dictionary, this is for easy handling reviewDict = dict(zip(sectionHeading, sectionText)) # Get's the date of the review from the review's URL sectionDate = link[56:-9] days = date(int(sectionDate[:-6]), int(sectionDate[5:-3]), int(sectionDate[8:])) # Create a new review using our Review class, and return that review rev = Review(reviewAuthor, reviewPosition, reviewCompany, reviewRating, reviewDict, days) print "Review created for %s..." % rev.name[0] sys.stdout.flush() return rev
def render_review(): form = reviewForm(request.form) if request.method == 'POST' and form.validate(): review = form.review.data s1 = Review(review) mag_db = root.child("review") mag_db.push({ # "username": session["username"], 'review': s1.get_review(), # "rating":s1.get_rating(), # "companyname":s1.get_rating() }) flash("Thank You !! We Appreciate your Review :) ", "success") return render_template('Review.html', form=form)
def load_review_from_csv(self, infile): with open(infile, "rb") as csvfile: reader = csv.DictReader(csvfile) # init field names & label column self.field_names = reader.fieldnames self.column_label = self.field_names[-1] for rows in reader: review = Review(rows[self.field_names[0]], rows[self.field_names[1]]) self.dataset.append(review) if self.label_values.count(rows[self.column_label]) == 0: self.label_values.append(rows[self.column_label]) return infile
def collection(self, reviews): all_reviews = [] factory = ElementFactory() for r in reviews: content = factory.content(r) five_stars = factory.five_stars(r) date = factory.date(r).text author = factory.author(r).text title = factory.title(r).text rating = 5 if five_stars != None else 0 if(rating == 5): review = Review(title, content, rating, date, author) all_reviews.append(review) return all_reviews
def test_add_review(self): expected = "Thank your for giving a review." is_first = True is_second = True count = 3 for k in self.drivers: if is_first and is_second: order_id = self.order1 is_first = False elif is_second: order_id = self.order2 is_second = False else: order_id = self.order3 result = Review().start(self.drivers[k], self.user, order_id, str(count), "Unit testing") self.assertEqual(expected, result) count = count + 1
def parse_album_review(text, site): """Return date, artist, album, and body of review for page""" soup = BeautifulSoup(text, "html.parser") if site == "exclaim": date = dateparser.parse( soup.find("div", { "class": "article-published" }).get_text()[10:]) author = soup.find("div", {"class": "article-author"}).get_text()[3:] try: # Some reviews don't have ratings rating = soup.find("div", {"class": "article-rating"}).get_text() except AttributeError as err: rating = '' artist = soup.find("span", {"class": "article-title"}).get_text() try: album = soup.find("span", {"class": "article-subtitle"}).get_text() except AttributeError as err: album = '' review = soup.find("div", {"class": "article"}).get_text() if rating != '': try: review = re.split('(\n\d{1,2}\n)', review)[2] except IndexError as err: pass review = re.split('(\([^()]+\)\n\n)', review)[0] elif site == "rollingstone": # date will need to be further processed date = dateparser.parse( soup.find("time", { "class": "content-published-date" }).get_text()) author = soup.find("a", {"class": "content-author"}).get_text() # title does not hold artist and album in structured way title = soup.find("h1", {"class": "content-title"}).get_text() # Work in progress -- use URL instead? # from urllib.parse imprt urlparse # url = soup.find('link', {'rel': 'canonical'}).get('href') # parsed_url = urlparse(url) # # get last part of URL, split it into words, and remove the last word which is some id # # should be left with # url_title = parsed_url.path.split("/")[-1].split("-")[:-1] # url_title = urltitle.split("-") if title.startswith("Review:"): title = title.lstrip("Review:") # if ":" in title: # artist, album = title.strip().split(": ") # else: artist, album = title.strip(), "" # Reviews are nested <p> in the article-content <div> # I want to join contents of all <p>s, unescape the HTML, and remove newlines and tabs review = " ".join([ p.get_text() for p in soup.find("div", { "class": "article-content" }).find_all("p") ]) rating = len(soup.select("span.percentage.full")) if len(soup.select("span.percentage.half")) == 1: rating += 0.5 if not review: review = "" return Review(date=date, author=author, rating=rating, artist=artist, album=album, review=review)
from Rating import Rating from Experience import Experience from Review import Review from User import User from Recomendation import Recomendation print(0, "->", Rating(0, 0, 2)) print(1, "->", Rating(1, 0, 2)) print(2, "->", Rating(2, 0, 2)) #Rating(9,0,2) #Rating("g") experiencia1 = Experience("Buenas migas", "Restaurante", 1) experiencia2 = Experience("Telepizza", "Restaurante") experiencia2.setId(2) valoracion1 = Rating(1) resenya1 = Review(experiencia1, valoracion1) recomendacion1 = Recomendation(experiencia2, Rating(2)) user1 = User("nombre", "contraseña") user1.setId(1) user1.addRecomendation( Recomendation(Experience("Dominus", "Restaurante", 1), Rating(3))) user1.addReview(Review(Experience("Dominus", "Restaurante", 3), Rating(4))) user1.getRecomendations()[0].setId( (user1.getRecomendations()[0].getExperience().getId(), user1.getId())) user1.getReviews()[0].setId( (user1.getReviews()[0].getExperience().getId(), user1.getId())) user2 = User("otroUser", "otraPassword", id=3) user2.setRecomendations(user1.getRecomendations()) user3 = User("copion", "copionpassword", user1.getReviews(), user2.getRecomendations(), 3)
def reviewDetail(self, review_container, poi_id): """--------------------------""" '''-------------uid-------------''' uid = None try: try: original_uid = review_container.find_elements_by_css_selector( ".memberOverlayLink")[0].get_attribute("id") #print(original_uid) long_uid = original_uid.split("_")[1] long_uid_split = long_uid.split("-") if len(long_uid_split) > 0: uid = long_uid_split[0] else: uid = long_uid except: original_uid = review_container.find_elements_by_css_selector( ".member_info div")[0].get_attribute("class") try: long_uid = original_uid.split("_")[1] uid = long_uid except: uid = review_container.find_element_by_css_selector( ".username.mo span").text except: uid = None '''-------------review_title-------------''' review_title = None try: review_title = review_container.find_element_by_css_selector( "span.noQuotes").text except: review_title = None '''-------------review_rating-------------''' review_rating = None try: review_rating_string = review_container.find_element_by_css_selector( ".rating span.ui_bubble_rating").get_attribute("class") review_rating = int( review_rating_string.split(" ")[1].split("_")[1]) / 10 except: review_rating = None '''-------------ratingDate-------------''' ratingDate = None try: ratingDate = review_container.find_element_by_css_selector( ".ratingDate.relativeDate").get_attribute("title") except: ratingDate = None '''-------------review-------------''' review = None try: review = review_container.find_element_by_css_selector( ".entry .partial_entry").text except: review = None '''-------------print all data-------------''' # print("uid:",uid,"review_title:",review_title,"review_rating:",review_rating,"review:",review,"ratingDate:",ratingDate) review = Review(poi_id, uid, review_title, review_rating, review, ratingDate) print(review) if self.insertToDB_gate: self.db.insert(review, "review") print("insert ", review.review_title)
def reviewList(self, movie_code): d = json.loads( urllib.urlopen( "http://api.allocine.fr/rest/v3/reviewlist?partner=%s&format=json&code=%s" % (PARTNER_CODE, movie_code)).read()) return [Review(**i) for i in d["feed"]["review"]]
def __init__(self, url, cnx): #temporary for testing #url = '/movie/bond-23' #skip this. metacritic's fault if (url == '/movie/who-the-%-is-jackson-pollock'): return #values that go into database values = {} values['title'] = '' values['url'] = '' values['cScore'] = '' values['uScore'] = '' values['date'] = '' #get all of those single values then put them in the movie table #then find all of the reviews and put them in the reviews table with the movie id #time to get the stuff from the movie page #get movie page response = requests.get('http://www.metacritic.com' + url, allow_redirects=True) if (response.status_code == 400): return url = re.sub( 'http:\/\/www.metacritic.com', '', response.url) #resets the url to the one that was redirected to #convert html to string mainPageHtml = response.content #make the soup mainPageSoup = BeautifulSoup(mainPageHtml) #save the url values['url'] = url #get the title results = mainPageSoup.find_all('span', {'itemprop': 'name'}) values['title'] = results[0].string values['title'] = str( values['title'].lstrip().rstrip()) #get rid of weird whitespace #get the critic score results = mainPageSoup.find_all('span', {'itemprop': 'ratingValue'}) values['cScore'] = str(results[0].string) #get the user score results = mainPageSoup.find_all('a', { 'class': 'metascore_anchor', 'href': url + '/user-reviews' }) #if for some reason it can't find the user score. it happens even though it shouldn't if (len(results) > 0): values['uScore'] = str(results[0].div.string) if (values['uScore'] == 'tbd'): values['uScore'] = str('-1') else: values['uScore'] = str('-1') #get the year results = mainPageSoup.find_all('span', { 'class': 'data', 'itemprop': 'datePublished' }) date = str(results[0].string.lstrip().rstrip()) matches = re.match(r'([a-zA-Z]{3})\s(\d+),\s(\d{4})', date) if (matches): month = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }[matches.group(1)] day = matches.group(2) year = matches.group(3) values['date'] = year + '-' + month + '-' + day else: values['date'] = None #save to the database cursor = cnx.cursor() query = ("select movie_id from movies where movie_url = %s") inDB = False mid = 0 cursor.execute(query, (str(values['url']), )) for (movie_id, ) in cursor: inDB = True id = movie_id if (not inDB): #make a new row for this critic if (values['date'] is not None): add_movie = ("INSERT INTO movies" "(title, movie_url, uScore, cScore, release_date)" "VALUES (%s, %s, %s, %s, %s)") movie_data = (values['title'], values['url'], values['uScore'], values['cScore'], values['date']) else: add_movie = ("INSERT INTO movies" "(title, movie_url, uScore, cScore)" "VALUES (%s, %s, %s, %s)") movie_data = (values['title'], values['url'], values['uScore'], values['cScore']) cursor.execute(add_movie, movie_data) mid = cursor.lastrowid cnx.commit() cursor.close() #get the critic reviews #get html criticPage = openUrl(url) criticSoup = BeautifulSoup(criticPage) criticReviews = criticSoup.find_all( 'div', {'class': 'module reviews_module critic_reviews_module'}) if (len(criticReviews) > 0): reviews = criticReviews[0].find_all('div', {'class': 'review_content'}) else: print('ERROR:' + url) reviews = [] for r in reviews: Rev = Review(mid, values['url'], r, cnx)