def dataset_from_contents_labels(self, contents, labels):
        arr_dataset = []
        for i in xrange(len(contents)):
            dr = Review(contents[i], labels[i])
            arr_dataset.append(dr)

        return self.dataset_from_array(arr_dataset)
Ejemplo n.º 2
0
 def convertReview(self, serialized_dict):
     review = Review()
     for key in serialized_dict:
         if "review_" in key:
             value = serialized_dict[key]
             setattr(review, key, value)
     return review
Ejemplo n.º 3
0
 def scrape_user_comment_list(self, raw_page=None):
     if not raw_page:
         raw_page = self.fetch_beer_page()
     self.reviews = []
     try:
         self.total_ratings
     except AttributeError:
         self.parse_metadata(raw_page)
     page = 1
     while len(self.reviews) < self.total_ratings:
         if page != 1:
             raw_page = self.fetch_beer_page(page=page)
         self.reviews +=[Review(beer_uid=self.uid, user_uid=int(user_id),
                                 brewery_uid =self.brewery_id, topline_score=float(topline_score),
                                 aroma_score=int(aroma), apperance_score=int(apperance),
                                 taste_score=int(taste), palete_score=int(palete),
                                 overall_score=int(overall), user_loc=user_loc,
                                 date = datetime.datetime.strptime(date_str, '%b %d, %Y').date(),
                                 comment = comment) for (topline_score, aroma, apperance,
                                                     taste, palete, overall, user_id, user_name, user_loc,
                                                     date_str, comment) in \
                                                             Beer.reviews_regex.findall(raw_page)]
         page += 1
         if page - 1 > self.total_ratings / 8.0:
             logging.error(
                 'parsing should have completed, but did not, forcing.')
             break
Ejemplo n.º 4
0
 def processReviewXls(self, sheet, row):
     review = Review()
     start_col = 0
     end_col = 11
     for col in range(start_col, end_col):
         if (col == 0):
             review.reviewId = sheet.cell_value(row, col)
         elif (col == 1):
             review.review = sheet.cell_value(row, col)
         elif (col == 2):
             review.Food = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 3):
             review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 4):
             review.Ambiance = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 5):
             review.Service = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 6):
             review.Location = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 7):
             review.Deals = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 8):
             review.Price = self.XlsCheckValue(sheet.cell_value(row, col))
         else:
             pass  #control should have never reached here as there are only 11 columns in xls
     return review
Ejemplo n.º 5
0
    def stemmingStopWRemoval(self, review, vocab):
        ''' Does Following things:
        1. Tokenize review into sentences, and then into words
        2. Remove stopwords, punctuation and stem each word
        3. Add words into vocab 
        4. Make Sentence objects and corresponding Review object
        '''
        reviewObj = Review()
        #copying ratings into reviewObj
        for ratingType, rating in review["Ratings"].items():
            reviewObj.ratings[ratingType] = rating
        reviewObj.reviewId = review["ReviewID"]

        stemmer = PorterStemmer()
        reviewContent = review["Content"]
        #TODO: Append title too!
        sentencesInReview = nltk.sent_tokenize(reviewContent)
        puncs = set(string.punctuation)  #punctuation marks
        for sentence in sentencesInReview:
            wordList = []
            words = nltk.word_tokenize(sentence)
            for word in words:
                if not all(c.isdigit() or c in puncs for c in word):
                    word = word.lower()
                    if word not in self.stopWords:
                        word = stemmer.stem(word.lower())
                        vocab.append(word)
                        wordList.append(word)
            if wordList:
                sentenceObj = Sentence(wordList)
                reviewObj.sentences.append(sentenceObj)
        if reviewObj.sentences:
            self.allReviews.append(reviewObj)
Ejemplo n.º 6
0
    def getAllReviews(self):
        self._c.execute("SELECT * FROM reviews")
        for row in self._c.fetchall():
            review = Review(row[0],row[1],row[2],row[3],row[4],row[5],row[6])

            if review.artist not in self.artists: self.artists[review.artist] = Artist(review.artist)
            if review.bnm == 1: self.artists[review.artist].bnms.append(review)
            self.artists[review.artist].reviews.append(review)
Ejemplo n.º 7
0
 def test_review(self):
     review = Review("Review title", "This is the review content", "5",
                     "December 20, 2020", "Deco Oliveira")
     self.assertEquals(review.title, "Review title")
     self.assertEquals(review.content, "This is the review content")
     self.assertEquals(review.rating, "5")
     self.assertEquals(review.date, "December 20, 2020")
     self.assertEquals(review.author, "Deco Oliveira")
Ejemplo n.º 8
0
    def __init__(self, filename, empty_user=set()):
        '''
        filename: inits the UBRR data from the input file
        empty_user: skip the reviews by this user (keeps the ratings)
        '''
        self.empty_user = empty_user

        ur_map = dict()
        br_map = dict()

        cnt = 0
        skipped = 0

        #read the file
        if filename.endswith('.gz'):
            f = gzip.open(filename, 'r')
        else:
            f = open(filename, 'r')

        for line in f:
            vals = line.split("\t")
            if len(vals) == 0:
                continue

            u = vals[0]
            b = vals[1]
            r = float(vals[2])
            d = vals[3].strip()
            if u in self.empty_user:
                #we are skipping this review
                d = ''
                skipped += 1

            rev = Review(u, b, r, d)  #review obj

            #store biz -> list of reviews
            if not br_map.get(b):
                br_map[b] = []

            br_map[b].append(rev)

            #store user -> list of reviews
            if not ur_map.get(u):
                ur_map[u] = []

            ur_map[u].append(rev)

            cnt += 1

        self.biz_map = br_map
        self.user_map = ur_map

        f.close()
        print('Review Data Manager Initialized with ', cnt, ' reviews')
        print('Number of skipped users = ', len(self.empty_user))
        print('Number of skipped reviews = ', skipped)
Ejemplo n.º 9
0
 def getReview(self, reviewlink):
     review = urlopen(Request(reviewlink),
                      context=ssl._create_unverified_context())
     review_soup = soup(review, 'lxml')
     root_container = review_soup.find("img",
                                       attrs={"src": re.compile('album')})
     score_container = root_container.find_next_sibling('div').find('span')
     name_container = root_container.find_next_sibling('h1').find('a')
     album_container = root_container.find_next_sibling('h1').find('span')
     return Review(float(score_container.text), reviewlink,
                   name_container.text, album_container.text)
Ejemplo n.º 10
0
def findMaterials(link):
    # Parse the given link into some Beautiful Soup
    req = requests.get(link).text
    reviews = BeautifulSoup(req, 'html.parser')

    # Set up list string variables.
    reviewAuthor = []
    reviewPosition = []
    reviewCompany = []
    reviewRating = []
    sectionHeading = []
    sectionText = []
    sectionDate = ''


    # Find the authors name (if there is one)
    for review in reviews.find_all('span', {'itemprop': 'author'}):
        reviewAuthor.append(review.contents[0].text)
    
    # Find the author's position and company (if applicable)
    for review in reviews.find_all('span', {'class': 'user-info'}):
        reviewPosition.append(review.contents[0].text)
        reviewCompany.append(review.contents[1].text)

    # Find what the user rated Sitefinity
    reviewRating = reviews.find_all('span', class_='number')[0].text

    # Perform find.contents[] for all of the headings and text
    # and append them to our functions variables
    for review in reviews.find_all('div', {'class': 'description'}):
        
        # Receive review section headings
        for head in range(6):
            sectionHeading.append(review.contents[0].contents[0].contents[1].contents[head].contents[0].contents[0].contents[0].contents[0].text)

        # Receive review section bodies
        for body in range(6):
            sectionText.append(" %s" % review.contents[0].contents[0].contents[1].contents[body].contents[1].contents[0].contents[0].contents[0].text)


    # Wrap up the review information into a dictionary, this is for easy handling    
    reviewDict = dict(zip(sectionHeading, sectionText))

    # Get's the date of the review from the review's URL
    sectionDate = link[56:-9]
    days = date(int(sectionDate[:-6]), int(sectionDate[5:-3]), int(sectionDate[8:]))

    # Create a new review using our Review class, and return that review
    rev = Review(reviewAuthor, reviewPosition, reviewCompany, reviewRating, reviewDict, days)
    print "Review created for %s..." % rev.name[0]
    sys.stdout.flush()
    return rev
Ejemplo n.º 11
0
def render_review():
    form = reviewForm(request.form)
    if request.method == 'POST' and form.validate():
        review = form.review.data
        s1 = Review(review)
        mag_db = root.child("review")
        mag_db.push({
            # "username": session["username"],
            'review': s1.get_review(),
            # "rating":s1.get_rating(),
            # "companyname":s1.get_rating()
        })
        flash("Thank You !! We Appreciate your Review :) ", "success")

    return render_template('Review.html', form=form)
    def load_review_from_csv(self, infile):
        with open(infile, "rb") as csvfile:
            reader = csv.DictReader(csvfile)

            # init field names & label column
            self.field_names = reader.fieldnames
            self.column_label = self.field_names[-1]

            for rows in reader:
                review = Review(rows[self.field_names[0]],
                                rows[self.field_names[1]])
                self.dataset.append(review)
                if self.label_values.count(rows[self.column_label]) == 0:
                    self.label_values.append(rows[self.column_label])

        return infile
Ejemplo n.º 13
0
    def collection(self, reviews):
        all_reviews = []
        factory = ElementFactory()

        for r in reviews:

            content = factory.content(r)
            five_stars = factory.five_stars(r)
            date = factory.date(r).text
            author = factory.author(r).text
            title = factory.title(r).text
            rating = 5 if five_stars != None else 0

            if(rating == 5):
                review = Review(title, content, rating, date, author)
                all_reviews.append(review)
        return all_reviews
Ejemplo n.º 14
0
    def test_add_review(self):
        expected = "Thank your for giving a review."
        is_first = True
        is_second = True
        count = 3
        for k in self.drivers:
            if is_first and is_second:
                order_id = self.order1
                is_first = False
            elif is_second:
                order_id = self.order2
                is_second = False
            else: 
                order_id = self.order3

            result = Review().start(self.drivers[k], self.user, order_id, str(count), "Unit testing")
            self.assertEqual(expected, result)
            count = count + 1
Ejemplo n.º 15
0
def parse_album_review(text, site):
    """Return date, artist, album, and body of review for page"""
    soup = BeautifulSoup(text, "html.parser")

    if site == "exclaim":
        date = dateparser.parse(
            soup.find("div", {
                "class": "article-published"
            }).get_text()[10:])
        author = soup.find("div", {"class": "article-author"}).get_text()[3:]
        try:  # Some reviews don't have ratings
            rating = soup.find("div", {"class": "article-rating"}).get_text()
        except AttributeError as err:
            rating = ''
        artist = soup.find("span", {"class": "article-title"}).get_text()
        try:
            album = soup.find("span", {"class": "article-subtitle"}).get_text()
        except AttributeError as err:
            album = ''
        review = soup.find("div", {"class": "article"}).get_text()
        if rating != '':
            try:
                review = re.split('(\n\d{1,2}\n)', review)[2]
            except IndexError as err:
                pass
        review = re.split('(\([^()]+\)\n\n)', review)[0]

    elif site == "rollingstone":

        # date will need to be further processed
        date = dateparser.parse(
            soup.find("time", {
                "class": "content-published-date"
            }).get_text())

        author = soup.find("a", {"class": "content-author"}).get_text()

        # title does not hold artist and album in structured way
        title = soup.find("h1", {"class": "content-title"}).get_text()

        # Work in progress -- use URL instead?
        # from urllib.parse imprt urlparse
        # url = soup.find('link', {'rel': 'canonical'}).get('href')
        # parsed_url = urlparse(url)
        # # get last part of URL, split it into words, and remove the last word which is some id
        # # should be left with
        # url_title = parsed_url.path.split("/")[-1].split("-")[:-1]
        # url_title = urltitle.split("-")

        if title.startswith("Review:"):
            title = title.lstrip("Review:")
        # if ":" in title:
        #     artist, album = title.strip().split(": ")
        # else:
        artist, album = title.strip(), ""

        # Reviews are nested <p> in the article-content <div>
        # I want to join contents of all <p>s, unescape the HTML, and remove newlines and tabs
        review = " ".join([
            p.get_text() for p in soup.find("div", {
                "class": "article-content"
            }).find_all("p")
        ])

        rating = len(soup.select("span.percentage.full"))
        if len(soup.select("span.percentage.half")) == 1:
            rating += 0.5

        if not review:
            review = ""

    return Review(date=date,
                  author=author,
                  rating=rating,
                  artist=artist,
                  album=album,
                  review=review)
Ejemplo n.º 16
0
from Rating import Rating
from Experience import Experience
from Review import Review
from User import User
from Recomendation import Recomendation

print(0, "->", Rating(0, 0, 2))
print(1, "->", Rating(1, 0, 2))
print(2, "->", Rating(2, 0, 2))
#Rating(9,0,2)
#Rating("g")
experiencia1 = Experience("Buenas migas", "Restaurante", 1)
experiencia2 = Experience("Telepizza", "Restaurante")
experiencia2.setId(2)
valoracion1 = Rating(1)
resenya1 = Review(experiencia1, valoracion1)
recomendacion1 = Recomendation(experiencia2, Rating(2))
user1 = User("nombre", "contraseña")
user1.setId(1)
user1.addRecomendation(
    Recomendation(Experience("Dominus", "Restaurante", 1), Rating(3)))
user1.addReview(Review(Experience("Dominus", "Restaurante", 3), Rating(4)))
user1.getRecomendations()[0].setId(
    (user1.getRecomendations()[0].getExperience().getId(), user1.getId()))
user1.getReviews()[0].setId(
    (user1.getReviews()[0].getExperience().getId(), user1.getId()))

user2 = User("otroUser", "otraPassword", id=3)
user2.setRecomendations(user1.getRecomendations())
user3 = User("copion", "copionpassword", user1.getReviews(),
             user2.getRecomendations(), 3)
Ejemplo n.º 17
0
    def reviewDetail(self, review_container, poi_id):
        """--------------------------"""
        '''-------------uid-------------'''
        uid = None
        try:
            try:
                original_uid = review_container.find_elements_by_css_selector(
                    ".memberOverlayLink")[0].get_attribute("id")
                #print(original_uid)
                long_uid = original_uid.split("_")[1]
                long_uid_split = long_uid.split("-")
                if len(long_uid_split) > 0:
                    uid = long_uid_split[0]
                else:
                    uid = long_uid
            except:
                original_uid = review_container.find_elements_by_css_selector(
                    ".member_info div")[0].get_attribute("class")
                try:
                    long_uid = original_uid.split("_")[1]
                    uid = long_uid
                except:
                    uid = review_container.find_element_by_css_selector(
                        ".username.mo span").text
        except:
            uid = None
        '''-------------review_title-------------'''
        review_title = None
        try:
            review_title = review_container.find_element_by_css_selector(
                "span.noQuotes").text
        except:
            review_title = None
        '''-------------review_rating-------------'''

        review_rating = None
        try:
            review_rating_string = review_container.find_element_by_css_selector(
                ".rating span.ui_bubble_rating").get_attribute("class")
            review_rating = int(
                review_rating_string.split(" ")[1].split("_")[1]) / 10
        except:
            review_rating = None
        '''-------------ratingDate-------------'''
        ratingDate = None
        try:
            ratingDate = review_container.find_element_by_css_selector(
                ".ratingDate.relativeDate").get_attribute("title")
        except:
            ratingDate = None
        '''-------------review-------------'''
        review = None
        try:
            review = review_container.find_element_by_css_selector(
                ".entry .partial_entry").text
        except:
            review = None
        '''-------------print all data-------------'''
        # print("uid:",uid,"review_title:",review_title,"review_rating:",review_rating,"review:",review,"ratingDate:",ratingDate)
        review = Review(poi_id, uid, review_title, review_rating, review,
                        ratingDate)
        print(review)
        if self.insertToDB_gate:
            self.db.insert(review, "review")
            print("insert ", review.review_title)
Ejemplo n.º 18
0
 def reviewList(self, movie_code):
     d = json.loads(
         urllib.urlopen(
             "http://api.allocine.fr/rest/v3/reviewlist?partner=%s&format=json&code=%s"
             % (PARTNER_CODE, movie_code)).read())
     return [Review(**i) for i in d["feed"]["review"]]
Ejemplo n.º 19
0
    def __init__(self, url, cnx):

        #temporary for testing
        #url = '/movie/bond-23'

        #skip this. metacritic's fault
        if (url == '/movie/who-the-%-is-jackson-pollock'):
            return
        #values that go into database
        values = {}
        values['title'] = ''
        values['url'] = ''
        values['cScore'] = ''
        values['uScore'] = ''
        values['date'] = ''

        #get all of those single values then put them in the movie table
        #then find all of the reviews and put them in the reviews table with the movie id

        #time to get the stuff from the movie page

        #get movie page
        response = requests.get('http://www.metacritic.com' + url,
                                allow_redirects=True)

        if (response.status_code == 400):
            return
        url = re.sub(
            'http:\/\/www.metacritic.com', '',
            response.url)  #resets the url to the one that was redirected to

        #convert html to string
        mainPageHtml = response.content
        #make the soup
        mainPageSoup = BeautifulSoup(mainPageHtml)

        #save the url
        values['url'] = url

        #get the title
        results = mainPageSoup.find_all('span', {'itemprop': 'name'})
        values['title'] = results[0].string
        values['title'] = str(
            values['title'].lstrip().rstrip())  #get rid of weird whitespace
        #get the critic score
        results = mainPageSoup.find_all('span', {'itemprop': 'ratingValue'})
        values['cScore'] = str(results[0].string)

        #get the user score
        results = mainPageSoup.find_all('a', {
            'class': 'metascore_anchor',
            'href': url + '/user-reviews'
        })

        #if for some reason it can't find the user score. it happens even though it shouldn't
        if (len(results) > 0):
            values['uScore'] = str(results[0].div.string)
            if (values['uScore'] == 'tbd'):
                values['uScore'] = str('-1')
        else:
            values['uScore'] = str('-1')

        #get the year
        results = mainPageSoup.find_all('span', {
            'class': 'data',
            'itemprop': 'datePublished'
        })
        date = str(results[0].string.lstrip().rstrip())
        matches = re.match(r'([a-zA-Z]{3})\s(\d+),\s(\d{4})', date)
        if (matches):
            month = {
                'Jan': '01',
                'Feb': '02',
                'Mar': '03',
                'Apr': '04',
                'May': '05',
                'Jun': '06',
                'Jul': '07',
                'Aug': '08',
                'Sep': '09',
                'Oct': '10',
                'Nov': '11',
                'Dec': '12'
            }[matches.group(1)]
            day = matches.group(2)
            year = matches.group(3)
            values['date'] = year + '-' + month + '-' + day
        else:
            values['date'] = None
        #save to the database
        cursor = cnx.cursor()
        query = ("select movie_id from movies where movie_url = %s")

        inDB = False
        mid = 0
        cursor.execute(query, (str(values['url']), ))
        for (movie_id, ) in cursor:
            inDB = True
            id = movie_id
        if (not inDB):
            #make a new row for this critic
            if (values['date'] is not None):
                add_movie = ("INSERT INTO movies"
                             "(title, movie_url, uScore, cScore, release_date)"
                             "VALUES (%s, %s, %s, %s, %s)")
                movie_data = (values['title'], values['url'], values['uScore'],
                              values['cScore'], values['date'])
            else:
                add_movie = ("INSERT INTO movies"
                             "(title, movie_url, uScore, cScore)"
                             "VALUES (%s, %s, %s, %s)")
                movie_data = (values['title'], values['url'], values['uScore'],
                              values['cScore'])
            cursor.execute(add_movie, movie_data)
            mid = cursor.lastrowid
            cnx.commit()
        cursor.close()

        #get the critic reviews
        #get html
        criticPage = openUrl(url)
        criticSoup = BeautifulSoup(criticPage)

        criticReviews = criticSoup.find_all(
            'div', {'class': 'module reviews_module critic_reviews_module'})
        if (len(criticReviews) > 0):
            reviews = criticReviews[0].find_all('div',
                                                {'class': 'review_content'})
        else:
            print('ERROR:' + url)
            reviews = []

        for r in reviews:
            Rev = Review(mid, values['url'], r, cnx)