예제 #1
0
파일: Xls2mongo.py 프로젝트: kusum18/yelp
 def processReviewXls(self, sheet, row):
     review = Review()
     start_col = 0
     end_col = 11
     for col in range(start_col, end_col):
         if col == 0:
             review.reviewId = sheet.cell_value(row, col)
         elif col == 1:
             review.review = sheet.cell_value(row, col)
         elif col == 2:
             review.Food = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 3:
             review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 4:
             review.Ambiance = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 5:
             review.Service = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 6:
             review.Location = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 7:
             review.Deals = self.XlsCheckValue(sheet.cell_value(row, col))
         elif col == 8:
             review.Price = self.XlsCheckValue(sheet.cell_value(row, col))
         else:
             pass  # control should have never reached here as there are only 11 columns in xls
     return review
예제 #2
0
파일: Util.py 프로젝트: hseran/IAR
 def shuffleReviews(input_file, output_file):
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     random.shuffle(reviewList)
     Review.serializeToXML(reviewList, output_file)
예제 #3
0
    def stemmingStopWRemoval(self, review, vocab):
        ''' Does Following things:
        1. Tokenize review into sentences, and then into words
        2. Remove stopwords, punctuation and stem each word
        3. Add words into vocab 
        4. Make Sentence objects and corresponding Review object
        '''
        reviewObj = Review()
        #copying ratings into reviewObj
        for ratingType, rating in review["Ratings"].items():
            reviewObj.ratings[ratingType] = rating
        reviewObj.reviewId = review["ReviewID"]

        stemmer = PorterStemmer()
        reviewContent = review["Content"]
        #TODO: Append title too!
        sentencesInReview = nltk.sent_tokenize(reviewContent)
        puncs = set(string.punctuation)  #punctuation marks
        for sentence in sentencesInReview:
            wordList = []
            words = nltk.word_tokenize(sentence)
            for word in words:
                if not all(c.isdigit() or c in puncs for c in word):
                    word = word.lower()
                    if word not in self.stopWords:
                        word = stemmer.stem(word.lower())
                        vocab.append(word)
                        wordList.append(word)
            if wordList:
                sentenceObj = Sentence(wordList)
                reviewObj.sentences.append(sentenceObj)
        if reviewObj.sentences:
            self.allReviews.append(reviewObj)
예제 #4
0
파일: Util.py 프로젝트: hseran/IAR
 def seperateByRating(input_file, output_dir):
     reviewList = Review.readReviewsFromXML(input_file)
     high5 = []
     low1 = []
     medium = []
     low2 = []
     for review in reviewList:
         if str(review.getReviewRating()) == '5.0':
             review.setPolarity('1')
             review.setConfidence('1')
             high5.append(review)
         elif str(review.getReviewRating()) == '1.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low1.append(review)
         elif str(review.getReviewRating()) == '2.0':
             review.setPolarity('-1')
             review.setConfidence('1')
             low2.append(review)
         else:
             medium.append(review)
     
     Review.serializeToXML(high5, output_dir + "/high.xml")
     Review.serializeToXML(low1, output_dir + "/low1.xml")
     Review.serializeToXML(low2, output_dir + "/low2.xml")
     Review.serializeToXML(medium, output_dir + "/medium.xml")
     print "5: " + str(len(high5))
     print "1: " + str(len(low1))
     print "2: " + str(len(low2))       
예제 #5
0
파일: Util.py 프로젝트: hseran/IAR
 def separateLabeledAndUnlabeled(file, output_dir):
     reviewList = Review.readReviewsFromXML(file)
     labeled = []
     unlabeled = []
     
     for review in reviewList:
         if review.getReviewPolarity().strip() != '':
             labeled.append(review)
         else:
             unlabeled.append(review)
     Review.serializeToXML(labeled, output_dir + "/labeled-neu.xml")
     Review.serializeToXML(unlabeled, output_dir + "/unlabeled-neu.xml")
 def process(self):
     with codecs.open(SOURCE_TRAIN_FILE, encoding='utf-8') as r:
         lines = r.readlines()
         count = 0
         for line in lines:
             print count
             count += 1
             review = Review()
             self.stage_initial(review, line)
             self.stage_add_jj(review)
             self.stage_add_key_word(review)
             self.stage_post_process(review)
             if review.is_valid():
                 self.review_list.append(review)
예제 #7
0
파일: Util.py 프로젝트: hseran/IAR
 def siftReviewsByPolarity(input_file, output_file, polarity):
     '''
     out_file will contain all reviews from input_file 
     other than the ones labeled as polarity 
     '''
     reviewList = Review.readReviewsFromXML(input_file)
     if reviewList == None or len(reviewList) == 0:
         print "No reviews in input file"
     
     outList = []
     for review in reviewList:
         if str(review.getReviewPolarity()) == str(polarity):
             continue
         outList.append(review)
     Review.serializeToXML(outList, output_file)
예제 #8
0
def render_review():
    form = reviewForm(request.form)
    if request.method == 'POST' and form.validate():
        review = form.review.data
        s1 = Review(review)
        mag_db = root.child("review")
        mag_db.push({
            # "username": session["username"],
            'review': s1.get_review(),
            # "rating":s1.get_rating(),
            # "companyname":s1.get_rating()
        })
        flash("Thank You !! We Appreciate your Review :) ", "success")

    return render_template('Review.html', form=form)
예제 #9
0
 def generateKFolds(self, location = "./", trainingData = {}, validationData = {}):
     if self.reviews == None or len(self.reviews) == 0:
         print 'No data to work on'
         return
     i = 0;
     
     import os
     if not os.path.isdir(location):
         location = "./"
     
     for training, validation in self.k_fold_cross_validation():
         i = i + 1
         Review.serializeToXML(training, location + "/train" + str(i) + ".xml")
         Review.serializeToXML(validation, location + "/valid" + str(i) + ".xml")
         trainingData[str(i)] = training
         validationData[str(i)] = validation
    def dataset_from_contents_labels(self, contents, labels):
        arr_dataset = []
        for i in xrange(len(contents)):
            dr = Review(contents[i], labels[i])
            arr_dataset.append(dr)

        return self.dataset_from_array(arr_dataset)
예제 #11
0
파일: Util.py 프로젝트: hseran/IAR
 def countLabeledReviews(file):
     reviewList = Review.readReviewsFromXML(file)
     count = 0
     for review in reviewList:
         if review.getReviewPolarity().strip() != '':
             count += 1
     print count
예제 #12
0
 def scrape_user_comment_list(self, raw_page=None):
     if not raw_page:
         raw_page = self.fetch_beer_page()
     self.reviews = []
     try:
         self.total_ratings
     except AttributeError:
         self.parse_metadata(raw_page)
     page = 1
     while len(self.reviews) < self.total_ratings:
         if page != 1:
             raw_page = self.fetch_beer_page(page=page)
         self.reviews +=[Review(beer_uid=self.uid, user_uid=int(user_id),
                                 brewery_uid =self.brewery_id, topline_score=float(topline_score),
                                 aroma_score=int(aroma), apperance_score=int(apperance),
                                 taste_score=int(taste), palete_score=int(palete),
                                 overall_score=int(overall), user_loc=user_loc,
                                 date = datetime.datetime.strptime(date_str, '%b %d, %Y').date(),
                                 comment = comment) for (topline_score, aroma, apperance,
                                                     taste, palete, overall, user_id, user_name, user_loc,
                                                     date_str, comment) in \
                                                             Beer.reviews_regex.findall(raw_page)]
         page += 1
         if page - 1 > self.total_ratings / 8.0:
             logging.error(
                 'parsing should have completed, but did not, forcing.')
             break
예제 #13
0
 def convertReview(self, serialized_dict):
     review = Review()
     for key in serialized_dict:
         if "review_" in key:
             value = serialized_dict[key]
             setattr(review, key, value)
     return review
예제 #14
0
    def getAllReviews(self):
        self._c.execute("SELECT * FROM reviews")
        for row in self._c.fetchall():
            review = Review(row[0],row[1],row[2],row[3],row[4],row[5],row[6])

            if review.artist not in self.artists: self.artists[review.artist] = Artist(review.artist)
            if review.bnm == 1: self.artists[review.artist].bnms.append(review)
            self.artists[review.artist].reviews.append(review)
예제 #15
0
 def test_review(self):
     review = Review("Review title", "This is the review content", "5",
                     "December 20, 2020", "Deco Oliveira")
     self.assertEquals(review.title, "Review title")
     self.assertEquals(review.content, "This is the review content")
     self.assertEquals(review.rating, "5")
     self.assertEquals(review.date, "December 20, 2020")
     self.assertEquals(review.author, "Deco Oliveira")
예제 #16
0
    def delete(self):
        req_data = request.get_json()
        username = req_data['username']
        restaurant_id = req_data['restaurant_id']

        if Review.delete_rating(username, restaurant_id):
            return {'message': "Review deleted"}, 200
        else:
            return {'message': "No review to delete"}, 200
예제 #17
0
    def from_json(json_filename, from_annotated=False):
        paper = Paper('', '', None, [])

        datas = []
        with io.open(json_filename, mode='rt', encoding='utf8') as json_file:
            for line in json_file:
                try:
                    data = json.loads(line.strip())
                    datas.append(data)
                except Exception as e:
                    print(line)
                    continue
        if len(datas) == 0: return None
        data = datas[-1]
        #print data
        # Read required fields.
        assert 'title' in data
        assert 'abstract' in data
        paper.TITLE = data['title']
        paper.ABSTRACT = data['abstract']

        if 'id' in data:
            if data['id'] == "":
                paper.ID = json_filename.split("/")[-1].split(".")[0]
            else:
                paper.ID = data['id']
        else:
            paper.ID = json_filename.split("/")[-1].split(".")[0]

        # Read optional fields.
        paper.AUTHORS = data['authors'] if 'authors' in data else None
        paper.CONFERENCE = data['conference'] if 'conference' in data else None
        paper.ACCEPTED = data['accepted'] if 'accepted' in data else None
        paper.SCORE = data['score'] if 'score' in data else None
        paper.PUBLICATION_TYPE = data[
            'publication_type'] if 'publication_type' in data else None
        paper.SCIENCEPARSE = data[
            'scienceparse'] if 'scienceparse' in data else None
        paper.KEYWORDS = data['keywords'] if 'keywords' in data else None
        paper.AUTHOR_EMAILS = data[
            'author_emails'] if 'author_emails' in data else None

        paper.DATE_OF_SUBMISSION = data[
            'DATE_OF_SUBMISSION'] if 'DATE_OF_SUBMISSION' in data else None

        paper.SUBJECTS = data['SUBJECTS'] if 'SUBJECTS' in data else None
        paper.COMMENTS = data['COMMENTS'] if 'COMMENTS' in data else None
        paper.VERSION = data['VERSION'] if 'VERSION' in data else None
        paper.HISTORIES = data['histories'] if 'histories' in data else None

        # Read reviews (mandatory).
        assert 'reviews' in data
        for review_data in data['reviews']:
            review = Review.from_json_object(review_data)
            paper.REVIEWS.append(review)
        return paper
예제 #18
0
파일: DataMgr.py 프로젝트: crh5914/RevRec
    def __init__(self, filename, empty_user=set()):
        '''
        filename: inits the UBRR data from the input file
        empty_user: skip the reviews by this user (keeps the ratings)
        '''
        self.empty_user = empty_user

        ur_map = dict()
        br_map = dict()

        cnt = 0
        skipped = 0

        #read the file
        if filename.endswith('.gz'):
            f = gzip.open(filename, 'r')
        else:
            f = open(filename, 'r')

        for line in f:
            vals = line.split("\t")
            if len(vals) == 0:
                continue

            u = vals[0]
            b = vals[1]
            r = float(vals[2])
            d = vals[3].strip()
            if u in self.empty_user:
                #we are skipping this review
                d = ''
                skipped += 1

            rev = Review(u, b, r, d)  #review obj

            #store biz -> list of reviews
            if not br_map.get(b):
                br_map[b] = []

            br_map[b].append(rev)

            #store user -> list of reviews
            if not ur_map.get(u):
                ur_map[u] = []

            ur_map[u].append(rev)

            cnt += 1

        self.biz_map = br_map
        self.user_map = ur_map

        f.close()
        print('Review Data Manager Initialized with ', cnt, ' reviews')
        print('Number of skipped users = ', len(self.empty_user))
        print('Number of skipped reviews = ', skipped)
예제 #19
0
 def getReview(self, reviewlink):
     review = urlopen(Request(reviewlink),
                      context=ssl._create_unverified_context())
     review_soup = soup(review, 'lxml')
     root_container = review_soup.find("img",
                                       attrs={"src": re.compile('album')})
     score_container = root_container.find_next_sibling('div').find('span')
     name_container = root_container.find_next_sibling('h1').find('a')
     album_container = root_container.find_next_sibling('h1').find('span')
     return Review(float(score_container.text), reviewlink,
                   name_container.text, album_container.text)
예제 #20
0
def findMaterials(link):
    # Parse the given link into some Beautiful Soup
    req = requests.get(link).text
    reviews = BeautifulSoup(req, 'html.parser')

    # Set up list string variables.
    reviewAuthor = []
    reviewPosition = []
    reviewCompany = []
    reviewRating = []
    sectionHeading = []
    sectionText = []
    sectionDate = ''


    # Find the authors name (if there is one)
    for review in reviews.find_all('span', {'itemprop': 'author'}):
        reviewAuthor.append(review.contents[0].text)
    
    # Find the author's position and company (if applicable)
    for review in reviews.find_all('span', {'class': 'user-info'}):
        reviewPosition.append(review.contents[0].text)
        reviewCompany.append(review.contents[1].text)

    # Find what the user rated Sitefinity
    reviewRating = reviews.find_all('span', class_='number')[0].text

    # Perform find.contents[] for all of the headings and text
    # and append them to our functions variables
    for review in reviews.find_all('div', {'class': 'description'}):
        
        # Receive review section headings
        for head in range(6):
            sectionHeading.append(review.contents[0].contents[0].contents[1].contents[head].contents[0].contents[0].contents[0].contents[0].text)

        # Receive review section bodies
        for body in range(6):
            sectionText.append(" %s" % review.contents[0].contents[0].contents[1].contents[body].contents[1].contents[0].contents[0].contents[0].text)


    # Wrap up the review information into a dictionary, this is for easy handling    
    reviewDict = dict(zip(sectionHeading, sectionText))

    # Get's the date of the review from the review's URL
    sectionDate = link[56:-9]
    days = date(int(sectionDate[:-6]), int(sectionDate[5:-3]), int(sectionDate[8:]))

    # Create a new review using our Review class, and return that review
    rev = Review(reviewAuthor, reviewPosition, reviewCompany, reviewRating, reviewDict, days)
    print "Review created for %s..." % rev.name[0]
    sys.stdout.flush()
    return rev
예제 #21
0
    def post(self):
        req_data = request.get_json()
        username = req_data['username']
        restaurant_id = req_data['restaurant_id']
        restaurant_name = req_data['restaurant_name']
        comment = req_data['comment']
        rating = req_data['rating']

        if Review.post_rating(username, restaurant_name, restaurant_id,
                              comment, rating):
            return {'message': "Review created"}, 200
        else:
            return {'message': "Already reviewed restaurant"}, 200
예제 #22
0
파일: Util.py 프로젝트: hseran/IAR
 def labelTestFile(xml_test_file, weka_csv_results_file, output_file):
     '''
     this method takes the reviews xml file, weka results in CSV format
     applies polarity and confidence to reviews and write the resultant xml to output_file
     '''
     reviewList = Review.readReviewsFromXML(xml_test_file)
     
     results_file = open(weka_csv_results_file, "r")
     
     resultsList = results_file.readlines()
     
     if len(reviewList) != len(resultsList):
         print 'Different number of reviews and results'
         return
     
     counter = 0
     for review in reviewList:
         result = resultsList[counter].strip().split(',')
         counter += 1
         review.setPolarity( Util.getNumericLabel(result[2].split(':')[1]))
         review.setConfidence('0.9' if result[4] == '1' else result[4])
     
     print 'writing labelled test data to ' + output_file    
     Review.serializeToXML(reviewList, output_file)
예제 #23
0
    def get_page(self, page):
        myUrl = 'http://android.myapp.com/myapp/app/comment.htm?apkName=com.msxf.loan&apkCode=15701&p=' + page + '&contextData=' + self.contextData
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {'User-Agent': user_agent}
        req = urllib.request.Request(myUrl, headers=headers)
        myResponse = urllib.request.urlopen(req)
        myPage = myResponse.read()
        # encode的作用是将unicode编码转换成其他编码的字符串
        # decode的作用是将其他编码的字符串转换成unicode编码
        unicodePage = myPage.decode("utf-8")

        jsondata = json.loads(unicodePage)
        if (not jsondata == None) and 'obj' in jsondata:
            obj = jsondata['obj']
            if not obj == None:
                if self.total == 0:
                    if 'total' in obj:
                        self.total = obj['total']
                if 'commentDetails' in obj:
                    commentDetailes = obj['commentDetails']
                if 'contextData' in obj:
                    self.contextData = obj['contextData']

                self.crawlCount += len(commentDetailes)
                reviews = []
                for comment in commentDetailes:
                    review = Review()
                    review.appStore = 'myapp'
                    review.packageName = 'com.msxf.loan'

                    if 'content' in comment:
                        review.content = comment['content']
                    if 'nickName' in comment:
                        review.nickName = comment['nickName']
                    if 'score' in comment:
                        review.score = comment['score']
                    if 'versionCode' in comment:
                        review.versionCode = comment['versionCode']
                    if 'createdTime' in comment:
                        review.reviewTime = datetime.datetime.fromtimestamp(int(comment['createdTime'])).strftime(
                            '%Y-%m-%d %H:%M:%S')
                    reviews.append(review)
                ReviewsDataSource.insert(reviews)

        self.enable = self.crawlCount < self.total
    def load_review_from_csv(self, infile):
        with open(infile, "rb") as csvfile:
            reader = csv.DictReader(csvfile)

            # init field names & label column
            self.field_names = reader.fieldnames
            self.column_label = self.field_names[-1]

            for rows in reader:
                review = Review(rows[self.field_names[0]],
                                rows[self.field_names[1]])
                self.dataset.append(review)
                if self.label_values.count(rows[self.column_label]) == 0:
                    self.label_values.append(rows[self.column_label])

        return infile
예제 #25
0
    def from_softconf_dump(json_file, conference=None):
        with io.open(json_file, "r", encoding="utf8") as ifh:
            json_str = ifh.read()

        # print (json_str)
        json_data = json.loads(json_str)["submissions"]

        papers = []
        for i in range(len(json_data)):
            reviews = []
            for k in range(len(json_data[i]["reviews"])):
                # print(json_data[i]["reviews"][k])
                review_data = []

                review = Review.from_json_object(
                    json_data[i]["reviews"][k], k == i == 0)
                #review = None

                reviews.append(review)

            authors = json_data[i]["authors"] if "authors" in json_data[i] else None
            score = json_data[i]["score"] if "score" in json_data[i] else None
            accepted = json_data[i]["accepted"] if "accepted" in json_data[i] else None
            publication_type = json_data[i]["publication_type"] if "publication_type" in json_data[i] else None
            keywords = json_data[i]["KEYWORDS"] if "KEYWORDS" in json_data[i] else None
            author_emails = json_data[i]["AUTHOR_EMAILS"] if "AUTHOR_EMAILS" in json_data[i] else None
            date_of_submission = json_data[i]["DATE_OF_SUBMISSION"] if "DATE_OF_SUBMISSION" in json_data[i] else None

            paper = Paper(
                json_data[i]["title"],
                json_data[i]["abstract"],
                json_data[i]["id"],
                reviews,
                authors,
                conference,
                accepted,
                score,
                publication_type,
                None,
                keywords,
                author_emails,
                date_of_submission)

            papers.append(paper)
            # break

        return papers
예제 #26
0
def queryTodayReviews(reviews):
    conn = sqlite3.connect('reviews.db')

    try:
        c = conn.cursor()
        for row in c.execute(
                "select nickName,content,reviewTime,appStore,versionCode,packageName,score from reviews WHERE date(reviews.reviewTime) = date('now')"):
            review = Review()
            review.appStore = 'myapp'
            review.nickName = row[0]
            review.content = row[1]
            review.reviewTime = row[2]
            review.appStore = row[3]
            review.versionCode = row[4]
            review.packageName = row[5]
            review.score = row[6]
            reviews.append(review)
    except BaseException as e:
        print('sql error : ' + e.__cause__)

    c.close()
    print('queryTodayReviews ' + str(len(reviews)) + ' row.')
예제 #27
0
    def collection(self, reviews):
        all_reviews = []
        factory = ElementFactory()

        for r in reviews:

            content = factory.content(r)
            five_stars = factory.five_stars(r)
            date = factory.date(r).text
            author = factory.author(r).text
            title = factory.title(r).text
            rating = 5 if five_stars != None else 0

            if(rating == 5):
                review = Review(title, content, rating, date, author)
                all_reviews.append(review)
        return all_reviews
예제 #28
0
    def test_add_review(self):
        expected = "Thank your for giving a review."
        is_first = True
        is_second = True
        count = 3
        for k in self.drivers:
            if is_first and is_second:
                order_id = self.order1
                is_first = False
            elif is_second:
                order_id = self.order2
                is_second = False
            else: 
                order_id = self.order3

            result = Review().start(self.drivers[k], self.user, order_id, str(count), "Unit testing")
            self.assertEqual(expected, result)
            count = count + 1
 def load(self):
     for line in codecs.open(Train_File).readlines():
         curr_r = Review()
         curr_r.parsefromstring(line)
         self.data.append(curr_r)
     print "training data loaded, ", len(self.data)
예제 #30
0
파일: TestReview.py 프로젝트: hseran/IAR
from Review import Review

if __name__ == '__main__':
	Review.serializeToXML(Review.readReviewsFromXML('../low-rating-reviews.xml'), '../test.xml')
예제 #31
0
파일: Allocine.py 프로젝트: Angelaben/test
 def reviewList(self, movie_code):
     d = json.loads(
         urllib.urlopen(
             "http://api.allocine.fr/rest/v3/reviewlist?partner=%s&format=json&code=%s"
             % (PARTNER_CODE, movie_code)).read())
     return [Review(**i) for i in d["feed"]["review"]]
예제 #32
0
파일: Xls2mongo.py 프로젝트: kusum18/yelp
 def processReviewXls(self, sheet, row):
     review = Review()
     start_col = 0
     end_col = 11
     for col in range(start_col, end_col):
         if (col == 0):
             review.reviewId = sheet.cell_value(row, col)
         elif (col == 1):
             review.review = sheet.cell_value(row, col)
         elif (col == 2):
             review.Food = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 3):
             review.Drinks = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 4):
             review.Ambiance = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 5):
             review.Service = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 6):
             review.Location = self.XlsCheckValue(sheet.cell_value(
                 row, col))
         elif (col == 7):
             review.Deals = self.XlsCheckValue(sheet.cell_value(row, col))
         elif (col == 8):
             review.Price = self.XlsCheckValue(sheet.cell_value(row, col))
         else:
             pass  #control should have never reached here as there are only 11 columns in xls
     return review
예제 #33
0
 def get(self, username):
     reviews = Review.get_reviews_for_user(username)
     if reviews != None:
         return {'reviews': reviews}, 200
     else:
         return {'message': 'Error getting reviews'}, 404
예제 #34
0
from Rating import Rating
from Experience import Experience
from Review import Review
from User import User
from Recomendation import Recomendation

print(0, "->", Rating(0, 0, 2))
print(1, "->", Rating(1, 0, 2))
print(2, "->", Rating(2, 0, 2))
#Rating(9,0,2)
#Rating("g")
experiencia1 = Experience("Buenas migas", "Restaurante", 1)
experiencia2 = Experience("Telepizza", "Restaurante")
experiencia2.setId(2)
valoracion1 = Rating(1)
resenya1 = Review(experiencia1, valoracion1)
recomendacion1 = Recomendation(experiencia2, Rating(2))
user1 = User("nombre", "contraseña")
user1.setId(1)
user1.addRecomendation(
    Recomendation(Experience("Dominus", "Restaurante", 1), Rating(3)))
user1.addReview(Review(Experience("Dominus", "Restaurante", 3), Rating(4)))
user1.getRecomendations()[0].setId(
    (user1.getRecomendations()[0].getExperience().getId(), user1.getId()))
user1.getReviews()[0].setId(
    (user1.getReviews()[0].getExperience().getId(), user1.getId()))

user2 = User("otroUser", "otraPassword", id=3)
user2.setRecomendations(user1.getRecomendations())
user3 = User("copion", "copionpassword", user1.getReviews(),
             user2.getRecomendations(), 3)
예제 #35
0
파일: scrapyelp.py 프로젝트: hseran/IAR
	reviewObj.setReviewRating(rating)

#global variables
file_location = "../reviews.xml"

if __name__ == '__main__':
	hotel_url= ['http://www.yelp.com/biz/morimoto-new-york']   
	
	#variable to loop through pages
	i=0
	#variable to assign doc id to reviews
	objCount = 1
	#we store our reviews temporarily in this before we write to file
	buffer = []

	#crawl in a loop
	while(i<=1000):
		web_page= parse(hotel_url[0]+'?start='+str(i)).getroot()
		for review in web_page.cssselect('#bizReviews .externalReview'):
			obj = Review(objCount)
			myparser(obj, review)
			buffer.append(obj)
			objCount += 1
		i=i+40
		print objCount
		#if we crawl too fast, site comes up with captcha
		time.sleep(10)
	
	Review.serializeToXML(buffer, file_location)

예제 #36
0
파일: Util.py 프로젝트: hseran/IAR
 def printCount(file):
     reviewList = Review.readReviewsFromXML(file)
     print str(len(reviewList))
예제 #37
0
 def get(self, restaurant_id):
     reviews = Review.get_reviews_for_restaurant(restaurant_id)
     if reviews != None:
         return {'reviews': reviews}, 200
     else:
         return {'message': 'Error getting reviews'}, 404
예제 #38
0
파일: Generate.py 프로젝트: hseran/IAR
'''
Created on Apr 15, 2013

This is where we invoke modules to generate features for training and test data

@author: naresh
'''
from Review import Review
import nltk
from Corpus import Corpus
from Dictionary import Dictionary
from FeatureGenerator import FeatureGenerator
from FeatureWeight import FeatureWeight

if __name__ == '__main__':
    trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml")
    lemmatizer = nltk.WordNetLemmatizer()
    testReviews = Review.readReviewsFromXML("../old-test-data.xml")
    
    trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True)
    '''this dictionary will be used for both training and validation data'''
    dictionary = Dictionary(trainCorpus)
    generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()
    
    testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True);
    generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()
예제 #39
0
    def __init__(self, url, cnx):

        #temporary for testing
        #url = '/movie/bond-23'

        #skip this. metacritic's fault
        if (url == '/movie/who-the-%-is-jackson-pollock'):
            return
        #values that go into database
        values = {}
        values['title'] = ''
        values['url'] = ''
        values['cScore'] = ''
        values['uScore'] = ''
        values['date'] = ''

        #get all of those single values then put them in the movie table
        #then find all of the reviews and put them in the reviews table with the movie id

        #time to get the stuff from the movie page

        #get movie page
        response = requests.get('http://www.metacritic.com' + url,
                                allow_redirects=True)

        if (response.status_code == 400):
            return
        url = re.sub(
            'http:\/\/www.metacritic.com', '',
            response.url)  #resets the url to the one that was redirected to

        #convert html to string
        mainPageHtml = response.content
        #make the soup
        mainPageSoup = BeautifulSoup(mainPageHtml)

        #save the url
        values['url'] = url

        #get the title
        results = mainPageSoup.find_all('span', {'itemprop': 'name'})
        values['title'] = results[0].string
        values['title'] = str(
            values['title'].lstrip().rstrip())  #get rid of weird whitespace
        #get the critic score
        results = mainPageSoup.find_all('span', {'itemprop': 'ratingValue'})
        values['cScore'] = str(results[0].string)

        #get the user score
        results = mainPageSoup.find_all('a', {
            'class': 'metascore_anchor',
            'href': url + '/user-reviews'
        })

        #if for some reason it can't find the user score. it happens even though it shouldn't
        if (len(results) > 0):
            values['uScore'] = str(results[0].div.string)
            if (values['uScore'] == 'tbd'):
                values['uScore'] = str('-1')
        else:
            values['uScore'] = str('-1')

        #get the year
        results = mainPageSoup.find_all('span', {
            'class': 'data',
            'itemprop': 'datePublished'
        })
        date = str(results[0].string.lstrip().rstrip())
        matches = re.match(r'([a-zA-Z]{3})\s(\d+),\s(\d{4})', date)
        if (matches):
            month = {
                'Jan': '01',
                'Feb': '02',
                'Mar': '03',
                'Apr': '04',
                'May': '05',
                'Jun': '06',
                'Jul': '07',
                'Aug': '08',
                'Sep': '09',
                'Oct': '10',
                'Nov': '11',
                'Dec': '12'
            }[matches.group(1)]
            day = matches.group(2)
            year = matches.group(3)
            values['date'] = year + '-' + month + '-' + day
        else:
            values['date'] = None
        #save to the database
        cursor = cnx.cursor()
        query = ("select movie_id from movies where movie_url = %s")

        inDB = False
        mid = 0
        cursor.execute(query, (str(values['url']), ))
        for (movie_id, ) in cursor:
            inDB = True
            id = movie_id
        if (not inDB):
            #make a new row for this critic
            if (values['date'] is not None):
                add_movie = ("INSERT INTO movies"
                             "(title, movie_url, uScore, cScore, release_date)"
                             "VALUES (%s, %s, %s, %s, %s)")
                movie_data = (values['title'], values['url'], values['uScore'],
                              values['cScore'], values['date'])
            else:
                add_movie = ("INSERT INTO movies"
                             "(title, movie_url, uScore, cScore)"
                             "VALUES (%s, %s, %s, %s)")
                movie_data = (values['title'], values['url'], values['uScore'],
                              values['cScore'])
            cursor.execute(add_movie, movie_data)
            mid = cursor.lastrowid
            cnx.commit()
        cursor.close()

        #get the critic reviews
        #get html
        criticPage = openUrl(url)
        criticSoup = BeautifulSoup(criticPage)

        criticReviews = criticSoup.find_all(
            'div', {'class': 'module reviews_module critic_reviews_module'})
        if (len(criticReviews) > 0):
            reviews = criticReviews[0].find_all('div',
                                                {'class': 'review_content'})
        else:
            print('ERROR:' + url)
            reviews = []

        for r in reviews:
            Rev = Review(mid, values['url'], r, cnx)
예제 #40
0
    def reviewDetail(self, review_container, poi_id):
        """--------------------------"""
        '''-------------uid-------------'''
        uid = None
        try:
            try:
                original_uid = review_container.find_elements_by_css_selector(
                    ".memberOverlayLink")[0].get_attribute("id")
                #print(original_uid)
                long_uid = original_uid.split("_")[1]
                long_uid_split = long_uid.split("-")
                if len(long_uid_split) > 0:
                    uid = long_uid_split[0]
                else:
                    uid = long_uid
            except:
                original_uid = review_container.find_elements_by_css_selector(
                    ".member_info div")[0].get_attribute("class")
                try:
                    long_uid = original_uid.split("_")[1]
                    uid = long_uid
                except:
                    uid = review_container.find_element_by_css_selector(
                        ".username.mo span").text
        except:
            uid = None
        '''-------------review_title-------------'''
        review_title = None
        try:
            review_title = review_container.find_element_by_css_selector(
                "span.noQuotes").text
        except:
            review_title = None
        '''-------------review_rating-------------'''

        review_rating = None
        try:
            review_rating_string = review_container.find_element_by_css_selector(
                ".rating span.ui_bubble_rating").get_attribute("class")
            review_rating = int(
                review_rating_string.split(" ")[1].split("_")[1]) / 10
        except:
            review_rating = None
        '''-------------ratingDate-------------'''
        ratingDate = None
        try:
            ratingDate = review_container.find_element_by_css_selector(
                ".ratingDate.relativeDate").get_attribute("title")
        except:
            ratingDate = None
        '''-------------review-------------'''
        review = None
        try:
            review = review_container.find_element_by_css_selector(
                ".entry .partial_entry").text
        except:
            review = None
        '''-------------print all data-------------'''
        # print("uid:",uid,"review_title:",review_title,"review_rating:",review_rating,"review:",review,"ratingDate:",ratingDate)
        review = Review(poi_id, uid, review_title, review_rating, review,
                        ratingDate)
        print(review)
        if self.insertToDB_gate:
            self.db.insert(review, "review")
            print("insert ", review.review_title)
예제 #41
0
파일: SplitReviews.py 프로젝트: hseran/IAR
	
	#output files
	unlabeled_file='../test-data.xml'
	labeled_file='../traning-data.xml'
	
	#lists for labeled and unlabeled reviews
	unlabeled=[]
	labeled=[]
	labeled_high=[]
	labeled_low=[]
	labeled_mid=[]

	for each_file in review_files:	
		
		#call the readReviewsFromXML
		reviews = Review.readReviewsFromXML(each_file)

		for each_review in reviews:

			#convert reviewId into int, which help in sorting before saving in disk. 

			each_review.reviewId=int(each_review.getReviewId())

			#check and append if polarity is empty
		
			if (each_review.getReviewPolarity() == ""):
				unlabeled.append(each_review)
			elif (each_review.getReviewPolarity() == "-1"):
				labeled_low.append(each_review)
			elif(each_review.getReviewPolarity() == "0"):
				labeled_mid.append(each_review)
예제 #42
0
def parse_album_review(text, site):
    """Return date, artist, album, and body of review for page"""
    soup = BeautifulSoup(text, "html.parser")

    if site == "exclaim":
        date = dateparser.parse(
            soup.find("div", {
                "class": "article-published"
            }).get_text()[10:])
        author = soup.find("div", {"class": "article-author"}).get_text()[3:]
        try:  # Some reviews don't have ratings
            rating = soup.find("div", {"class": "article-rating"}).get_text()
        except AttributeError as err:
            rating = ''
        artist = soup.find("span", {"class": "article-title"}).get_text()
        try:
            album = soup.find("span", {"class": "article-subtitle"}).get_text()
        except AttributeError as err:
            album = ''
        review = soup.find("div", {"class": "article"}).get_text()
        if rating != '':
            try:
                review = re.split('(\n\d{1,2}\n)', review)[2]
            except IndexError as err:
                pass
        review = re.split('(\([^()]+\)\n\n)', review)[0]

    elif site == "rollingstone":

        # date will need to be further processed
        date = dateparser.parse(
            soup.find("time", {
                "class": "content-published-date"
            }).get_text())

        author = soup.find("a", {"class": "content-author"}).get_text()

        # title does not hold artist and album in structured way
        title = soup.find("h1", {"class": "content-title"}).get_text()

        # Work in progress -- use URL instead?
        # from urllib.parse imprt urlparse
        # url = soup.find('link', {'rel': 'canonical'}).get('href')
        # parsed_url = urlparse(url)
        # # get last part of URL, split it into words, and remove the last word which is some id
        # # should be left with
        # url_title = parsed_url.path.split("/")[-1].split("-")[:-1]
        # url_title = urltitle.split("-")

        if title.startswith("Review:"):
            title = title.lstrip("Review:")
        # if ":" in title:
        #     artist, album = title.strip().split(": ")
        # else:
        artist, album = title.strip(), ""

        # Reviews are nested <p> in the article-content <div>
        # I want to join contents of all <p>s, unescape the HTML, and remove newlines and tabs
        review = " ".join([
            p.get_text() for p in soup.find("div", {
                "class": "article-content"
            }).find_all("p")
        ])

        rating = len(soup.select("span.percentage.full"))
        if len(soup.select("span.percentage.half")) == 1:
            rating += 0.5

        if not review:
            review = ""

    return Review(date=date,
                  author=author,
                  rating=rating,
                  artist=artist,
                  album=album,
                  review=review)
예제 #43
0
            return
        
        trainingData = {}
        validationData = {}
        self.generateKFolds(outdir, trainingData, validationData)        
        
        for i in range(1,self.k+1):
            print "generating features for fold " + str(i)          
            
            trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging)
            '''this dictionary will be used for both training and validation data'''
            dictionary = Dictionary(trainCorpus)
            generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', 
                                         weightScheme, includeRating, includeDocLength)
            generator.generateFeatures()
            
            validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging);
            generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', 
                                         weightScheme, includeRating, includeDocLength)
            generator.generateFeatures()

            
if __name__ == '__main__':
    reviews = Review.readReviewsFromXML("../old-training-shuffled.xml")
    lemmatizer = nltk.WordNetLemmatizer()
    print 'reviews: ' + str(len(reviews))
    kfg = KFoldGenerator(reviews, 10)
    kfg.generateFolds("../kfolds/linearSVM/unigrams-lemma-POS-tf-no-stop", lemmatizer, 
                      POS_tagging = True, weightScheme = FeatureWeight.TF,
                      includeRating=False, includeDocLength=False)