Esempio n. 1
0
    def solveProfileUp(self):
        profile = self.html.xpath("//div[@class='a-row profile-details']")
        if len(profile) == 0:
            return
        else:
            profile = profile[0]

        # Reviewer Name
        self.rName = self.html.xpath(
            ".//div[@class='a-section']/h1")[0].text.strip()
        if len(self.rName) == 0:
            self.rName = "N/A"
        

        # Reviewer Ranking
        rRank = profile.xpath(".//div[@class='profile-info']")
        for rank in rRank:
            if len(rRank) != 0:
                rRank = rank.xpath(".//div[@class='a-row']/span[@class='a-size-large a-text-bold']")
                if len(rRank) != 0:
                    rRank = rRank[0].text.strip()
                    if len(rRank) != 0:
                        #rRank=rRank.split(':')[1]
                        self.rRank = rRank
                        self.rRank = self.rRank.replace('#', '')
                        self.rRank = CommonTool.strToInt(self.rRank)
                    else:
                        self.rRank = 'N/A'
                else:
                    print "entered else in ranking"
                    rRank=profile.xpath(".//span[@class='a-size-small a-color-secondary']")
                    print rRank
                    for rank in rRank:
                        if "ranking" in rank.text:
                            rRank=rank
                            
                    print "aranking ",rRank
                    if len(rRank)> 0 and rRank[0].text is not None:
                        rRank=rRank[0].text.strip()
                        if "#" in rRank:
                            rRank=rRank.split('#')
                            print "rank",rRank
                            self.rRank = rRank[1]
                            self.rRank = CommonTool.strToInt(self.rRank)
                    else:
                        self.rRank = 'N/A'
            else:
                self.rRank = 'N/A'

        # Top Reviewer && Vine Voice
        self.tRev1 = 0
        self.tRev10 = 0
        self.tRev50 = 0
        self.tRev100 = 0
        self.tRev500 = 0
        self.tRev1000 = 0
        self.tRevHall = 0
        self.rReal=0
        #self.vVoice = '0'

        tRev = profile.xpath(
            ".//span[@class='a-color-link pr-c7y-badge a-text-bold']")
        if len(tRev) != 0:
            
            temp = tRev[0].text.strip()
            if temp.find('#1 REVIEWER') != -1:
                self.tRev1 = 1
            elif temp.find('TOP 10 REVIEWER') != -1:
                self.tRev10 = 1
            elif temp.find('TOP 50 REVIEWER') != -1:
                self.tRev50 = 1
            elif temp.find('TOP 100 REVIEWER') != -1:
                self.tRev100 = 1
            elif temp.find('TOP 500 REVIEWER') != -1:
                self.tRev500 = 1
            elif temp.find('TOP 1000 REVIEWER') != -1:
                self.tRev1000 = 1
            elif temp.find('HALL OF FAME') != -1:
                self.tRevHall = 1
                #elif temp.find('VINE VOICE') != -1:
                    #self.vVoice = 1

        # INTEREST
        self.interest = profile.xpath(
            ".//div[@class='a-row a-spacing-medium profile-interests']")
        if len(self.interest) == 0:
            self.interest = 'N/A'
            # print '(interest) No a-row a-spacing-medium profile-interests is
            # found!'
        else:
            self.interest = self.interest[0].xpath(
                "./div/span[@class='a-size-small']")[0].text.strip()
            self.interest = self.interest.replace('\r', '')
            self.interest = self.interest.replace('\n', '<br>')

        # ABOUT ME
        abtMe = profile.xpath(
            ".//span/text()[normalize-space(.)='About']/parent::*/\
            following-sibling::div/div[1]/span/p")
        print abtMe,"abtme"
        if abtMe is not None and len(abtMe)>0:
            for p in abtMe:
                about =p.text.strip()\
                    .replace('\r', '').replace('\n', '<br />')
                self.aboutMe=''
                self.aboutMe=self.aboutMe+about
                    
        #except LookupError:
        #self.aboutMe = 'N/A'
        
        # Email && webpage
        link1 = profile.xpath(".//div[@class='a-row break-word pr-link']/a")
        if len(link1) == 0:
            # print '(email) No a-size-small found!'
            self.email = 0
        else:
            # self.email = link1[0].text.strip()
            self.email = 1

        link2 = profile.xpath(".//div[@class='a-row customer-website pr-link']/a/span")
        if len(link2) == 0:
            self.webPage = 0
        else:
            self.webPage = 1
        # rNum
        rNum = profile.xpath(".//div[@class='a-column a-span7 pr-link']/a/span")
        if len(rNum) == 0:
            self.rNum = 0
        else:
            rNum = rNum[0].text.strip()
            if "Reviews" in rNum:
                print rNum
                rNum = rNum.split('(')
                rNum=rNum[1]
                rNum=rNum.split(')')
                rNum=rNum[0]
            else:
                rNum='0'
           
            self.rNum = CommonTool.strToInt(rNum)



        # helpRate
        helpful = profile.xpath(".//div[@class='a-row customer-helpfulness']")
        if len(helpful) == 0:
            # print '(helpful) No a-size-large a-text-bold found!'
            self.helpRate = 0.0
        else:
            self.helpRate = helpful[0].xpath(
                ".//span[@class='a-size-large a-text-bold']")
            if len(self.helpRate) == 0:
                # print '(helpRate) No a-size-large a-text-bold found!'
                self.helpRate = 0.0
            else:
                self.helpRate = self.helpRate[0].text.strip()
                self.helpRate = int(self.helpRate[:-1]) / 100.0

        # hVote && tVote
        votes = profile.xpath(
            "./span/div/div/div/span[@class='a-size-small a-color-secondary']")
        if len(votes) == 0:
            # print '(votes) No a-size-small a-color-secondary found!'
            self.hVote = 0
            self.tVote = 0
        else:
            votes = votes[0].text.strip()
            votesList = votes.split(' of ')
            self.hVote = votesList[0][1:]
            self.tVote = votesList[1][:-1]
            self.hVote = self.hVote.strip()
            self.tVote = self.tVote.strip()
            self.hVote = CommonTool.strToInt(self.hVote)
            self.tVote = CommonTool.strToInt(self.tVote)

            del votesList
Esempio n. 2
0
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate):
    hlre = re.compile(
        r'^(\d+) of (\d+) people found the following review helpful')
    html = MyHtml.getHtml(url)
    print "solving Review Page"
    countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip())
    if countOfReviews>0:
        
        divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0]
        divReviewList = divWholeReviewList.xpath('./div[@id]')     
        print divReviewList
        for divReview in divReviewList:
            
            aReview = Review()
            rank += 1
            aReview.helpfulRank = rank
            aReview.asin = asin
            aReview.reviewID = divReview.attrib['id']

            # helpful line
            parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
            print "helpful Match",parentNode
            helpfulMatch=None
            if parentNode  is not None: 
                match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
                if match1 is not None and len(match1)>0:
                    if match1[0].text is not None:
                        helpfulMatch=hlre.match(match1[0].text.strip())
                else:
                    
                    match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]')
                    if match2 is not None and len(match2)>0:
                        if match2[0].text is not None:
                            helpfulMatch=hlre.match(match2[0].text.strip())
                        
                print helpfulMatch
                if helpfulMatch:
                    aReview.helpful = int(helpfulMatch.group(1))
                    aReview.total = int(helpfulMatch.group(2))
                    aReview.helpfulness = aReview.helpful * \
                                100 / aReview.total / 100.0
                else:
                    aReview.helpful=0
                    aReview.total=0
                    aReview.helpfulness=0
                del helpfulMatch

                print "getting rate data"
                
                rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip()
                aReview.rate=rateData.split(' ')[0].strip()
                aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip()
            
                print "getting reviewerId"
               
                reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a')
                if reviewer is not None and len(reviewer)>0:
                    aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0]
                print "reviewerId",aReview.reviewerID
                
            aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip())
            aReview.elapsedDate = (fetchDate - aReview.date).days
            print bookPublishDate
            if bookPublishDate=='N/A':
                aReview.reviewBookDate='N/A'
            else:
                aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days
                
    
            # format line
            try:
                strFormat = divReview[3].xpath('./a[1]')[0].text.strip()
                aReview.fromFormat = strFormat.split(' ')[1]
            except IndexError:
                aReview.fromFormat = ''
            spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]')
            if spanVerifiedPurchase:
                spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip()
                if spanVerifiedPurchase=="Verified Purchase":
                    aReview.verified = 1
                else:
                    aReview.verified = 0
    
                # review text line
            divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0]
            aReview.description = filtTag.filter_tags(
                etree.tostring(divReviewText).strip()).strip()
            aReview.description = aReview.description.replace('\n', '<br />')
            del divReviewText
   
   
                # review comments line
            aReview.numOfComments = CommonTool.strToInt(
                divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip())
            aReview.getComments()
            saveReview(review=aReview)
    #             try:
    #                 reviewer = Reviewer.loadReviewer(aReview.reviewerID)
    #                 aReview.lastReviewRank = reviewer.getPreBookReviewRanking(
    #                     aReview.reviewID)
    #             except Exception, e:
    #                 sys.stderr.write(str(e) + '\n')
    #                 sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \
    #                 reviewerID: {2}\n'.format(
    #                     url, aReview.reviewID, aReview.reviewerID))
    #                 import traceback
    #                 traceback.print_exc()
            
        # end of for
    # end of else
    return rank