def solveProfileUp(self): profile = self.html.xpath("//div[@class='a-row profile-details']") if len(profile) == 0: return else: profile = profile[0] # Reviewer Name self.rName = self.html.xpath( ".//div[@class='a-section']/h1")[0].text.strip() if len(self.rName) == 0: self.rName = "N/A" # Reviewer Ranking rRank = profile.xpath(".//div[@class='profile-info']") for rank in rRank: if len(rRank) != 0: rRank = rank.xpath(".//div[@class='a-row']/span[@class='a-size-large a-text-bold']") if len(rRank) != 0: rRank = rRank[0].text.strip() if len(rRank) != 0: #rRank=rRank.split(':')[1] self.rRank = rRank self.rRank = self.rRank.replace('#', '') self.rRank = CommonTool.strToInt(self.rRank) else: self.rRank = 'N/A' else: print "entered else in ranking" rRank=profile.xpath(".//span[@class='a-size-small a-color-secondary']") print rRank for rank in rRank: if "ranking" in rank.text: rRank=rank print "aranking ",rRank if len(rRank)> 0 and rRank[0].text is not None: rRank=rRank[0].text.strip() if "#" in rRank: rRank=rRank.split('#') print "rank",rRank self.rRank = rRank[1] self.rRank = CommonTool.strToInt(self.rRank) else: self.rRank = 'N/A' else: self.rRank = 'N/A' # Top Reviewer && Vine Voice self.tRev1 = 0 self.tRev10 = 0 self.tRev50 = 0 self.tRev100 = 0 self.tRev500 = 0 self.tRev1000 = 0 self.tRevHall = 0 self.rReal=0 #self.vVoice = '0' tRev = profile.xpath( ".//span[@class='a-color-link pr-c7y-badge a-text-bold']") if len(tRev) != 0: temp = tRev[0].text.strip() if temp.find('#1 REVIEWER') != -1: self.tRev1 = 1 elif temp.find('TOP 10 REVIEWER') != -1: self.tRev10 = 1 elif temp.find('TOP 50 REVIEWER') != -1: self.tRev50 = 1 elif temp.find('TOP 100 REVIEWER') != -1: self.tRev100 = 1 elif temp.find('TOP 500 REVIEWER') != -1: self.tRev500 = 1 elif temp.find('TOP 1000 REVIEWER') != -1: self.tRev1000 = 1 elif temp.find('HALL OF FAME') != -1: self.tRevHall = 1 #elif temp.find('VINE VOICE') != -1: #self.vVoice = 1 # INTEREST self.interest = profile.xpath( ".//div[@class='a-row a-spacing-medium profile-interests']") if len(self.interest) == 0: self.interest = 'N/A' # print '(interest) No a-row a-spacing-medium profile-interests is # found!' else: self.interest = self.interest[0].xpath( "./div/span[@class='a-size-small']")[0].text.strip() self.interest = self.interest.replace('\r', '') self.interest = self.interest.replace('\n', '<br>') # ABOUT ME abtMe = profile.xpath( ".//span/text()[normalize-space(.)='About']/parent::*/\ following-sibling::div/div[1]/span/p") print abtMe,"abtme" if abtMe is not None and len(abtMe)>0: for p in abtMe: about =p.text.strip()\ .replace('\r', '').replace('\n', '<br />') self.aboutMe='' self.aboutMe=self.aboutMe+about #except LookupError: #self.aboutMe = 'N/A' # Email && webpage link1 = profile.xpath(".//div[@class='a-row break-word pr-link']/a") if len(link1) == 0: # print '(email) No a-size-small found!' self.email = 0 else: # self.email = link1[0].text.strip() self.email = 1 link2 = profile.xpath(".//div[@class='a-row customer-website pr-link']/a/span") if len(link2) == 0: self.webPage = 0 else: self.webPage = 1 # rNum rNum = profile.xpath(".//div[@class='a-column a-span7 pr-link']/a/span") if len(rNum) == 0: self.rNum = 0 else: rNum = rNum[0].text.strip() if "Reviews" in rNum: print rNum rNum = rNum.split('(') rNum=rNum[1] rNum=rNum.split(')') rNum=rNum[0] else: rNum='0' self.rNum = CommonTool.strToInt(rNum) # helpRate helpful = profile.xpath(".//div[@class='a-row customer-helpfulness']") if len(helpful) == 0: # print '(helpful) No a-size-large a-text-bold found!' self.helpRate = 0.0 else: self.helpRate = helpful[0].xpath( ".//span[@class='a-size-large a-text-bold']") if len(self.helpRate) == 0: # print '(helpRate) No a-size-large a-text-bold found!' self.helpRate = 0.0 else: self.helpRate = self.helpRate[0].text.strip() self.helpRate = int(self.helpRate[:-1]) / 100.0 # hVote && tVote votes = profile.xpath( "./span/div/div/div/span[@class='a-size-small a-color-secondary']") if len(votes) == 0: # print '(votes) No a-size-small a-color-secondary found!' self.hVote = 0 self.tVote = 0 else: votes = votes[0].text.strip() votesList = votes.split(' of ') self.hVote = votesList[0][1:] self.tVote = votesList[1][:-1] self.hVote = self.hVote.strip() self.tVote = self.tVote.strip() self.hVote = CommonTool.strToInt(self.hVote) self.tVote = CommonTool.strToInt(self.tVote) del votesList
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate): hlre = re.compile( r'^(\d+) of (\d+) people found the following review helpful') html = MyHtml.getHtml(url) print "solving Review Page" countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') print divReviewList for divReview in divReviewList: aReview = Review() rank += 1 aReview.helpfulRank = rank aReview.asin = asin aReview.reviewID = divReview.attrib['id'] # helpful line parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') print "helpful Match",parentNode helpfulMatch=None if parentNode is not None: match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') if match1 is not None and len(match1)>0: if match1[0].text is not None: helpfulMatch=hlre.match(match1[0].text.strip()) else: match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]') if match2 is not None and len(match2)>0: if match2[0].text is not None: helpfulMatch=hlre.match(match2[0].text.strip()) print helpfulMatch if helpfulMatch: aReview.helpful = int(helpfulMatch.group(1)) aReview.total = int(helpfulMatch.group(2)) aReview.helpfulness = aReview.helpful * \ 100 / aReview.total / 100.0 else: aReview.helpful=0 aReview.total=0 aReview.helpfulness=0 del helpfulMatch print "getting rate data" rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip() aReview.rate=rateData.split(' ')[0].strip() aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip() print "getting reviewerId" reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a') if reviewer is not None and len(reviewer)>0: aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0] print "reviewerId",aReview.reviewerID aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip()) aReview.elapsedDate = (fetchDate - aReview.date).days print bookPublishDate if bookPublishDate=='N/A': aReview.reviewBookDate='N/A' else: aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days # format line try: strFormat = divReview[3].xpath('./a[1]')[0].text.strip() aReview.fromFormat = strFormat.split(' ')[1] except IndexError: aReview.fromFormat = '' spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]') if spanVerifiedPurchase: spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip() if spanVerifiedPurchase=="Verified Purchase": aReview.verified = 1 else: aReview.verified = 0 # review text line divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0] aReview.description = filtTag.filter_tags( etree.tostring(divReviewText).strip()).strip() aReview.description = aReview.description.replace('\n', '<br />') del divReviewText # review comments line aReview.numOfComments = CommonTool.strToInt( divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip()) aReview.getComments() saveReview(review=aReview) # try: # reviewer = Reviewer.loadReviewer(aReview.reviewerID) # aReview.lastReviewRank = reviewer.getPreBookReviewRanking( # aReview.reviewID) # except Exception, e: # sys.stderr.write(str(e) + '\n') # sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \ # reviewerID: {2}\n'.format( # url, aReview.reviewID, aReview.reviewerID)) # import traceback # traceback.print_exc() # end of for # end of else return rank