def getProductLinksUsingThreads(self,productList): print "getting products using thread method" for link in productList: linkPage = MyHtml.getHtml( link, self.rID + "_AllProductLinks_" + str(self.page)) trObjs = linkPage.xpath( ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']") if link is self.firstLink: try: firstTimeObj = trObjs[-1] except LookupError: print link exit(0) fTime = firstTimeObj.xpath("./following-sibling::*") if len(fTime) == 0: self.fRevTime = 'N/A' else: # print etree.tostring(fTime[0]) fTime = fTime[0].xpath(".//nobr") if len(fTime) == 0: self.fRevTime = 'N/A' else: self.fRevTime = fTime[0].text.strip() self.fRevTime = CommonTool.strToDate(self.fRevTime) del fTime if link is self.lastLink: lastTimeObj = trObjs[0] lTime = lastTimeObj.xpath("./following-sibling::*") if len(lTime) == 0: self.lRevTime = 'N/A' else: # print etree.tostring(lTime[0]) lTime = lTime[0].xpath(".//nobr") if len(lTime) == 0: self.lRevTime = 'N/A' else: self.lRevTime = lTime[0].text.strip() self.lRevTime = CommonTool.strToDate(self.lRevTime) del lTime for trObj in trObjs: tableObj = trObj.xpath( "./td[@class='small'][3]/table[@class='small']") # aLink if len(tableObj) != 0: aLink = tableObj[0].xpath(".//a") if len(aLink) == 0: aLink = '' else: aLink = aLink[0].attrib['href'] # rate rateObj = trObj.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: try: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) self.counter = self.counter + 1 except Exception, e: sys.stderr.write(str(e) + ' rate Exception\n') # reviewID reviewID = '' rIDObj = rateObj[0].xpath(".//a") if len(rIDObj) != 0: reviewID = rIDObj[0].attrib['name'] # label The review is from label = '' labelObj = rateObj[0].xpath(".//div[@class='tiny']") if len(labelObj) != 0: # verified purchase + the review is from aObj = labelObj[-1].xpath(".//a") if len(aObj) != 0: label = filtTag.filter_tags( etree.tostring(aObj[0]).strip()) # 1---book 0---product parLeft = label.find('(') parRight = label.find(')') if parLeft == -1 and parRight == -1: label = '0' elif label[-1] == ')': label = label.split('(') label = label[-1][:-1] if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1): label = '1' bookID = aLink.replace('/ref=cm_cr-mr-title', '') bookID = bookID[-10:] self.reviewedBookList.append(bookID) productList = [] productList.append(aLink) productList.append(rate) productList.append(reviewID) productList.append(label) self.allProductLinks.append(productList) del productList self.page = self.page + 1
def getPreviousReviewedBook(self,reviewerId): initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId self.allRevLink=initUrl html=MyHtml.getHtml(initUrl) ftable=html.xpath('.//body/table[2]')[0] pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]') if pages is not None and len(pages)>0: totalPages=pages[0].text.strip() else: totalPages=1 strPages=str(totalPages) print strPages if "-" in strPages: totalPages=totalPages.split('-') print "totalPages",totalPages totalPages=totalPages[1] else: totalPages=totalPages print totalPages,"totalPages" sortBy='MostRecentReview' j=1 self.counter=0 flag=0 for j in range(1,(int(totalPages)+1)): baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId baseUrl=baseUrl+ \ '?pageNumber={}&sortBy={}'.format(str(j), sortBy) print baseUrl,j,totalPages html=MyHtml.getHtml(baseUrl) ftable=html.xpath('.//body/table[2]')[0] mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]') for row in mainTable: if row is not None: isBook=row.xpath('./td[5]/table/tr[2]/td/b') if isBook is not None and len(isBook)>0: if isBook[0].text is not None and flag==0: edition=isBook[0].text.strip() if "Edition" in edition: print "got the previous book" flag=1 reviewdate=row.xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') if reviewdate: reviewdate=reviewdate[0].text.strip() print "got the reviewDate",reviewdate self.previousBookReviewDate=CommonTool.strToDate(reviewdate) #to get link of the previous reviewedbook url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href'] asin=Book.getAsinFromUrl(url) previousBook=Book.loadBookByAsin(asin) print "asinofPrevious",asin print "previousBook",previousBook self.previousBookPublishDate=previousBook.publishDate if j==1 and self.lRevTime=='': reviewdate=mainTable[0].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"lRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.lRevTime=CommonTool.strToDate(reviewdate) print "value of j",j if j==int(totalPages) and self.fRevTime=='': print "inside frevtime loop" reviewdate=mainTable[-1].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"fRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.fRevTime=CommonTool.strToDate(reviewdate) # rate rateObj = row.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) j=j+1 #end of inner for loop print "sum",self.sum if self.rNum is not 0: self.avgRate=self.sum/self.rNum self.avgRate=round(self.avgRate,2) if self.lRevTime=='' or self.fRevTime=='': duration=0 else : duration = (self.lRevTime-self.fRevTime).days self.duration=int(duration)
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate): hlre = re.compile( r'^(\d+) of (\d+) people found the following review helpful') html = MyHtml.getHtml(url) print "solving Review Page" countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') print divReviewList for divReview in divReviewList: aReview = Review() rank += 1 aReview.helpfulRank = rank aReview.asin = asin aReview.reviewID = divReview.attrib['id'] # helpful line parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') print "helpful Match",parentNode helpfulMatch=None if parentNode is not None: match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') if match1 is not None and len(match1)>0: if match1[0].text is not None: helpfulMatch=hlre.match(match1[0].text.strip()) else: match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]') if match2 is not None and len(match2)>0: if match2[0].text is not None: helpfulMatch=hlre.match(match2[0].text.strip()) print helpfulMatch if helpfulMatch: aReview.helpful = int(helpfulMatch.group(1)) aReview.total = int(helpfulMatch.group(2)) aReview.helpfulness = aReview.helpful * \ 100 / aReview.total / 100.0 else: aReview.helpful=0 aReview.total=0 aReview.helpfulness=0 del helpfulMatch print "getting rate data" rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip() aReview.rate=rateData.split(' ')[0].strip() aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip() print "getting reviewerId" reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a') if reviewer is not None and len(reviewer)>0: aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0] print "reviewerId",aReview.reviewerID aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip()) aReview.elapsedDate = (fetchDate - aReview.date).days print bookPublishDate if bookPublishDate=='N/A': aReview.reviewBookDate='N/A' else: aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days # format line try: strFormat = divReview[3].xpath('./a[1]')[0].text.strip() aReview.fromFormat = strFormat.split(' ')[1] except IndexError: aReview.fromFormat = '' spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]') if spanVerifiedPurchase: spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip() if spanVerifiedPurchase=="Verified Purchase": aReview.verified = 1 else: aReview.verified = 0 # review text line divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0] aReview.description = filtTag.filter_tags( etree.tostring(divReviewText).strip()).strip() aReview.description = aReview.description.replace('\n', '<br />') del divReviewText # review comments line aReview.numOfComments = CommonTool.strToInt( divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip()) aReview.getComments() saveReview(review=aReview) # try: # reviewer = Reviewer.loadReviewer(aReview.reviewerID) # aReview.lastReviewRank = reviewer.getPreBookReviewRanking( # aReview.reviewID) # except Exception, e: # sys.stderr.write(str(e) + '\n') # sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \ # reviewerID: {2}\n'.format( # url, aReview.reviewID, aReview.reviewerID)) # import traceback # traceback.print_exc() # end of for # end of else return rank