def getBookAsinList(self): self.bookAsinList = [] html = MyHtml.getHtml( self.url, name=self.tag, crawlDate=self.fetchDate) divProductList = html.xpath(".//div[@class='productList']")[0] trListProductList = divProductList.xpath("./table/tr[@class='small']") for i, tr in enumerate(trListProductList): if i % 2 == 1: continue aUrl = tr.xpath("./td[2]/a")[0].attrib['href'].strip() asin = Book.getAsinFromUrl(aUrl) if asin != '' and asin[0] != 'B': book = Book.loadBookByAsin(asin, self.fetchDate) if self.checkBook(book): self.bookAsinList.append(asin)
def getBookAsinList(self): self.bookAsinList=[] html=MyHtml.getHtml(self.url,name="NewReleasedBooks",crawlDate=self.fetchDate) divBooksList = html.xpath(".//div[@id='zg_centerListWrapper']")[0] divItemsList=divBooksList.xpath("./div[@class='zg_itemImmersion']") for item in divItemsList: #item=divItemsList[0] #if item: aUrl=item.xpath("./div[2]/div[2]/a")[0].attrib['href'].strip() rank=item.xpath("./div[@class='zg_rankDiv']/span")[0].text.strip() asin=Book.getAsinFromUrl(aUrl) if asin!='': book=Book.loadBookByAsin(asin,self.fetchDate) #if self.checkBook(book,rank): self.bookAsinList.append(asin) print self.bookAsinList
def getPreviousReviewedBook(self,reviewerId): initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId self.allRevLink=initUrl html=MyHtml.getHtml(initUrl) ftable=html.xpath('.//body/table[2]')[0] pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]') if pages is not None and len(pages)>0: totalPages=pages[0].text.strip() else: totalPages=1 strPages=str(totalPages) print strPages if "-" in strPages: totalPages=totalPages.split('-') print "totalPages",totalPages totalPages=totalPages[1] else: totalPages=totalPages print totalPages,"totalPages" sortBy='MostRecentReview' j=1 self.counter=0 flag=0 for j in range(1,(int(totalPages)+1)): baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId baseUrl=baseUrl+ \ '?pageNumber={}&sortBy={}'.format(str(j), sortBy) print baseUrl,j,totalPages html=MyHtml.getHtml(baseUrl) ftable=html.xpath('.//body/table[2]')[0] mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]') for row in mainTable: if row is not None: isBook=row.xpath('./td[5]/table/tr[2]/td/b') if isBook is not None and len(isBook)>0: if isBook[0].text is not None and flag==0: edition=isBook[0].text.strip() if "Edition" in edition: print "got the previous book" flag=1 reviewdate=row.xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') if reviewdate: reviewdate=reviewdate[0].text.strip() print "got the reviewDate",reviewdate self.previousBookReviewDate=CommonTool.strToDate(reviewdate) #to get link of the previous reviewedbook url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href'] asin=Book.getAsinFromUrl(url) previousBook=Book.loadBookByAsin(asin) print "asinofPrevious",asin print "previousBook",previousBook self.previousBookPublishDate=previousBook.publishDate if j==1 and self.lRevTime=='': reviewdate=mainTable[0].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"lRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.lRevTime=CommonTool.strToDate(reviewdate) print "value of j",j if j==int(totalPages) and self.fRevTime=='': print "inside frevtime loop" reviewdate=mainTable[-1].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"fRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.fRevTime=CommonTool.strToDate(reviewdate) # rate rateObj = row.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) j=j+1 #end of inner for loop print "sum",self.sum if self.rNum is not 0: self.avgRate=self.sum/self.rNum self.avgRate=round(self.avgRate,2) if self.lRevTime=='' or self.fRevTime=='': duration=0 else : duration = (self.lRevTime-self.fRevTime).days self.duration=int(duration)