Esempio n. 1
0
    def getProductLinksUsingThreads(self,productList):
        print "getting products using thread method"
        for link in productList:
            
            linkPage = MyHtml.getHtml(
                link, self.rID + "_AllProductLinks_" + str(self.page))
            trObjs = linkPage.xpath(
                ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']")
            if link is self.firstLink:
                try:
                    firstTimeObj = trObjs[-1]
                except LookupError:
                    print link
                    exit(0)
                fTime = firstTimeObj.xpath("./following-sibling::*")
                if len(fTime) == 0:
                    self.fRevTime = 'N/A'
                else:
                    # print etree.tostring(fTime[0])
                    fTime = fTime[0].xpath(".//nobr")
                    if len(fTime) == 0:
                        self.fRevTime = 'N/A'
                    else:
                        self.fRevTime = fTime[0].text.strip()
                        self.fRevTime = CommonTool.strToDate(self.fRevTime)
                del fTime

            if link is self.lastLink:
                lastTimeObj = trObjs[0]
                lTime = lastTimeObj.xpath("./following-sibling::*")
                if len(lTime) == 0:
                    self.lRevTime = 'N/A'
                else:
                    # print etree.tostring(lTime[0])
                    lTime = lTime[0].xpath(".//nobr")
                    if len(lTime) == 0:
                        self.lRevTime = 'N/A'
                    else:
                        self.lRevTime = lTime[0].text.strip()
                        self.lRevTime = CommonTool.strToDate(self.lRevTime)
                del lTime

            for trObj in trObjs:
                tableObj = trObj.xpath(
                    "./td[@class='small'][3]/table[@class='small']")
                # aLink
                if len(tableObj) != 0:
                    aLink = tableObj[0].xpath(".//a")
                    if len(aLink) == 0:
                        aLink = ''
                    else:
                        aLink = aLink[0].attrib['href']

                # rate
                rateObj = trObj.xpath("./following-sibling::*")
                rate = 'N/A'
                if len(rateObj) != 0:
                    try:
                        rateObj1 = rateObj[0].xpath(".//img")
                        title = rateObj1[0].attrib['title']
                        rate = title.split("out")[0].strip()
                        self.sum = self.sum + float(rate)
                        self.counter = self.counter + 1
                    except Exception, e:
                        sys.stderr.write(str(e) + ' rate Exception\n')

                # reviewID
                reviewID = ''
                rIDObj = rateObj[0].xpath(".//a")
                if len(rIDObj) != 0:
                    reviewID = rIDObj[0].attrib['name']

                # label The review is from
                label = ''
                labelObj = rateObj[0].xpath(".//div[@class='tiny']")
                if len(labelObj) != 0:
                    # verified purchase + the review is from
                    aObj = labelObj[-1].xpath(".//a")
                    if len(aObj) != 0:
                        label = filtTag.filter_tags(
                            etree.tostring(aObj[0]).strip())

                # 1---book 0---product
                parLeft = label.find('(')
                parRight = label.find(')')

                if parLeft == -1 and parRight == -1:
                    label = '0'
                elif label[-1] == ')':
                    label = label.split('(')
                    label = label[-1][:-1]
                    if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1):
                        label = '1'
                        bookID = aLink.replace('/ref=cm_cr-mr-title', '')
                        bookID = bookID[-10:]
                        self.reviewedBookList.append(bookID)

                productList = []
                productList.append(aLink)
                productList.append(rate)
                productList.append(reviewID)
                productList.append(label)
                self.allProductLinks.append(productList)
                del productList

            self.page = self.page + 1
Esempio n. 2
0
 def getPreviousReviewedBook(self,reviewerId):
     initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId
     self.allRevLink=initUrl
     html=MyHtml.getHtml(initUrl)
     ftable=html.xpath('.//body/table[2]')[0]
     pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]')
     if pages is not None and len(pages)>0:
         totalPages=pages[0].text.strip()
     else:
         totalPages=1
     strPages=str(totalPages)
     print strPages
     if "-" in strPages:
         totalPages=totalPages.split('-')
         print "totalPages",totalPages
         totalPages=totalPages[1]
     else:
         totalPages=totalPages
     print totalPages,"totalPages"
     sortBy='MostRecentReview'
     j=1
     self.counter=0
     flag=0
     for j in range(1,(int(totalPages)+1)):
         baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId
         baseUrl=baseUrl+ \
         '?pageNumber={}&sortBy={}'.format(str(j), sortBy)
         print baseUrl,j,totalPages
         html=MyHtml.getHtml(baseUrl)
         ftable=html.xpath('.//body/table[2]')[0]
         mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]')
         for row in mainTable:
             if row is not None:
                 isBook=row.xpath('./td[5]/table/tr[2]/td/b')
                 if isBook is not None and len(isBook)>0:
                     if isBook[0].text is not None and flag==0:
                         edition=isBook[0].text.strip()
                         if "Edition" in edition:
                             print "got the previous book"
                             flag=1
                             reviewdate=row.xpath('./following-sibling::*')
                             reviewdate=reviewdate[0].xpath('.//nobr')
                             if reviewdate:
                                 reviewdate=reviewdate[0].text.strip()
                                 print "got the reviewDate",reviewdate
                                 self.previousBookReviewDate=CommonTool.strToDate(reviewdate)
                                 
                             #to get link of the previous reviewedbook 
                             url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href']
                             asin=Book.getAsinFromUrl(url)
                             previousBook=Book.loadBookByAsin(asin)
                             print "asinofPrevious",asin
                             print "previousBook",previousBook
                             self.previousBookPublishDate=previousBook.publishDate
                 
                             
                 if j==1 and self.lRevTime=='':
                     reviewdate=mainTable[0].xpath('./following-sibling::*')
                     reviewdate=reviewdate[0].xpath('.//nobr')
                     print reviewdate,"lRevtime"
                     if reviewdate:
                         reviewdate=reviewdate[0].text.strip()
                         self.lRevTime=CommonTool.strToDate(reviewdate)
                 print "value of j",j
                 if j==int(totalPages) and self.fRevTime=='':
                     print "inside frevtime loop"
                     reviewdate=mainTable[-1].xpath('./following-sibling::*')
                     reviewdate=reviewdate[0].xpath('.//nobr')
                     print reviewdate,"fRevtime"
                     if reviewdate:
                         reviewdate=reviewdate[0].text.strip()
                         self.fRevTime=CommonTool.strToDate(reviewdate)
                     
                 # rate
                 rateObj = row.xpath("./following-sibling::*")
                 rate = 'N/A'
                 if len(rateObj) != 0:
                     rateObj1 = rateObj[0].xpath(".//img")
                     title = rateObj1[0].attrib['title']
                     rate = title.split("out")[0].strip()
                     self.sum = self.sum + float(rate)        
     j=j+1
         #end of inner for loop
     print "sum",self.sum
     if self.rNum is not 0:
         self.avgRate=self.sum/self.rNum 
         self.avgRate=round(self.avgRate,2)
     if self.lRevTime=='' or self.fRevTime=='':
         duration=0
     else :  
         duration = (self.lRevTime-self.fRevTime).days
     self.duration=int(duration)
Esempio n. 3
0
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate):
    hlre = re.compile(
        r'^(\d+) of (\d+) people found the following review helpful')
    html = MyHtml.getHtml(url)
    print "solving Review Page"
    countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip())
    if countOfReviews>0:
        
        divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0]
        divReviewList = divWholeReviewList.xpath('./div[@id]')     
        print divReviewList
        for divReview in divReviewList:
            
            aReview = Review()
            rank += 1
            aReview.helpfulRank = rank
            aReview.asin = asin
            aReview.reviewID = divReview.attrib['id']

            # helpful line
            parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
            print "helpful Match",parentNode
            helpfulMatch=None
            if parentNode  is not None: 
                match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
                if match1 is not None and len(match1)>0:
                    if match1[0].text is not None:
                        helpfulMatch=hlre.match(match1[0].text.strip())
                else:
                    
                    match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]')
                    if match2 is not None and len(match2)>0:
                        if match2[0].text is not None:
                            helpfulMatch=hlre.match(match2[0].text.strip())
                        
                print helpfulMatch
                if helpfulMatch:
                    aReview.helpful = int(helpfulMatch.group(1))
                    aReview.total = int(helpfulMatch.group(2))
                    aReview.helpfulness = aReview.helpful * \
                                100 / aReview.total / 100.0
                else:
                    aReview.helpful=0
                    aReview.total=0
                    aReview.helpfulness=0
                del helpfulMatch

                print "getting rate data"
                
                rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip()
                aReview.rate=rateData.split(' ')[0].strip()
                aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip()
            
                print "getting reviewerId"
               
                reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a')
                if reviewer is not None and len(reviewer)>0:
                    aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0]
                print "reviewerId",aReview.reviewerID
                
            aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip())
            aReview.elapsedDate = (fetchDate - aReview.date).days
            print bookPublishDate
            if bookPublishDate=='N/A':
                aReview.reviewBookDate='N/A'
            else:
                aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days
                
    
            # format line
            try:
                strFormat = divReview[3].xpath('./a[1]')[0].text.strip()
                aReview.fromFormat = strFormat.split(' ')[1]
            except IndexError:
                aReview.fromFormat = ''
            spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]')
            if spanVerifiedPurchase:
                spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip()
                if spanVerifiedPurchase=="Verified Purchase":
                    aReview.verified = 1
                else:
                    aReview.verified = 0
    
                # review text line
            divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0]
            aReview.description = filtTag.filter_tags(
                etree.tostring(divReviewText).strip()).strip()
            aReview.description = aReview.description.replace('\n', '<br />')
            del divReviewText
   
   
                # review comments line
            aReview.numOfComments = CommonTool.strToInt(
                divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip())
            aReview.getComments()
            saveReview(review=aReview)
    #             try:
    #                 reviewer = Reviewer.loadReviewer(aReview.reviewerID)
    #                 aReview.lastReviewRank = reviewer.getPreBookReviewRanking(
    #                     aReview.reviewID)
    #             except Exception, e:
    #                 sys.stderr.write(str(e) + '\n')
    #                 sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \
    #                 reviewerID: {2}\n'.format(
    #                     url, aReview.reviewID, aReview.reviewerID))
    #                 import traceback
    #                 traceback.print_exc()
            
        # end of for
    # end of else
    return rank