Python filter_tags Exemples, filtTag.filter_tags Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Book.py Projet : inderjot29/AmazonBook

 def solveEditorialReview(self):
     if len(self.html.xpath(".//h2[text()='Editorial Reviews']")) > 0:
         self.hasEditorialReview = 1
         self.editorialReview = filtTag.filter_tags(etree.tostring(
             self.html.xpath(
                 ".//h2[text()='Editorial Reviews'][1]/following-sibling::*"
             )[0])).strip()
         self.editorialReview = self.editorialReview.replace('\n', '<br />')
     else:
         self.hasEditorialReview = 0
         self.editorialReview = ""

Exemple #2

0

Afficher le fichier

Fichier : Reviewer.py Projet : inderjot29/AmazonBook

    def getProductLinksUsingThreads(self,productList):
        print "getting products using thread method"
        for link in productList:
            
            linkPage = MyHtml.getHtml(
                link, self.rID + "_AllProductLinks_" + str(self.page))
            trObjs = linkPage.xpath(
                ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']")
            if link is self.firstLink:
                try:
                    firstTimeObj = trObjs[-1]
                except LookupError:
                    print link
                    exit(0)
                fTime = firstTimeObj.xpath("./following-sibling::*")
                if len(fTime) == 0:
                    self.fRevTime = 'N/A'
                else:
                    # print etree.tostring(fTime[0])
                    fTime = fTime[0].xpath(".//nobr")
                    if len(fTime) == 0:
                        self.fRevTime = 'N/A'
                    else:
                        self.fRevTime = fTime[0].text.strip()
                        self.fRevTime = CommonTool.strToDate(self.fRevTime)
                del fTime

            if link is self.lastLink:
                lastTimeObj = trObjs[0]
                lTime = lastTimeObj.xpath("./following-sibling::*")
                if len(lTime) == 0:
                    self.lRevTime = 'N/A'
                else:
                    # print etree.tostring(lTime[0])
                    lTime = lTime[0].xpath(".//nobr")
                    if len(lTime) == 0:
                        self.lRevTime = 'N/A'
                    else:
                        self.lRevTime = lTime[0].text.strip()
                        self.lRevTime = CommonTool.strToDate(self.lRevTime)
                del lTime

            for trObj in trObjs:
                tableObj = trObj.xpath(
                    "./td[@class='small'][3]/table[@class='small']")
                # aLink
                if len(tableObj) != 0:
                    aLink = tableObj[0].xpath(".//a")
                    if len(aLink) == 0:
                        aLink = ''
                    else:
                        aLink = aLink[0].attrib['href']

                # rate
                rateObj = trObj.xpath("./following-sibling::*")
                rate = 'N/A'
                if len(rateObj) != 0:
                    try:
                        rateObj1 = rateObj[0].xpath(".//img")
                        title = rateObj1[0].attrib['title']
                        rate = title.split("out")[0].strip()
                        self.sum = self.sum + float(rate)
                        self.counter = self.counter + 1
                    except Exception, e:
                        sys.stderr.write(str(e) + ' rate Exception\n')

                # reviewID
                reviewID = ''
                rIDObj = rateObj[0].xpath(".//a")
                if len(rIDObj) != 0:
                    reviewID = rIDObj[0].attrib['name']

                # label The review is from
                label = ''
                labelObj = rateObj[0].xpath(".//div[@class='tiny']")
                if len(labelObj) != 0:
                    # verified purchase + the review is from
                    aObj = labelObj[-1].xpath(".//a")
                    if len(aObj) != 0:
                        label = filtTag.filter_tags(
                            etree.tostring(aObj[0]).strip())

                # 1---book 0---product
                parLeft = label.find('(')
                parRight = label.find(')')

                if parLeft == -1 and parRight == -1:
                    label = '0'
                elif label[-1] == ')':
                    label = label.split('(')
                    label = label[-1][:-1]
                    if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1):
                        label = '1'
                        bookID = aLink.replace('/ref=cm_cr-mr-title', '')
                        bookID = bookID[-10:]
                        self.reviewedBookList.append(bookID)

                productList = []
                productList.append(aLink)
                productList.append(rate)
                productList.append(reviewID)
                productList.append(label)
                self.allProductLinks.append(productList)
                del productList

            self.page = self.page + 1

Exemple #3

0

Afficher le fichier

Fichier : Review.py Projet : inderjot29/AmazonBook

def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate):
    hlre = re.compile(
        r'^(\d+) of (\d+) people found the following review helpful')
    html = MyHtml.getHtml(url)
    print "solving Review Page"
    countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip())
    if countOfReviews>0:
        
        divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0]
        divReviewList = divWholeReviewList.xpath('./div[@id]')     
        print divReviewList
        for divReview in divReviewList:
            
            aReview = Review()
            rank += 1
            aReview.helpfulRank = rank
            aReview.asin = asin
            aReview.reviewID = divReview.attrib['id']

            # helpful line
            parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
            print "helpful Match",parentNode
            helpfulMatch=None
            if parentNode  is not None: 
                match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]')
                if match1 is not None and len(match1)>0:
                    if match1[0].text is not None:
                        helpfulMatch=hlre.match(match1[0].text.strip())
                else:
                    
                    match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]')
                    if match2 is not None and len(match2)>0:
                        if match2[0].text is not None:
                            helpfulMatch=hlre.match(match2[0].text.strip())
                        
                print helpfulMatch
                if helpfulMatch:
                    aReview.helpful = int(helpfulMatch.group(1))
                    aReview.total = int(helpfulMatch.group(2))
                    aReview.helpfulness = aReview.helpful * \
                                100 / aReview.total / 100.0
                else:
                    aReview.helpful=0
                    aReview.total=0
                    aReview.helpfulness=0
                del helpfulMatch

                print "getting rate data"
                
                rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip()
                aReview.rate=rateData.split(' ')[0].strip()
                aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip()
            
                print "getting reviewerId"
               
                reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a')
                if reviewer is not None and len(reviewer)>0:
                    aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0]
                print "reviewerId",aReview.reviewerID
                
            aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip())
            aReview.elapsedDate = (fetchDate - aReview.date).days
            print bookPublishDate
            if bookPublishDate=='N/A':
                aReview.reviewBookDate='N/A'
            else:
                aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days
                
    
            # format line
            try:
                strFormat = divReview[3].xpath('./a[1]')[0].text.strip()
                aReview.fromFormat = strFormat.split(' ')[1]
            except IndexError:
                aReview.fromFormat = ''
            spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]')
            if spanVerifiedPurchase:
                spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip()
                if spanVerifiedPurchase=="Verified Purchase":
                    aReview.verified = 1
                else:
                    aReview.verified = 0
    
                # review text line
            divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0]
            aReview.description = filtTag.filter_tags(
                etree.tostring(divReviewText).strip()).strip()
            aReview.description = aReview.description.replace('\n', '<br />')
            del divReviewText
   
   
                # review comments line
            aReview.numOfComments = CommonTool.strToInt(
                divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip())
            aReview.getComments()
            saveReview(review=aReview)
    #             try:
    #                 reviewer = Reviewer.loadReviewer(aReview.reviewerID)
    #                 aReview.lastReviewRank = reviewer.getPreBookReviewRanking(
    #                     aReview.reviewID)
    #             except Exception, e:
    #                 sys.stderr.write(str(e) + '\n')
    #                 sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \
    #                 reviewerID: {2}\n'.format(
    #                     url, aReview.reviewID, aReview.reviewerID))
    #                 import traceback
    #                 traceback.print_exc()
            
        # end of for
    # end of else
    return rank

Exemple #4

0

Afficher le fichier

Fichier : Book.py Projet : inderjot29/AmazonBook

                    priceValue = priceValue.text.strip()
                if cmp(priceType, 'Kindle') == 0:
                    self.kindlePrice = CommonTool.strToFloat(priceValue)
                elif cmp(priceType, 'Hardcover') == 0:
                    self.hardcoverPrice = CommonTool.strToFloat(priceValue)
                elif cmp(priceType, 'Paperback') == 0:
                    self.paperbackPrice = CommonTool.strToFloat(priceValue)
                del spans
                del priceType
                del priceValue
            del priceList

        try:
            strBookDesc = etree.tostring(divCenterCol.xpath(
                "./div[@id='bookDescription_feature_div']/noscript")[0])
            self.bookDsc = filtTag.filter_tags(strBookDesc).strip()
            self.bookDsc = self.bookDsc.replace('\n', '<br />')
            del strBookDesc
        except Exception, e:
            print 'self.bookDsc error: %s' % e
            self.bookDsc = ""

        del divCenterCol
    # end of solveCenterCol

    def initCenterCol(self):
        self.title = 'N/A'
        self.binding = 'N/A'
        self.publishDate = 'N/A'
        self.elapsedDate = 'N/A'
        self.author = 'N/A'