Python strip_html_tags Beispiele, amazon_scraper.strip_html_tags Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: product.py Projekt: TJJonesy16/amazon_scraper

    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script', text=re.compile(ur'bookDescEncodedData', flags=re.I))

Beispiel #2

0

Datei anzeigen

    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(ur'bookDescEncodedData',
                                             flags=re.I))

Beispiel #3

0

Datei anzeigen

 def author_bio(self):
     tag = self.soup.find('div', class_='mainContent')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None

Beispiel #4

0

Datei anzeigen

 def total_reviews(self):
     tag = self.soup.find('span', class_='totalReviewCount')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None

Beispiel #5

0

Datei anzeigen

Datei: product.py Projekt: TJJonesy16/amazon_scraper

 def author_bio(self):
     tag = self.soup.find('div', class_='mainContent')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None

Beispiel #6

0

Datei anzeigen

            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(ur'bookDescEncodedData',
                                             flags=re.I))
        if tag:
            match = re.search(
                ur'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div',
                                      class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:

Beispiel #7

0

Datei anzeigen

Datei: review.py Projekt: rob0tca/amazon_scraper

 def text(self):
     tag = self.soup.find('span', class_='description')
     return strip_html_tags(unicode(tag))

Beispiel #8

0

Datei anzeigen

    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(r'bookDescEncodedData',
                                             flags=re.I))
        if tag:
            match = re.search(
                r'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div',
                                      class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:
            sub_tag = tag.find('div', class_='content')
            if sub_tag:
                tag = sub_tag
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # amazon instant video
        # http://www.amazon.com/dp/B004C0YS5C
        # older method
        tag = self.soup.find('div', class_='prod-synopsis')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)
        # newer method
        tag = self.soup.find('div', class_='dv-simple-synopsis')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B0006FUAD6
        tag = self.soup.find('div',
                             id=re.compile('feature-bullets', flags=re.I))
        if tag:
            tags = map(unicode, tag.find_all('span'))
            text = strip_html_tags(u''.join(tags))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B00DHF39KS
        tag = self.soup.find('div', class_='aplus')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        return result

Beispiel #9

0

Datei anzeigen

Datei: product.py Projekt: TJJonesy16/amazon_scraper

        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script', text=re.compile(ur'bookDescEncodedData', flags=re.I))
        if tag:
            match = re.search(ur'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div', class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:
            sub_tag = tag.find('div', class_='content')

Beispiel #10

0

Datei anzeigen

Datei: product.py Projekt: christineli/Lucid

    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find("div", id="postBodyPS")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find("div", id="bookDescription_feature_div")
        if tag:
            tag = tag.find("div", class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find("script", text=re.compile(r"bookDescEncodedData", flags=re.I))
        if tag:
            match = re.search(r'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group("description")
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all("div", class_="productDescriptionWrapper"):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find("div", class_="mas-product-description-wrapper")
        if tag:
            sub_tag = tag.find("div", class_="content")
            if sub_tag:
                tag = sub_tag
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # amazon instant video
        # http://www.amazon.com/dp/B004C0YS5C
        # older method
        tag = self.soup.find("div", class_="prod-synopsis")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)
        # newer method
        tag = self.soup.find("div", class_="dv-simple-synopsis")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B0006FUAD6
        tag = self.soup.find("div", id=re.compile("feature-bullets", flags=re.I))
        if tag:
            tags = map(unicode, tag.find_all("span"))
            text = strip_html_tags(u"".join(tags))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B00DHF39KS
        tag = self.soup.find("div", class_="aplus")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        return result