Beispiel #1
0
    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script', text=re.compile(ur'bookDescEncodedData', flags=re.I))
Beispiel #2
0
    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(ur'bookDescEncodedData',
                                             flags=re.I))
Beispiel #3
0
 def author_bio(self):
     tag = self.soup.find('div', class_='mainContent')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None
Beispiel #4
0
 def total_reviews(self):
     tag = self.soup.find('span', class_='totalReviewCount')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None
Beispiel #5
0
 def author_bio(self):
     tag = self.soup.find('div', class_='mainContent')
     if tag:
         text = strip_html_tags(unicode(tag))
         if text:
             return text
     return None
Beispiel #6
0
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(ur'bookDescEncodedData',
                                             flags=re.I))
        if tag:
            match = re.search(
                ur'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div',
                                      class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:
Beispiel #7
0
 def text(self):
     tag = self.soup.find('span', class_='description')
     return strip_html_tags(unicode(tag))
Beispiel #8
0
    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find('div', id='postBodyPS')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script',
                             text=re.compile(r'bookDescEncodedData',
                                             flags=re.I))
        if tag:
            match = re.search(
                r'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div',
                                      class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:
            sub_tag = tag.find('div', class_='content')
            if sub_tag:
                tag = sub_tag
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # amazon instant video
        # http://www.amazon.com/dp/B004C0YS5C
        # older method
        tag = self.soup.find('div', class_='prod-synopsis')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)
        # newer method
        tag = self.soup.find('div', class_='dv-simple-synopsis')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B0006FUAD6
        tag = self.soup.find('div',
                             id=re.compile('feature-bullets', flags=re.I))
        if tag:
            tags = map(unicode, tag.find_all('span'))
            text = strip_html_tags(u''.join(tags))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B00DHF39KS
        tag = self.soup.find('div', class_='aplus')
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        return result
Beispiel #9
0
        tag = self.soup.find('div', id='bookDescription_feature_div')
        if tag:
            tag = tag.find('div', class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find('script', text=re.compile(ur'bookDescEncodedData', flags=re.I))
        if tag:
            match = re.search(ur'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group('description')
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all('div', class_='productDescriptionWrapper'):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find('div', class_='mas-product-description-wrapper')
        if tag:
            sub_tag = tag.find('div', class_='content')
Beispiel #10
0
    def supplemental_text(self):
        # get all the known text blobs
        # remove any found in editorial reviews
        result = []

        # kindle
        # http://www.amazon.com/dp/1593080050
        tag = self.soup.find("div", id="postBodyPS")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # paperbacks
        # http://www.amazon.com/dp/1568822812
        tag = self.soup.find("div", id="bookDescription_feature_div")
        if tag:
            tag = tag.find("div", class_=None)
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # extract from the javascript code that updates the iframe
        # http://www.amazon.com/dp/1491268727
        tag = self.soup.find("script", text=re.compile(r"bookDescEncodedData", flags=re.I))
        if tag:
            match = re.search(r'bookDescEncodedData\s=\s"(?P<description>[^",]+)', tag.text)
            if match:
                text = match.group("description")
                text = urllib.unquote(text)
                text = strip_html_tags(text)
                if text:
                    result.append(text)

        # http://www.amazon.com/dp/1616611359
        for tag in self.soup.find_all("div", class_="productDescriptionWrapper"):
            text = unicode(tag)
            text = strip_html_tags(text)
            if text:
                result.append(text)

        # android apps
        # http://www.amazon.com/dp/B008A1I0SU
        tag = self.soup.find("div", class_="mas-product-description-wrapper")
        if tag:
            sub_tag = tag.find("div", class_="content")
            if sub_tag:
                tag = sub_tag
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # amazon instant video
        # http://www.amazon.com/dp/B004C0YS5C
        # older method
        tag = self.soup.find("div", class_="prod-synopsis")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)
        # newer method
        tag = self.soup.find("div", class_="dv-simple-synopsis")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B0006FUAD6
        tag = self.soup.find("div", id=re.compile("feature-bullets", flags=re.I))
        if tag:
            tags = map(unicode, tag.find_all("span"))
            text = strip_html_tags(u"".join(tags))
            if text:
                result.append(text)

        # http://www.amazon.com/dp/B00DHF39KS
        tag = self.soup.find("div", class_="aplus")
        if tag:
            text = strip_html_tags(unicode(tag))
            if text:
                result.append(text)

        return result