Example #1
0
 def parse_author(self):
     try:
         return soup_utils.find_tag(
             self.item_soup,
             ReviewItem.REVIEW_AUTHOR_SELECTOR).text.strip()
     except:
         return None
Example #2
0
 def parse_helpful(self):
     try:
         vote_text = soup_utils.find_tag(
             self.item_soup, ReviewItem.REVIEW_VOTES_SELECTOR).text
         return vote_text.replace("people found this helpful.", "").strip()
     except:
         return 0
Example #3
0
 def parse_description(self):
     try:
         return soup_utils.find_tag(
             self.soup, self.DESCRIPTION_SELECTOR).decode_contents(
                 formatter="html").strip()
     except:
         return None
Example #4
0
 def parse_feature_list(self):
     try:
         return soup_utils.find_tag(
             self.soup, self.FEATURE_BULLETS_SELECTOR).decode_contents(
                 formatter="html").strip()
     except:
         return None
Example #5
0
 def parse_content(self):
     try:
         return soup_utils.find_tag(
             self.item_soup,
             ReviewItem.REVIEW_CONTENT_SELECTOR).decode_contents(
                 formatter="html").strip()
     except:
         return None
Example #6
0
 def parse_date(self):
     try:
         date_text = soup_utils.find_tag(
             self.item_soup, ReviewItem.REVIEW_DATE_SELECTOR).text
         date_text = date_text.split('on')[-1].strip()
         return datetime.datetime.strptime(date_text, '%B %d, %Y')
     except:
         return None
Example #7
0
 def parse_breadcrumbs(self):
     try:
         categories = soup_utils.find_tag(
             self.soup, self.BREADCRUMBS_SELECTOR).text.strip()
         categories = [c.strip() for c in categories.split('›')]
         return " > ".join(categories)
     except:
         return None
Example #8
0
 def parse_price(self):
     try:
         price_text = soup_utils.find_tag(self.soup,
                                          self.PRICE_SELECTOR).text.strip()
         price_text = re.sub(r'[^0-9.,\-]', '', price_text)
         return round(float(price_text), 2)
     except:
         return None
Example #9
0
    def parse(self, response):
        soup = BeautifulSoup(response, HTML_PARSER)
        items = soup_utils.find_tags(soup, ITEM_LINK_SELECTOR)
        for item in items:
            item_link = soup_utils.find_tag(item, 'a')
            asin = extract_asin_from_url(item_link['href'])
            self.save_asin(asin)

        self.process_next_page(soup)
Example #10
0
 def parse_rating(self):
     try:
         stars_text = soup_utils.find_tag(
             self.item_soup,
             ReviewItem.REVIEW_RATING_SELECTOR).text.replace(
                 "out of 5 stars", "").strip()
         return int(float(stars_text))
     except:
         return 0
Example #11
0
    def process_next_page(self, soup):
        self.page_processed += 1
        if self.page_processed > self.max_pages:
            return

        next_page = soup_utils.find_tag(soup, NEXT_PAGE_SELECTOR)
        if next_page is not None:
            next_page_url = soup_utils.format_url(
                next_page['href'], get_review_url(self.asin, self.country))
            self.process(next_page_url)
Example #12
0
 def process_next_page(self, soup):
     next_page = soup_utils.find_tag(soup, NEXT_PAGE_SELECTOR)
     if next_page is not None:
         next_page_url = soup_utils.format_url(next_page['href'], self.url)
         self.process(next_page_url)
Example #13
0
 def parse_title(self):
     try:
         return soup_utils.find_tag(
             self.item_soup, ReviewItem.REVIEW_TITLE_SELECTOR).text.strip()
     except:
         return None
Example #14
0
 def parse_brand(self):
     return soup_utils.find_tag(self.soup, self.BRAND_SELECTOR).text.strip()
Example #15
0
 def parse_name(self):
     return soup_utils.find_tag(self.soup, self.TITLE_SELECTOR).text.strip()