def collect(self): """ Connect to the home-page and grab the information """ self._soup = FileIO.connect_with_bs4(self.site_url) self.num_books = self.__scrap_num_books() self.links = self.__scrap_links() self.categories = self.__scrap_categories()
def collect(self): """ Connect to the category page and grab the information """ self._soup = FileIO.connect_with_bs4(self.category_url) self.name = self.__scrap_name() self.num_books = self.__scrap_num_books() self.links = self.__scrap_links() self.books = self.__scrap_books()
def collect(self): """ Connect to the product page and grab the information """ self._soup = FileIO.connect_with_bs4(self.product_page_url) self.universal_product_code = self.__scrap_upc() self.title = self.__scrap_title() self.price_including_tax = self.__scrap_price_inc_tax() self.price_excluding_tax = self.__scrap_price_exc_tax() self.number_available = self.__scrap_number_available() self.product_description = self.__scrap_product_description() self.category = self.__scrap_category() self.review_rating = self.__scrap_review_rating() self.image_url = self.__scrap_image_url()
def __scrap_links(self): def get_links(soup): return soup.select('section a[title]') try: links = get_links(self._soup) page = 2 while(len(links) < self.num_books): base = urljoin(self.category_url, 'page-{}.html'.format(page)) soup = FileIO.connect_with_bs4(base) links.extend(get_links(soup)) page += 1 return [(urljoin(self.category_url, x.attrs['href']), x.attrs['title']) for x in links] except Exception: raise(Exception(f"Can't find the Book links ::\ \n{self.product_page_url}"))
def test_connect_with_bs4_ERROR(): with pytest.raises(Exception): FileIO.connect_with_bs4('http://www.xxxfakexxx.xxx')
def test_connect_with_bs4_TYPE(): url = 'http://books.toscrape.com' assert type(FileIO.connect_with_bs4(url)) == BeautifulSoup