datefmt='%d-%m-%Y %H:%M:%S', level=logging.INFO, filename='logs.txt', filemode='w') logger = logging.getLogger('scraping') print('Loading books list...') logger.info('Loading books list...') book_website = 'http://books.toscrape.com' # main website link logger.info(f'Requesting `{book_website}`') page_content = requests.get(book_website).content logger.debug('Creating AllBooksPage from page content.') page = AllBooksPage(page_content) books_ = [] start_time = time.time() # initialize time to measure task-time. logger.info(f'Going through all `{page.page_count}` pages of books...') for page_num in range(page.page_count): page_start = time.time() url = f'http://books.toscrape.com/catalogue/page-{page_num+1}.html' logger.info(f'Requesting `{url}`') page_content = requests.get(url).content logger.debug('Creating AllBooksPage from page content.') page = AllBooksPage(page_content) print(f'`{url}` took `{time.time() - page_start}` seconds.' ) # Time taken by each url. books_.extend(page.books)
from pages.all_books_page import AllBooksPage logging.basicConfig( format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', datefmt='%d-%m-%Y %H:%M:%S', level=logging.DEBUG, filename='logs.txt') logger = logging.getLogger('scraping') logger.info('Loading books list...') page_content = requests.get('http://books.toscrape.com').content page = AllBooksPage(page_content) # builds page object loop = asyncio.get_event_loop() books = page.books #returns the books using the method books from pages.all_books_page async def fetch_page(session, url): page_start = time.time() # async with aiohttp.ClientSession() as session: async with async_timeout.timeout( 10): # to terminate a session if it takes too long async with session.get(url) as response: print(f'page loaded in {time.time() - page_start}') #return response.status return await response.text(
import requests from pages.all_books_page import AllBooksPage page_content = requests.get('http://books.toscrape.com').content books = AllBooksPage(page_content).books
def scrape_books_page(page_number=1) -> AllBooksPage: logger.info(f'Scraping book page {page_number}') url = BOOKS_URL.format(page_number) page_content = requests.get(url).text page = AllBooksPage(page_content) return page
import requests from locators.all_books_page import AllBooksPageLocators from pages.all_books_page import AllBooksPage book_content = requests.get('http://books.toscrape.com/').content book_tag = AllBooksPage(book_content) for book in book_tag.books: print (book)
import requests import logging from pages.all_books_page import AllBooksPage logging.basicConfig(format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', datefmt='%d-%m-%Y %H:%m:%S', level=logging.DEBUG, filename='logs.txt') logger = logging.getLogger('scraping') logger.info('Loading book lists...') page_content = requests.get('http://books.toscrape.com').content page = AllBooksPage(page_content) books = page.books logger.info('Adding page 2 to 50 books in the list') for page in range(1, page.page_num): url = f'http://books.toscrape.com/catalogue/page-{page+1}.html' page_content = requests.get(url).content new_page = AllBooksPage(page_content) books.extend(new_page.books)