class Books: def __init__(self, path=None, arabic=True): self.arabic = arabic # Browsing and writing managers self.br = Browser() self.wr = Writer(path) if path else Writer() # An array for scrapped books self._books_ids = [] # Append an external books ids array to local array def append_books(self, books_ids): # Loop through sent books ids for book_id in books_ids: # Only append id if it's not stored already if book_id not in self._books_ids: self._books_ids.append(book_id) # Scrape books and write them to a file (browse is: list, lists, author or shelf) def output_books(self, keyword=None, browse="list", file_name="books"): self.wr.open(file_name, "w+") # Get books if keyword is provided, otherwise output stored books books_ids = self.get_books(keyword, browse) if keyword else self._books_ids # Loop through book ids and write them for book_id in books_ids: self.wr.write(book_id) self.wr.close() def output_books_editions(self, books_ids=None, file_name="editions"): skip = len(read_books(file_name)) self.wr.open(file_name, "a+") # Loop through book ids and write their editions id for book_id in books_ids[skip:] or self._books_ids[skip:]: editions_id = self.get_book_editions_id(book_id) # Editions id is None when page refuses to load if editions_id is None: return self.wr.close() # Write editions id to file if it loads correctly self.wr.write(editions_id or "-"*7) # Display book id and editions id print(f"Book ID:\t{book_id:<15}Book Editions ID:\t{editions_id or ''}") self.wr.close() return True def output_books_edition_by_language(self, editions_ids, lang="Arabic", file_name="ara_books"): skip = len(read_books(file_name)) self.wr.open(file_name, "a+") # Loop through book ids and write their editions id for editions_id in editions_ids[skip:]: books_ids = self.get_book_edition_by_language(editions_id, lang) if editions_id.isdigit() else '' # Editions id is None when page refuses to load if books_ids is None: return self.wr.close() # Write editions id to file if it loads correctly self.wr.write(books_ids or "-"*7) # Display book id and editions id print(f"Book Editions ID:\t{editions_id:<15}Books IDs:\t{books_ids or ''}") self.wr.close() # Open a new file to move done list to it self.wr.open(file_name + "_list") # Loop through previously scraped editions ids for line in read_books(file_name): # If line isn't empty if line != "-"*7: # Write each book edition id in a separate line [self.wr.write(id_) for id_ in line.split(',')] self.wr.close() return True # Main function to scrape books ids def get_books(self, keyword, browse="list"): # Get lists in search list if searching if browse == "lists": keywords = self._get_lists(keyword.replace(' ', '+')) browse = "list" # Otherwise, it's a single "list" or "shelf" else: keywords = [ str(key) for key in ( keyword if isinstance(keyword, list) else [keyword] )] try: # Loop through all lists for keyword in keywords: # Open each list url self.br.open_page(keyword, browse) # Scrape pages until there's no next page while True: self._scrape_list("book", self._books_ids) if not self.br.goto_next_page(): break except Exception as e: print("Couldn't go to next page:", e) finally: return self._books_ids def get_book_editions_id(self, book_id): self.br.open("/book/show/", book_id) return self.br.editions_id() def get_book_edition_by_language(self, editions_id, lang): self.br.open_book_editions(editions_id) soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="workEditions") if not soup: return None editions = [] for details in soup.find_all(class_="editionData"): language, rating = [row.find(class_="dataValue") for row in details.find_all(class_="dataRow")[-3:-1]] if language.text.strip() == lang: reviewers = get_digits(rating.find("span").text) if reviewers > 50: editions.append(id_from_url.match(details.find(class_="bookTitle")["href"]).group(1)) return ','.join(editions) # Main function to scrape lists ids def _get_lists(self, keyword): lists = [] # Open GoodReads' lists search url self.br.open_list_search(keyword) # Scrape all result pages while True: self._scrape_list("list", lists) # Go to next page if there's one, otherwise break if not self.br.goto_next_page(): break return lists # Scrape a single search results page def _scrape_list(self, title, array): soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="tableList") if not soup: return None for book in soup.find_all("tr"): if self.arabic or get_digits(book.find(class_="minirating").text.split("—")[1]) > 1000: try: # Get id from url id_ = id_from_url.match(book.find(class_=title + "Title")["href"]).group(1) except Exception: print("Couldn't extract Book Id from URL") continue # Extract and store unique id from link if id_ not in array: array.append(id_) print(f"{title.capitalize()} {id_:<10}count:\t{len(array)}")
class Reviews: def __init__(self, path=None, lang="ar", edition_reviews=False): # Language of reviews to be scraped self._lang = lang # Instantiate browsing and writing managers self.wr = Writer(path) if path else Writer() self.br = Browser(edition_reviews) # Initialize an empty threads list self._threads = [] # Counter for reviews from different languages self._invalid = None def start(self): self.br.start() # Scrape and write books' reviews to separate files def output_books_reviews(self, books_ids, consider_previous=True): if consider_previous: # Don't loop through already scraped books self.wr.consider_written_files(books_ids) # Show how many books are going to be scraped print(f"Scraping {len(books_ids)} Books") # Loop through book ids in array and scrape books for book_id in books_ids: self.output_book_reviews(book_id) # Scrape and write one book's reviews to a file def output_book_reviews(self, book_id): self._threads.clear() # Open book file and page by its Id self.br.open_book_page(book_id) self.wr.open_book_file(book_id) # Reset invalid reviews counter and page counter self._invalid = 0 # Scrape book meta data in first line self.run(self._scrape_book_meta, [book_id]) # Scrape first page of the book anyway self.run(self._scrape_book_reviews) no_next_page = False try: # Scrape the remaining pages while self._invalid < 60: # Go to next page if there's one in_next_page = self.br.goto_next_page() if no_next_page or not in_next_page: no_next_page = False # Switch to a different reviews mode if not self.br.switch_reviews_mode(book_id, in_next_page is None): # Break after switching to all modes break # Wait until requested book reviews are loaded if self.br.are_reviews_loaded(): # Scrape loaded book reviews self.run(self._scrape_book_reviews) else: no_next_page = True finally: # Wait until all threads are done [thread.join() for thread in self._threads] # Finalize file name and close it self.wr.close_book_file() # Scrape and write book and author data def _scrape_book_meta(self, html, book_id): # Create soup object and store book meta section of the page in soup soup = BeautifulSoup(html, "lxml").find(id="metacol") # If book is not found if not soup: print(f"*Book ID:\t{book_id:<15}Not Found!") # Close file and raise an error self.wr.close_book_file() raise FileNotFoundError # Get book title and remove spaces from it title = soup.find(id="bookTitle").get_text(". ", strip=True) # Get average rating of the book out of five rating = soup.find(class_="average").get_text() # Store author data section author = soup.find(class_="authorName") # Get author id from url id_ = author.get("href")[38:].split(".")[0] # Get author name name = author.find().get_text() # Write scraped meta data to file's first line self.wr.write_book_meta(book_id, title, rating, id_, name) # Display book id and title print(f"*Book ID:\t{book_id:<15}Title:\t{title}") # Scrape a single page's reviews def _scrape_book_reviews(self, html): # Store reviews section of the page in soup soup = BeautifulSoup(html, "lxml").find(id="bookReviews") # Loop through reviews individually for review in soup.find_all(class_="review"): try: # Get user / reviewer id user_id = review.find(class_="user").get("href")[11:].split("-")[0] # Get rating out of five stars stars = len(review.find(class_="staticStars").find_all(class_="p10")) # Get full review text even the hidden parts, and remove spaces and newlines comment = review.find(class_="readable").find_all("span")[-1].get_text(". ", strip=True) # Detect which language the review is in if detect(comment) != self._lang: # Count it as a different language review self._invalid += 1 continue # Get review date date = review.find(class_="reviewDate").get_text() # Skip the rest if one of the above is missing except Exception: # Count it as an invalid review self._invalid += 2 continue # If it's not a strike, reset the counter self._invalid = 0 # Get review ID review_id = review.get("id")[7:] # Write the scraped review to the file self.wr.write_review(review_id, user_id, date, stars, comment) # Add review id to ids print(f"Added ID:\t{review_id}") return True # Starts a scraping process on a new thread def run(self, method, args=[]): # Create a thread and add it to threads list then start it self._threads.append(SafeThread(target=method, args=[self.br.page_source] + args)) self._threads[-1].start() def reset(self): self.stop() self.start() print("Restarted Reviews") def stop(self): self.br.close() self.wr.delete_file() def close(self): self.br.quit() self.wr.close() self._threads.clear() print("Closed Reviews")