def test_scrape_invalid_book_url_wrong_format(self): """ Test invalid input url does has prefix "https://www.goodreads.com/book/show/" """ scraper = BookScraper() dic = scraper.scrape_book("https://www.google.com/") # bad shape self.assertTrue(dic is None)
def test_scrape_invalid_book_url_non_exist(self): """ Test invalid input with the prefix, but actually not exists. """ scraper = BookScraper() book_url = "https://www.goodreads.com/book/show/373a91" # good shape but non-exist dic = scraper.scrape_book(book_url) self.assertTrue(dic is not None) self.assertTrue(dic["book_id"] is None)
def scrape(): args = request.args if 'type' in args and 'start_id' in args and 'number' in args: if (args['type'] == 'book'): book_scraper = BookScraper(book_data_collection) book_scraper.scrapeBooks( build_start_url(args['type'], args['start_id']), int(args['number'])) if (args['type'] == 'author'): author_scraper = AuthorScraper(author_data_collection) author_scraper.scrapeAuthors( build_start_url(args['type'], args['start_id']), int(args['number'])) return "success"
def test_scrape_valid_book_url(self): """ Test that book scraper can scrape information from a valid book url correctly. """ scraper = BookScraper() book_url = "https://www.goodreads.com/book/show/108986.Introduction_to_Algorithms" dic = scraper.scrape_book(book_url) self.assertEqual(dic["rating_value"], 4.34) img_url = "https://i.gr-assets.com/images/S/compressed.photo." \ "goodreads.com/books/1387741681l/108986.jpg" self.assertEqual(dic["cover_url"], img_url) author_url = "https://www.goodreads.com/author/show/60841.Thomas_H_Cormen" self.assertEqual(dic["author_url"], author_url) similar_book_url = "https://www.goodreads.com/book/show/515601.The_C_Programming_Language" self.assertTrue(similar_book_url in dic["similar_book_urls"])
def test_scrape_valid_book_no_isbn(self): """ Given a url of a book without ISBN, the scraper should be able to retrieve other information correctly. """ scraper = BookScraper() book_url = "https://www.goodreads.com/book/show/25008661-the-rust-programming-language" dic = scraper.scrape_book(book_url) self.assertTrue(dic is not None) self.assertEqual(dic["rating_value"], 4.43) self.assertTrue(dic["ISBN"] is None) img_url = "https://i.gr-assets.com/images/S/compressed.photo." \ "goodreads.com/books/1518920310l/25008661._SX318_.jpg" self.assertEqual(dic["cover_url"], img_url) author_url = "https://www.goodreads.com/author/show/7048888.Steve_Klabnik" self.assertEqual(dic["author_url"], author_url) similar_book_url = "https://www.goodreads.com/book/show/25550614-programming-rust" self.assertTrue(similar_book_url in dic["similar_book_urls"])
class TestScraper(unittest.TestCase): def setUp(self): self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'), "testDatabase", "testCollection") self.bookScraper = BookScraper(self.testDB) self.authroScraper = AuthorScraper(self.testDB) def testBookScraper(self): self.testDB.empty_data_collection() testurl = "https://www.goodreads.com/book/show/6185.Wuthering_Heights" self.bookScraper.scrape_one_book(testurl) self.assertEqual(1, self.testDB.get_collection_size()) def testAuthorScraper(self): self.testDB.empty_data_collection() testurl = "https://www.goodreads.com/author/show/6485178.Fredrik_Backman" self.authroScraper.scrape_one_author(testurl) self.assertEqual(1, self.testDB.get_collection_size())
def scrape(data_collection_type, start_url, target_number): """Scrape data from goodreads starting with the starting url Args: data_collection_type (str): Name of data collection, either 'book' or 'author' start_url (str): The url to start scraping from target_number (int): Number of books/authors to scrape """ if data_collection_type == "book": if not re.search(r'([https://]?)www.goodreads.com/book/show/(.*)', start_url): print("Please provide a valid url pointing to a book in goodReads") sys.exit(1) if target_number > 200: print("Cannot scrape more than 200 books at once") sys.exit(1) data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", "book") book_scraper = BookScraper(data_collection) book_scraper.scrapeBooks(start_url, target_number) elif data_collection_type == "author": if not re.search(r'([https://]?)www.goodreads.com/author/show/(.*)', start_url): print( "Please provide a valid url pointing to an author in goodReads" ) sys.exit(1) if target_number > 50: print("Cannot scrape more than 50 authors at once") sys.exit(1) data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads", "author") author_scraper = AuthorScraper(data_collection) author_scraper.scrapeAuthors(start_url, target_number) else: print("Error: no collection named " + data_collection_type + ", please enter 'book' or 'author' ") return
"""Execute the scrape sub command from main""" import pickle as pk import os import sys from time import sleep from book_scraper import BookScraper from author_scraper import AuthorScraper from mongo_manipulator import connect_to_mongo SEP = "=" * 120 # log separator book_scraper = BookScraper() # scraper wrapper for books author_scraper = AuthorScraper() # scraper wrapper for authors def save_progress(bfs_queue, visited_books, visited_authors, progress_dir=None): """ Save scraping progress to local. """ if not os.path.isdir(progress_dir): os.mkdir(progress_dir) with open(progress_dir + "bfs_queue.pkl", "wb+") as file: pk.dump(bfs_queue, file) with open(progress_dir + "visited_books.pkl", "wb+") as file: pk.dump(visited_books, file) with open(progress_dir + "visited_authors.pkl", "wb+") as file: pk.dump(visited_authors, file)
def setUp(self): self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'), "testDatabase", "testCollection") self.bookScraper = BookScraper(self.testDB) self.authroScraper = AuthorScraper(self.testDB)