def test_scrape_invalid_book_url_wrong_format(self):
     """
     Test invalid input url does has prefix "https://www.goodreads.com/book/show/"
     """
     scraper = BookScraper()
     dic = scraper.scrape_book("https://www.google.com/")  # bad shape
     self.assertTrue(dic is None)
 def test_scrape_invalid_book_url_non_exist(self):
     """
     Test invalid input with the prefix, but actually not exists.
     """
     scraper = BookScraper()
     book_url = "https://www.goodreads.com/book/show/373a91"  # good shape but non-exist
     dic = scraper.scrape_book(book_url)
     self.assertTrue(dic is not None)
     self.assertTrue(dic["book_id"] is None)
Beispiel #3
0
def scrape():
    args = request.args
    if 'type' in args and 'start_id' in args and 'number' in args:
        if (args['type'] == 'book'):
            book_scraper = BookScraper(book_data_collection)
            book_scraper.scrapeBooks(
                build_start_url(args['type'], args['start_id']),
                int(args['number']))
        if (args['type'] == 'author'):
            author_scraper = AuthorScraper(author_data_collection)
            author_scraper.scrapeAuthors(
                build_start_url(args['type'], args['start_id']),
                int(args['number']))
    return "success"
 def test_scrape_valid_book_url(self):
     """
     Test that book scraper can scrape information
     from a valid book url correctly.
     """
     scraper = BookScraper()
     book_url = "https://www.goodreads.com/book/show/108986.Introduction_to_Algorithms"
     dic = scraper.scrape_book(book_url)
     self.assertEqual(dic["rating_value"], 4.34)
     img_url = "https://i.gr-assets.com/images/S/compressed.photo." \
               "goodreads.com/books/1387741681l/108986.jpg"
     self.assertEqual(dic["cover_url"], img_url)
     author_url = "https://www.goodreads.com/author/show/60841.Thomas_H_Cormen"
     self.assertEqual(dic["author_url"], author_url)
     similar_book_url = "https://www.goodreads.com/book/show/515601.The_C_Programming_Language"
     self.assertTrue(similar_book_url in dic["similar_book_urls"])
 def test_scrape_valid_book_no_isbn(self):
     """
     Given a url of a book without ISBN,
     the scraper should be able to retrieve other information correctly.
     """
     scraper = BookScraper()
     book_url = "https://www.goodreads.com/book/show/25008661-the-rust-programming-language"
     dic = scraper.scrape_book(book_url)
     self.assertTrue(dic is not None)
     self.assertEqual(dic["rating_value"], 4.43)
     self.assertTrue(dic["ISBN"] is None)
     img_url = "https://i.gr-assets.com/images/S/compressed.photo." \
               "goodreads.com/books/1518920310l/25008661._SX318_.jpg"
     self.assertEqual(dic["cover_url"], img_url)
     author_url = "https://www.goodreads.com/author/show/7048888.Steve_Klabnik"
     self.assertEqual(dic["author_url"], author_url)
     similar_book_url = "https://www.goodreads.com/book/show/25550614-programming-rust"
     self.assertTrue(similar_book_url in dic["similar_book_urls"])
class TestScraper(unittest.TestCase):
    def setUp(self):
        self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'),
                                     "testDatabase", "testCollection")
        self.bookScraper = BookScraper(self.testDB)
        self.authroScraper = AuthorScraper(self.testDB)

    def testBookScraper(self):
        self.testDB.empty_data_collection()
        testurl = "https://www.goodreads.com/book/show/6185.Wuthering_Heights"
        self.bookScraper.scrape_one_book(testurl)
        self.assertEqual(1, self.testDB.get_collection_size())

    def testAuthorScraper(self):
        self.testDB.empty_data_collection()
        testurl = "https://www.goodreads.com/author/show/6485178.Fredrik_Backman"
        self.authroScraper.scrape_one_author(testurl)
        self.assertEqual(1, self.testDB.get_collection_size())
Beispiel #7
0
def scrape(data_collection_type, start_url, target_number):
    """Scrape data from goodreads starting with the starting url

    Args:
        data_collection_type (str):  Name of data collection, either 'book' or 'author'
        start_url (str): The url to start scraping from
        target_number (int): Number of books/authors to scrape
    """

    if data_collection_type == "book":
        if not re.search(r'([https://]?)www.goodreads.com/book/show/(.*)',
                         start_url):
            print("Please provide a valid url pointing to a book in goodReads")
            sys.exit(1)
        if target_number > 200:
            print("Cannot scrape more than 200 books at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "book")
        book_scraper = BookScraper(data_collection)
        book_scraper.scrapeBooks(start_url, target_number)
    elif data_collection_type == "author":
        if not re.search(r'([https://]?)www.goodreads.com/author/show/(.*)',
                         start_url):
            print(
                "Please provide a valid url pointing to an author in goodReads"
            )
            sys.exit(1)
        if target_number > 50:
            print("Cannot scrape more than 50 authors at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "author")
        author_scraper = AuthorScraper(data_collection)
        author_scraper.scrapeAuthors(start_url, target_number)
    else:
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return
"""Execute the scrape sub command from main"""
import pickle as pk
import os
import sys
from time import sleep
from book_scraper import BookScraper
from author_scraper import AuthorScraper
from mongo_manipulator import connect_to_mongo

SEP = "=" * 120  # log separator
book_scraper = BookScraper()  # scraper wrapper for books
author_scraper = AuthorScraper()  # scraper wrapper for authors


def save_progress(bfs_queue,
                  visited_books,
                  visited_authors,
                  progress_dir=None):
    """
    Save scraping progress to local.
    """
    if not os.path.isdir(progress_dir):
        os.mkdir(progress_dir)
    with open(progress_dir + "bfs_queue.pkl", "wb+") as file:
        pk.dump(bfs_queue, file)
    with open(progress_dir + "visited_books.pkl", "wb+") as file:
        pk.dump(visited_books, file)
    with open(progress_dir + "visited_authors.pkl", "wb+") as file:
        pk.dump(visited_authors, file)

 def setUp(self):
     self.testDB = DataCollection(os.getenv('MONGO_CONNECTION_STRING'),
                                  "testDatabase", "testCollection")
     self.bookScraper = BookScraper(self.testDB)
     self.authroScraper = AuthorScraper(self.testDB)