Beispiel #1
0
def scrape():
    args = request.args
    if 'type' in args and 'start_id' in args and 'number' in args:
        if (args['type'] == 'book'):
            book_scraper = BookScraper(book_data_collection)
            book_scraper.scrapeBooks(
                build_start_url(args['type'], args['start_id']),
                int(args['number']))
        if (args['type'] == 'author'):
            author_scraper = AuthorScraper(author_data_collection)
            author_scraper.scrapeAuthors(
                build_start_url(args['type'], args['start_id']),
                int(args['number']))
    return "success"
Beispiel #2
0
def scrape(data_collection_type, start_url, target_number):
    """Scrape data from goodreads starting with the starting url

    Args:
        data_collection_type (str):  Name of data collection, either 'book' or 'author'
        start_url (str): The url to start scraping from
        target_number (int): Number of books/authors to scrape
    """

    if data_collection_type == "book":
        if not re.search(r'([https://]?)www.goodreads.com/book/show/(.*)',
                         start_url):
            print("Please provide a valid url pointing to a book in goodReads")
            sys.exit(1)
        if target_number > 200:
            print("Cannot scrape more than 200 books at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "book")
        book_scraper = BookScraper(data_collection)
        book_scraper.scrapeBooks(start_url, target_number)
    elif data_collection_type == "author":
        if not re.search(r'([https://]?)www.goodreads.com/author/show/(.*)',
                         start_url):
            print(
                "Please provide a valid url pointing to an author in goodReads"
            )
            sys.exit(1)
        if target_number > 50:
            print("Cannot scrape more than 50 authors at once")
            sys.exit(1)
        data_collection = DataCollection(MONGO_CONNECTION_STRING, "goodReads",
                                         "author")
        author_scraper = AuthorScraper(data_collection)
        author_scraper.scrapeAuthors(start_url, target_number)
    else:
        print("Error: no collection named " + data_collection_type +
              ", please enter 'book' or 'author' ")
        return