def list_books(soup, userid, print_status=True):
    """
    Takes in the user id and finds all that user's books, and the number of them. (W
    :param print_status: If true, print the status of the crawl
    :param soup: html of the page
    :return: list of htmlbooks
    """

    # figure out how many pages we must parse
    num_books = extract_num_books(soup)
    pages_to_parse = math.ceil(num_books / 30)

    # if empty user
    if num_books < 1:
        return "Empty User"

    # Parse the pages
    books = []
    for i in range(0, pages_to_parse):
        print_("Extracting books for user {}. Page {}/{}".format(userid, i + 1, pages_to_parse))

        # Extract books from current page
        books += extract_books(soup, i, userid)

    return books
def extract_books(soup, page_number, userid):
    """
    Gets the of the books in each page.
    Can also just return the source code of the page instead.
    :param return_soup: If true, return page code instead of list of books code
    :param soup: soup of the page
    :return: resultset of books
    """

    # If we are on the first page, we already have the soup information. No need to extract again.
    if page_number == 1:
        books = soup.find_all("tr", class_="bookalike review")
        return books

    # Get books from aditional pages
    while True:  # Never give up. Stay determined.
        try:

            url = 'https://www.goodreads.com/review/list/{}?page={}&shelf=read'.format(userid, page_number + 1)
            html = urllib.request.urlopen(url).read()
            soup = BeautifulSoup(html, 'lxml')

            books = soup.find_all("tr", class_="bookalike review")
            break

        except:  # Happens if there are connection problems

            print_("Cannot connect. Retrying...")

    return books
Beispiel #3
0
    def add_to_bookshelf(self, book):
        """
        Add a book to the bookshelf
        :param book: A goodreads book object
        :return: None
        """

        if isinstance(book, GoodreadsBook):
            self.bookshelf[str(book.gid)] = book
            self.bookshelf.sync()

            general.print_("Book saved!")
    def add_to_bookshelf(self, abook, gbook, skipped=False):
        """
        Add a book to the bookshelf
        :param book: A goodreads book object
        :return: None
        """

        if abook != "Skipped":
            self.amazon_bookshelf[str(gbook.gid)] = abook
            print_("Data Saved!\n")
        else:
            self.amazon_bookshelf[str(gbook.gid)] = "Skipped"
def gather_book(goodreads_id):
    """
    Gathers a book with the goodreads id.
    :param goodreads_id: Book id to gather.
    :return: None
    """

    try:

        if not str(goodreads_id) in genre_db:

            start = time.time()

            general.print_("Downloading information...")
            book = goodreads.book(goodreads_id)

            # Make sure to not query goodreads more than once a second.
            end = time.time()
            total_time = end - start
            if 1 > total_time > 0:
                time.sleep(1 - total_time)

            add_to_bookshelf(book)

            general.print_("Book saved!")

        else:
            general.print_("Book is already in the bookshelf!")

    except:
        general.print_("Cannot process book. Skipping.")
Beispiel #6
0
    def gather_from_file(self, path):
        """
        Fetch the books in a user file and place them in the bookshelf.
        :param path: path to user file
        :return: None
        """

        general.print_("Opening {}".format(path))

        l = general.read(path)

        for i in range(0, len(l)):
            user = l[i]
            self.fetch_books(user)
def fetch_books(user):
    """
    Get the goodreads book objects for the user.
    :param user:
    :return:
    """

    books = []

    for book in user.userbooks:
        general.print_("\n{}".format(book.title))
        gather_book(book.goodreads_id)

    return books
Beispiel #8
0
    def get_book_list(self, userlist_name, i):
        """
        Get the next chunk of the list for the userlist name.
        :param userlist_name: Name of userlist
        :return: path to the chunk
        """

        filenames = os.listdir("../data/userlists")

        try:
            filename = filenames[i]
            path = "../data/userlists/{}".format(filename)
            return path
        except IndexError:
            general.print_("Finished with {}".format(userlist_name))
    def make_amazon_book(self, abook):
        """
        Makes an amazon book object from the downloaded information.
        Note that we do this because we cannot pickle the downloaded object directly.
        :param abook: Downloaded amazon book object.
        :return: amazon book object.
        """

        abook = AmazonBook(abook)

        print_("Genres: {}".format(abook.genres))

        self.amazon_bookshelf.sync()

        return abook
def extract_username(soup):
    """
    Extracts username of a user
    This function does not work very well, so do not use.
    :param soup: html soup of bookshelf
    :return: The number of books
    """

    text = soup.find('title').text

    try:
        username = re.findall(r'| (.*?)\)\'s bookshelf', text)
    except:
        username = "******"

    print_("Username: {}".format(username))

    return username
def gather_books(userlist_name):
    """
    Fetches the genres of books and makes a gid:genre entry in genre_db.
    :param userlist_name: Name of userlist to gather from.
    :return: None
    """
    path = None
    i = 0
    while path != "Finished":
        path = get_list(userlist_name, i)
        gather_from_file(path)
        genre_db["processed files"][path] = path
        i += 1
        general.print_(
            "-------------- \nWe have completed {}! \n --------------- ".
            format(path))

    print("Finished gathering from {}".format(userlist_name))
    genre_db.close()
Beispiel #12
0
    def download_goodreads_book(self, goodreads_id):
        """
        Download a goodreads book object.
        :return: goodreads book object
        """

        start = time.time()

        general.print_("Currently on {}".format(self.current_path))
        general.print_("Downloading information...")
        book = self.api_client.book(goodreads_id)

        # Make sure to not query goodreads more than once a second.
        end = time.time()
        total_time = end - start
        if 1 > total_time > 0:
            time.sleep(1 - total_time)

        return book
def gather_from_file(path):
    """
    Fetch the books in a user file and place them in the bookshelf.
    :param path: path to user file
    :return: None
    """

    if path not in genre_db["processed files"]:

        general.print_("Opening {}".format(path))

        l = general.read(path)

        for i in range(0, len(l)):
            user = l[i]
            fetch_books(user)

    else:
        general.print_("We have already processed {}".format(path))
Beispiel #14
0
    def gather_knowledge(self, userlist_name):
        """
        Fetches books from Goodreads and places them in the bookshelf database.
        :param userlist_name: Name of userlist to gather from.
        :return: None
        """

        path = None
        i = 0
        while path != "Finished":
            path = self.get_book_list(userlist_name, i)
            self.current_path = path
            self.gather_from_file(path)

            i += 1
            general.print_(
                "-------------- \nWe have completed {}! \n --------------- ".
                format(path))

        print("Finished gathering from {}".format(userlist_name))
    def gather(self, skipped=False):
        """
        Fetches books from Goodreads and places them in the bookshelf database.
        :param userlist_name: Name of userlist to gather from.
        :return: None
        """

        for key in self.source_bookshelf.keys():
            gbook = None
            try:
                gbook = self.source_bookshelf.get(str(key), default=None)
            except:
                print("Something Funny Happened")
            if isinstance(gbook, GoodreadsBook):
                if hasattr(gbook, "gid"):  # If is actually a book.
                    if (gbook.gid not in self.amazon_bookshelf
                            or gbook == "Skipped"
                        ) or self.amazon_bookshelf[gbook.gid] == "Skipped":
                        # Added in the part about the amazon book being skipped a long time later to retry skipped books

                        print_(gbook.title)

                        # Download amazon data
                        abook = self.download_amazon_book(gbook)

                        # turn into amazon book object if we could download it
                        if abook != "Skipped":
                            abook = self.make_amazon_book(abook)

                        # Add to shelve
                        self.add_to_bookshelf(abook, gbook)
                    else:
                        print_(gbook.title)
                        print_("Already in bookshelf!\n")
                        print(self.amazon_bookshelf[gbook.gid])
def extract_user_type(soup):
    """
    Extract the type of user.
    :param soup:
    :return: "normal", "private", "restricted" or "empty".
    """

    if not page_exists(soup):
        print_("User does not exist.")
        return "does not exist."
    elif is_restricted(soup):
        print_("User is restricted.")
        return "restricted"
    elif is_private(soup):
        print_("User is private.")
        return "private"
    elif extract_num_books(soup) < 1:
        print_("User is empty.")
        return "empty"
    else:
        print_("User is normal")
        return "normal"
def get_soup(user_id, print_status=True):
    """
    Gets the html of page 1 of a user's read-bookshelf.
    :todo: We should be able to append ?per_page=100 to the end of url, but does not work.
    :param user_id: Id of user
    :return: beautifulsoup soup object
    """

    print_("Downloading Page...")

    while True:  # Determination
        try:

            # Get info from correct page
            url = 'https://www.goodreads.com/review/list/{}?shelf=read'.format(user_id)
            html = urllib.request.urlopen(url).read()
            soup = BeautifulSoup(html, 'lxml')

            return soup

        except:
            print_("Could not connect. Retrying...")
Beispiel #18
0
def make_user(userid):
    """
    Make a user object, which holds information about the user along with all
    the books the user has. The books are stored as userbook objects
    :param userid: The id of the user.
    :return: a user object
    """

    try:

        soup = parser.get_soup(userid)

        print_("Parsing page...")

        # Make user object and save information
        user = User()
        user.profile_type = parser.extract_user_type(soup)
        user.number_books = parser.extract_num_books(soup)
        user.id = userid

        # Abort if user is private, empty, or non-existent
        if user.profile_type != "normal":
            return user

        # Get and save books
        books = parser.list_books(soup, userid)
        for book in books:
            userbook = user.make_userbook(book)
            user.userbooks.append(userbook)

        return user

    except:  # If cannot process user

        user = User()
        user.profile_type = "error"
        print_("Error: Cannot process user.")
        return user
Beispiel #19
0
    def gather_book(self, book, open_bookshelf=False):
        """
        Gathers an amazon or goodreads book.
        :param book: Book object
        :return: None
        """

        try:

            if str(book.goodreads_id) not in self.bookshelf or self.bookshelf[
                    str(book.goodreads_id)] == "Skipped":
                if self.api_type == "goodreads":
                    downloaded_book = self.download_goodreads_book(
                        book.goodreads_id)
                else:
                    raise Exception("api_type must be goodreads.")

                self.add_to_bookshelf(downloaded_book)

            else:
                general.print_("Book is already in the bookshelf!")

        except:
            general.print_("Cannot process book.")
    def download_amazon_book(self, gbook):
        """
        Downloads amazon book information
        :param book: book object
        :return: amazon book as a list
        """

        abook = None

        # Lookup via isbn
        try:
            print_("Looking up by isbn...")
            abook = self.amazon.lookup(IdType="ISBN",
                                       ItemId=str(gbook.isbn),
                                       SearchIndex='Books')
        except:

            # If above fails, lookup via isbn13
            try:
                print_("Failed isbn lookup. Trying isbn13...")
                abook = self.amazon.lookup(IdType="ISBN",
                                           ItemId=str(gbook.isbn13),
                                           SearchIndex='Books')
            except:

                # Finally, try a search and get first item
                try:
                    print_("Failed isbn13 lookup. Trying a search...")
                    abook = self.amazon.search_n(
                        1,
                        Power="author:{} and title:{}".format(
                            gbook.authors, gbook.title),
                        SearchIndex='Books')

                except:
                    print_("Failed search. Skipping...\n")
                    return "Skipped"

        # Make sure return list of abooks
        if not isinstance(abook, list):
            abook = [abook]

        abook = self.pick_best(abook)

        return abook
Beispiel #21
0
    def build_dataframe(self):
        """
        Makes a dataframe for the user.
        :return: A pandas dataframe
        """

        print_("Building Dataframe...")

        try:

            # Init lists
            titles = []
            ratings = []
            goodreads_ids = []
            readcounts = []
            links = []
            formats = []
            comments = []
            conditions = []
            dates_added = []
            dates_pub_edition = []
            dates_purchased = []
            owned = []
            purchase_locations = []
            reviews = []
            recommenders = []
            notes = []
            votes = []

            for u in self.userbooks:
                titles.append(u.title)
                ratings.append(u.rating)
                goodreads_ids.append(u.goodreads_id)
                readcounts.append(u.readcount)
                links.append(u.link)
                formats.append(u.format)
                comments.append(u.comments)
                conditions.append(u.condition)
                dates_added.append(u.date_added)
                dates_pub_edition.append(u.date_pub_edition)
                dates_purchased.append(u.date_purchased)
                owned.append(u.owned)
                purchase_locations.append(u.purchase_location)
                reviews.append(u.review)
                recommenders.append(u.recommender)
                notes.append(u.notes)
                votes.append(u.votes)

            self.dataframe = pandas.DataFrame(
                data={"Title": titles, "Rating": ratings, "Goodreads ID": goodreads_ids, "Read Count": readcounts,
                      "Link": links, "Format": formats, "Comments": comments, "Condition": conditions,
                      "Date Added": dates_added, "Date Edition Published": dates_pub_edition,
                      "Date Purchased": dates_purchased,
                      "Owned": owned, "Purchase Location": purchase_locations, "Review": reviews,
                      "Recommender": recommenders, "Notes": notes, "Votes": votes})

            return self.dataframe

        except:

            print_("Failed to build dataframe.")
            return None
Beispiel #22
0
def crawl_and_save(userlist_name,
                   userlistpath="../data/userlist_db/",
                   load_data=True):
    """
    Crawls forever, and saves the data to the extracted data folder.
    We can stop it and it will start where it left off.
    (Though it will skip the user we were just on because it will think
    we got their data. Our data set is big enough that this should not
    be a real issue.)

    Makes a new file for the data every time we start or for 1000 users
    so that we don't spend too much time rewriting the same file.

    :param load_data: If true, load our previous data. Should disable for testing
    or if re-running a portion of the list.
    :param userlist_name: Name of userlist file to crawl
    :return: "Finished" if finished.
    """

    users = []
    directory = "extracted_data/" + userlist_name + "_data/"

    # Make directory if needed
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Make new file for this chunk of data.
    counter = read(userlistpath + userlist_name + "_counter")
    file = directory + userlist_name + "_data_counter_" + str(counter) + ".dat"
    overwrite(users, file)

    # Main loop
    while True:

        # Get next user. If none, then run finished function.
        userid = userlist.next_user(userlist_name, path=userlistpath)
        if userid == "finished":
            finished(users, userlist_name)

        print_("Begin extraction for user {}.".format(userid))

        # Make each user.
        user = make_user(userid)
        users.append(user)

        # Build the dataframe for the user
        user.build_dataframe()
        if user.profile_type == "normal":
            print_(user.dataframe.head(n=10))

        # Decide if want new file for data.
        counter = read(userlistpath + userlist_name + "_counter")
        if counter % 500 == 0:
            file = "extracted_data/" + userlist_name + "_data/" + userlist_name + "_data_counter_" + str(
                counter) + ".dat"
            users = []

        # Save the data, but make sure not overwriting with less.
        print_("Saving data...")
        if overwrite_safe(file, users):
            overwrite(users, file)
            print_("Saved. \n")
        else:
            raise Exception("Overwriting {}_data with less data.".format(
                userlist_name, userlist_name))