def user_book_num(show_progress=True, save_results=True):
    """
    :return: A dataframe with number of books and number of users with that number of books.
    """

    d = {}
    l = len(os.listdir("../data/userlists"))
    i = 0

    for filename in os.listdir("../data/userlists"):

        if show_progress:
            i += 1
            print_progress("Calculating book number distribution.", i, l, 1)

        userlist = read("../data/userlists/" + filename)
        for user in userlist:
            if user.profile_type == "normal":
                book_num = str(len(user.userbooks))
                if book_num in d:
                    d[book_num] += 1
                else:
                    d[book_num] = 1

    # Make data into a dataframe and save it.
    genre_data = pandas.DataFrame(list(d.items()),
                                  columns=['Book Number', 'Number of Users'])
    if save_results:
        genre_data.to_csv(
            "results.smallscale/user_book_number_distribution.csv")

    return genre_data
def book_read_number(show_progress=True, save_results=True):
    """
    :return: A dataframe with book id and number of times that book has been read.
    """

    # Init counter
    l = len(os.listdir("../data/userlists"))
    i = 0
    d = {}

    for filename in os.listdir("../data/userlists"):

        if show_progress:
            i += 1
            print_progress("Calculating book reader number distribution.", i,
                           l, 1)

        userlist = read("../data/userlists/" + filename)
        for user in userlist:
            if user.profile_type == "normal":
                for book in user.userbooks:
                    if str(book.goodreads_id) in d:
                        d[str(book.goodreads_id)] += 1
                    else:
                        d[str(book.goodreads_id)] = 1

    # Make data into a dataframe and save it.
    data = pandas.DataFrame(list(d.items()),
                            columns=['Book ID', 'Number Readers'])
    if save_results:
        data.to_csv("results.smallscale/book_reader_number_distribution.csv")

    return data
Example #3
0
def load_file(name, path="userlist_db/"):
    """
    Loads the file.
    Path automatically appended to be userlist_db
    :param name:
    :return:
    """

    return read(path + name)
def user_book_dataframe(show_progress=True, save_results=True):
    """
    :return: A dataframe with book id and number of times that book has been read.
    """

    goodreads_books = shelve.open("../data/book_db/goodreads_bookshelf.db",
                                  flag='r')
    data_list = []

    # Init counter
    l = len(os.listdir("../data/userlists"))
    i = 0

    for filename in os.listdir("../data/userlists"):

        if show_progress:
            i += 1
            print_progress("Calculating book reader number distribution.", i,
                           l, 1)

        userlist = read("../data/userlists/" + filename)
        for user in userlist:
            if user.profile_type == "normal":
                for book in user.userbooks:
                    d = {}
                    d["Goodreads ID"] = book.goodreads_id
                    d["Title"] = goodreads_books[str(book.goodreads_id)]
                    # d[] = book.goodreads_id = "No gid"
                    #
                    # d[] = book.rating = "No rating"
                    # d[] = book.readcount = 0
                    #
                    # d[] = book.date_added = "No added date"
                    # d[] = book.date_purchased = "No purchase date"
                    # d[] = book.owned = "No owned info"
                    # d[] = book.purchase_location = None
                    # d[] = book.condition = None
                    # d[] = book.format = None
                    # d[] = book.review = None
                    # d[] = book.recomender = None
                    # d[] = book.notes = None
                    # d[] = book.comments = None
                    # d[] = book.votes = None
                    # d[] = book.date_pub_edition = None
                    # d[] = book.link = None

    # Make data into a dataframe and save it.
    data = pandas.DataFrame(list(d.items()),
                            columns=['Book ID', 'Number Readers'])
    if save_results:
        data.to_csv("results.smallscale/userbook_dataframe.csv")

    return data
Example #5
0
    def gather_from_file(self, path):
        """
        Fetch the books in a user file and place them in the bookshelf.
        :param path: path to user file
        :return: None
        """

        general.print_("Opening {}".format(path))

        l = general.read(path)

        for i in range(0, len(l)):
            user = l[i]
            self.fetch_books(user)
def gather_from_file(path):
    """
    Fetch the books in a user file and place them in the bookshelf.
    :param path: path to user file
    :return: None
    """

    if path not in genre_db["processed files"]:

        general.print_("Opening {}".format(path))

        l = general.read(path)

        for i in range(0, len(l)):
            user = l[i]
            fetch_books(user)

    else:
        general.print_("We have already processed {}".format(path))
def user_profile_statistics():
    """
    This gets the counts of user types.
    :return: 
    """

    total_users = 0
    normal_users = 0
    private_users = 0
    empty_users = 0
    error_users = 0
    no_type_users = 0
    users_with_no_id = 0

    for filename in os.listdir("../data/userlists"):

        print("Processing {}".format(filename))
        userlist = read("../data/userlists/" + filename)

        for user in userlist:
            total_users += 1
            if user.profile_type == "normal":
                normal_users += 1
            elif user.profile_type == "private":
                private_users += 1
            elif user.profile_type == "empty":
                empty_users += 1
            elif user.profile_type == "error":
                error_users += 1
            elif user.profile_type == "no type":
                no_type_users += 1

            if user.id == 0:
                users_with_no_id += 1

    print("Total Users: {}".format(total_users))
    print("Normal Users: {}".format(normal_users))
    print("Private Users: {}".format(private_users))
    print("Empty Users: {}".format(empty_users))
    print("Error Users: {}".format(error_users))
    print("No Type Users: {}".format(no_type_users))
    print("No ID Users: {}".format(users_with_no_id))
Example #8
0
def overwrite_safe(file, users):
    """
    Returns True if we will be overwriting our data file with less data.
    :param users: The new userlist to replace the data with
    :param userlist_name: Name of userlist
    :return: bool
    """

    # If file does not exist, is safe
    if not os.path.exists(file):
        return True

    # Otherwise, make sure not overwriting with smaller list.
    old_data_size = len(read(file))
    new_data_size = len(users)

    if new_data_size <= old_data_size:
        return False
    else:
        return True
"Scratch file to look at extracted data"

from crawler.general import read, overwrite

data = read("extracted_data/test_data")

print(data[100].id)
Example #10
0
"""
To look at extracted data
"""

import pandas

from crawler.general import read

data = read("test_userlist_data")


def print_full(x):
    """
    Prints the full dataframe.
    """
    pandas.set_option('display.max_rows', len(x))
    print(x)
    pandas.reset_option('display.max_rows')


def to_dataframe(data):
    """
    Puts the data in a Panda's dataframe with columns user id, book id, rating, book title
    :param data:
    :return:
    """
    userids = []
    titles = []
    ratings = []
    goodreadsids = []
Example #11
0
def crawl_and_save(userlist_name,
                   userlistpath="../data/userlist_db/",
                   load_data=True):
    """
    Crawls forever, and saves the data to the extracted data folder.
    We can stop it and it will start where it left off.
    (Though it will skip the user we were just on because it will think
    we got their data. Our data set is big enough that this should not
    be a real issue.)

    Makes a new file for the data every time we start or for 1000 users
    so that we don't spend too much time rewriting the same file.

    :param load_data: If true, load our previous data. Should disable for testing
    or if re-running a portion of the list.
    :param userlist_name: Name of userlist file to crawl
    :return: "Finished" if finished.
    """

    users = []
    directory = "extracted_data/" + userlist_name + "_data/"

    # Make directory if needed
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Make new file for this chunk of data.
    counter = read(userlistpath + userlist_name + "_counter")
    file = directory + userlist_name + "_data_counter_" + str(counter) + ".dat"
    overwrite(users, file)

    # Main loop
    while True:

        # Get next user. If none, then run finished function.
        userid = userlist.next_user(userlist_name, path=userlistpath)
        if userid == "finished":
            finished(users, userlist_name)

        print_("Begin extraction for user {}.".format(userid))

        # Make each user.
        user = make_user(userid)
        users.append(user)

        # Build the dataframe for the user
        user.build_dataframe()
        if user.profile_type == "normal":
            print_(user.dataframe.head(n=10))

        # Decide if want new file for data.
        counter = read(userlistpath + userlist_name + "_counter")
        if counter % 500 == 0:
            file = "extracted_data/" + userlist_name + "_data/" + userlist_name + "_data_counter_" + str(
                counter) + ".dat"
            users = []

        # Save the data, but make sure not overwriting with less.
        print_("Saving data...")
        if overwrite_safe(file, users):
            overwrite(users, file)
            print_("Saved. \n")
        else:
            raise Exception("Overwriting {}_data with less data.".format(
                userlist_name, userlist_name))