Ejemplo n.º 1
0
 def print_top_books(self, n):
     """
     Print top books for each centroids
     """
     k_books = pd.DataFrame(list(zip(list(self.kmeans.labels_),
                            list(self.reviews.index))),
                            columns=['k_label', 'book_id'])
     self.k_counter = Counter(k_books['k_label'])
     df_books = load_data.get_books()
     self.df_k_books = pd.merge(df_books[['best_book_id', 'title']], k_books, how='inner',
                                left_on='best_book_id', right_on='book_id')
     for i in range(self.cluster_num):
         print(i)
         print(list(self.df_k_books[self.df_k_books['k_label'] == i]['title'])[:n])
         print("=="*20)
Ejemplo n.º 2
0
 def _get_books_data(self):
     """
     Load book data from postgres database
     """
     self.df_books = load_data.get_books()
     self.df_authors = load_data.get_classified_authors()
     self.df_authors_books = load_data.get_books_to_authors()
     self.df_isbn_best_book_id = load_data.get_isbn_to_best_book_id()
     df_books_classified = load_data.merge_to_classify_books()
     df_books_classified[
         'authorbook_id'] = df_books_classified['best_book_id'].map(
             str) + ' ' + df_books_classified['author_id'].map(str)
     self.df_books_classified = df_books_classified
     df_ab_classified = df_books_classified.groupby(
         ['race', 'gender'])['authorbook_id'].nunique().reset_index()
     df_ab_classified['percentage'] = df_ab_classified[
         'authorbook_id'] / df_ab_classified['authorbook_id'].sum()
     df_ab_classified['race_gender'] = df_ab_classified[
         'race'] + ' ' + df_ab_classified['gender']
     self.df_ab_classified = df_ab_classified
Ejemplo n.º 3
0
def load_books_data():
    # Created from GoodReads API
    book_file = '../data/updated_books.csv'
    # Created from GoodReads API, and manual classification
    author_file = '../data/classified_authors.csv'
    # Created from GoodReads API
    author_book_file = '../data/author_books.csv'
    # Created from Amazon Review file for ASIN and GoodReads API
    asin_best_file = '../data/asin_best_book_id.csv'
    # From Kaggle's Goodbooks-10K
    k_rating_file = '../data/goodbooks-10k/ratings.csv'
    k_book_file = '../data/goodbooks-10k/books.csv'

    df_books = load_data.get_books(book_file)
    df_authors = load_data.get_classified_authors(author_file)
    df_authors_books = load_data.get_books_to_authors(author_book_file)
    df_isbn_best_book_id = load_data.get_isbn_to_best_book_id(asin_best_file)
    df_books_classified = load_data.merge_to_classify_books(df_authors_books, df_authors,
                                                  df_books)
    df_k_ratings = load_data.get_goodread_data(k_rating_file, k_book_file)
    return df_books, df_authors, df_authors_books, df_isbn_best_book_id, df_books_classified, df_k_ratings
Ejemplo n.º 4
0
    plt.xticks(rotation=0)
    x_labels = list(df_u_ab_classified['race'] + '\n' + df_u_ab_classified['gender'])
    ax.set_xticklabels(map(lambda x: x.title(), x_labels))
    ax.set_ylabel("Unique Author Books Combos", fontsize=12)
    rects = ax.patches
    labels = list(df_u_ab_classified['percentage'].map(lambda x: "{:.2%}".format(x)))
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2, height, label,
                ha='center', va='bottom')
    plt.show()


if __name__ == '__main__':

    df_books = load_data.get_books()
    df_authors = load_data.get_classified_authors()
    df_authors_books = load_data.get_books_to_authors()
    df_isbn_best_book_id = load_data.get_isbn_to_best_book_id()
    df_books_classified = load_data.merge_to_classify_books()

    api_key = os.environ['GOODREADS_API_KEY']

    df_user_ratings, books_read_10k, books_read = get_user_read_books(2624891, api_key, df_isbn_best_book_id, df_books)
    print(len(df_user_ratings))
    df_user_ratings.head()

    df_user_authorsbooks_classified = create_user_authorbook_classified(
                                                df_isbn_best_book_id,
                                                df_user_ratings,
                                                df_books_classified)
Ejemplo n.º 5
0
                    json_line['helpful'], json_line['overall'],
                    json_line['unixReviewTime']
                ])
                i += 1
                if i % 100000 == 0:
                    print("{} ratings processed".format(i))
            except KeyError:
                pass
    print("{} RATINGS COMPLETE".format(i))


if __name__ == '__main__':
    api_key = os.environ['GOODREADS_API_KEY']
    # Created from GoodReads API, should be the top 10K rated books
    book_file = 'updated_books.csv'
    # Created from Amazon Review file for ASIN and GoodReads API
    asin_best_file = 'asin_best_book_id_take_4.csv'

    df_books = load_data.get_books(book_file)
    our_best_book_ids = set(df_books['best_book_id'])
    df_isbn_best_book_id = load_data.get_isbn_to_best_book_id(
        asin_best_file, our_best_book_ids)
    dict_isbn_best_id = df_isbn_best_book_id.set_index(
        ['isbn'])['best_book_id'].to_dict()

    get_amazon_reviews('reviews_Books_5.json.gz', 'limited_amazon_reviews.csv',
                       dict_isbn_best_id)

    get_amazon_ratings('reviews_Books_5.json.gz', 'limited_amazon_ratings.csv',
                       dict_isbn_best_id)