def print_top_books(self, n): """ Print top books for each centroids """ k_books = pd.DataFrame(list(zip(list(self.kmeans.labels_), list(self.reviews.index))), columns=['k_label', 'book_id']) self.k_counter = Counter(k_books['k_label']) df_books = load_data.get_books() self.df_k_books = pd.merge(df_books[['best_book_id', 'title']], k_books, how='inner', left_on='best_book_id', right_on='book_id') for i in range(self.cluster_num): print(i) print(list(self.df_k_books[self.df_k_books['k_label'] == i]['title'])[:n]) print("=="*20)
def _get_books_data(self): """ Load book data from postgres database """ self.df_books = load_data.get_books() self.df_authors = load_data.get_classified_authors() self.df_authors_books = load_data.get_books_to_authors() self.df_isbn_best_book_id = load_data.get_isbn_to_best_book_id() df_books_classified = load_data.merge_to_classify_books() df_books_classified[ 'authorbook_id'] = df_books_classified['best_book_id'].map( str) + ' ' + df_books_classified['author_id'].map(str) self.df_books_classified = df_books_classified df_ab_classified = df_books_classified.groupby( ['race', 'gender'])['authorbook_id'].nunique().reset_index() df_ab_classified['percentage'] = df_ab_classified[ 'authorbook_id'] / df_ab_classified['authorbook_id'].sum() df_ab_classified['race_gender'] = df_ab_classified[ 'race'] + ' ' + df_ab_classified['gender'] self.df_ab_classified = df_ab_classified
def load_books_data(): # Created from GoodReads API book_file = '../data/updated_books.csv' # Created from GoodReads API, and manual classification author_file = '../data/classified_authors.csv' # Created from GoodReads API author_book_file = '../data/author_books.csv' # Created from Amazon Review file for ASIN and GoodReads API asin_best_file = '../data/asin_best_book_id.csv' # From Kaggle's Goodbooks-10K k_rating_file = '../data/goodbooks-10k/ratings.csv' k_book_file = '../data/goodbooks-10k/books.csv' df_books = load_data.get_books(book_file) df_authors = load_data.get_classified_authors(author_file) df_authors_books = load_data.get_books_to_authors(author_book_file) df_isbn_best_book_id = load_data.get_isbn_to_best_book_id(asin_best_file) df_books_classified = load_data.merge_to_classify_books(df_authors_books, df_authors, df_books) df_k_ratings = load_data.get_goodread_data(k_rating_file, k_book_file) return df_books, df_authors, df_authors_books, df_isbn_best_book_id, df_books_classified, df_k_ratings
plt.xticks(rotation=0) x_labels = list(df_u_ab_classified['race'] + '\n' + df_u_ab_classified['gender']) ax.set_xticklabels(map(lambda x: x.title(), x_labels)) ax.set_ylabel("Unique Author Books Combos", fontsize=12) rects = ax.patches labels = list(df_u_ab_classified['percentage'].map(lambda x: "{:.2%}".format(x))) for rect, label in zip(rects, labels): height = rect.get_height() ax.text(rect.get_x() + rect.get_width()/2, height, label, ha='center', va='bottom') plt.show() if __name__ == '__main__': df_books = load_data.get_books() df_authors = load_data.get_classified_authors() df_authors_books = load_data.get_books_to_authors() df_isbn_best_book_id = load_data.get_isbn_to_best_book_id() df_books_classified = load_data.merge_to_classify_books() api_key = os.environ['GOODREADS_API_KEY'] df_user_ratings, books_read_10k, books_read = get_user_read_books(2624891, api_key, df_isbn_best_book_id, df_books) print(len(df_user_ratings)) df_user_ratings.head() df_user_authorsbooks_classified = create_user_authorbook_classified( df_isbn_best_book_id, df_user_ratings, df_books_classified)
json_line['helpful'], json_line['overall'], json_line['unixReviewTime'] ]) i += 1 if i % 100000 == 0: print("{} ratings processed".format(i)) except KeyError: pass print("{} RATINGS COMPLETE".format(i)) if __name__ == '__main__': api_key = os.environ['GOODREADS_API_KEY'] # Created from GoodReads API, should be the top 10K rated books book_file = 'updated_books.csv' # Created from Amazon Review file for ASIN and GoodReads API asin_best_file = 'asin_best_book_id_take_4.csv' df_books = load_data.get_books(book_file) our_best_book_ids = set(df_books['best_book_id']) df_isbn_best_book_id = load_data.get_isbn_to_best_book_id( asin_best_file, our_best_book_ids) dict_isbn_best_id = df_isbn_best_book_id.set_index( ['isbn'])['best_book_id'].to_dict() get_amazon_reviews('reviews_Books_5.json.gz', 'limited_amazon_reviews.csv', dict_isbn_best_id) get_amazon_ratings('reviews_Books_5.json.gz', 'limited_amazon_ratings.csv', dict_isbn_best_id)