def analyze_data(self): # Log the time and print out a notification to the user default_logger.log_time() self.log("Starting analysis...") self.log("Reading json file") # Read the json file data = pd.read_json("test.json") # Print out the shape of the data object self.log("{0} Reviews (Rows), {1} Columns".format( data.shape[0], data.shape[1])) # Print out the first few entries of the dataset # print(data.head()) group_by_reviewer = data.groupby(['reviewerID'])['reviewerID'].count() # print(group_by_reviewer.head()) print("# Unique Reviewers: {0}".format(len(group_by_reviewer))) # print("# Unique Reviewers: {0}".format(len(data['reviewerID'].unique()))) # An alternative way for the above line print("# Unique Items: {0}".format(len(data['asin'].unique()))) print("Average number of reviews: {0}".format( group_by_reviewer.mean())) print("Average review rating: {0}".format(data['overall'].mean())) # Users are connected if they rate the same item # graph(data) # WARNING THIS IS VERY SLOW self.log("Finished analysis")
def __init__(self, raw_folder_path, cache_folder_path, csv_file_name): default_logger.log_time() self.raw_folder_path = raw_folder_path self.cache_folder_path = cache_folder_path self.target_csv_file = csv_file_name self.log("Initialized preprocessor") self.user_reviews_pkl_file_path = self.cache_folder_path + self.target_csv_file + '.pkl' self.product_integer_encoding_file_path = self.cache_folder_path + self.target_csv_file + '_item_integer_encoding.pkl' self.user_integer_encoding_file_path = self.cache_folder_path + self.target_csv_file + '_user_integer_encoding.pkl'
def __init__(self, dataset, model_options=AllOptions.ModelOptions, data_options=AllOptions.DataOptions): default_logger.log_time() NN.__init__(self, dataset, model_options, data_options) # self.num_items = dataset.get_num_items() # self.num_users = dataset.get_num_users() self.num_users = 24303 self.num_items = 10672 # Get the data self.log("Initialized UserItemRecommender")
def train(self): '''Train the model ''' # Check if a model exists if self.model is None: self.log("ERROR: Please create a model using build_model() first.") return # A model exists. Begin training default_logger.log_time() self.log("Beginning training...") callback_list = [] if self.model_options.checkpoint_enabled: checkpoints_callback = keras.callbacks.ModelCheckpoint(self.data_options.checkpoints_folder_path + self.model_options.checkpoint_string, monitor=self.model_options.checkpoint_monitor, verbose=self.model_options.checkpoint_verbose, save_weights_only=True, period=self.model_options.checkpoint_period) callback_list.append(checkpoints_callback) if self.model_options.early_stopping_enabled: early_stopping_callback = keras.callbacks.EarlyStopping(monitor=self.model_options.early_stopping_monitor, min_delta=self.model_options.early_stopping_min_delta, patience=self.model_options.early_stopping_patience, verbose=self.model_options.early_stopping_verbose) callback_list.append(early_stopping_callback) training_history = self.model.fit(self.partial_train_data, self.partial_train_labels, epochs=self.model_options.num_epochs, batch_size=self.model_options.training_batch_size, validation_data=( self.validation_data, self.validation_labels), verbose=1, callbacks=callback_list) self.log("Finished training at:") default_logger.log_time() self.log("You can override this training function in a subclass") return training_history
def __init__(self, dataset, model_options=AllOptions.ModelOptions, data_options=AllOptions.DataOptions): default_logger.log_time() # Make sure you are running an adequate version of Tensorflow self.log("Tensorflow version: {}".format(tf.__version__)) # Set variables self.review_dataset = dataset self.data_options = data_options self.model_options = model_options self.model = None # Get the data (self.train_data, self.train_labels), (self.test_data, self.test_labels) = self.review_dataset.get_training_and_testing() self.log("Retrieved training and testing data from the dataset") self.create_validation_from_training( model_options.num_validation_samples) self.log("Initialized NN")
def create_csv(self, target_file_name): '''Creates a csv based on a 'target_file_name' in json. ''' self.update_paths(target_file_name) # Log the time and some notificiations default_logger.log_time() self.log("Reading json file") self.log("Stripping unnecessary data and converting to proper json...") # Attempt to filter the data # self.log("From: {}\t To: {}".format(self.target_json_path, self.stripped_json_path)) self.filter_data(self.target_json_path, self.stripped_json_path) # Log the time it took and send a notification default_logger.log_time() self.log( "Finished stripping unnecessary data. Now converting json to csv..." ) # Convert the json to csv self.convert_json_to_csv(self.stripped_json_path, self.target_csv_path) self.log("Finished converting json file at:") default_logger.log_time()
# Build and train the model user_item_recommender.build_model() training_history = user_item_recommender.train() # Evaluate the trained model user_item_recommender.evaluate_model() # Save the model saved_file_name = user_item_recommender.save_model(training_history) # Graph the results from training user_item_recommender.generate_graphs( training_history, AllOptions.DataOptions.graphs_folder_path + saved_file_name) else: print("Loading model...") # Load model from a checkpoint # user_item_recommender.load_model_from_checkpoint('weights_020_0.73loss.hdf5') user_item_recommender.load_model( '/2018-12-20/13h-44m-53s_user_item_NN_model_[1.851val_loss]_[1.066val_mean_absolute_error]_[0.341loss]_[0.424mean_absolute_error].h5' ) user_item_recommender.evaluate_model() default_logger.log_time() (user_embeddings, item_embeddings) = user_item_recommender.get_embeddings() default_logger.log_time() user_item_predictions = user_item_recommender.recommend_items_for_user(546) default_logger.log_time()
def get_user_reviews(self, reconstruct=False): '''Get user reviews ''' user_reviews = None if reconstruct == False and os.path.exists( self.user_reviews_pkl_file_path): with open(self.user_reviews_pkl_file_path, 'rb') as user_reviews_file: self.log("Loaded stored user reviews data at: {}".format( self.user_reviews_pkl_file_path)) user_reviews = pickle.load(user_reviews_file) user_encoding_df = self.get_item_encoding( reconstruct=reconstruct) product_encoding_df = self.get_user_encoding( reconstruct=reconstruct) else: # Log the time of this function call (reset checkpoint too) default_logger.set_checkpoint() default_logger.log_time() user_reviews = pd.read_csv(self.raw_folder_path + self.target_csv_file + '.csv') # Get products product_ids = array(user_reviews.asin) self.log("First 10 products IDs: {}".format(product_ids[:10])) # Integer encode products product_label_encoder = LabelEncoder() integer_encoded_products = product_label_encoder.fit_transform( product_ids) self.log("First 10 ENCODED products IDs: {}".format( integer_encoded_products[:10])) # Reorganize column labels for products user_reviews['original_asin'] = user_reviews['asin'] user_reviews['asin'] = integer_encoded_products.astype(int) # Save the encoding for products self.log("Saving product integer encoding...") product_encoding_df = pd.DataFrame( data={'original_asin': product_label_encoder.classes_}) with open(self.product_integer_encoding_file_path, 'wb') as product_encoding_file: pickle.dump(product_encoding_df, product_encoding_file, pickle.HIGHEST_PROTOCOL) self.log("_" * 100) # Get reviewers user_ids = array(user_reviews.reviewerID) self.log("First 10 reviewer IDs: {}".format(user_ids[:10])) # Integer encode reviewers reviewer_label_encoder = LabelEncoder() integer_encoded_reviewers = reviewer_label_encoder.fit_transform( user_ids) self.log("First 10 ENCODED user IDs: {}".format( integer_encoded_reviewers[:10])) # Reorganize column labls for users user_reviews['original_reviewerID'] = user_reviews['reviewerID'] user_reviews['reviewerID'] = integer_encoded_reviewers.astype(int) # Save the encoding for users self.log("Saving user integer encoding...") user_encoding_df = pd.DataFrame( data={'original_reviewerID': reviewer_label_encoder.classes_}) with open(self.user_integer_encoding_file_path, 'wb') as user_encoding_file: pickle.dump(user_encoding_df, user_encoding_file, pickle.HIGHEST_PROTOCOL) # Save the user reviews dataframe to a file self.log("Saving preprocessed user reviews data...") with open(self.user_reviews_pkl_file_path, 'wb') as user_reviews_file: pickle.dump(user_reviews, user_reviews_file, pickle.HIGHEST_PROTOCOL) self.log("Saved user reviews data at: {}".format( self.user_reviews_pkl_file_path)) # Log the time this function call took default_logger.log_time() return user_reviews, user_encoding_df, product_encoding_df