def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl' print pred_save_loc if os.path.isdir(pred_save_loc)==False: print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv if cb_pred=='cb_vect': predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred=='cb_kmeans_100': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred=='cb_kmeans_1000': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CB predictions saved'
def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = ( self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl" ) print pred_save_loc if os.path.isdir(pred_save_loc) == False: print "Running " + cb_pred + " for user vector " + uv + " and content vector " + cv if cb_pred == "cb_vect": predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == "cb_kmeans_100": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == "cb_kmeans_1000": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) else: break print "All CB predictions saved"
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type=='cb_vect': predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cb_kmeans_100': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cb_kmeans_1000': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: print 'Running ' + alg_type + ' for user vector ' + user_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type=='cf_mllib': predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_item': predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_user': predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_map': predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_mse': predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_mae': predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_random': predictions = random_recommender.predict(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)
def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition( self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl' print pred_save_loc if os.path.isdir(pred_save_loc) == False: print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv if cb_pred == 'cb_vect': predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == 'cb_kmeans_100': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == 'cb_kmeans_1000': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CB predictions saved'
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type == 'cb_vect': predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cb_kmeans_100': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cb_kmeans_1000': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: print 'Running ' + alg_type + ' for user vector ' + user_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type == 'cf_mllib': predictions = cf.calc_cf_mllib( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_item': predictions = cf.calc_item_item_cf( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_user': predictions = cf.calc_user_user_cf2( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_map': predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_mse': predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_mae': predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_random': predictions = random_recommender.predict( train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) print "Running " + alg_type + " for user vector " + user_vector + " and content vector " + content_vector pred_save_loc = ( self.directory + self.data_name + "_predictions_" + user_vector + "_" + content_vector + "_" + alg_type + ".pkl" ) print pred_save_loc if alg_type == "cb_vect": predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cb_kmeans_100": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cb_kmeans_1000": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) else: print "Running " + alg_type + " for user vector " + user_vector pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + alg_type + ".pkl" print pred_save_loc if alg_type == "cf_mllib": predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_item": predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_user": predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_map": predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_mse": predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_mae": predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_random": predictions = random_recommender.predict(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)