def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl' print pred_save_loc if os.path.isdir(pred_save_loc)==False: print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv if cb_pred=='cb_vect': predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred=='cb_kmeans_100': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred=='cb_kmeans_1000': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CB predictions saved'
def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = ( self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl" ) print pred_save_loc if os.path.isdir(pred_save_loc) == False: print "Running " + cb_pred + " for user vector " + uv + " and content vector " + cv if cb_pred == "cb_vect": predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == "cb_kmeans_100": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == "cb_kmeans_1000": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) else: break print "All CB predictions saved"
def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # get the first content vector for results purposes content_path = self.directory + self.data_name + "_cv_" + self.content_vector_types[0] + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl" print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CF" results["alg_type"] = cf_pred results["user_vector"] = uv results["content_vector"] = self.content_vector_types[0] print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cf_pred + "_" + str(run) + ".pkl" ) f = open(results_path, "w") f.write(str(results)) f.close() print "All CF predictions results aquired"
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type=='cb_vect': predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cb_kmeans_100': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cb_kmeans_1000': predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: print 'Running ' + alg_type + ' for user vector ' + user_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type=='cf_mllib': predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_item': predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_user': predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_map': predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_mse': predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_bayes_mae': predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type=='cf_random': predictions = random_recommender.predict(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)
def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) #get the first content vector for results purposes content_path = self.directory + self.data_name + '_cv_' + self.content_vector_types[ 0] + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop( pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CF' results['alg_type'] = cf_pred results['user_vector'] = uv results['content_vector'] = self.content_vector_types[0] print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \ + cf_pred + '_' + str(run) + '.pkl' f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CF predictions results aquired'
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition( self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type == 'cb': pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' else: pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = num_preds results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = algorithm results['user_vector'] = user_vector results['content_vector'] = content_vector print results #save off the results print results_path f = open(results_path, 'w') f.write(str(results)) f.close()
def run_cf_predictions(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' if os.path.isdir(pred_save_loc) == False: print 'Running ' + cf_pred + ' for user vector ' + uv print pred_save_loc if cf_pred == 'cf_mllib': predictions = cf.calc_cf_mllib( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred == 'cf_item': predictions = cf.calc_item_item_cf( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred == 'cf_user': predictions = cf.calc_user_user_cf2( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CF predictions saved'
def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) #get the first content vector for results purposes content_path = self.directory + self.data_name +'_cv_' + self.content_vector_types[0] + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CF' results['alg_type'] = cf_pred results['user_vector'] = uv results['content_vector'] = self.content_vector_types[0] print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \ + cf_pred + '_' + str(run) + '.pkl' f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CF predictions results aquired'
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type=='cb': pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' else: pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = num_preds results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = algorithm results['user_vector'] = user_vector results['content_vector'] = content_vector print results #save off the results print results_path f = open(results_path, 'w') f.write(str(results)) f.close()
def run_cb_predictions(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition( self.num_partitions) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl' print pred_save_loc if os.path.isdir(pred_save_loc) == False: print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv if cb_pred == 'cb_vect': predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == 'cb_kmeans_100': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cb_pred == 'cb_kmeans_1000': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CB predictions saved'
def run_cf_predictions(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' if os.path.isdir(pred_save_loc)==False: print 'Running ' + cf_pred + ' for user vector ' + uv print pred_save_loc if cf_pred=='cf_mllib': predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred=='cf_item': predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred=='cf_user': predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print 'All CF predictions saved'
def run_cf_predictions(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl" if os.path.isdir(pred_save_loc) == False: print "Running " + cf_pred + " for user vector " + uv print pred_save_loc if cf_pred == "cf_mllib": predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred == "cf_item": predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif cf_pred == "cf_user": predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: break print "All CF predictions saved"
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) print "Running " + alg_type + " for user vector " + user_vector + " and content vector " + content_vector pred_save_loc = ( self.directory + self.data_name + "_predictions_" + user_vector + "_" + content_vector + "_" + alg_type + ".pkl" ) print pred_save_loc if alg_type == "cb_vect": predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cb_kmeans_100": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cb_kmeans_1000": predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions ) sl.save_to_hadoop(predictions, pred_save_loc) else: print "Running " + alg_type + " for user vector " + user_vector pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + alg_type + ".pkl" print pred_save_loc if alg_type == "cf_mllib": predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_item": predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_user": predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_map": predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_mse": predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_bayes_mae": predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == "cf_random": predictions = random_recommender.predict(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \ + cb_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) #print preds.count() #if we ran the kmeans we do not need to complete both runs #otherwise we do if cb_pred=='cb_kmeans_100' or cb_pred=='cb_kmeans_1000': if cb_pred=='cb_kmeans_1000': run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \ + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \ + '_' + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CB predictions results aquired'
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = ( self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl" ) print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) # print preds.count() # if we ran the kmeans we do not need to complete both runs # otherwise we do if cb_pred == "cb_kmeans_100" or cb_pred == "cb_kmeans_1000": if cb_pred == "cb_kmeans_1000": run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = cb_pred results["user_vector"] = uv results["content_vector"] = cv print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cv + "_" + cb_pred + "_" + str(run) + ".csv" ) print results_path f = open(results_path, "w") f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = cb_pred results["user_vector"] = uv results["content_vector"] = cv print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cv + "_" + cb_pred + "_" + str(run) + ".csv" ) print results_path f = open(results_path, "w") f.write(str(results)) f.close() print "All CB predictions results aquired"
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + user_vector + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type == "cb": pred_save_loc = ( self.directory + self.data_name + "_predictions_" + user_vector + "_" + content_vector + "_" + algorithm + ".pkl" ) results_path = ( self.results_directory + self.data_name + "_results_" + user_vector + "_" + content_vector + "_" + algorithm + "_" + str(num_preds) + ".csv" ) else: pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + algorithm + ".pkl" results_path = ( self.results_directory + self.data_name + "_results_" + user_vector + "_" + algorithm + "_" + str(num_preds) + ".csv" ) print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=num_preds, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = num_preds results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = algorithm results["user_vector"] = user_vector results["content_vector"] = content_vector print results # save off the results print results_path f = open(results_path, "w") f.write(str(results)) f.close()
def run_single_prediction(self, user_vector, content_vector, alg_type): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) if content_vector: content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type == 'cb_vect': predictions = content_based.predict( train_ratings, content_vect, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cb_kmeans_100': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cb_kmeans_1000': predictions = content_based_kmeans.predict( train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) else: print 'Running ' + alg_type + ' for user vector ' + user_vector pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl' print pred_save_loc if alg_type == 'cf_mllib': predictions = cf.calc_cf_mllib( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_item': predictions = cf.calc_item_item_cf( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_user': predictions = cf.calc_user_user_cf2( train_ratings, num_partitions=self.num_partitions) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_map': predictions = cf.calc_naive_bayes_map(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_mse': predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_bayes_mae': predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc) elif alg_type == 'cf_random': predictions = random_recommender.predict( train_ratings, self.sc) sl.save_to_hadoop(predictions, pred_save_loc)
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition( self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats( train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \ + cb_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition( self.num_partitions) #print preds.count() #if we ran the kmeans we do not need to complete both runs #otherwise we do if cb_pred == 'cb_kmeans_100' or cb_pred == 'cb_kmeans_1000': if cb_pred == 'cb_kmeans_1000': run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \ + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \ + '_' + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CB predictions results aquired'