def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # get the first content vector for results purposes content_path = self.directory + self.data_name + "_cv_" + self.content_vector_types[0] + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl" print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CF" results["alg_type"] = cf_pred results["user_vector"] = uv results["content_vector"] = self.content_vector_types[0] print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cf_pred + "_" + str(run) + ".pkl" ) f = open(results_path, "w") f.write(str(results)) f.close() print "All CF predictions results aquired"
def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) #get the first content vector for results purposes content_path = self.directory + self.data_name + '_cv_' + self.content_vector_types[ 0] + '.pkl' content_vect = sl.load_from_hadoop( content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop( pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CF' results['alg_type'] = cf_pred results['user_vector'] = uv results['content_vector'] = self.content_vector_types[0] print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \ + cf_pred + '_' + str(run) + '.pkl' f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CF predictions results aquired'
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop( train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition( self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type == 'cb': pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' else: pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = num_preds results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = algorithm results['user_vector'] = user_vector results['content_vector'] = content_vector print results #save off the results print results_path f = open(results_path, 'w') f.write(str(results)) f.close()
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type=='cb': pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' else: pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' \ + algorithm + '.pkl' results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' \ + algorithm + '_' + str(num_preds) + '.csv' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = num_preds results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = algorithm results['user_vector'] = user_vector results['content_vector'] = content_vector print results #save off the results print results_path f = open(results_path, 'w') f.write(str(results)) f.close()
def run_cf_results(self): for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) #get the first content vector for results purposes content_path = self.directory + self.data_name +'_cv_' + self.content_vector_types[0] + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cf_pred in self.cf_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CF' results['alg_type'] = cf_pred results['user_vector'] = uv results['content_vector'] = self.content_vector_types[0] print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \ + cf_pred + '_' + str(run) + '.pkl' f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CF predictions results aquired'
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \ + cb_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) #print preds.count() #if we ran the kmeans we do not need to complete both runs #otherwise we do if cb_pred=='cb_kmeans_100' or cb_pred=='cb_kmeans_1000': if cb_pred=='cb_kmeans_1000': run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \ + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \ + '_' + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CB predictions results aquired'
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl' content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl' train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition( self.num_partitions) test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl' test_ratings = sl.load_from_hadoop( test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats( train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \ + cb_pred + '.pkl' print 'Getting results for: ' + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition( self.num_partitions) #print preds.count() #if we ran the kmeans we do not need to complete both runs #otherwise we do if cb_pred == 'cb_kmeans_100' or cb_pred == 'cb_kmeans_1000': if cb_pred == 'cb_kmeans_1000': run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \ + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \ content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions) # Merge the stats (which do not change run to run) with the results results.update(stats) #add some information to the results dictionary if it gets jumbled results['N'] = run results['dataset'] = self.data_name results['CF_CB'] = 'CB' results['alg_type'] = cb_pred results['user_vector'] = uv results['content_vector'] = cv print results #save off the results results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \ + '_' + cb_pred + '_' + str(run) + '.csv' print results_path f = open(results_path, 'w') f.write(str(results)) f.close() print 'All CB predictions results aquired'
def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds): train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + user_vector + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions) stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) if alg_type == "cb": pred_save_loc = ( self.directory + self.data_name + "_predictions_" + user_vector + "_" + content_vector + "_" + algorithm + ".pkl" ) results_path = ( self.results_directory + self.data_name + "_results_" + user_vector + "_" + content_vector + "_" + algorithm + "_" + str(num_preds) + ".csv" ) else: pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + algorithm + ".pkl" results_path = ( self.results_directory + self.data_name + "_results_" + user_vector + "_" + algorithm + "_" + str(num_preds) + ".csv" ) print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=num_preds, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = num_preds results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = algorithm results["user_vector"] = user_vector results["content_vector"] = content_vector print results # save off the results print results_path f = open(results_path, "w") f.write(str(results)) f.close()
def run_cb_results(self): for cv in self.content_vector_types: content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl" content_vect = sl.load_from_hadoop(content_path, self.sc) for uv in self.user_vector_types: train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl" train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions) test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl" test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions) # Calculate statistics about the dataset stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings) for cb_pred in self.cb_predictions: pred_save_loc = ( self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl" ) print "Getting results for: " + pred_save_loc preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions) # print preds.count() # if we ran the kmeans we do not need to complete both runs # otherwise we do if cb_pred == "cb_kmeans_100" or cb_pred == "cb_kmeans_1000": if cb_pred == "cb_kmeans_1000": run = 1000 else: run = 100 results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = cb_pred results["user_vector"] = uv results["content_vector"] = cv print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cv + "_" + cb_pred + "_" + str(run) + ".csv" ) print results_path f = open(results_path, "w") f.write(str(results)) f.close() else: for run in self.results_runs: results = performance_metrics.get_perform_metrics( test_ratings, train_ratings, preds, content_vect, self.sqlCtx, num_predictions=run, num_partitions=self.num_partitions, ) # Merge the stats (which do not change run to run) with the results results.update(stats) # add some information to the results dictionary if it gets jumbled results["N"] = run results["dataset"] = self.data_name results["CF_CB"] = "CB" results["alg_type"] = cb_pred results["user_vector"] = uv results["content_vector"] = cv print results # save off the results results_path = ( self.results_directory + self.data_name + "_results_" + uv + "_" + cv + "_" + cb_pred + "_" + str(run) + ".csv" ) print results_path f = open(results_path, "w") f.write(str(results)) f.close() print "All CB predictions results aquired"