def for_each_fold(fold, folds, data, labels, model, error_function): (x_train, y_train), (x_test, y_test) = partition_data(data, labels, fold, folds) model.fit(x_train, y_train) y_pred = model.predict(x_test) # Based on the error_function passed if error_function is None: # if None calculate precision error = precision_score(y_test, y_pred) elif error_function == 'precision': error = precision_score(y_test, y_pred) if error_function == 'accuracy': error = accuracy_score(y_test, y_pred) elif error_function == 'recall': error = recall_score(y_test, y_pred) elif error_function == 'f1': error = f1_score(y_test, y_pred) else: raise ValueError('%s error function is not defined.' % error_function) return {'expected labels': y_test, 'predicted labels': y_pred, 'errors': [error]}
def cross_validation_whole(): X_train = [] X_test = [] y_train = [] y_test = [] indexes = [[0, 36], [37, 73], [74, 110], [111, 146], [147, 182], [183, 216], [217, 253], [254, 290], [291, 328], [329, 365]] #indexes of specified groups res_accuracy = [] res_precision = [] for i in range(0, 10): X_test = data[np.arange(indexes[i][0], indexes[i][1] + 1), :48] X_test = X_test[:, [0, 1, 38, 39, 40, 41, 42, 43]] #F1+F12 X_test = X_test.astype(np.float64) X_train = np.delete(data, np.arange(indexes[i][0], indexes[i][1] + 1), axis=0)[:, :48] X_train = X_train[:, [0, 1, 38, 39, 40, 41, 42, 43]] #F1+F12 X_train = X_train.astype(np.float64) y_test = data[np.arange(indexes[i][0], indexes[i][1] + 1), 48] y_train = np.delete(data, np.arange(indexes[i][0], indexes[i][1] + 1), axis=0)[:, 48] #clf = svm.SVC(kernel='rbf', probability=0, C=1).fit(X_train, y_train) #clf = LogisticRegression(solver='liblinear', C=10).fit(X_train, y_train) clf = RandomForestClassifier(criterion='gini', n_estimators=10, min_samples_leaf=1).fit(X_train, y_train) #clf = DecisionTreeClassifier(criterion='entropy', max_features='auto', min_samples_leaf=2).fit(X_train, y_train) y_pred = clf.predict(X_test) res_accuracy.append( metrics.accuracy_score(y_test, y_pred, normalize=True)) res_precision.append( metrics.precision_score(y_test, y_pred, pos_label="1")) res_accuracy = np.array(res_accuracy) res_precision = np.array(res_precision) print(res_accuracy.min()) print(statistics.median(res_accuracy)) print(res_accuracy.max()) print(res_precision.min()) print(statistics.median(res_precision)) print(res_precision.max()) print("max acc: " + str(res_accuracy.max()) + " min acc: " + str(res_accuracy.min()) + " mediana acc: " + str(statistics.median(res_accuracy))) print("max prec: " + str(res_precision.max()) + " min prec: " + str(res_precision.min()) + " mediana prec: " + str(statistics.median(res_precision)))
def evaluate(self, darray, thr): batch_index = 0 X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size, batch_index) y_pred = None y_label = None while len(X_batch) > 0: num_batch = len(y_batch) feed_dict = { self.vocab_index: X_batch, self.props: P_batch, self.label: y_batch, self.first_level_lstm_dropout_p: [1.0] * len(self.first_level_lstm_dropout), self.deep_dropout_p: [1.0] * len(self.deep_dropout), self.conv_pool_dropout_p: [1.0] * len(self.conv_pool_dropout), self.second_level_lstm_dropout_p: [1.0] * len(self.second_level_lstm_dropout), self.train_phase: False } batch_out = self.sess.run(self.out, feed_dict=feed_dict) if batch_index == 0: y_pred = np.reshape(batch_out, (num_batch, )) y_label = np.reshape(y_batch, (num_batch, )) else: y_pred = np.concatenate( (y_pred, np.reshape(batch_out, (num_batch, )))) y_label = np.concatenate( (y_label, np.reshape(y_batch, (num_batch, )))) batch_index += 1 X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size, batch_index) pred = [1 if y_pred[i] > thr else 0 for i in range(len(y_pred))] accuracy = metrics.accuracy_score(y_label, pred) precision = metrics.precision_score(y_label, pred) recall = metrics.recall_score(y_label, pred) f1 = metrics.f1_score(y_label, pred) return accuracy, precision, recall, f1
def step(self): """ Epochs step, training and validation. Return: training_loss, validation_loss, accuracy, precision, recall """ # Training loop batch_loss, batch_val_loss, batch_accuracy, batch_precision, batch_recall = [], [], [], [], [] for x_batch, y_batch in self.train_loader: x_batch = x_batch.to(self.device) y_batch = y_batch.to(self.device) loss = self.train_step(x_batch, y_batch) batch_loss.append(loss) with torch.no_grad(): # Validation loop for i, (x_val, y_val) in enumerate(self.val_loader): x_val = x_val.to(self.device) y_val = y_val.to(self.device) self.model.eval() yhat = self.model(x_val) val_loss = self.criterion(yhat, y_val) batch_val_loss.append(val_loss) batch_accuracy.append(accuracy_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1))) batch_precision.append(precision_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1))) batch_recall.append(recall_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1))) # step lr scheduler using val_loss if self.scheduler is not None: self.scheduler.step(val_loss) return [ torch.mean(torch.Tensor(batch_loss)), torch.mean(torch.Tensor(batch_val_loss)), torch.mean(torch.Tensor(batch_accuracy)), torch.mean(torch.Tensor(batch_precision)), torch.mean(torch.Tensor(batch_recall)) ]
def generate_classification_perf(truths, pred_probs, multiclass=False): """Given truths, and predicted probabilities, generate ModelPerf object""" pred_classes = np.round(pred_probs).astype(int) with warnings.catch_warnings(): warnings.simplefilter("ignore") retval = ClassificationModelPerf( auroc=metrics.roc_auc_score(truths, pred_probs), auroc_curve=metrics.roc_curve(truths, pred_probs) if not multiclass else None, auprc=metrics.average_precision_score(truths, pred_probs), accuracy=metrics.accuracy_score(truths, pred_classes) if not multiclass else None, recall=metrics.recall_score(truths, pred_classes) if not multiclass else None, precision=metrics.precision_score(truths, pred_classes) if not multiclass else None, f1=metrics.f1_score(truths, pred_classes) if not multiclass else None, ce_loss=metrics.log_loss(truths, pred_probs, normalize=False) / np.prod(truths.shape), ) return retval
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): """ Evaluate on the folds of a dataset split Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. cv: integer or crossvalidation, optional, default = None If an integer is passed, it is the number of fold (default 3). Specific sampling objects can be passed, see scikits.crab.metrics.cross_validation module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- score: dict a dictionary containing the average results over the different permutations on the split. permutation_scores : array, shape = [n_permutations] The scores obtained for each permutations. """ sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) #Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists #in the test data but # not training data in which case #an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) }) else: permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences) }) elif metric is None: #Return all mae, nmae, rmse = evaluation_error( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({ 'mae': mae, 'nmae': nmae, 'rmse': rmse }) #IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [ item_id for item_id, preference in preferences[:at] ] if len(relevant_item_ids) == 0: continue #Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [ (preference, 1.0) for preference in preferences_other_user ] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict( preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append( {metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({ 'precision': p, 'recall': r, 'f1score': f }) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std( final_score_error['avg'][key]) final_score_error['avg'][key] = np.average( final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores
def evaluate(self, recommender, metric=None, **kwargs): """ Evaluates the predictor Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. sampling_ratings: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated ratings. If sampling_ratings is None, 70% will be used in the training set and 30% in the test set. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- Returns a dictionary containing the evaluation results: (NMAE, MAE, RMSE, Precision, Recall, F1-Score) """ sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict( (preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set ] if preferences else [] else: training_set[user_id] = dict( ((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [ (preferences[idx], 1.0) for idx in test_set ] if preferences else [] #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return { metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) } return { metric: eval_function(real_preferences, estimated_preferences) } #IR_Statistics relevant_arrays = [] real_arrays = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [ item_id for item_id, preference in preferences[:at] ] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [ (preference, 1.0) for preference in preferences_other_user ] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: #Return all mae, nmae, rmse = evaluation_error( real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return { 'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f }
y = data[:, 48] # last column in file X_train, X_test, y_train, y_test = train_test_split( df, y, test_size=0.2, random_state=0) clf = RandomForestClassifier(n_estimators=ne, criterion=crit, max_features=mf, min_samples_leaf=msl) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) results.append( "criterion " + crit + ", n_estimators " + str(ne) + ", max_features " + mf + ", min_samples_leaf " + str(msl) + ", accuracy " + str(metrics.accuracy_score( y_test, y_pred, normalize=True)) + ", precision " + str(metrics.precision_score(y_test, y_pred, pos_label="1")) ) for res in results: print(res) #Decision Tree """ criterion = ['gini', 'entropy'] max_features = ['auto', 'sqrt', 'log2'] min_samples_leaf = [ 1, 2, 3, 4, 5] for crit in criterion: for mf in max_features: for msl in min_samples_leaf: df = pd.DataFrame(data[:, :48],
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): """ Evaluate on the folds of a dataset split Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. cv: integer or crossvalidation, optional, default = None If an integer is passed, it is the number of fold (default 3). Specific sampling objects can be passed, see scikits.crab.metrics.cross_validation module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- score: dict a dictionary containing the average results over the different permutations on the split. permutation_scores : array, shape = [n_permutations] The scores obtained for each permutations. """ sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) #Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists #in the test data but # not training data in which case #an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())}) else: permutation_scores_error.append( {metric: eval_function(real_preferences, estimated_preferences)}) elif metric is None: #Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({'mae': mae, 'nmae': nmae, 'rmse': rmse}) #IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue #Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores
def evaluate(self, recommender, metric=None, **kwargs): """ Evaluates the predictor Parameters ---------- recommender: The BaseRecommender instance The recommender instance to be evaluated. metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae'] If metrics is None, all metrics available will be evaluated. Otherwise it will return the specified metric evaluated. sampling_users: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated users. If sampling_users is None, all users are used in the evaluation. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. sampling_ratings: float or sampling, optional, default = None If an float is passed, it is the percentage of evaluated ratings. If sampling_ratings is None, 70% will be used in the training set and 30% in the test set. Specific sampling objects can be passed, see scikits.crab.metrics.sampling module for the list of possible objects. at: integer, optional, default = None This number at is the 'at' value, as in 'precision at 5'. For example this would mean precision or recall evaluated by removing the top 5 preferences for a user and then finding the percentage of those 5 items included in the top 5 recommendations for that user. If at is None, it will consider all the top 3 elements. Returns ------- Returns a dictionary containing the evaluation results: (NMAE, MAE, RMSE, Precision, Recall, F1-Score) """ sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: #Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict((preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set] if preferences else [] else: training_set[user_id] = dict(((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [(preferences[idx], 1.0) for idx in test_set] if preferences else [] #Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: #Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) #Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return {metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())} return {metric: eval_function(real_preferences, estimated_preferences)} #IR_Statistics relevant_arrays = [] real_arrays = [] #Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) #Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: #Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: #Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return {'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f}
def precision_score(self,x_test,y_test): _y_predict = self.predict(x_test); return metrics.precision_score(y_test,_y_predict);
recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) #Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) #Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) #Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, [])
def evaluate(self, recommender, metric=None, **kwargs): sampling_users = kwargs.pop('sampling_users', None) sampling_ratings = kwargs.pop('sampling_ratings', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) training_set = {} testing_set = {} # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: # Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) sampling_eval = check_sampling(sampling_ratings, \ len(preferences)) train_set, test_set = sampling_eval.split(indices=True, permutation=permutation) preferences = list(preferences) if recommender.model.has_preference_values(): training_set[user_id] = dict((preferences[idx] for idx in train_set)) if preferences else {} testing_set[user_id] = [preferences[idx] for idx in test_set] if preferences else [] else: training_set[user_id] = dict(((preferences[idx], 1.0) for idx in train_set)) if preferences else {} testing_set[user_id] = [(preferences[idx], 1.0) for idx in test_set] if preferences else [] # Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: # Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except ItemNotFoundError: # It is possible that an item exists in the test data but # not training data in which case an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) # Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': return {metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())} return {metric: eval_function(real_preferences, estimated_preferences)} # IR_Statistics relevant_arrays = [] real_arrays = [] # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = \ recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = \ [pref for pref in preferences_other_user \ if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = \ dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) # Evaluate the recommender recommender_training = self._build_recommender(training_set, \ recommender) try: preferences = \ recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: # Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) # Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] return {metric: eval_function(real_arrays, relevant_arrays)} if metric is None: # Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) return {'mae': mae, 'nmae': nmae, 'rmse': rmse, 'precision': p, 'recall': r, 'f1score': f}
def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs): sampling_users = kwargs.pop('sampling_users', 0.7) permutation = kwargs.pop('permutation', True) at = kwargs.pop('at', 3) if metric not in evaluation_metrics and metric is not None: raise ValueError('metric %s is not recognized. valid keywords \ are %s' % (metric, evaluation_metrics.keys())) permutation_scores_error = [] permutation_scores_ir = [] final_score_error = {'avg': {}, 'stdev': {}} final_score_ir = {'avg': {}, 'stdev': {}} n_users = recommender.model.users_count() sampling_users = check_sampling(sampling_users, n_users) users_set, _ = sampling_users.split(permutation=permutation) total_ratings = [] # Select the users to be evaluated. user_ids = recommender.model.user_ids() for user_id in user_ids[users_set]: # Select the ratings to be evaluated. preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) total_ratings.extend([(user_id, preference) for preference in preferences]) n_ratings = len(total_ratings) cross_val = check_cv(cv, n_ratings) # Defining the splits and run on the splits. for train_set, test_set in cross_val: training_set = {} testing_set = {} for idx in train_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): training_set.setdefault(user_id, {}) training_set[user_id][pref[0]] = pref[1] else: training_set.setdefault(user_id, {}) training_set[user_id][pref] = 1.0 for idx in test_set: user_id, pref = total_ratings[idx] if recommender.model.has_preference_values(): testing_set.setdefault(user_id, []) testing_set[user_id].append(pref) else: testing_set.setdefault(user_id, []) testing_set[user_id].append((pref, 1.0)) # Evaluate the recommender. recommender_training = self._build_recommender(training_set, \ recommender) real_preferences = [] estimated_preferences = [] for user_id, preferences in testing_set.iteritems(): for item_id, preference in preferences: # Estimate the preferences try: estimated = recommender_training.estimate_preference( user_id, item_id) real_preferences.append(preference) except: # It is possible that an item exists # in the test data but # not training data in which case # an exception will be # throw. Just ignore it and move on continue estimated_preferences.append(estimated) # Return the error results. if metric in ['rmse', 'mae', 'nmae']: eval_function = evaluation_metrics[metric] if metric == 'nmae': permutation_scores_error.append({ metric: eval_function(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value())}) else: permutation_scores_error.append( {metric: eval_function(real_preferences, estimated_preferences)}) elif metric is None: # Return all mae, nmae, rmse = evaluation_error(real_preferences, estimated_preferences, recommender.model.maximum_preference_value(), recommender.model.minimum_preference_value()) permutation_scores_error.append({'mae': mae, 'nmae': nmae, 'rmse': rmse}) # IR_Statistics (Precision, Recall and F1-Score) n_users = recommender.model.users_count() cross_val = check_cv(cv, n_users) for train_idx, test_idx in cross_val: relevant_arrays = [] real_arrays = [] for user_id in user_ids[train_idx]: preferences = recommender.model.preferences_from_user(user_id) preferences = list(preferences) if len(preferences) < 2 * at: # Really not enough prefs to meaningfully evaluate the user continue # List some most-preferred items that would count as most if not recommender.model.has_preference_values(): preferences = [(preference, 1.0) for preference in preferences] preferences = sorted(preferences, key=lambda x: x[1], reverse=True) relevant_item_ids = [item_id for item_id, preference in preferences[:at]] if len(relevant_item_ids) == 0: continue # Build the training set. training_set = {} for other_user_id in recommender.model.user_ids(): preferences_other_user = recommender.model.preferences_from_user(other_user_id) if not recommender.model.has_preference_values(): preferences_other_user = [(preference, 1.0) for preference in preferences_other_user] if other_user_id == user_id: preferences_other_user = [pref for pref in preferences_other_user if pref[0] not in relevant_item_ids] if preferences_other_user: training_set[other_user_id] = dict(preferences_other_user) else: training_set[other_user_id] = dict(preferences_other_user) # Evaluate the recommender recommender_training = self._build_recommender(training_set, recommender) try: preferences = recommender_training.model.preferences_from_user(user_id) preferences = list(preferences) if not preferences: continue except: # Excluded all prefs for the user. move on. continue recommended_items = recommender_training.recommend(user_id, at) relevant_arrays.append(list(relevant_item_ids)) real_arrays.append(list(recommended_items)) relevant_arrays = np.array(relevant_arrays) real_arrays = np.array(real_arrays) # Return the IR results. if metric in ['precision', 'recall', 'f1score']: eval_function = evaluation_metrics[metric] permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)}) elif metric is None: f = f1_score(real_arrays, relevant_arrays) r = recall_score(real_arrays, relevant_arrays) p = precision_score(real_arrays, relevant_arrays) permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f}) # Compute the final score for Error Statistics for result in permutation_scores_error: for key in result: final_score_error['avg'].setdefault(key, []) final_score_error['avg'][key].append(result[key]) for key in final_score_error['avg']: final_score_error['stdev'][key] = np.std(final_score_error['avg'][key]) final_score_error['avg'][key] = np.average(final_score_error['avg'][key]) # Compute the final score for IR statistics for result in permutation_scores_ir: for key in result: final_score_ir['avg'].setdefault(key, []) final_score_ir['avg'][key].append(result[key]) for key in final_score_ir['avg']: final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key]) final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key]) permutation_scores = {} scores = {} if permutation_scores_error: permutation_scores['error'] = permutation_scores_error scores['final_error'] = final_score_error if permutation_scores_ir: permutation_scores['ir'] = permutation_scores_ir scores.setdefault('final_error', {}) scores['final_error'].setdefault('avg', {}) scores['final_error'].setdefault('stdev', {}) scores['final_error']['avg'].update(final_score_ir['avg']) scores['final_error']['stdev'].update(final_score_ir['stdev']) return permutation_scores, scores