def for_each_fold(fold, folds, data, labels, model, error_function):
    
    (x_train, y_train), (x_test, y_test) = partition_data(data, labels, fold, folds)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
        
    # Based on the error_function passed
    if error_function is None: # if None calculate precision
        error = precision_score(y_test, y_pred)
            
    elif error_function == 'precision':
        error = precision_score(y_test, y_pred)
        
    if error_function == 'accuracy':
        error = accuracy_score(y_test, y_pred)
            
    elif error_function == 'recall':
        error = recall_score(y_test, y_pred)
            
    elif error_function == 'f1':
        error = f1_score(y_test, y_pred)
    else:
        raise ValueError('%s error function is not defined.' % error_function)
        
    return {'expected labels': y_test, 
            'predicted labels': y_pred, 
            'errors': [error]}
def cross_validation_whole():
    X_train = []
    X_test = []
    y_train = []
    y_test = []

    indexes = [[0, 36], [37, 73], [74, 110], [111, 146],
               [147, 182], [183, 216], [217, 253], [254, 290], [291, 328],
               [329, 365]]  #indexes of specified groups

    res_accuracy = []
    res_precision = []

    for i in range(0, 10):
        X_test = data[np.arange(indexes[i][0], indexes[i][1] + 1), :48]
        X_test = X_test[:, [0, 1, 38, 39, 40, 41, 42, 43]]  #F1+F12
        X_test = X_test.astype(np.float64)
        X_train = np.delete(data,
                            np.arange(indexes[i][0], indexes[i][1] + 1),
                            axis=0)[:, :48]
        X_train = X_train[:, [0, 1, 38, 39, 40, 41, 42, 43]]  #F1+F12
        X_train = X_train.astype(np.float64)
        y_test = data[np.arange(indexes[i][0], indexes[i][1] + 1), 48]
        y_train = np.delete(data,
                            np.arange(indexes[i][0], indexes[i][1] + 1),
                            axis=0)[:, 48]
        #clf = svm.SVC(kernel='rbf', probability=0, C=1).fit(X_train, y_train)
        #clf = LogisticRegression(solver='liblinear', C=10).fit(X_train, y_train)
        clf = RandomForestClassifier(criterion='gini',
                                     n_estimators=10,
                                     min_samples_leaf=1).fit(X_train, y_train)
        #clf = DecisionTreeClassifier(criterion='entropy', max_features='auto', min_samples_leaf=2).fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        res_accuracy.append(
            metrics.accuracy_score(y_test, y_pred, normalize=True))
        res_precision.append(
            metrics.precision_score(y_test, y_pred, pos_label="1"))
    res_accuracy = np.array(res_accuracy)
    res_precision = np.array(res_precision)
    print(res_accuracy.min())
    print(statistics.median(res_accuracy))
    print(res_accuracy.max())
    print(res_precision.min())
    print(statistics.median(res_precision))
    print(res_precision.max())
    print("max acc: " + str(res_accuracy.max()) + " min acc: " +
          str(res_accuracy.min()) + " mediana acc: " +
          str(statistics.median(res_accuracy)))
    print("max prec: " + str(res_precision.max()) + " min prec: " +
          str(res_precision.min()) + " mediana prec: " +
          str(statistics.median(res_precision)))
Example #3
0
    def evaluate(self, darray, thr):

        batch_index = 0
        X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size,
                                                   batch_index)
        y_pred = None
        y_label = None
        while len(X_batch) > 0:
            num_batch = len(y_batch)
            feed_dict = {
                self.vocab_index:
                X_batch,
                self.props:
                P_batch,
                self.label:
                y_batch,
                self.first_level_lstm_dropout_p:
                [1.0] * len(self.first_level_lstm_dropout),
                self.deep_dropout_p: [1.0] * len(self.deep_dropout),
                self.conv_pool_dropout_p: [1.0] * len(self.conv_pool_dropout),
                self.second_level_lstm_dropout_p:
                [1.0] * len(self.second_level_lstm_dropout),
                self.train_phase:
                False
            }
            batch_out = self.sess.run(self.out, feed_dict=feed_dict)

            if batch_index == 0:
                y_pred = np.reshape(batch_out, (num_batch, ))
                y_label = np.reshape(y_batch, (num_batch, ))
            else:
                y_pred = np.concatenate(
                    (y_pred, np.reshape(batch_out, (num_batch, ))))
                y_label = np.concatenate(
                    (y_label, np.reshape(y_batch, (num_batch, ))))

            batch_index += 1
            X_batch, P_batch, y_batch = self.get_batch(darray, self.batch_size,
                                                       batch_index)

        pred = [1 if y_pred[i] > thr else 0 for i in range(len(y_pred))]
        accuracy = metrics.accuracy_score(y_label, pred)
        precision = metrics.precision_score(y_label, pred)
        recall = metrics.recall_score(y_label, pred)
        f1 = metrics.f1_score(y_label, pred)

        return accuracy, precision, recall, f1
Example #4
0
    def step(self):
        """ Epochs step, training and validation.
            Return:
                training_loss, validation_loss, accuracy, precision, recall
        """
        # Training loop
        batch_loss, batch_val_loss, batch_accuracy, batch_precision, batch_recall = [], [], [], [], []

        for x_batch, y_batch in self.train_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)

            loss = self.train_step(x_batch, y_batch)
            batch_loss.append(loss)

        with torch.no_grad():
            # Validation loop
            for i, (x_val, y_val) in enumerate(self.val_loader):
                x_val = x_val.to(self.device)
                y_val = y_val.to(self.device)

                self.model.eval()
                yhat = self.model(x_val)
                val_loss = self.criterion(yhat, y_val)
                batch_val_loss.append(val_loss)

                batch_accuracy.append(accuracy_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1)))
                batch_precision.append(precision_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1)))
                batch_recall.append(recall_score(np.argmax(yhat.cpu().detach().numpy(), axis=1), np.argmax(y_val.cpu().detach().numpy(), axis=1)))

        # step lr scheduler using val_loss
        if self.scheduler is not None:
            self.scheduler.step(val_loss)

        return [    torch.mean(torch.Tensor(batch_loss)),
                    torch.mean(torch.Tensor(batch_val_loss)),
                    torch.mean(torch.Tensor(batch_accuracy)),
                    torch.mean(torch.Tensor(batch_precision)),
                    torch.mean(torch.Tensor(batch_recall))
                ]
Example #5
0
def generate_classification_perf(truths, pred_probs, multiclass=False):
    """Given truths, and predicted probabilities, generate ModelPerf object"""
    pred_classes = np.round(pred_probs).astype(int)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        retval = ClassificationModelPerf(
            auroc=metrics.roc_auc_score(truths, pred_probs),
            auroc_curve=metrics.roc_curve(truths, pred_probs)
            if not multiclass else None,
            auprc=metrics.average_precision_score(truths, pred_probs),
            accuracy=metrics.accuracy_score(truths, pred_classes)
            if not multiclass else None,
            recall=metrics.recall_score(truths, pred_classes)
            if not multiclass else None,
            precision=metrics.precision_score(truths, pred_classes)
            if not multiclass else None,
            f1=metrics.f1_score(truths, pred_classes)
            if not multiclass else None,
            ce_loss=metrics.log_loss(truths, pred_probs, normalize=False) /
            np.prod(truths.shape),
        )
    return retval
Example #6
0
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric:
                        eval_function(
                            real_preferences, estimated_preferences,
                            recommender.model.maximum_preference_value(),
                            recommender.model.minimum_preference_value())
                    })
                else:
                    permutation_scores_error.append({
                        metric:
                        eval_function(real_preferences, estimated_preferences)
                    })
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(
                    real_preferences, estimated_preferences,
                    recommender.model.maximum_preference_value(),
                    recommender.model.minimum_preference_value())
                permutation_scores_error.append({
                    'mae': mae,
                    'nmae': nmae,
                    'rmse': rmse
                })

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0)
                                   for preference in preferences]

                preferences = sorted(preferences,
                                     key=lambda x: x[1],
                                     reverse=True)
                relevant_item_ids = [
                    item_id for item_id, preference in preferences[:at]
                ]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [
                            (preference, 1.0)
                            for preference in preferences_other_user
                        ]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(
                            preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append(
                    {metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({
                    'precision': p,
                    'recall': r,
                    'f1score': f
                })

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(
                final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(
                final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores
Example #7
0
    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict(
                    (preferences[idx]
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx] for idx in test_set
                                        ] if preferences else []
            else:
                training_set[user_id] = dict(
                    ((preferences[idx], 1.0)
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [
                    (preferences[idx], 1.0) for idx in test_set
                ] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {
                    metric:
                    eval_function(real_preferences, estimated_preferences,
                                  recommender.model.maximum_preference_value(),
                                  recommender.model.minimum_preference_value())
                }
            return {
                metric: eval_function(real_preferences, estimated_preferences)
            }

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [
                item_id for item_id, preference in preferences[:at]
            ]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [
                        (preference, 1.0)
                        for preference in preferences_other_user
                    ]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(
                real_preferences, estimated_preferences,
                recommender.model.maximum_preference_value(),
                recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {
                'mae': mae,
                'nmae': nmae,
                'rmse': rmse,
                'precision': p,
                'recall': r,
                'f1score': f
            }
                y = data[:, 48]  # last column in file
                X_train, X_test, y_train, y_test = train_test_split(
                    df, y, test_size=0.2, random_state=0)
                clf = RandomForestClassifier(n_estimators=ne,
                                             criterion=crit,
                                             max_features=mf,
                                             min_samples_leaf=msl)
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                results.append(
                    "criterion " + crit + ", n_estimators " + str(ne) +
                    ", max_features " + mf + ", min_samples_leaf " + str(msl) +
                    ", accuracy " +
                    str(metrics.accuracy_score(
                        y_test, y_pred, normalize=True)) + ", precision " +
                    str(metrics.precision_score(y_test, y_pred, pos_label="1"))
                )

for res in results:
    print(res)

#Decision Tree
"""
criterion = ['gini', 'entropy']
max_features = ['auto', 'sqrt', 'log2']
min_samples_leaf = [ 1, 2, 3, 4, 5]

for crit in criterion:
    for mf in max_features:
        for msl in min_samples_leaf:
            df = pd.DataFrame(data[:, :48],
Example #9
0
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                 for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                                    user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                                metric: eval_function(real_preferences,
                                                 estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                    {metric: eval_function(real_preferences,
                                       estimated_preferences)})
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                  'rmse': rmse})

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                        in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0)
                                         for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays,
                                          relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores
Example #10
0
    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                        permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                             for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                             for idx in test_set] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
            #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                                user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                          estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                    in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                     for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}
	def precision_score(self,x_test,y_test):
		_y_predict = self.predict(x_test);
		return metrics.precision_score(y_test,_y_predict);
Example #12
0
                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays,
                    relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
    def evaluate(self, recommender, metric=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                           len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                                        for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                                        for idx in test_set] if preferences else []

        # Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                                       recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                # Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        # Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        # IR_Statistics
        relevant_arrays = []
        real_arrays = []

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                 in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                              for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                         if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            # Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                # Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        # Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            # Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                                               estimated_preferences,
                                               recommender.model.maximum_preference_value(),
                                               recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        # Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            # Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    # Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        # in the test data but
                        # not training data in which case
                        # an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            # Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                        {metric: eval_function(real_preferences,
                                               estimated_preferences)})
            elif metric is None:
                # Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                                                   estimated_preferences,
                                                   recommender.model.maximum_preference_value(),
                                                   recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                 'rmse': rmse})

        # IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                     in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                # Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0) for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = [pref for pref in preferences_other_user if
                                                  pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                # Evaluate the recommender
                recommender_training = self._build_recommender(training_set, recommender)

                try:
                    preferences = recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    # Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            # Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        # Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        # Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores