Example #1
0
def train_and_score(learner, train_examples, test_examples, metric):
    """
    A utility method to train a given learner instance on the given
    training examples, generate predictions on the training set itself
    and also the given test set, and score those predictions using the
    given metric. The method returns the train and test scores.

    Note that this method needs to be a top-level function since it is
    called from within ``joblib.Parallel()`` and, therefore, needs to be
    picklable which it would not be as an instancemethod of the ``Learner``
    class.

    Parameters
    ----------
    learner : skll.Learner
        A SKLL ``Learner`` instance.
    train_examples : array-like, with shape (n_samples, n_features)
        The training examples.
    test_examples : array-like, of length n_samples
        The test examples.
    metric : str
        The scoring function passed to ``use_score_func()``.

    Returns
    -------
    train_score : float
        Output of the score function applied to predictions of
        ``learner`` on ``train_examples``.
    test_score : float
        Output of the score function applied to predictions of
        ``learner`` on ``test_examples``.
    """

    _ = learner.train(train_examples, grid_search=False, shuffle=False)

    # get the train and test class indices (not labels)
    train_predictions = learner.predict(train_examples, class_labels=False)
    test_predictions = learner.predict(test_examples, class_labels=False)

    # now get the training and test labels and convert them to indices
    # but make sure to include any unseen labels in the test data
    if learner.model_type._estimator_type == 'classifier':
        test_label_list = np.unique(test_examples.labels).tolist()
        train_and_test_label_dict = add_unseen_labels(learner.label_dict,
                                                      test_label_list)
        train_labels = np.array([
            train_and_test_label_dict[label] for label in train_examples.labels
        ])
        test_labels = np.array([
            train_and_test_label_dict[label] for label in test_examples.labels
        ])
    else:
        train_labels = train_examples.labels
        test_labels = test_examples.labels

    # now compute and return the scores
    train_score = use_score_func(metric, train_labels, train_predictions)
    test_score = use_score_func(metric, test_labels, test_predictions)
    return train_score, test_score
Example #2
0
def train_and_score(learner,
                    train_examples,
                    test_examples,
                    metric):
    """
    A utility method to train a given learner instance on the given training examples,
    generate predictions on the training set itself and also the given
    test set, and score those predictions using the given metric.
    The method returns the train and test scores.

    Note that this method needs to be a top-level function since it is
    called from within ``joblib.Parallel()`` and, therefore, needs to be
    picklable which it would not be as an instancemethod of the ``Learner``
    class.

    Parameters
    ----------
    learner : skll.Learner
        A SKLL ``Learner`` instance.
    train_examples : array-like, with shape (n_samples, n_features)
        The training examples.
    test_examples : array-like, of length n_samples
        The test examples.
    metric : str
        The scoring function passed to ``use_score_func()``.

    Returns
    -------
    train_score : float
        Output of the score function applied to predictions of
        ``learner`` on ``train_examples``.
    test_score : float
        Output of the score function applied to predictions of
        ``learner`` on ``test_examples``.
    """

    _ = learner.train(train_examples, grid_search=False, shuffle=False)
    train_predictions = learner.predict(train_examples)
    test_predictions = learner.predict(test_examples)
    if learner.model_type._estimator_type == 'classifier':
        test_label_list = np.unique(test_examples.labels).tolist()
        unseen_test_label_list = [label for label in test_label_list
                                  if label not in learner.label_list]
        unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list,
                                                                start=len(learner.label_list))}
        # combine the two dictionaries
        train_and_test_label_dict = learner.label_dict.copy()
        train_and_test_label_dict.update(unseen_label_dict)
        train_labels = np.array([train_and_test_label_dict[label]
                                 for label in train_examples.labels])
        test_labels = np.array([train_and_test_label_dict[label]
                                for label in test_examples.labels])
    else:
        train_labels = train_examples.labels
        test_labels = test_examples.labels

    train_score = use_score_func(metric, train_labels, train_predictions)
    test_score = use_score_func(metric, test_labels, test_predictions)
    return train_score, test_score
Example #3
0
def test_register_custom_metric_values():
    """Test to check values of custom metrics"""

    # register two metrics in the same file
    metric_dir = join(_my_dir, "other")
    custom_metrics_file = join(metric_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file, "f075_macro")
    register_custom_metric(custom_metrics_file, "ratio_of_ones")

    # check that the values that SKLL would compute matches what we expect
    y_true = [1, 1, 1, 0, 2, 1, 2, 0, 1]
    y_pred = [0, 1, 1, 0, 1, 2, 0, 1, 2]
    skll_value = use_score_func("f075_macro", y_true, y_pred)
    sklearn_value = fbeta_score(y_true, y_pred, 0.75, average='macro')
    eq_(skll_value, sklearn_value)

    y_true = [1, 1, 1, 0]
    y_pred = [0, 1, 1, 0]
    skll_value = use_score_func("ratio_of_ones", y_true, y_pred)
    true_ones = len([true for true in y_true if true == 1])
    pred_ones = len([pred for pred in y_pred if pred == 1])
    expected_value = pred_ones / (true_ones + pred_ones)
    eq_(skll_value, expected_value)
def compute_eval_from_predictions(examples_file, predictions_file,
                                  metric_names):
    '''
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    '''

    # read gold standard labels
    data = load_examples(examples_file)
    gold = dict(zip(data.ids, data.classes))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
Example #5
0
def compute_eval_from_predictions(examples_file, predictions_file,
                                  metric_names):
    '''
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    :param examples_file: a SKLL examples file (in .jsonlines or other format)
    :param predictions_file: a SKLL predictions output TSV file with id
                             and prediction column names
    :param metric_names: a list of SKLL metric names
                         (e.g., [pearson, unweighted_kappa])

    :returns: a dictionary from metrics names to values
    '''

    # read gold standard labels
    data = load_examples(examples_file)
    gold = dict(zip(data.ids, data.classes))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        next(reader)  # skip header
        for row in reader:
            pred[row[0]] = float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
Example #6
0
def compute_eval_from_predictions(examples_file,
                                  predictions_file,
                                  metric_names,
                                  prediction_method=None):
    """
    Compute evaluation metrics from prediction files after you have run an
    experiment.

    Parameters
    ----------
    examples_file: str
        Path to a SKLL examples file (in .jsonlines or other format).
    predictions_file: str
        Path to a SKLL predictions output TSV file with id and prediction column names.
    metric_names: list of str
        A list of SKLL metric names (e.g., [pearson, unweighted_kappa]).
    prediction_method: str or None
        Indicates how to get a single class prediction from the probabilities. Currently
        supported options are  "highest", which selects the class with the highest
        probability, and "expected_value", which calculates an expected value over
        integer classes and rounds to the nearest int. If predictions file does not
        contain probabilities, this should be set to None.

    Returns
    -------
    dict
        Maps metrics names to corresponding values.

    Raises
    ------
    ValueError
        If the requested prediction method is 'expected_value' but the class names can't
        be converted to ints.
    """

    # read gold standard labels
    data = Reader.for_path(examples_file).read()
    gold = dict(zip(data.ids, data.labels))

    # read predictions
    pred = {}
    with open(predictions_file) as pred_file:
        reader = csv.reader(pred_file, dialect=csv.excel_tab)
        header = next(reader)

        # If there are more than two columns, assume column 0 contains the ids, and
        # columns 1-n contain class probabilities. Convert them to a class prediction
        # using the specified `method`.
        if len(header) > 2:
            classes = [c for c in header[1:] if c]
            if prediction_method is None:
                prediction_method = "highest"
                logger.info("No prediction method specified. Using 'highest'.")
            if prediction_method == 'expected_value':
                try:
                    classes = [int(c) for c in classes]
                except ValueError as e:
                    raise e
            for row in reader:
                probabilities = [safe_float(p) for p in row[1:]]
                prediction = get_prediction_from_probabilities(
                    classes, probabilities, prediction_method)
                pred[row[0]] = safe_float(prediction)
        else:
            if prediction_method is not None:
                logger.warning(
                    "A prediction method was provided, but the predictions "
                    "file doesn't contain probabilities. Ignoring prediction "
                    "method '{}'.".format(prediction_method))

            for row in reader:
                pred[row[0]] = safe_float(row[1])

    # make a sorted list of example ids in order to match up
    # labels and predictions
    if set(gold.keys()) != set(pred.keys()):
        raise ValueError('The example and prediction IDs do not match.')
    example_ids = sorted(gold.keys())

    res = {}
    for metric_name in metric_names:
        score = use_score_func(metric_name,
                               [gold[ex_id] for ex_id in example_ids],
                               [pred[ex_id] for ex_id in example_ids])
        res[metric_name] = score
    return res
Example #7
0
def compute_evaluation_metrics(metrics,
                               labels,
                               predictions,
                               model_type,
                               label_dict=None,
                               grid_objective=None,
                               probability=False,
                               logger=None):
    """
    Compute given metrics to evaluate the given predictions generated
    by the given type of estimator against the given true labels.

    Parameters
    ----------
    metrics : list of str
        List of metrics to compute.
    labels : array-like
        True labels to be used for computing the metrics.
    predictions : array-like
        The predictions to be used for computing the metrics.
    model_type : str
        One of "classifier" or "regressor".
    label_dict : dict, optional
        Dictionary mapping class labels to indices for classification.
        Defaults to ``None``.
    grid_objective : str, optional
        The objective used for tuning the hyper-parameters of the model
        that generated the predictions. If ``None``, it means that no
        grid search was done.
        Defaults to ``None``.
    probability : bool, optional
        Does the model output class probabilities?
        Defaults to ``False``.
    logger : logging.Logger, optional
        A logger instance to use for logging messages and warnings.
        If ``None``, a new one is created.
        Defaults to ``None``.

    Returns
    -------
    res : 5-tuple
        The confusion matrix, the overall accuracy, the per-label
        PRFs, the grid search objective function score, and the
        additional evaluation metrics, if any. For regressors, the
        first two elements are ``None``.
    """
    # set up the logger
    logger = logger if logger else logging.getLogger(__name__)

    # warn if grid objective was also specified in metrics
    if len(metrics) > 0 and grid_objective in metrics:
        logger.warning(f"The grid objective '{grid_objective}' is also "
                       f"specified as an evaluation metric. Since its "
                       f"value is already included in the results as the "
                       f"objective score, it will not be printed "
                       f"again in the list of metrics.")
        metrics = [metric for metric in metrics if metric != grid_objective]

    # initialize a dictionary that will hold all of the metric scores
    metric_scores = {metric: None for metric in metrics}

    # if we are a classifier and in probability mode, then
    # `yhat` are probabilities so we need to compute the
    # class indices separately and save them too
    if probability and model_type == 'classifier':
        class_probs = predictions
        predictions = np.argmax(class_probs, axis=1)
    # if we are a regressor or classifier not in probability
    # mode, then we have the class indices already and there
    # are no probabilities
    else:
        class_probs = None

    # make a single list of metrics including the grid objective
    # since it's easier to compute everything together
    metrics_to_compute = [grid_objective] + metrics
    for metric in metrics_to_compute:

        # skip the None if we are not doing grid search
        if not metric:
            continue

        # CASE 1: in probability mode for classification which means we
        # need to either use the probabilities directly or infer the labels
        # from them depending on the metric
        if probability:

            # there are three possible cases here:
            # (a) if we are using a correlation metric or
            #     `average_precision` or `roc_auc` in a binary
            #      classification scenario, then we need to explicitly
            #     pass in the probabilities of the positive class.
            # (b) if we are using `neg_log_loss`, then we
            #     just pass in the full probability array
            # (c) we compute the most likely labels from the
            #     probabilities via argmax and use those
            #     for all other metrics
            if (len(label_dict) == 2
                    and (metric in CORRELATION_METRICS
                         or metric in ['average_precision', 'roc_auc'])
                    and metric != grid_objective):
                logger.info(f"using probabilities for the positive class to "
                            f"compute '{metric}' for evaluation.")
                preds_for_metric = class_probs[:, 1]
            elif metric == 'neg_log_loss':
                preds_for_metric = class_probs
            else:
                preds_for_metric = predictions

        # CASE 2: no probability mode for classifier or regressor
        # in which case we just use the predictions as they are
        else:
            preds_for_metric = predictions

        try:
            metric_scores[metric] = use_score_func(metric, labels,
                                                   preds_for_metric)
        except ValueError:
            metric_scores[metric] = float('NaN')

    # now separate out the grid objective score from the additional metric scores
    # if a grid objective was actually passed in. If no objective was passed in
    # then that score should just be none.
    objective_score = None
    additional_scores = metric_scores.copy()
    if grid_objective:
        objective_score = metric_scores[grid_objective]
        del additional_scores[grid_objective]

    # compute some basic statistics for regressors
    if model_type == 'regressor':
        result_dict = {'descriptive': defaultdict(dict)}
        for table_label, y in zip(['actual', 'predicted'],
                                  [labels, predictions]):
            result_dict['descriptive'][table_label]['min'] = min(y)
            result_dict['descriptive'][table_label]['max'] = max(y)
            result_dict['descriptive'][table_label]['avg'] = np.mean(y)
            result_dict['descriptive'][table_label]['std'] = np.std(y)
        result_dict['pearson'] = use_score_func('pearson', labels, predictions)
        res = (None, None, result_dict, objective_score, additional_scores)
    else:
        # compute the confusion matrix and precision/recall/f1
        # note that we are using the class indices here
        # and not the actual class labels themselves
        num_labels = len(label_dict)
        conf_mat = confusion_matrix(labels,
                                    predictions,
                                    labels=list(range(num_labels)))
        # Calculate metrics
        overall_accuracy = accuracy_score(labels, predictions)
        result_matrix = precision_recall_fscore_support(labels,
                                                        predictions,
                                                        labels=list(
                                                            range(num_labels)),
                                                        average=None)

        # Store results
        result_dict = defaultdict(dict)
        for actual_label in sorted(label_dict):
            col = label_dict[actual_label]
            result_dict[actual_label]["Precision"] = result_matrix[0][col]
            result_dict[actual_label]["Recall"] = result_matrix[1][col]
            result_dict[actual_label]["F-measure"] = result_matrix[2][col]

        res = (conf_mat.tolist(), overall_accuracy, result_dict,
               objective_score, additional_scores)

    return res
Example #8
0
def check_f05_metrics(metric_name, average_method):
    y_true = [1, 1, 1, 0, 0, 0]
    y_pred = [0, 1, 1, 1, 0, 0]
    skll_value = use_score_func(metric_name, y_true, y_pred)
    sklearn_value = fbeta_score(y_true, y_pred, 0.5, average=average_method)
    eq_(skll_value, sklearn_value)
Example #9
0
    def evaluate(self, examples, prediction_prefix=None, append=False,
                 grid_objective=None):
        '''
        Evaluates a given model on a given dev or test example set.

        :param examples: The examples to evaluate the performance of the model
                         on.
        :type examples: ExamplesTuple
        :param prediction_prefix: If saving the predictions, this is the
                                  prefix that will be used for the filename.
                                  It will be followed by ".predictions"
        :type prediction_prefix: str
        :param append: Should we append the current predictions to the file if
                       it exists?
        :type append: bool
        :param grid_objective: The objective function that was used when doing
                               the grid search.
        :type grid_objective: function

        :return: The confusion matrix, the overall accuracy, the per-class
                 PRFs, the model parameters, and the grid search objective
                 function score.
        :rtype: 5-tuple
        '''
        # initialize grid score
        grid_score = None

        # make the prediction on the test data
        yhat = self.predict(examples, prediction_prefix=prediction_prefix,
                            append=append)

        # extract actual labels (transformed for classification tasks)
        if self._model_type not in _REGRESSION_MODELS:
            ytest = np.array([self.label_dict[label] for label in
                              examples.classes])
        else:
            ytest = examples.classes

        # if run in probability mode, convert yhat to list of classes predicted
        if self.probability:
            # if we're using a correlation grid objective, calculate it here
            if grid_objective and grid_objective in _CORRELATION_METRICS:
                try:
                    grid_score = use_score_func(grid_objective, ytest,
                                                yhat[:, 1])
                except ValueError:
                    grid_score = float('NaN')

            yhat = np.array([max(range(len(row)),
                                 key=lambda i: row[i])
                             for row in yhat])

        # calculate grid search objective function score, if specified
        if (grid_objective and (grid_objective not in _CORRELATION_METRICS or
                                not self.probability)):
            try:
                grid_score = use_score_func(grid_objective, ytest, yhat)
            except ValueError:
                grid_score = float('NaN')

        if self._model_type in _REGRESSION_MODELS:
            result_dict = {'descriptive': defaultdict(dict)}
            for table_label, y in zip(['actual', 'predicted'], [ytest, yhat]):
                result_dict['descriptive'][table_label]['min'] = min(y)
                result_dict['descriptive'][table_label]['max'] = max(y)
                result_dict['descriptive'][table_label]['avg'] = np.mean(y)
                result_dict['descriptive'][table_label]['std'] = np.std(y)
            result_dict['pearson'] = SCORERS['pearson']._score_func(ytest, yhat)
            res = (None, None, result_dict, self._model.get_params(),
                   grid_score)
        else:
            # compute the confusion matrix
            num_labels = len(self.label_list)
            conf_mat = confusion_matrix(ytest, yhat,
                                        labels=list(range(num_labels)))
            # Calculate metrics
            overall_accuracy = accuracy_score(ytest, yhat)
            result_matrix = precision_recall_fscore_support(ytest,
                                                            yhat,
                                                            labels=list(range(num_labels)),
                                                            average=None)

            # Store results
            result_dict = defaultdict(dict)
            for actual_class in sorted(self.label_list):
                c_num = self.label_dict[actual_class]
                result_dict[actual_class]["Precision"] = result_matrix[0][c_num]
                result_dict[actual_class]["Recall"] = result_matrix[1][c_num]
                result_dict[actual_class]["F-measure"] = result_matrix[2][c_num]

            res = (conf_mat.tolist(), overall_accuracy, result_dict,
                   self._model.get_params(), grid_score)
        return res