コード例 #1
0
    def classify(self, dataset, output_frequency='per_row'):
        """
        Return a classification, for each ``prediction_window`` examples in the
        ``dataset``, using the trained activity classification model. The output
        SFrame contains predictions as both class labels as well as probabilities 
        that the predicted value is the associated label.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features and session id used for model training, but
            does not require a target column. Additional columns are ignored.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_row': Each prediction is returned ``prediction_window`` times.
            - 'per_window': Return a single prediction for each 
              ``prediction_window`` rows in ``dataset`` per ``session_id``.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions i.e class labels and probabilities.

        See Also
        ----------
        create, evaluate, predict

        Examples
        ----------
        >>> classes = model.classify(data)
        """
        _tkutl._check_categorical_option_type('output_frequency',
                                              output_frequency,
                                              ['per_window', 'per_row'])
        id_target_map = self._id_target_map
        preds = self.predict(dataset,
                             output_type='probability_vector',
                             output_frequency=output_frequency)

        if output_frequency == 'per_row':
            return _SFrame({
                'class':
                preds.apply(lambda p: id_target_map[_np.argmax(p)]),
                'probability':
                preds.apply(_np.max)
            })
        elif output_frequency == 'per_window':
            preds['class'] = preds['probability_vector'].apply(
                lambda p: id_target_map[_np.argmax(p)])
            preds['probability'] = preds['probability_vector'].apply(_np.max)
            preds = preds.remove_column('probability_vector')
            return preds
コード例 #2
0
    def evaluate(self, dataset, metric='auto', batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')
        if(batch_size < 1):
            raise ValueError('\'batch_size\' must be greater than or equal to 1')

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tk_utils._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(dataset, output_type='probability_vector', batch_size=batch_size)
        if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]):
            classes = self.predict(dataset, output_type='class', batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(dataset[self.target], classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(dataset[self.target], classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(dataset[self.target], classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(dataset[self.target], probs, index_map=self._class_label_to_id)

        return ret
コード例 #3
0
ファイル: image_classifier.py プロジェクト: lbddk/turicreate
def create(
        dataset,
        target,
        feature=None,
        model='resnet-50',
        l2_penalty=0.01,
        l1_penalty=0.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        step_size=_DEFAULT_SOLVER_OPTIONS['step_size'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        class_weights=None,
        validation_set='auto',
        verbose=True,
        seed=None,
        batch_size=64):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    l2_penalty : float, optional
        Weight on l2 regularization of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized logistic regression. See the ridge
        regression reference for more detail.

    l1_penalty : float, optional
        Weight on l1 regularization of the model. Like the l2 penalty, the
        higher the l1 penalty, the more the estimated coefficients shrink toward
        0. The l1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful
        for the model. The default weight of 0 prevents any features from
        being discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. Available solvers are:

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *fista*: accelerated gradient descent

        For this model, the Newton-Raphson method is equivalent to the
        iteratively re-weighted least squares algorithm. If the l1_penalty is
        greater than 0, use the 'fista' solver.

        The model is trained using a carefully engineered collection of methods
        that are automatically picked based on the input data. The ``newton``
        method  works best for datasets with plenty of examples and few features
        (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for
        wide datasets (i.e datasets with many coefficients).  ``fista`` is the
        default solver for l1-regularized linear regression. The solvers are all
        automatically tuned and the default options should function well. See
        the solver options guide for setting additional parameters for each of
        the solvers.

        See the user guide for additional details on how the solver is chosen.
        (see `here
        <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_)

    feature_rescaling : boolean, optional
        Feature rescaling is an important pre-processing step that ensures that
        all features are on the same scale. An l2-norm rescaling is performed
        to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that are
        used to represent them. The coefficients are returned in original scale
        of the problem. This process is particularly useful when features
        vary widely in their ranges.

    convergence_threshold : float, optional
        Convergence is tested using variation in the training objective. The
        variation in the training objective is calculated using the difference
        between the objective values between two steps. Consider reducing this
        below the default value (0.01) for a more accurately trained model.
        Beware of overfitting (i.e a model that works well only on the training
        data) if this parameter is set to a very low value.

    lbfgs_memory_level : float, optional
        The L-BFGS algorithm keeps track of gradient information from the
        previous ``lbfgs_memory_level`` iterations. The storage requirement for
        each of these gradients is the ``num_coefficients`` in the problem.
        Increasing the ``lbfgs_memory_level ``can help improve the quality of
        the model trained. Setting this to more than ``max_iterations`` has the
        same effect as setting it to ``max_iterations``.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    step_size : float, optional
        The starting step size to use for the ``fista`` solver. The default is
        set to 1.0, this is an aggressive setting. If the first iteration takes
        a considerable amount of time, reducing this parameter may speed up
        model training.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    max_iterations : int, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check model parameter
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append('VisionFeaturePrint_Scene')

        # Also, to make sure existing code doesn't break, replace incorrect name
        # with the correct name version
        if model == "VisionFeaturePrint_Screen":
            print(
                "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene; VisionFeaturePrint_Screen will be removed in subsequent versions."
            )
            model = "VisionFeaturePrint_Scene"

    _tkutl._check_categorical_option_type('model', model, allowed_models)

    # Check dataset parameter
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if (batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto'
            or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'.")

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        target:
        dataset[target],
        '__image_features__':
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })
    if isinstance(validation_set, _tc.SFrame):
        extracted_features_validation = _tc.SFrame({
            target:
            validation_set[target],
            '__image_features__':
            feature_extractor.extract_features(validation_set,
                                               feature,
                                               verbose=verbose,
                                               batch_size=batch_size),
        })
    else:
        extracted_features_validation = validation_set

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(
        extracted_features,
        features=['__image_features__'],
        target=target,
        max_iterations=max_iterations,
        validation_set=extracted_features_validation,
        seed=seed,
        verbose=verbose,
        l2_penalty=l2_penalty,
        l1_penalty=l1_penalty,
        solver=solver,
        feature_rescaling=feature_rescaling,
        convergence_threshold=convergence_threshold,
        step_size=step_size,
        lbfgs_memory_level=lbfgs_memory_level,
        class_weights=class_weights)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:  # model == VisionFeaturePrint_Scene
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
コード例 #4
0
ファイル: topic_model.py プロジェクト: zoecarver/turicreate
    def get_topics(self,
                   topic_ids=None,
                   num_words=5,
                   cdf_cutoff=1.0,
                   output_type='topic_probabilities'):
        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = turicreate.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = turicreate.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _check_categorical_option_type('output_type', output_type,
                                       ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = list(range(self._get('num_topics')))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError(
                "Only integer topic_ids can be used at this point in time.")
        if not all([x >= 0 and x < self.num_topics for x in topic_ids]):
            raise ValueError("Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model.")

        opts = {
            'model': self.__proxy__,
            'topic_ids': topic_ids,
            'num_words': num_words,
            'cdf_cutoff': cdf_cutoff
        }
        response = _turicreate.extensions._text.topicmodel_get_topic(opts)
        ret = response['top_words']

        def sort_wordlist_by_prob(z):
            words = sorted(z.items(),
                           key=_operator.itemgetter(1),
                           reverse=True)
            return [word for (word, prob) in words]

        if output_type != 'topic_probabilities':
            ret = ret.groupby(
                'topic',
                {'word': _turicreate.aggregate.CONCAT('word', 'score')})
            words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob)
            ret = _SFrame({'words': words})

        return ret
コード例 #5
0
    def predict_topk(self, dataset, output_type='probability', k=3, output_frequency='per_row'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `prediction_id`, 
        `class`, and `probability`, or `rank`, depending on the ``output_type``
        parameter.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features and session id used for model training, but
            does not require a target column. Additional columns are ignored.

        output_type : {'probability', 'rank'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_row': Each prediction is returned ``prediction_window`` times.
            - 'per_window': Return a single prediction for each 
              ``prediction_window`` rows in ``dataset`` per ``session_id``.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +---------------+-------+-------------------+
        |     row_id    | class |    probability    |
        +---------------+-------+-------------------+
        |       0       |   4   |   0.995623886585  |
        |       0       |   9   |  0.0038311756216  |
        |       0       |   7   | 0.000301006948575 |
        |       1       |   1   |   0.928708016872  |
        |       1       |   3   |  0.0440889261663  |
        |       1       |   2   |  0.0176190119237  |
        |       2       |   3   |   0.996967732906  |
        |       2       |   2   |  0.00151345680933 |
        |       2       |   7   | 0.000637513934635 |
        |       3       |   1   |   0.998070061207  |
        |      ...      |  ...  |        ...        |
        +---------------+-------+-------------------+
        """
        _tkutl._check_categorical_option_type('output_type', output_type, ['probability', 'rank'])
        id_target_map = self._id_target_map
        preds = self.predict(
            dataset, output_type='probability_vector', output_frequency=output_frequency)

        if output_frequency == 'per_row':
            probs = preds
        elif output_frequency == 'per_window':
            probs = preds['probability_vector']

        if output_type == 'rank':
            probs = probs.apply(lambda p: [
                {'class': id_target_map[i],
                 'rank': i}
                for i in reversed(_np.argsort(p)[-k:])]
            )
        elif output_type == 'probability':
            probs = probs.apply(lambda p: [
                {'class': id_target_map[i],
                 'probability': p[i]}
                for i in reversed(_np.argsort(p)[-k:])]
            )

        if output_frequency == 'per_row':
            output = _SFrame({'probs': probs})
            output = output.add_row_number(column_name='row_id')
        elif output_frequency == 'per_window':
            output = _SFrame({
                'probs': probs,
                self.session_id: preds[self.session_id],
                'prediction_id': preds['prediction_id']
            })

        output = output.stack('probs', new_column_name='probs')
        output = output.unpack('probs', column_name_prefix='')
        return output
コード例 #6
0
    def predict(self, dataset, output_type='class', output_frequency='per_row'):
        """
        Return predictions for ``dataset``, using the trained activity classifier.
        Predictions can be generated as class labels, or as a probability
        vector with probabilities for each class.

        The activity classifier generates a single prediction for each
        ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the
        number of predictions is smaller than the length of ``dataset``. By
        default each prediction is replicated by ``prediction_window`` to return
        a prediction for each row of ``dataset``. Use ``output_frequency`` to
        get the unreplicated predictions.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'class', 'probability_vector'}, optional
            Form of each prediction which is one of:

            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. This returns the class with maximum
              probability.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_window': Return a single prediction for each
              ``prediction_window`` rows in ``dataset`` per ``session_id``.
            - 'per_row': Convenience option to make sure the number of
              predictions match the number of rows in the dataset. Each
              prediction from the model is repeated ``prediction_window``
              times during that window.

        Returns
        -------
        out : SArray | SFrame
            If ``output_frequency`` is 'per_row' return an SArray with predictions
            for each row in ``dataset``.
            If ``output_frequency`` is 'per_window' return an SFrame with
            predictions for ``prediction_window`` rows in ``dataset``.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------

        .. sourcecode:: python

            # One prediction per row
            >>> probability_predictions = model.predict(
            ...     data, output_type='probability_vector', output_frequency='per_row')[:4]
            >>> probability_predictions

            dtype: array
            Rows: 4
            [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])]

            # One prediction per window
            >>> class_predictions = model.predict(
            ...     data, output_type='class', output_frequency='per_window')
            >>> class_predictions

            +---------------+------------+-----+
            | prediction_id | session_id |class|
            +---------------+------------+-----+
            |       0       |     3      |  5  |
            |       1       |     3      |  5  |
            |       2       |     3      |  5  |
            |       3       |     3      |  5  |
            |       4       |     3      |  5  |
            |       5       |     3      |  5  |
            |       6       |     3      |  5  |
            |       7       |     3      |  4  |
            |       8       |     3      |  4  |
            |       9       |     3      |  4  |
            |      ...      |    ...     | ... |
            +---------------+------------+-----+
        """
        _tkutl._raise_error_if_not_sframe(dataset, 'dataset')
        _tkutl._check_categorical_option_type(
            'output_frequency', output_frequency, ['per_window', 'per_row'])
        _tkutl._check_categorical_option_type(
            'output_type', output_type, ['probability_vector', 'class'])
        from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
        from ._sframe_sequence_iterator import prep_data as _prep_data

        from ._sframe_sequence_iterator import _ceil_dev

        prediction_window = self.prediction_window
        chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window,
                                     self._predictions_in_chunk, verbose=False)
        data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features),
                                        prediction_window, self._predictions_in_chunk,
                                        self._recalibrated_batch_size, use_pad=True)

        chunked_data = data_iter.dataset
        preds = self._pred_model.predict(data_iter).asnumpy()

        if output_frequency == 'per_row':
            # Replicate each prediction times prediction_window
            preds = preds.repeat(prediction_window, axis=1)

            # Remove predictions for padded rows
            unpadded_len = chunked_data['chunk_len'].to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            # Reshape from (num_of_chunks, chunk_size, num_of_classes)
            # to (ceil(length / prediction_window), num_of_classes)
            # chunk_size is DIFFERENT between chunks - since padding was removed.
            out = _np.concatenate(preds)
            out = out.reshape((-1, len(self._target_id_map)))
            out = _SArray(out)

            if output_type == 'class':
                id_target_map = self._id_target_map
                out = out.apply(lambda c: id_target_map[_np.argmax(c)])

        elif output_frequency == 'per_window':
            # Calculate the number of expected predictions and
            # remove predictions for padded data
            unpadded_len = chunked_data['chunk_len'].apply(
                lambda l: _ceil_dev(l, prediction_window)).to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            out = _SFrame({
                self.session_id: chunked_data['session_id'],
                'preds': _SArray(preds, dtype=list)
            }).stack('preds', new_column_name='probability_vector')

            # Calculate the prediction index per session
            out = out.add_row_number(column_name='prediction_id')
            start_sess_idx = out.groupby(
                self.session_id, {'start_idx': _agg.MIN('prediction_id')})
            start_sess_idx = start_sess_idx.unstack(
                [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0]

            if output_type == 'class':
                id_target_map = self._id_target_map
                out['probability_vector'] = out['probability_vector'].apply(
                    lambda c: id_target_map[_np.argmax(c)])
                out = out.rename({'probability_vector': 'class'})

        return out
コード例 #7
0
    def predict(self, dataset, output_type="class", missing_value_action="auto"):
        """
        Return predictions for ``dataset``, using the trained logistic
        regression model. Predictions can be generated as class labels (0 or
        1), or margins (i.e. the distance of the observations from the hyperplane
        separating the classes). By default, the predict method returns class
        labels.

        For each new example in ``dataset``, the margin---also known as the
        linear predictor---is the inner product of the example and the model
        coefficients plus the intercept term. Predicted classes are obtained by
        thresholding the margins at 0.

        Parameters
        ----------
        dataset : SFrame | dict
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'margin', 'class'}, optional
            Form of the predictions which are one of:

            - 'margin': Distance of the observations from the hyperplane
              separating the classes.
            - 'class': Class prediction.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error' : Do not proceed with prediction and terminate with
              an error message.

        Returns
        -------
        out : SArray
            An SArray with model predictions.

        See Also
        ----------
        create, evaluate, classify

        Examples
        ----------
        >>> data =  turicreate.SFrame('https://static.turi.com/datasets/regression/houses.csv')

        >>> data['is_expensive'] = data['price'] > 30000
        >>> model = turicreate.svm_classifier.create(data,
                                  target='is_expensive',
                                  features=['bath', 'bedroom', 'size'])

        >>> class_predictions = model.progressredict(data)
        >>> margin_predictions = model.progressredict(data, output_type='margin')

        """

        _check_categorical_option_type("output_type", output_type, ["class", "margin"])
        return super(_Classifier, self).predict(
            dataset, output_type=output_type, missing_value_action=missing_value_action
        )
コード例 #8
0
    def predict(self,
                dataset,
                output_type='class',
                missing_value_action='auto'):
        """
        A flexible and advanced prediction API.

        The target column is provided during
        :func:`~turicreate.random_forest.create`. If the target column is in the
        `dataset` it will be ignored.

        Parameters
        ----------
        dataset : SFrame
          A dataset that has the same columns that were used during training.
          If the target column exists in ``dataset`` it will be ignored
          while making predictions.

        output_type : {'probability', 'margin', 'class', 'probability_vector'}, optional.
            Form of the predictions which are one of:

            - 'probability': Prediction probability associated with the True
               class (not applicable for multi-class classification)
            - 'margin': Margin associated with the prediction (not applicable
              for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. For multi-class classification, this
               returns the class with maximum probability.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SArray
           Predicted target value for each example (i.e. row) in the dataset.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------
        >>> m.predict(testdata)
        >>> m.predict(testdata, output_type='probability')
        >>> m.predict(testdata, output_type='margin')
        """
        _check_categorical_option_type(
            'output_type', output_type,
            ['class', 'margin', 'probability', 'probability_vector'])
        return super(_Classifier,
                     self).predict(dataset,
                                   output_type=output_type,
                                   missing_value_action=missing_value_action)
コード例 #9
0
def create(dataset,
           annotations=None,
           feature=None,
           model="darknet-yolo",
           classes=None,
           batch_size=0,
           max_iterations=0,
           verbose=True,
           grid_shape=[13, 13],
           **kwargs):
    """
    Create a :class:`ObjectDetector` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``annotations``
        parameters will be extracted for training the detector.

    annotations : string
        Name of the column containing the object detection annotations.  This
        column should be a list of dictionaries (or a single dictionary), with
        each dictionary representing a bounding box of an object instance. Here
        is an example of the annotations for a single image with two object
        instances::

            [{'label': 'dog',
              'type': 'rectangle',
              'coordinates': {'x': 223, 'y': 198,
                              'width': 130, 'height': 230}},
             {'label': 'cat',
              'type': 'rectangle',
              'coordinates': {'x': 40, 'y': 73,
                              'width': 80, 'height': 123}}]

        The value for `x` is the horizontal center of the box paired with
        `width` and `y` is the vertical center of the box paired with `height`.
        'None' (the default) indicates the only list column in `dataset` should
        be used for the annotations.

    feature : string
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Object detection model to use:

           - "darknet-yolo" : Fast and medium-sized model

    grid_shape : array optional
        Shape of the grid used for object detection. Higher values increase precision for small objects, but at a higher computational cost

           - [13, 13] : Default grid value for a Fast and medium-sized model

    classes : list optional
        List of strings containing the names of the classes of objects.
        Inferred from the data if not provided.

    batch_size: int
        The number of images per training iteration. If 0, then it will be
        automatically determined based on resource availability.

    max_iterations : int
        The number of training iterations. If 0, then it will be automatically
        be determined based on the amount of data you provide.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ObjectDetector
        A trained :class:`ObjectDetector` model.

    See Also
    --------
    ObjectDetector

    Examples
    --------
    .. sourcecode:: python

        # Train an object detector model
        >>> model = turicreate.object_detector.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

        # Visualize predictions by generating a new column of marked up images
        >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions'])
    """
    _raise_error_if_not_sframe(dataset, "dataset")

    if len(dataset) == 0:
        raise _ToolkitError("Unable to train on empty dataset")

    _numeric_param_check_range("max_iterations", max_iterations, 0,
                               _six.MAXSIZE)
    start_time = _time.time()

    supported_detectors = ["darknet-yolo"]

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)
        if verbose:
            print("Using '%s' as feature column" % feature)
    if annotations is None:
        annotations = _tkutl._find_only_column_of_type(
            dataset,
            target_type=[list, dict],
            type_name="list",
            col_name="annotations")
        if verbose:
            print("Using '%s' as annotations column" % annotations)

    _raise_error_if_not_detection_sframe(dataset,
                                         feature,
                                         annotations,
                                         require_annotations=True)
    _tkutl._handle_missing_values(dataset, feature, "dataset")
    _tkutl._check_categorical_option_type("model", model, supported_detectors)

    base_model = model.split("-", 1)[0]
    ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]()

    pretrained_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[
        "darknet_mlmodel"]()
    pretrained_model_path = pretrained_model.get_model_path()

    params = {
        "anchors": [
            (1.0, 2.0),
            (1.0, 1.0),
            (2.0, 1.0),
            (2.0, 4.0),
            (2.0, 2.0),
            (4.0, 2.0),
            (4.0, 8.0),
            (4.0, 4.0),
            (8.0, 4.0),
            (8.0, 16.0),
            (8.0, 8.0),
            (16.0, 8.0),
            (16.0, 32.0),
            (16.0, 16.0),
            (32.0, 16.0),
        ],
        "grid_shape":
        grid_shape,
        "aug_resize":
        0,
        "aug_rand_crop":
        0.9,
        "aug_rand_pad":
        0.9,
        "aug_rand_gray":
        0.0,
        "aug_aspect_ratio":
        1.25,
        "aug_hue":
        0.05,
        "aug_brightness":
        0.05,
        "aug_saturation":
        0.05,
        "aug_contrast":
        0.05,
        "aug_horizontal_flip":
        True,
        "aug_min_object_covered":
        0,
        "aug_min_eject_coverage":
        0.5,
        "aug_area_range": (0.15, 2),
        "aug_pca_noise":
        0.0,
        "aug_max_attempts":
        20,
        "aug_inter_method":
        2,
        "lmb_coord_xy":
        10.0,
        "lmb_coord_wh":
        10.0,
        "lmb_obj":
        100.0,
        "lmb_noobj":
        5.0,
        "lmb_class":
        2.0,
        "non_maximum_suppression_threshold":
        0.45,
        "rescore":
        True,
        "clip_gradients":
        0.025,
        "weight_decay":
        0.0005,
        "sgd_momentum":
        0.9,
        "learning_rate":
        1.0e-3,
        "shuffle":
        True,
        "mps_loss_mult":
        8,
        # This large buffer size (8 batches) is an attempt to mitigate against
        # the SFrame shuffle operation that can occur after each epoch.
        "io_thread_buffer_size":
        8,
        "mlmodel_path":
        pretrained_model_path,
    }

    # create tensorflow model here
    import turicreate.toolkits.libtctensorflow

    if classes == None:
        classes = []

    _raise_error_if_not_iterable(classes)
    _raise_error_if_not_iterable(grid_shape)

    grid_shape = [int(x) for x in grid_shape]
    assert len(grid_shape) == 2

    tf_config = {
        "grid_height": params["grid_shape"][0],
        "grid_width": params["grid_shape"][1],
        "mlmodel_path": params["mlmodel_path"],
        "classes": classes,
        "compute_final_metrics": False,
        "verbose": verbose,
        "model": "darknet-yolo",
    }

    # If batch_size or max_iterations = 0, they will be automatically
    # generated in C++.
    if batch_size > 0:
        tf_config["batch_size"] = batch_size

    if max_iterations > 0:
        tf_config["max_iterations"] = max_iterations

    model = _tc.extensions.object_detector()
    model.train(
        data=dataset,
        annotations_column_name=annotations,
        image_column_name=feature,
        options=tf_config,
    )
    return ObjectDetector(model_proxy=model, name="object_detector")
コード例 #10
0
def roc_curve(targets, predictions, average=None, index_map=None):
    r"""
    Compute an ROC curve for the given targets and predictions. Currently,
    only binary classification is supported.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        The prediction that corresponds to each target value.  This vector must
        have the same length as ``targets``. Target scores, can either be
        probability estimates of the positive class, confidence values, or
        binary decisions.

    average : string, [None (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.

    index_map : dict[int], [None (default)]
        For binary classification, a dictionary mapping the two target labels to
        either 0 (negative) or 1 (positive). For multi-class classification, a
        dictionary mapping potential target labels to the associated index into
        the vectors in ``predictions``.

    Returns
    -------
    out : SFrame
        Each row represents the predictive performance when using a given
        cutoff threshold, where all predictions above that cutoff are
        considered "positive". Four columns are used to describe the
        performance:

            - tpr   : True positive rate, the number of true positives divided by the number of positives.
            - fpr   : False positive rate, the number of false positives divided by the number of negatives.
            - p     : Total number of positive values.
            - n     : Total number of negative values.
            - class : Reference class for this ROC curve.

    See Also
    --------
    confusion_matrix, auc

    References
    ----------
    `An introduction to ROC analysis. Tom Fawcett.
    <https://ccrma.stanford.edu/workshops/mir2009/references/ROCintro.pdf>`_

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case. This behavior can be overridden by providing
       an explicit ``index_map``.
     - For multi-class classification, when the target label is of type
       "string", then the probability vector is assumed to be a vector of
       probabilities of classes as sorted alphanumerically. Hence, for the
       probability vector [0.1, 0.2, 0.7] for a dataset with classes "cat",
       "dog", and "rat"; the 0.1 corresponds to "cat", the 0.2 to "dog" and the
       0.7 to "rat". This behavior can be overridden by providing an explicit
       ``index_map``.
     - The ROC curve is computed using a binning approximation with 1M bins and
       is hence accurate only to the 5th decimal.


    Examples
    --------
    .. sourcecode:: python

        >>> targets = turicreate.SArray([0, 1, 1, 0])
        >>> predictions = turicreate.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the roc-curve.
        >>> roc_curve =  turicreate.evaluation.roc_curve(targets, predictions)
        +-------------------+-----+-----+---+---+
        |     threshold     | fpr | tpr | p | n |
        +-------------------+-----+-----+---+---+
        |        0.0        | 1.0 | 1.0 | 2 | 2 |
        | 9.99999974738e-06 | 1.0 | 1.0 | 2 | 2 |
        | 1.99999994948e-05 | 1.0 | 1.0 | 2 | 2 |
        | 2.99999992421e-05 | 1.0 | 1.0 | 2 | 2 |
        | 3.99999989895e-05 | 1.0 | 1.0 | 2 | 2 |
        | 4.99999987369e-05 | 1.0 | 1.0 | 2 | 2 |
        | 5.99999984843e-05 | 1.0 | 1.0 | 2 | 2 |
        | 7.00000018696e-05 | 1.0 | 1.0 | 2 | 2 |
        |  7.9999997979e-05 | 1.0 | 1.0 | 2 | 2 |
        | 9.00000013644e-05 | 1.0 | 1.0 | 2 | 2 |
        +-------------------+-----+-----+---+---+
        [100001 rows x 5 columns]

    For the multi-class setting, an ROC curve is returned for each class.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = turicreate.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = turicreate.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works in the multi-class setting.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = turicreate.SArray([ 1, 0, 2, 1])
        >>> predictions = turicreate.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = turicreate.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |   0   |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = turicreate.SArray(["cat", "dog", "foosa", "dog"])
        >>> predictions = turicreate.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = turicreate.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]
    """
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average, [None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    _check_index_map(index_map)

    opts = {"average": average, "binary": predictions.dtype in [int, float]}
    if index_map is not None:
        opts['index_map'] = index_map

    return _turicreate.extensions._supervised_streaming_evaluator(
        targets, predictions, "roc_curve", opts)
コード例 #11
0
def recall(targets, predictions, average='macro'):
    r"""
    Compute the recall score for classification tasks. The recall score
    quantifies the ability of a classifier to predict `positive` examples.
    Recall can be interpreted as the probability that a randomly selected
    `positive` example is correctly identified by the classifier. The score
    is in the range [0,1] with 0 being the worst, and 1 being perfect.


    The recall score is defined as the ratio:
        .. math::
            \frac{tp}{tp + fn}

    where `tp` is the number of true positives and `fn` the number of false
    negatives.

    Parameters
    ----------
    targets : SArray
        Ground truth class labels. The SArray can be of any type.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives, and false positives.
            - 'macro': Calculate metrics for each label and find their
              unweighted mean. This does not take label imbalance into account.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.

    See Also
    --------
    confusion_matrix, accuracy, precision, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = turicreate.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = turicreate.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = turicreate.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = turicreate.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> turicreate.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}
    """
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                                   ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)
    opts = {"average": average}
    return _turicreate.extensions._supervised_streaming_evaluator(
        targets, predictions, "recall", opts)
コード例 #12
0
def fbeta_score(targets, predictions, beta=1.0, average='macro'):
    r"""
    Compute the F-beta score. The F-beta score is the weighted harmonic mean of
    precision and recall. The score lies in the range [0,1] with 1 being ideal
    and 0 being the worst.

    The `beta` value is the weight given to `precision` vs `recall` in the
    combined score. `beta=0` considers only precision, as `beta` increases, more
    weight is given to recall with `beta > 1` favoring recall over precision.

    The F-beta score is defined as:

        .. math::
            f_{\beta} = (1 + \beta^2) \times \frac{(p \times r)}{(\beta^2 p + r)}

    Where :math:`p` is the precision and :math:`r` is the recall.

    Parameters
    ----------
    targets : SArray
        An SArray of ground truth class labels. Can be of any type except
        float.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    beta: float
        Weight of the `precision` term in the harmonic mean.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives and false positives.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

        For a more precise definition of `micro` and `macro` averaging refer
        to [1] below.

    Returns
    -------
    out : float (for binary classification) or dict[float] (for multi-class, average=None)
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, if the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.


    See Also
    --------
    confusion_matrix, accuracy, precision, recall, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = turicreate.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = turicreate.SArray([1, 0, 2, 1, 3, 1, 0, 1])

        # Micro average of the F-Beta score
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {0: 0.0, 1: 0.4166666666666667, 2: 0.5555555555555556, 3: 0.0}

    This metric also works when the targets are of type `str`

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = turicreate.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = turicreate.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the F-Beta score
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> turicreate.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {'cat': 0.0, 'dog': 0.4166666666666667, 'foosa': 0.5555555555555556, 'snake': 0.0}

    References
    ----------
    - [1] Sokolova, Marina, and Guy Lapalme. "A systematic analysis of
      performance measures for classification tasks." Information Processing &
      Management 45.4 (2009): 427-437.

    """
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                                   ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)

    opts = {"beta": beta, "average": average}
    return _turicreate.extensions._supervised_streaming_evaluator(
        targets, predictions, "fbeta_score", opts)
コード例 #13
0
def auc(targets, predictions, average='macro', index_map=None):
    r"""
    Compute the area under the ROC curve for the given targets and predictions.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        Prediction probability that corresponds to each target value. This must
        be of same length as ``targets``.

    average : string, [None, 'macro' (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

    index_map : dict[int], [None (default)]
        For binary classification, a dictionary mapping the two target labels to
        either 0 (negative) or 1 (positive). For multi-class classification, a
        dictionary mapping potential target labels to the associated index into
        the vectors in ``predictions``.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    See Also
    --------
    roc_curve, confusion_matrix

    Examples
    --------
    .. sourcecode:: python

        >>> targets = turicreate.SArray([0, 1, 1, 0])
        >>> predictions = turicreate.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  turicreate.evaluation.auc(targets, predictions)
        0.5

    This metric also works when the targets are strings (Here "cat" is chosen
    as the reference class).

    .. sourcecode:: python

        >>> targets = turicreate.SArray(["cat", "dog", "dog", "cat"])
        >>> predictions = turicreate.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  turicreate.evaluation.auc(targets, predictions)
        0.5


    For the multi-class setting, the auc-score can be averaged.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = turicreate.SArray([ 1, 0, 2, 1])
        >>> predictions = turicreate.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        #  Macro average of the scores for each class.
        >>> turicreate.evaluation.auc(targets, predictions, average = 'macro')
        0.8888888888888888

        # Scores for each class.
        >>> turicreate.evaluation.auc(targets, predictions, average = None)
        {0: 1.0, 1: 1.0, 2: 0.6666666666666666}

    This metric also works for "string" targets in the multi-class setting

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = turicreate.SArray([ "dog", "cat", "foosa", "dog"])
        >>> predictions = turicreate.SArray([[.1, .8, 0.1],
                                           [.9, .1, 0.0],
                                           [.8, .1, 0.1],
                                           [.3, .6, 0.1]])

        # Macro average.
        >>> auc =  turicreate.evaluation.auc(targets, predictions)
        0.8888888888888888

        # Score for each class.
        >>> auc =  turicreate.evaluation.auc(targets, predictions, average=None)
        {'cat': 1.0, 'dog': 1.0, 'foosa': 0.6666666666666666}

    """
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average, ['macro', None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    _check_index_map(index_map)

    opts = {"average": average, "binary": predictions.dtype in [int, float]}
    if index_map is not None:
        opts['index_map'] = index_map

    return _turicreate.extensions._supervised_streaming_evaluator(
        targets, predictions, "auc", opts)
コード例 #14
0
    def evaluate(self, dataset, metric = 'auto', verbose = True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.
        
        Parameters
        ----------
        dataset : SFrame
        Dataset of new observations. Must include columns with the same
        names as the feature and target columns used for model training.
        Additional columns are ignored.
        
        metric : str optional
        Name of the evaluation metric. Possible values are:
        
        - 'auto'             : Returns all available metrics.
        - 'accuracy'         : Classification accuracy (micro average).
        - 'auc'              : Area under the ROC curve (macro average)
        - 'precision'        : Precision score (macro average)
        - 'recall'           : Recall score (macro average)
        - 'f1_score'         : F1 score (macro average)
        - 'confusion_matrix' : An SFrame with counts of possible 
                               prediction/true label combinations.
        - 'roc_curve'        : An SFrame containing information needed for an
                               ROC curve
        
        verbose : bool optional
        If True, prints prediction progress.

        Returns
        -------
        out : dict
        Dictionary of evaluation results where the key is the name of the
        evaluation metric (e.g. `accuracy`) and the value is the evaluation
        score.
        
        See Also
        ----------
        create, predict
        
        Examples
        ----------
        .. sourcecode:: python
        
        >>> results = model.evaluate(data)
        >>> print(results['accuracy'])
        """

        if self.target not in dataset.column_names():
            raise _ToolkitError("Dataset provided to evaluate does not have " 
                + "ground truth in the " + self.target + " column.")

        predicted = self._predict_with_probabilities(dataset, verbose)

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'confusion_matrix', 'roc_curve']

        _tkutl._check_categorical_option_type(
                        'metric', metric, avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]
        
        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(
                dataset[self.target], predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(
                dataset[self.target], predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(
                dataset[self.target], predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(
                dataset[self.target], predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        
        return ret
コード例 #15
0
    def predict_topk(self, dataset, output_type='probability', k=3, batch_size=64):
        """
        Return top-k predictions for the ``dataset``.
        Predictions are returned as an SFrame with three columns: `id`,
        `class`, and `probability` or `rank` depending on the ``output_type``
        parameter.

        Parameters
        ----------
        dataset : SFrame
            Dataset to classify. Must include columns with the same
            names as the features. Additional columns are ignored.

        output_type : {'probability', 'rank'}, optional
            Choose the return type of the prediction:
            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +------+-------+-------------------+
        |  id  | class |    probability    |
        +------+-------+-------------------+
        |  0   |   4   |   0.995623886585  |
        |  0   |   9   |  0.0038311756216  |
        |  0   |   7   | 0.000301006948575 |
        |  1   |   1   |   0.928708016872  |
        |  1   |   3   |  0.0440889261663  |
        |  1   |   2   |  0.0176190119237  |
        |  2   |   3   |   0.996967732906  |
        |  2   |   2   |  0.00151345680933 |
        |  2   |   7   | 0.000637513934635 |
        |  3   |   1   |   0.998070061207  |
        | ...  |  ...  |        ...        |
        +------+-------+-------------------+
        """
        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')
        _tk_utils._check_categorical_option_type('output_type', output_type, ['probability', 'rank'])
        if(batch_size < 1):
            raise ValueError('\'batch_size\' must be greater than or equal to 1')

        prob_vector = self.predict(dataset, output_type='probability_vector', batch_size=64)
        id_to_label = self._id_to_class_label

        if output_type == 'probability':
            results = prob_vector.apply(lambda p: [
                {'class': id_to_label[i], 'probability': p[i]}
                for i in reversed(_np.argsort(p)[-k:])]
            )
        else:
            assert(output_type == 'rank')
            results = prob_vector.apply(lambda p: [
                {'class': id_to_label[i], 'rank': rank}
                for rank, i in enumerate(reversed(_np.argsort(p)[-k:]))]
            )

        results = _tc.SFrame({'X': results})
        results = results.add_row_number()
        results = results.stack('X', new_column_name='X')
        results = results.unpack('X', column_name_prefix='')
        return results
コード例 #16
0
    def evaluate(self, dataset, metric='auto', batch_size=256, verbose=True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the feature and target columns used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric. Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve
            performance.

        verbose : bool, optional
            If True, prints prediction progress.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print(results['accuracy'])
        """
        import os, json, math

        if self.target not in dataset.column_names():
            raise _ToolkitError("Must provide ground truth column, '"
                + self.target + "' in the evaluation dataset.")

        predicted = self._predict_with_probabilities(dataset, batch_size, verbose)

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'confusion_matrix', 'roc_curve', 'log_loss']

        _tkutl._check_categorical_option_type(
                        'metric', metric, avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]

        labels = self.classes

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(
                dataset[self.target], predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(
                dataset[self.target], predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(
                dataset[self.target], predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(
                dataset[self.target], predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)

        from .._evaluate_utils import  (
            entropy,
            confidence,
            relative_confidence,
            get_confusion_matrix,
            hclusterSort,
            l2Dist
        )
        evaluation_result = {k: ret[k] for k in metrics}
        evaluation_result['num_test_examples'] = len(dataset)
        for k in ['num_classes', 'num_examples', 'training_loss', 'training_time', 'max_iterations']:
            evaluation_result[k] = getattr(self, k)

        #evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape')

        evaluation_result["model_name"] = "Drawing Classifier"
        extended_test = dataset.add_column(predicted["probability"], 'probs')
        extended_test['label'] = dataset[self.target]

        extended_test = extended_test.add_columns( [extended_test.apply(lambda d: labels[d['probs'].index(confidence(d['probs']))]),
            extended_test.apply(lambda d: entropy(d['probs'])),
            extended_test.apply(lambda d: confidence(d['probs'])),
            extended_test.apply(lambda d: relative_confidence(d['probs']))],
            ['predicted_label', 'entropy', 'confidence', 'relative_confidence'])

        extended_test = extended_test.add_column(extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct')

        sf_conf_mat = get_confusion_matrix(extended_test, labels)
        confidence_threshold = 0.5
        hesitant_threshold = 0.2
        evaluation_result['confidence_threshold'] = confidence_threshold
        evaluation_result['hesitant_threshold'] = hesitant_threshold
        evaluation_result['confidence_metric_for_threshold'] = 'relative_confidence'

        evaluation_result['conf_mat'] = list(sf_conf_mat)

        vectors = map(lambda l: {'name': l, 'pos':list(sf_conf_mat[sf_conf_mat['target_label']==l].sort('predicted_label')['norm_prob'])},
                    labels)
        evaluation_result['sorted_labels'] = hclusterSort(vectors, l2Dist)[0]['name'].split("|")

        per_l = extended_test.groupby(['label'], {'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') })
        per_l['recall'] = per_l.apply(lambda l: l['correct_count']*1.0 / l['count'])

        per_pl = extended_test.groupby(['predicted_label'], {'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') })
        per_pl['precision'] = per_pl.apply(lambda l: l['correct_count']*1.0 / l['predicted_count'])
        per_pl = per_pl.rename({'predicted_label': 'label'})
        evaluation_result['label_metrics'] = list(per_l.join(per_pl, on='label', how='outer').select_columns(['label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision']))
        evaluation_result['labels'] = labels

        extended_test = extended_test.add_row_number('__idx').rename({'label': 'target_label'})

        evaluation_result['test_data'] = extended_test
        evaluation_result['feature'] = self.feature

        return _Evaluation(evaluation_result)
コード例 #17
0
    def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tk_utils._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if _is_deep_feature_sarray(dataset[self.feature]):
            deep_features = dataset[self.feature]
        else:
            deep_features = get_deep_features(dataset[self.feature], verbose=verbose)
        data = _tc.SFrame({'deep features': deep_features})
        data = data.add_row_number()
        missing_ids = data.filter_by([[]], 'deep features')['id']

        if len(missing_ids) > 0:
            data = data.filter_by([[]], 'deep features', exclude=True)
            # Remove the labels for entries without deep features
            _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids))
            labels = dataset[[self.target]].add_row_number()
            labels = data.join(labels, how='left')[self.target]
        else:
            labels = dataset[self.target]
        assert(len(labels) == len(data))

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(data['deep features'], output_type='probability_vector',
                                 verbose=verbose, batch_size=batch_size)
        if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]):
            classes = self.predict(data['deep features'], output_type='class',
                                   verbose=verbose, batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(labels, classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(labels, probs, index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(labels, classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(labels, classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(labels, classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(labels, probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(labels, classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(labels, probs, index_map=self._class_label_to_id)

        return ret
コード例 #18
0
    def predict_topk(self, dataset, output_type="probability", k=3,
        batch_size=256):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `id`,
        `class`, and `probability` or `rank`, depending on the ``output_type``
        parameter.

        Parameters
        ----------
        dataset : SFrame | SArray | turicreate.Image
            Drawings to be classified.
            If dataset is an SFrame, it must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'probability', 'rank'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the
                             prediction.
            - `rank`       : Rank associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve
            performance.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> print(pred)
        +----+-------+-------------------+
        | id | class |   probability     |
        +----+-------+-------------------+
        | 0  |   4   |   0.995623886585  |
        | 0  |   9   |  0.0038311756216  |
        | 0  |   7   | 0.000301006948575 |
        | 1  |   1   |   0.928708016872  |
        | 1  |   3   |  0.0440889261663  |
        | 1  |   2   |  0.0176190119237  |
        | 2  |   3   |   0.996967732906  |
        | 2  |   2   |  0.00151345680933 |
        | 2  |   7   | 0.000637513934635 |
        | 3  |   1   |   0.998070061207  |
        | .. |  ...  |        ...        |
        +----+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _tkutl._check_categorical_option_type("output_type", output_type,
            ["probability", "rank"])

        if not isinstance(k, int):
            raise TypeError("'k' must be an integer >= 1")
        if k <= 0:
            raise ValueError("'k' must be >= 1")
        if batch_size is not None and not isinstance(batch_size, int):
            raise TypeError("'batch_size' must be an integer >= 1")
        if batch_size is not None and batch_size < 1:
            raise ValueError("'batch_size' must be >= 1")

        prob_vector = self.predict(
            dataset, output_type='probability_vector', batch_size=batch_size)

        classes = self.classes
        if output_type == 'probability':
            results = prob_vector.apply(lambda p: [
                        {'class': classes[i], 'probability': p[i]}
                        for i in reversed(_np.argsort(p)[-k:])]
                      )
        else:
            assert(output_type == 'rank')
            results = prob_vector.apply(lambda p: [
                        {'class': classes[index], 'rank': rank}
                        for rank, index in enumerate(reversed(_np.argsort(p)[-k:]))]
                      )

        results = _tc.SFrame({'X': results})
        results = results.add_row_number()
        results = results.stack('X', new_column_name='X')
        results = results.unpack('X', column_name_prefix='')
        return results
コード例 #19
0
    def predict_topk(self,
                     dataset,
                     output_type="probability",
                     k=3,
                     missing_value_action='auto'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | id     | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _check_categorical_option_type('output_type', output_type,
                                       ['rank', 'margin', 'probability'])
        if missing_value_action == 'auto':
            missing_value_action = _sl.select_default_missing_value_policy(
                self, 'predict')

        # Low latency path
        if isinstance(dataset, list):
            return _turicreate.extensions._fast_predict_topk(
                self.__proxy__, dataset, output_type, missing_value_action, k)
        if isinstance(dataset, dict):
            return _turicreate.extensions._fast_predict_topk(
                self.__proxy__, [dataset], output_type, missing_value_action,
                k)

        options = dict()
        options.update({
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': dataset,
            'output_type': output_type,
            'topk': k,
            'missing_value_action': missing_value_action
        })
        target = _turicreate.toolkits._main.run(
            'supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
コード例 #20
0
    def predict(self, data, output_type='class', batch_size=256, verbose=True):
        """
        Predict on an SFrame or SArray of drawings, or on a single drawing.

        Parameters
        ----------
        data : SFrame | SArray | tc.Image
            The drawing(s) on which to perform drawing classification.
            If dataset is an SFrame, it must have a column with the same name
            as the feature column during training. Additional columns are
            ignored.
            If the data is a single drawing, it can be either of type tc.Image,
            in which case it is a bitmap-based drawing input,
            or of type list, in which case it is a stroke-based drawing input.

        output_type : {'probability', 'class', 'probability_vector'}, optional
            Form of the predictions which are one of:

            - 'class': Class prediction. For multi-class classification, this
              returns the class with maximum probability.
            - 'probability': Prediction probability associated with the True
              class (not applicable for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. Label ordering is dictated by the ``classes``
              member variable.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve
            performance.

        verbose : bool, optional
            If True, prints prediction progress.

        Returns
        -------
        out : SArray
            An SArray with model predictions. Each element corresponds to
            a drawing and contains a single value corresponding to the
            predicted label. Each prediction will have type integer or string
            depending on the type of the classes the model was trained on.
            If `data` is a single drawing, the return value will be a single
            prediction.

        See Also
        --------
        evaluate

        Examples
        --------
        .. sourcecode:: python

            # Make predictions
            >>> pred = model.predict(data)

            # Print predictions, for a better overview
            >>> print(pred)
            dtype: int
            Rows: 10
            [3, 4, 3, 3, 4, 5, 8, 8, 8, 4]
        """
        _tkutl._check_categorical_option_type("output_type", output_type,
            ["probability", "class", "probability_vector"])
        if isinstance(data, _tc.SArray):
            predicted = self._predict_with_probabilities(
                _tc.SFrame({
                    self.feature: data
                }),
                batch_size,
                verbose
            )
        elif isinstance(data, _tc.SFrame):
            predicted = self._predict_with_probabilities(data, batch_size, verbose)
        else:
            # single input
            predicted = self._predict_with_probabilities(
                _tc.SFrame({
                    self.feature: [data]
                }),
                batch_size,
                verbose
            )
        if output_type == "class":
            return predicted[self.target]
        elif output_type == "probability":
            if len(self.classes) <= 2:
                _class_to_index = self._class_to_index
                target = self.target
                return predicted.apply(
                    lambda row: row["probability"][_class_to_index[row[target]]])
            else:
                raise _ToolkitError("Use probability vector in case of multi-class classification")
        else:
            assert (output_type == "probability_vector")
            return predicted["probability"]
コード例 #21
0
ファイル: image_analysis.py プロジェクト: chrinide/turicreate
def get_deep_features(images, model_name, batch_size=64, verbose=True):
    """
    Extracts features from images from a specific model.

    Parameters
    ----------
    images : SArray
        Input data.

    model_name : string
        string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    Returns
    -------
    out : SArray
        Returns an SArray with all the extracted features.

    See Also
    --------
    turicreate.image_classifier.create
    turicreate.image_similarity.create

    Examples
    --------
    >>> url = 'https://static.turi.com/datasets/images/nested'
    >>> image_sframe = turicreate.load_images(url)
    >>> image_sarray = image_sframe["image"]
    >>> deep_features_sframe = turicreate.image_analysis.get_deep_features(image_sarray, model_name="resnet-50")
    """

    # Check model parameter
    allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append("VisionFeaturePrint_Scene")
    _tkutl._check_categorical_option_type("model", model_name, allowed_models)

    # Check images parameter
    if not isinstance(images, _tc.SArray):
        raise TypeError(
            "Unrecognized type for 'images'. An SArray is expected.")
    if len(images) == 0:
        raise _ToolkitError(
            "Unable to extract features on an empty SArray object")

    if batch_size < 1:
        raise ValueError("'batch_size' must be greater than or equal to 1")

    # Extract features
    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model_name)
    images_sf = _tc.SFrame({"image": images})
    return feature_extractor.extract_features(images_sf,
                                              "image",
                                              verbose=verbose,
                                              batch_size=batch_size)
コード例 #22
0
    def predict(self, dataset, output_type="class", output_frequency="per_row"):
        """
        Return predictions for ``dataset``, using the trained activity classifier.
        Predictions can be generated as class labels, or as a probability
        vector with probabilities for each class.

        The activity classifier generates a single prediction for each
        ``prediction_window`` rows in ``dataset``, per ``session_id``. The number
        of these predictions is smaller than the length of ``dataset``. By default,
        when ``output_frequency='per_row'``, each prediction is repeated ``prediction_window`` to return
        a prediction for each row of ``dataset``. Use ``output_frequency=per_window`` to
        get the unreplicated predictions.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'class', 'probability_vector'}, optional
            Form of each prediction which is one of:

            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. This returns the class with maximum
              probability.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_window': Return a single prediction for each
              ``prediction_window`` rows in ``dataset`` per ``session_id``.
            - 'per_row': Convenience option to make sure the number of
              predictions match the number of rows in the dataset. Each
              prediction from the model is repeated ``prediction_window``
              times during that window.

        Returns
        -------
        out : SArray | SFrame
            If ``output_frequency`` is 'per_row' return an SArray with predictions
            for each row in ``dataset``.
            If ``output_frequency`` is 'per_window' return an SFrame with
            predictions for ``prediction_window`` rows in ``dataset``.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------

        .. sourcecode:: python

            # One prediction per row
            >>> probability_predictions = model.predict(
            ...     data, output_type='probability_vector', output_frequency='per_row')[:4]
            >>> probability_predictions

            dtype: array
            Rows: 4
            [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])]

            # One prediction per window
            >>> class_predictions = model.predict(
            ...     data, output_type='class', output_frequency='per_window')
            >>> class_predictions

            +---------------+------------+-----+
            | prediction_id | session_id |class|
            +---------------+------------+-----+
            |       0       |     3      |  5  |
            |       1       |     3      |  5  |
            |       2       |     3      |  5  |
            |       3       |     3      |  5  |
            |       4       |     3      |  5  |
            |       5       |     3      |  5  |
            |       6       |     3      |  5  |
            |       7       |     3      |  4  |
            |       8       |     3      |  4  |
            |       9       |     3      |  4  |
            |      ...      |    ...     | ... |
            +---------------+------------+-----+
        """
        _tkutl._check_categorical_option_type(
            "output_frequency", output_frequency, ["per_window", "per_row"]
        )
        if output_frequency == "per_row":
            return self.__proxy__.predict(dataset, output_type)
        elif output_frequency == "per_window":
            return self.__proxy__.predict_per_window(dataset, output_type)
コード例 #23
0
    def evaluate(self, dataset, metric='auto'):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the session_id, target and features used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tkutl._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        probs = self.predict(dataset, output_type='probability_vector')
        classes = self.predict(dataset, output_type='class')

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(dataset[self.target], classes)
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(dataset[self.target], probs)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(dataset[self.target], classes)
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(dataset[self.target], classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(dataset[self.target], probs)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(dataset[self.target], probs)

        return ret
コード例 #24
0
def create(dataset, label=None, feature=None, model='resnet-50', verbose=True):
    """
    Create a :class:`ImageSimilarityModel` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    label : string
        Name of the SFrame column with row labels to be used as uuid's to
        identify the data. If 'label' is set to None, row numbers are used to
        identify reference dataset rows when the model is queried.

    feature : string
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        be used for similarity.

    model: string, optional
        Uses a pretrained model to bootstrap an image similarity model

           - "resnet-50" : Uses a pretrained resnet model.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ImageSimilarityModel
        A trained :class:`ImageSimilarityModel` model.

    See Also
    --------
    ImageSimilarityModel

    Examples
    --------
    .. sourcecode:: python

        # Train an image similarity model
        >>> model = turicreate.image_similarity.create(data)

        # Query the model for similar images
        >>> similar_images = model.query(data)
        +-------------+-----------------+-------------------+------+
        | query_label | reference_label |      distance     | rank |
        +-------------+-----------------+-------------------+------+
        |      0      |        0        |        0.0        |  1   |
        |      0      |       519       |   12.5319706301   |  2   |
        |      0      |       1619      |   12.5563764596   |  3   |
        |      0      |       186       |   12.6132604915   |  4   |
        |      0      |       1809      |   12.9180964745   |  5   |
        |      1      |        1        | 2.02304872852e-06 |  1   |
        |      1      |       1579      |   11.4288186151   |  2   |
        |      1      |       1237      |   12.3764325949   |  3   |
        |      1      |        80       |   12.7264363676   |  4   |
        |      1      |        58       |   12.7675058558   |  5   |
        +-------------+-----------------+-------------------+------+
        [500 rows x 4 columns]
    """
    start_time = _time.time()

    # Check parameters
    _tkutl._check_categorical_option_type('model', model,
                                          _pre_trained_models.MODELS.keys())
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (label is not None) and (label not in dataset.column_names()):
        raise _ToolkitError("Row label column '%s' does not exist" % label)
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)

    # Set defaults
    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    # Load pre-trained model & feature extractor
    ptModel = _pre_trained_models.MODELS[model]()
    feature_extractor = _image_feature_extractor.MXFeatureExtractor(ptModel)

    # Extract features
    extracted_features = _tc.SFrame({
        '__image_features__':
        feature_extractor.extract_features(dataset, feature, verbose=verbose),
    })

    # Train a similarity model using the extracted features
    if label is not None:
        extracted_features[label] = dataset[label]
    nn_model = _tc.nearest_neighbors.create(extracted_features,
                                            label=label,
                                            features=['__image_features__'],
                                            verbose=verbose)

    # Save the model
    state = {
        'similarity_model': nn_model,
        'model': model,
        'feature_extractor': feature_extractor,
        'input_image_shape': ptModel.input_image_shape,
        'label': label,
        'feature': feature,
        'num_features': 1,
        'num_examples': nn_model.num_examples,
        'training_time': _time.time() - start_time,
    }
    return ImageSimilarityModel(state)
コード例 #25
0
ファイル: topic_model.py プロジェクト: zoecarver/turicreate
def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           num_burnin=5,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
    document is a mixture of a set of topics, where for each topic some words
    are more likely than others. One statistical approach to do this is called a
    "topic model". This method learns a topic model for the given document
    collection.

    Parameters
    ----------
    dataset : SArray of type dict or SFrame with a single column of type dict
        A bag of words representation of a document corpus.
        Each element is a dictionary representing a single document, where
        the keys are words and the values are the number of times that word
        occurs in that document.

    num_topics : int, optional
        The number of topics to learn.

    initial_topics : SFrame, optional
        An SFrame with a column of unique words representing the vocabulary
        and a column of dense vectors representing
        probability of that word given each topic. When provided,
        these values are used to initialize the algorithm.

    alpha : float, optional
        Hyperparameter that controls the diversity of topics in a document.
        Smaller values encourage fewer topics per document.
        Provided value must be positive. Default value is 50/num_topics.

    beta : float, optional
        Hyperparameter that controls the diversity of words in a topic.
        Smaller values encourage fewer words per topic. Provided value
        must be positive.

    num_iterations : int, optional
        The number of iterations to perform.

    num_burnin : int, optional
        The number of iterations to perform when inferring the topics for
        documents at prediction time.

    verbose : bool, optional
        When True, print most probable words for each topic while printing
        progress.

    print_interval : int, optional
        The number of iterations to wait between progress reports.

    associations : SFrame, optional
        An SFrame with two columns named "word" and "topic" containing words
        and the topic id that the word should be associated with. These words
        are not considered during learning.

    validation_set : SArray of type dict or SFrame with a single column
        A bag of words representation of a document corpus, similar to the
        format required for `dataset`. This will be used to monitor model
        performance during training. Each document in the provided validation
        set is randomly split: the first portion is used estimate which topic
        each document belongs to, and the second portion is used to estimate
        the model's performance at predicting the unseen words in the test data.

    method : {'cgs', 'alias'}, optional
        The algorithm used for learning the model.

        - *cgs:* Collapsed Gibbs sampling
        - *alias:* AliasLDA method.

    Returns
    -------
    out : TopicModel
        A fitted topic model. This can be used with
        :py:func:`~TopicModel.get_topics()` and
        :py:func:`~TopicModel.predict()`. While fitting is in progress, several
        metrics are shown, including:

        +------------------+---------------------------------------------------+
        |      Field       | Description                                       |
        +==================+===================================================+
        | Elapsed Time     | The number of elapsed seconds.                    |
        +------------------+---------------------------------------------------+
        | Tokens/second    | The number of unique words processed per second   |
        +------------------+---------------------------------------------------+
        | Est. Perplexity  | An estimate of the model's ability to model the   |
        |                  | training data. See the documentation on evaluate. |
        +------------------+---------------------------------------------------+

    See Also
    --------
    TopicModel, TopicModel.get_topics, TopicModel.predict,
    turicreate.SArray.dict_trim_by_keys, TopicModel.evaluate

    References
    ----------
    - `Wikipedia - Latent Dirichlet allocation
      <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_

    - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of
      Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_.
      KDD 2014.

    Examples
    --------
    The following example includes an SArray of documents, where
    each element represents a document in "bag of words" representation
    -- a dictionary with word keys and whose values are the number of times
    that word occurred in the document:

    >>> docs = turicreate.SArray('https://static.turi.com/datasets/nytimes')

    Once in this form, it is straightforward to learn a topic model.

    >>> m = turicreate.topic_model.create(docs)

    It is also easy to create a new topic model from an old one  -- whether
    it was created using Turi Create or another package.

    >>> m2 = turicreate.topic_model.create(docs, initial_topics=m['topics'])

    To manually fix several words to always be assigned to a topic, use
    the `associations` argument. The following will ensure that topic 0
    has the most probability for each of the provided words:

    >>> from turicreate import SFrame
    >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'],
                               'topic': [0, 0, 0]})
    >>> m = turicreate.topic_model.create(docs,
                                        associations=associations)

    More advanced usage allows you  to control aspects of the model and the
    learning method.

    >>> import turicreate as tc
    >>> m = tc.topic_model.create(docs,
                                  num_topics=20,       # number of topics
                                  num_iterations=10,   # algorithm parameters
                                  alpha=.01, beta=.1)  # hyperparameters

    To evaluate the model's ability to generalize, we can create a train/test
    split where a portion of the words in each document are held out from
    training.

    >>> train, test = tc.text_analytics.random_split(.8)
    >>> m = tc.topic_model.create(train)
    >>> results = m.evaluate(test)
    >>> print results['perplexity']

    """
    dataset = _check_input(dataset)

    _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias'])
    if method == 'cgs' or method == 'auto':
        model_name = 'cgs_topic_model'
    else:
        model_name = 'alias_topic_model'

    # If associations are provided, check they are in the proper format
    if associations is None:
        associations = _turicreate.SFrame({'word': [], 'topic': []})
    if isinstance(associations, _turicreate.SFrame) and \
       associations.num_rows() > 0:
        assert set(associations.column_names()) == set(['word', 'topic']), \
            "Provided associations must be an SFrame containing a word column\
             and a topic column."
        assert associations['word'].dtype == str, \
            "Words must be strings."
        assert associations['topic'].dtype == int, \
            "Topic ids must be of int type."
    if alpha is None:
        alpha = float(50) / num_topics

    if validation_set is not None:
        _check_input(validation_set)  # Must be a single column
        if isinstance(validation_set, _turicreate.SFrame):
            column_name = validation_set.column_names()[0]
            validation_set = validation_set[column_name]
        (validation_train, validation_test) = _random_split(validation_set)
    else:
        validation_train = _SArray()
        validation_test = _SArray()

    opts = {
        'model_name': model_name,
        'data': dataset,
        'num_topics': num_topics,
        'num_iterations': num_iterations,
        'print_interval': print_interval,
        'alpha': alpha,
        'beta': beta,
        'num_burnin': num_burnin,
        'associations': associations
    }

    # Initialize the model with basic parameters
    response = _turicreate.extensions._text.topicmodel_init(opts)
    m = TopicModel(response['model'])

    # If initial_topics provided, load it into the model
    if isinstance(initial_topics, _turicreate.SFrame):
        assert set(['vocabulary', 'topic_probabilities']) ==              \
               set(initial_topics.column_names()),                        \
            "The provided initial_topics does not have the proper format, \
             e.g. wrong column names."

        observed_topics = initial_topics['topic_probabilities'].apply(
            lambda x: len(x))
        assert all(observed_topics == num_topics),                        \
            "Provided num_topics value does not match the number of provided initial_topics."

        # Rough estimate of total number of words
        weight = len(dataset) * 1000

        opts = {
            'model': m.__proxy__,
            'topics': initial_topics['topic_probabilities'],
            'vocabulary': initial_topics['vocabulary'],
            'weight': weight
        }
        response = _turicreate.extensions._text.topicmodel_set_topics(opts)
        m = TopicModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {
        'model': m.__proxy__,
        'data': dataset,
        'verbose': verbose,
        'validation_train': validation_train,
        'validation_test': validation_test
    }

    response = _turicreate.extensions._text.topicmodel_train(opts)
    m = TopicModel(response['model'])

    return m
コード例 #26
0
def create(dataset,
           target,
           feature=None,
           model='resnet-50',
           max_iterations=10,
           verbose=True,
           seed=None):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier

           - "resnet-50" : Uses a pretrained resnet model.
           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    max_iterations : float, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check parameters
    _tkutl._check_categorical_option_type('model', model,
                                          _pre_trained_models.MODELS.keys())
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    # Load pre-trained model & feature extractor
    ptModel = _pre_trained_models.MODELS[model]()
    feature_extractor = _image_feature_extractor.MXFeatureExtractor(ptModel)

    # Extract features
    extracted_features = _tc.SFrame({
        target:
        dataset[target],
        '__image_features__':
        feature_extractor.extract_features(dataset, feature, verbose=verbose),
    })

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(extracted_features,
                                              features=['__image_features__'],
                                              target=target,
                                              max_iterations=max_iterations,
                                              seed=seed,
                                              verbose=verbose)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': ptModel.input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
コード例 #27
0
def create(dataset,
           label=None,
           feature=None,
           model="resnet-50",
           verbose=True,
           batch_size=64):
    """
    Create a :class:`ImageSimilarityModel` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    label : string
        Name of the SFrame column with row labels to be used as uuid's to
        identify the data. If 'label' is set to None, row numbers are used to
        identify reference dataset rows when the model is queried.

    feature : string
        Name of the column containing the input images. 'None' (the default)
        indicates that the SFrame has only one column of Image type and that will
        be used for similarity.

    model: string, optional
        Uses a pretrained model to bootstrap an image similarity model

           - "resnet-50" : Uses a pretrained resnet model.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    verbose : bool, optional
        If True, print progress updates and model details.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageSimilarityModel
        A trained :class:`ImageSimilarityModel` model.

    See Also
    --------
    ImageSimilarityModel

    Examples
    --------
    .. sourcecode:: python

        # Train an image similarity model
        >>> model = turicreate.image_similarity.create(data)

        # Query the model for similar images
        >>> similar_images = model.query(data)
        +-------------+-----------------+-------------------+------+
        | query_label | reference_label |      distance     | rank |
        +-------------+-----------------+-------------------+------+
        |      0      |        0        |        0.0        |  1   |
        |      0      |       519       |   12.5319706301   |  2   |
        |      0      |       1619      |   12.5563764596   |  3   |
        |      0      |       186       |   12.6132604915   |  4   |
        |      0      |       1809      |   12.9180964745   |  5   |
        |      1      |        1        | 2.02304872852e-06 |  1   |
        |      1      |       1579      |   11.4288186151   |  2   |
        |      1      |       1237      |   12.3764325949   |  3   |
        |      1      |        80       |   12.7264363676   |  4   |
        |      1      |        58       |   12.7675058558   |  5   |
        +-------------+-----------------+-------------------+------+
        [500 rows x 4 columns]
    """
    start_time = _time.time()
    if not isinstance(dataset, _tc.SFrame):
        raise TypeError("'dataset' must be of type SFrame.")

    # Check parameters
    allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append("VisionFeaturePrint_Scene")

        # Also, to make sure existing code doesn't break, replace incorrect name
        # with the correct name version
        if model == "VisionFeaturePrint_Screen":
            print(
                "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene.  VisionFeaturePrint_Screen will be removed in future releases."
            )
            model = "VisionFeaturePrint_Scene"

    _tkutl._check_categorical_option_type("model", model, allowed_models)
    if len(dataset) == 0:
        raise _ToolkitError("Unable to train on empty dataset")
    if (label is not None) and (label not in dataset.column_names()):
        raise _ToolkitError("Row label column '%s' does not exist" % label)
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if batch_size < 1:
        raise ValueError("'batch_size' must be greater than or equal to 1")

    # Set defaults
    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        "__image_features__":
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })

    # Train a similarity model using the extracted features
    if label is not None:
        extracted_features[label] = dataset[label]
    nn_model = _tc.nearest_neighbors.create(
        extracted_features,
        label=label,
        features=["__image_features__"],
        verbose=verbose,
    )

    # set input image shape
    if model in _pre_trained_models.IMAGE_MODELS:
        input_image_shape = _pre_trained_models.IMAGE_MODELS[
            model].input_image_shape
    else:  # model == VisionFeaturePrint_Scene
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        "similarity_model": nn_model,
        "model": model,
        "feature_extractor": feature_extractor,
        "input_image_shape": input_image_shape,
        "label": label,
        "feature": feature,
        "num_features": 1,
        "num_examples": nn_model.num_examples,
        "training_time": _time.time() - start_time,
    }
    return ImageSimilarityModel(state)
コード例 #28
0
def create(dataset, target, feature = None, model = 'resnet-50',
           validation_set='auto', max_iterations = 10, verbose = True,
           seed = None, batch_size=64):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    max_iterations : float, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check model parameter
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10,14):
        allowed_models.append('VisionFeaturePrint_Screen')
    _tkutl._check_categorical_option_type('model', model, allowed_models)

    # Check dataset parameter
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" % feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if(batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'.")

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(model)

    # Extract features
    extracted_features = _tc.SFrame({
        target: dataset[target],
        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size),
        })
    if isinstance(validation_set, _tc.SFrame):
        extracted_features_validation = _tc.SFrame({
            target: validation_set[target],
            '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size),
        })
    else:
        extracted_features_validation = validation_set

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(extracted_features,
                                              features=['__image_features__'],
                                              target=target,
                                              max_iterations=max_iterations,
                                              validation_set=extracted_features_validation,
                                              seed=seed,
                                              verbose=verbose)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:    # model == VisionFeaturePrint_Screen
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
コード例 #29
0
def create(dataset, annotations=None, feature=None, model='darknet-yolo',
           classes=None, max_iterations=0, verbose=True, **kwargs):
    """
    Create a :class:`ObjectDetector` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``annotations``
        parameters will be extracted for training the detector.

    annotations : string
        Name of the column containing the object detection annotations.
        This column should be a list of dictionaries, with each dictionary
        representing a bounding box of an object instance. Here is an example
        of the annotations for a single image with two object instances::

            [{'label': 'dog',
              'type': 'rectangle',
              'coordinates': {'x': 223, 'y': 198,
                              'width': 130, 'height': 230}},
             {'label': 'cat',
              'type': 'rectangle',
              'coordinates': {'x': 40, 'y': 73,
                              'width': 80, 'height': 123}}]

        The value for `x` is the horizontal center of the box paired with
        `width` and `y` is the vertical center of the box paired with `height`.
        'None' (the default) indicates the only list column in `dataset` should
        be used for the annotations.

    feature : string
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Object detection model to use:

           - "darknet-yolo" : Fast and medium-sized model

    classes : list optional
        List of strings containing the names of the classes of objects.
        Inferred from the data if not provided.

    max_iterations : int
        The number of training iterations. If 0, then it will be automatically
        be determined based on the amount of data you provide.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ObjectDetector
        A trained :class:`ObjectDetector` model.

    See Also
    --------
    ObjectDetector

    Examples
    --------
    .. sourcecode:: python

        # Train an object detector model
        >>> model = turicreate.object_detector.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

        # Visualize predictions by generating a new column of marked up images
        >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions'])
    """
    _raise_error_if_not_sframe(dataset, "dataset")
    from ._mx_detector import YOLOLoss as _YOLOLoss
    from ._model import tiny_darknet as _tiny_darknet
    from ._sframe_loader import SFrameDetectionIter as _SFrameDetectionIter
    from ._manual_scheduler import ManualScheduler as _ManualScheduler
    import mxnet as _mx
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')

    _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE)
    start_time = _time.time()

    supported_detectors = ['darknet-yolo']

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)
        if verbose:
            print("Using '%s' as feature column" % feature)
    if annotations is None:
        annotations = _tkutl._find_only_column_of_type(dataset,
                                                       target_type=list,
                                                       type_name='list',
                                                       col_name='annotations')
        if verbose:
            print("Using '%s' as annotations column" % annotations)

    _raise_error_if_not_detection_sframe(dataset, feature, annotations,
                                         require_annotations=True)

    _tkutl._check_categorical_option_type('model', model,
            supported_detectors)

    base_model = model.split('-', 1)[0]
    ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]()

    params = {
        'anchors': [
            (1.0, 2.0), (1.0, 1.0), (2.0, 1.0),
            (2.0, 4.0), (2.0, 2.0), (4.0, 2.0),
            (4.0, 8.0), (4.0, 4.0), (8.0, 4.0),
            (8.0, 16.0), (8.0, 8.0), (16.0, 8.0),
            (16.0, 32.0), (16.0, 16.0), (32.0, 16.0),
        ],
        'grid_shape': [13, 13],
        'batch_size': 32,
        'aug_resize': 0,
        'aug_rand_crop': 0.9,
        'aug_rand_pad': 0.9,
        'aug_rand_gray': 0.0,
        'aug_aspect_ratio': 1.25,
        'aug_hue': 0.05,
        'aug_brightness': 0.05,
        'aug_saturation': 0.05,
        'aug_contrast': 0.05,
        'aug_horizontal_flip': True,
        'aug_min_object_covered': 0,
        'aug_min_eject_coverage': 0.5,
        'aug_area_range': (.15, 2),
        'aug_pca_noise': 0.0,
        'aug_max_attempts': 20,
        'aug_inter_method': 2,
        'lmb_coord_xy': 10.0,
        'lmb_coord_wh': 10.0,
        'lmb_obj': 100.0,
        'lmb_noobj': 5.0,
        'lmb_class': 2.0,
        'non_maximum_suppression_threshold': 0.45,
        'rescore': True,
        'clip_gradients': 0.025,
        'learning_rate': 1.0e-3,
        'shuffle': True,
    }

    if '_advanced_parameters' in kwargs:
        # Make sure no additional parameters are provided
        new_keys = set(kwargs['_advanced_parameters'].keys())
        set_keys = set(params.keys()) 
        unsupported = new_keys - set_keys
        if unsupported:
            raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported))

        params.update(kwargs['_advanced_parameters'])

    anchors = params['anchors']
    num_anchors = len(anchors)

    num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=params['batch_size'])
    batch_size_each = params['batch_size'] // max(num_gpus, 1)
    # Note, this may slightly alter the batch size to fit evenly on the GPUs
    batch_size = max(num_gpus, 1) * batch_size_each

    grid_shape = params['grid_shape']
    input_image_shape = (3,
                         grid_shape[0] * ref_model.spatial_reduction,
                         grid_shape[1] * ref_model.spatial_reduction)

    try:
        instances = (dataset.stack(annotations, new_column_name='_bbox', drop_na=True)
                            .unpack('_bbox', limit=['label']))
    except (TypeError, RuntimeError):
        # If this fails, the annotation format isinvalid at the coarsest level
        raise _ToolkitError("Annotations format is invalid. Must be a list of "
                            "dictionaries containing 'label' and 'coordinates'.")
    num_images = len(dataset)
    num_instances = len(instances)
    if classes is None:
        classes = instances['_bbox.label'].unique()
    classes = sorted(classes)

    # Make a class-to-index look-up table
    class_to_index = {name: index for index, name in enumerate(classes)}
    num_classes = len(classes)

    # Create data loader
    loader = _SFrameDetectionIter(dataset,
                                  batch_size=batch_size,
                                  input_shape=input_image_shape[1:],
                                  output_shape=grid_shape,
                                  anchors=anchors,
                                  class_to_index=class_to_index,
                                  aug_params=params,
                                  shuffle=params['shuffle'],
                                  loader_type='augmented',
                                  feature_column=feature,
                                  annotations_column=annotations)

    # Predictions per anchor box: x/y + w/h + object confidence + class probs
    preds_per_box = 5 + num_classes
    output_size = preds_per_box * num_anchors
    ymap_shape = (batch_size_each,) + tuple(grid_shape) + (num_anchors, preds_per_box)

    net = _tiny_darknet(output_size=output_size)

    loss = _YOLOLoss(input_shape=input_image_shape[1:],
                     output_shape=grid_shape,
                     batch_size=batch_size_each,
                     num_classes=num_classes,
                     anchors=anchors,
                     parameters=params)

    base_lr = params['learning_rate']
    if max_iterations == 0:
        # Set number of iterations through a heuristic
        num_iterations_raw = 5000 * _np.sqrt(num_instances) / batch_size
        num_iterations = 1000 * max(1, int(round(num_iterations_raw / 1000)))
    else:
        num_iterations = max_iterations

    steps = [num_iterations // 2, 3 * num_iterations // 4, num_iterations]
    steps_and_factors = [(step, 10**(-i)) for i, step in enumerate(steps)]

    steps, factors = zip(*steps_and_factors)
    lr_scheduler = _ManualScheduler(step=steps, factor=factors)

    ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size)

    net_params = net.collect_params()
    net_params.initialize(_mx.init.Xavier(), ctx=ctx)
    net_params['conv7_weight'].initialize(_mx.init.Xavier(factor_type='avg'), ctx=ctx, force_reinit=True)
    net_params['conv8_weight'].initialize(_mx.init.Uniform(0.00005), ctx=ctx, force_reinit=True)
    # Initialize object confidence low, preventing an unnecessary adjustment
    # period toward conservative estimates
    bias = _np.zeros(output_size, dtype=_np.float32)
    bias[4::preds_per_box] -= 6
    from ._mx_detector import ConstantArray
    net_params['conv8_bias'].initialize(ConstantArray(bias), ctx, force_reinit=True)

    # Take a subset and then load the rest of the parameters. It is possible to
    # do allow_missing=True directly on net_params. However, this will more
    # easily hide bugs caused by names getting out of sync.
    ref_model.available_parameters_subset(net_params).load(ref_model.model_path, ctx)

    options = {'learning_rate': base_lr, 'lr_scheduler': lr_scheduler,
               'momentum': 0.9, 'wd': 0.00005, 'rescale_grad': 1.0}
    clip_grad = params.get('clip_gradients')
    if clip_grad:
        options['clip_gradient'] = clip_grad

    trainer = _mx.gluon.Trainer(net.collect_params(), 'sgd', options)

    iteration = 0
    smoothed_loss = None
    last_time = 0
    while iteration < num_iterations:
        loader.reset()
        for batch in loader:
            data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)

            Ls = []
            with _mx.autograd.record():
                for x, y in zip(data, label):
                    z = net(x)
                    z0 = _mx.nd.transpose(z, [0, 2, 3, 1]).reshape(ymap_shape)
                    L = loss(z0, y)
                    Ls.append(L)
                for L in Ls:
                    L.backward()

            cur_loss = _np.mean([L.asnumpy()[0] for L in Ls])
            if smoothed_loss is None:
                smoothed_loss = cur_loss
            else:
                smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss
            trainer.step(1)
            iteration += 1
            cur_time = _time.time()
            if verbose and cur_time > last_time + 10:
                print('{now:%Y-%m-%d %H:%M:%S}  Training {cur_iter:{width}d}/{num_iterations:{width}d}  Loss {loss:6.3f}'.format(
                    now=_datetime.now(), cur_iter=iteration, num_iterations=num_iterations,
                    loss=smoothed_loss, width=len(str(num_iterations))))
                last_time = cur_time
            if iteration == num_iterations:
                break

    training_time = _time.time() - start_time

    # Save the model
    state = {
        '_model': net,
        '_class_to_index': class_to_index,
        '_training_time_as_string': _seconds_as_string(training_time),
        '_grid_shape': grid_shape,
        'anchors': anchors,
        'model': model,
        'classes': classes,
        'batch_size': batch_size,
        'input_image_shape': input_image_shape,
        'feature': feature,
        'non_maximum_suppression_threshold': params['non_maximum_suppression_threshold'],
        'annotations': annotations,
        'num_classes': num_classes,
        'num_examples': num_images,
        'num_bounding_boxes': num_instances,
        'training_time': training_time,
        'training_epochs': loader.cur_epoch,
        'training_iterations': iteration,
        'max_iterations': max_iterations,
        'training_loss': smoothed_loss,
    }
    return ObjectDetector(state)
コード例 #30
0
    def predict_topk(self,
                     dataset,
                     output_type="probability",
                     k=3,
                     missing_value_action="auto"):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | id     | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _check_categorical_option_type("output_type", output_type,
                                       ["rank", "margin", "probability"])
        _check_categorical_option_type("missing_value_action",
                                       missing_value_action,
                                       ["auto", "impute", "error"])
        if missing_value_action == "auto":
            missing_value_action = "impute"

        # Low latency path
        if isinstance(dataset, list):
            return self.__proxy__.fast_predict_topk(dataset,
                                                    missing_value_action,
                                                    output_type, k)
        if isinstance(dataset, dict):
            return self.__proxy__.fast_predict_topk([dataset],
                                                    missing_value_action,
                                                    output_type, k)
        # Fast path
        _raise_error_if_not_sframe(dataset, "dataset")
        if missing_value_action == "auto":
            missing_value_action = _sl.select_default_missing_value_policy(
                self, "predict")
        return self.__proxy__.predict_topk(dataset, missing_value_action,
                                           output_type, k)