def extract_features(self, dataset, missing_value_action='auto'): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~turicreate.logistic_classifier.LogisticClassifier`, or a :py:class:`~turicreate.svm_classifier.SVMClassifier`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = turicreate.SFrame( 'https://static.turi.com/datasets/regression/houses.csv') >>> # Regression Tree Models >>> data['regression_tree_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['classification_tree_features'] = model.extract_features(data) """ _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'extract_features') return self.__proxy__.extract_features(dataset, missing_value_action)
def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, `class`, and `probability`, `margin`, or `rank`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the target column exists in ``dataset`` it will be ignored while making predictions. output_type : {'probability', 'rank', 'margin'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. - `margin` : Margin associated with each label in the prediction. k : int, optional Number of classes to return for each input example. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': By default the model will treat missing value as is. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) if missing_value_action == 'auto': missing_value_action = _sl.select_default_missing_value_policy( self, 'predict') # Low latency path if isinstance(dataset, list): return _turicreate.extensions._fast_predict_topk( self.__proxy__, dataset, output_type, missing_value_action, k) if isinstance(dataset, dict): return _turicreate.extensions._fast_predict_topk( self.__proxy__, [dataset], output_type, missing_value_action, k) options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': missing_value_action }) target = _turicreate.toolkits._main.run( 'supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action="auto"): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, `class`, and `probability`, `margin`, or `rank`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the target column exists in ``dataset`` it will be ignored while making predictions. output_type : {'probability', 'rank', 'margin'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. - `margin` : Margin associated with each label in the prediction. k : int, optional Number of classes to return for each input example. missing_value_action : str, optional Action to perform when missing values are encountered. Can be one of: - 'auto': Default to 'impute' - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with evaluation and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _check_categorical_option_type("output_type", output_type, ["rank", "margin", "probability"]) _check_categorical_option_type("missing_value_action", missing_value_action, ["auto", "impute", "error"]) if missing_value_action == "auto": missing_value_action = "impute" # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict_topk(dataset, missing_value_action, output_type, k) if isinstance(dataset, dict): return self.__proxy__.fast_predict_topk([dataset], missing_value_action, output_type, k) # Fast path _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == "auto": missing_value_action = _sl.select_default_missing_value_policy( self, "predict") return self.__proxy__.predict_topk(dataset, missing_value_action, output_type, k)