def _raise_error_if_not_drawing_classifier_input_sframe( dataset, feature, target): """ Performs some sanity checks on the SFrame provided as input to `turicreate.drawing_classifier.create` and raises a ToolkitError if something in the dataset is missing or wrong. """ from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe _raise_error_if_not_sframe(dataset) if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if dataset[feature].dtype != _tc.Image and dataset[feature].dtype != list: raise _ToolkitError( "Feature column must contain images" + " or stroke-based drawings encoded as lists of strokes" + " where each stroke is a list of points and" + " each point is stored as a dictionary") if dataset[target].dtype != int and dataset[target].dtype != str: raise _ToolkitError( "Target column contains " + str(dataset[target].dtype) + " but it must contain strings or integers to represent" + " labels for drawings.") if len(dataset) == 0: raise _ToolkitError("Input Dataset is empty!")
def fit(self, data): """ Fit a transformer using the SFrame `data`. Parameters ---------- data : SFrame The data used to fit the transformer. Returns ------- self (A fitted version of the object) See Also -------- transform, fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") self.__proxy__.fit(data) return self
def _raise_error_if_not_training_sframe(dataset, context_column): _raise_error_if_not_sframe(dataset, 'datset') if context_column not in dataset.column_names(): raise _ToolkitError("Context Image column '%s' does not exist" % context_column) if dataset[context_column].dtype != _tc.Image: raise _ToolkitError("Context Image column must contain images")
def test_stylize_success(self): sf = self.content_sf[0:1] model = self.model styles = self._get_valid_style_cases() for style in styles: stylized_out = model.stylize(sf, style=style) feat_name = 'stylized_{}'.format(self.content_feature) self.assertEqual(set(stylized_out.column_names()), set(["row_id", "style", feat_name])) # Check the structure of the output _raise_error_if_not_sframe(stylized_out) if style is None: num_styles = self.num_styles elif isinstance(style, list): num_styles = len(style) else: num_styles = 1 self.assertEqual(len(stylized_out), len(sf) * num_styles) # Check if input and output image have the same shape input_size = (sf[self.content_feature][0].width, sf[self.content_feature][0].height) output_size = (stylized_out[0][feat_name].width, stylized_out[0][feat_name].height) self.assertEqual(input_size, output_size)
def create_regression_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Error checking _raise_error_if_not_sframe(dataset, "training dataset") if features is None: features = dataset.column_names() if target in features: features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str" % x) # Sample the data features_sframe = _toolkits_select_columns(dataset, features) if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed=0) # Run the model selector. selected_model_name = model_selector(features_sframe) model = create_selected(selected_model_name, dataset, target, features, validation_set, verbose) return model
def fit_transform(self, data): """ First fit a transformer using the SFrame `data` and then return a transformed version of `data`. Parameters ---------- data : SFrame The data used to fit the transformer. The same data is then also transformed. Returns ------- Transformed SFrame. See Also -------- fit, transform Notes ------ - Fit transform modifies self. Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") return self.__proxy__.fit_transform(data)
def create(dataset, transformers): """ Create a Transformer object to transform data for feature engineering. Parameters ---------- dataset : SFrame The dataset to use for training the model. transformers: Transformer | list[Transformer] An Transformer or a list of Transformers. See Also -------- turicreate.toolkits.feature_engineering._feature_engineering._TransformerBase Examples -------- .. sourcecode:: python # Create data. >>> sf = turicreate.SFrame({'a': [1,2,3], 'b' : [2,3,4]}) >>> from turicreate.feature_engineering import FeatureHasher, \ QuadraticFeatures, OneHotEncoder # Create a single transformer. >>> encoder = turicreate.feature_engineering.create(sf, OneHotEncoder(max_categories = 10)) # Create a chain of transformers. >>> chain = turicreate.feature_engineering.create(sf, [ QuadraticFeatures(), FeatureHasher() ]) # Create a chain of transformers with names for each of the steps. >>> chain = turicreate.feature_engineering.create(sf, [ ('quadratic', QuadraticFeatures()), ('hasher', FeatureHasher()) ]) """ err_msg = "The parameters 'transformers' must be a valid Transformer object." cls = transformers.__class__ _raise_error_if_not_sframe(dataset, "dataset") # List of transformers. if (cls == list): transformers = TransformerChain(transformers) # Transformer. else: if not issubclass(cls, TransformerBase): raise TypeError(err_msg) # Fit and return transformers.fit(dataset) return transformers
def transform(self, data): """ Transform the SFrame `data` using a fitted model. Parameters ---------- data : SFrame The data to be transformed. Returns ------- A transformed SFrame. See Also -------- fit, fit_transform Examples -------- .. sourcecode:: python {examples} """ _raise_error_if_not_sframe(data, "data") return self.__proxy__.transform(data)
def test_get_styles_success(self): style = [0, 1, 2] model = self.model model_styles = model.get_styles(style=style) # Check the structure of the output _raise_error_if_not_sframe(model_styles) self.assertEqual(len(model_styles), len(style))
def predict(self, dataset, missing_value_action='auto', output_type='', options={}, **kwargs): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. output_type : str, optional output type that maybe needed by some of the toolkits options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction Returns ------- out : SArray An SArray with model predictions. """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy(self, 'predict') # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict( dataset, missing_value_action, output_type) if isinstance(dataset, dict): return self.__proxy__.fast_predict( [dataset], missing_value_action, output_type) # Batch predictions path else: _raise_error_if_not_sframe(dataset, "dataset") return self.__proxy__.predict( dataset, missing_value_action, output_type)
def test_single_image(self): model = self.model single_image = self.sf[0][self.feature] prediction = model.predict(single_image) self.assertTrue(isinstance(prediction, (int, str))) prediction = model.predict_topk(single_image) _raise_error_if_not_sframe(prediction) prediction = model.classify(single_image) self.assertTrue(isinstance(prediction, dict) and 'class' in prediction and 'probability' in prediction)
def test_sarray(self): model = self.model sa = data[self.feature] predictions = model.predict(sa) _raise_error_if_not_sarray(predictions) predictions = model.predict_topk(sa, k=2) _raise_error_if_not_sframe(predictions) predictions = model.classify(sa) _raise_error_if_not_sframe(predictions)
def test_sarray(self): model = self.model data = self.sf[self.feature] predictions = model.predict(data) _raise_error_if_not_sarray(predictions) predictions = model.predict_topk(data) _raise_error_if_not_sframe(predictions) predictions = model.classify(data) _raise_error_if_not_sframe(predictions)
def evaluate(self, dataset, metric="auto", missing_value_action='auto', options={}, **kwargs): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, list[str] Evaluation metric(s) to be computed. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction """ if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'evaluate') _raise_error_if_not_sframe(dataset, "dataset") options = options.copy() options.update(kwargs) options.update({ 'model': self.__proxy__, 'dataset': dataset, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'metric': metric }) results = _turicreate.toolkits._main.run( 'supervised_learning_evaluate', options) return _map_unity_proxy_to_object(results)
def extract_features(self, dataset, missing_value_action='auto'): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~turicreate.logistic_classifier.LogisticClassifier`, or a :py:class:`~turicreate.svm_classifier.SVMClassifier`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = turicreate.SFrame( 'https://static.turi.com/datasets/regression/houses.csv') >>> # Regression Tree Models >>> data['regression_tree_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['classification_tree_features'] = model.extract_features(data) """ _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'extract_features') return self.__proxy__.extract_features(dataset, missing_value_action)
def _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations): _raise_error_if_not_sframe(dataset, 'datset') if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if dataset[feature].dtype != _tc.Image: raise _ToolkitError("Feature column must contain images") if require_annotations: if annotations not in dataset.column_names(): raise _ToolkitError("Annotations column '%s' does not exist" % annotations) if dataset[annotations].dtype != list: raise _ToolkitError("Annotations column must contain lists")
def classify(self, dataset, missing_value_action='auto'): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset: SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose model dependent missing value action - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with prediction and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. """ if (missing_value_action == 'auto'): missing_value_action = select_default_missing_value_policy( self, 'classify') # Low latency path if isinstance(dataset, list): return _turicreate.extensions._fast_classify( self.__proxy__, dataset, missing_value_action) if isinstance(dataset, dict): return _turicreate.extensions._fast_classify( self.__proxy__, [dataset], missing_value_action) _raise_error_if_not_sframe(dataset, "dataset") options = {} options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action': missing_value_action, }) target = _turicreate.toolkits._main.run('supervised_learning_classify', options) return _map_unity_proxy_to_object(target['classify'])
def evaluate(self, dataset, metric="auto", missing_value_action="auto", with_predictions=False, options={}, **kwargs): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset in the same format used for training. The columns names and types of the dataset must be the same as that used in training. metric : str, list[str] Evaluation metric(s) to be computed. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. options : dict additional options to be passed in to prediction kwargs : dict additional options to be passed into prediction """ if missing_value_action == "auto": missing_value_action = select_default_missing_value_policy( self, "evaluate") _raise_error_if_not_sframe(dataset, "dataset") results = self.__proxy__.evaluate(dataset, missing_value_action, metric, with_predictions=with_predictions) return results
def classify(self, dataset, missing_value_action="auto"): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or 1). Parameters ---------- dataset: SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose model dependent missing value action - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'error': Do not proceed with prediction and terminate with an error message. Returns ------- out : SFrame An SFrame with model predictions. """ if missing_value_action == "auto": missing_value_action = select_default_missing_value_policy( self, "classify") # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_classify(dataset, missing_value_action) if isinstance(dataset, dict): return self.__proxy__.fast_classify([dataset], missing_value_action) _raise_error_if_not_sframe(dataset, "dataset") return self.__proxy__.classify(dataset, missing_value_action)
def fit(self, data): """ Fits the transformer using the given data. """ _raise_error_if_not_sframe(data, "data") fitted_state = {} feature_columns = _internal_utils.get_column_names( data, self._exclude, self._features) if not feature_columns: raise RuntimeError( "No valid feature columns specified in transformation.") fitted_state['features'] = feature_columns fitted_state['fitted'] = True self.__proxy__.update(fitted_state) return self
def create(dataset, features=None, distance=None, radius=1., min_core_neighbors=10, verbose=True): """ Create a DBSCAN clustering model. The DBSCAN method partitions the input dataset into three types of points, based on the estimated probability density at each point. - **Core** points have a large number of points within a given neighborhood. Specifically, `min_core_neighbors` must be within distance `radius` of a point for it to be considered a core point. - **Boundary** points are within distance `radius` of a core point, but don't have sufficient neighbors of their own to be considered core. - **Noise** points comprise the remainder of the data. These points have too few neighbors to be considered core points, and are further than distance `radius` from all core points. Clusters are formed by connecting core points that are neighbors of each other, then assigning boundary points to their nearest core neighbor's cluster. Parameters ---------- dataset : SFrame Training data, with each row corresponding to an observation. Must include all features specified in the `features` parameter, but may have additional columns as well. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns of the input `dataset` should be used to train the model. All features must be numeric, i.e. integer or float types. distance : str or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified, a composite distance is constructed automatically based on feature types. radius : int or float, optional Size of each point's neighborhood, with respect to the specified distance function. min_core_neighbors : int, optional Number of neighbors that must be within distance `radius` of a point in order for that point to be considered a "core point" of a cluster. verbose : bool, optional If True, print progress updates and model details during model creation. Returns ------- out : DBSCANModel A model containing a cluster label for each row in the input `dataset`. Also contains the indices of the core points, cluster boundary points, and noise points. See Also -------- DBSCANModel, turicreate.toolkits.distances Notes ----- - Our implementation of DBSCAN first computes the similarity graph on the input dataset, which can be a computationally intensive process. In the current implementation, some distances are substantially faster than others; in particular "euclidean", "squared_euclidean", "cosine", and "transformed_dot_product" are quite fast, while composite distances can be slow. - Any distance function in the GL Create library may be used with DBSCAN but the results may be poor for distances that violate the standard metric properties, i.e. symmetry, non-negativity, triangle inequality, and identity of indiscernibles. In particular, the DBSCAN algorithm is based on the concept of connecting high-density points that are *close* to each other into a single cluster, but the notion of *close* may be very counterintuitive if the chosen distance function is not a valid metric. The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will likely yield the best results. References ---------- - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise <https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_. In Proceedings of the Second International Conference on Knowledge Discovery and Data Mining. pp. 226-231. - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_ - `Visualizing DBSCAN Clustering <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_ Examples -------- >>> sf = turicreate.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = turicreate.dbscan.create(sf, radius=4.25, min_core_neighbors=3) >>> model.cluster_id.print_rows(15) +--------+------------+----------+ | row_id | cluster_id | type | +--------+------------+----------+ | 8 | 0 | core | | 7 | 2 | core | | 0 | 1 | core | | 2 | 2 | core | | 3 | 1 | core | | 11 | 2 | core | | 4 | 2 | core | | 1 | 0 | boundary | | 6 | 0 | boundary | | 5 | 0 | boundary | | 9 | 0 | boundary | | 12 | 2 | boundary | | 10 | 1 | boundary | | 13 | 1 | boundary | +--------+------------+----------+ [14 rows x 3 columns] """ ## Start the training time clock and instantiate an empty model logger = _logging.getLogger(__name__) start_time = _time.time() ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0: raise ValueError("Input 'min_core_neighbors' must be a non-negative " + "integer.") if not isinstance(radius, (int, float)) or radius < 0: raise ValueError("Input 'radius' must be a non-negative integer " + "or float.") ## Compute all-point nearest neighbors within `radius` and count # neighborhood sizes knn_model = _tc.nearest_neighbors.create(dataset, features=features, distance=distance, method='brute_force', verbose=verbose) knn = knn_model.similarity_graph(k=None, radius=radius, include_self_edges=False, output_type='SFrame', verbose=verbose) neighbor_counts = knn.groupby('query_label', _agg.COUNT) ### NOTE: points with NO neighbors are already dropped here! ## Identify core points and boundary candidate points. Not all of the # boundary candidates will be boundary points - some are in small isolated # clusters. if verbose: logger.info("Identifying noise points and core points.") boundary_mask = neighbor_counts['Count'] < min_core_neighbors core_mask = 1 - boundary_mask # this includes too small clusters boundary_idx = neighbor_counts[boundary_mask]['query_label'] core_idx = neighbor_counts[core_mask]['query_label'] ## Build a similarity graph on the core points ## NOTE: careful with singleton core points - the second filter removes them # from the edge set so they have to be added separately as vertices. if verbose: logger.info("Constructing the core point similarity graph.") core_vertices = knn.filter_by(core_idx, 'query_label') core_edges = core_vertices.filter_by(core_idx, 'reference_label') core_graph = _tc.SGraph() core_graph = core_graph.add_vertices(core_vertices[['query_label']], vid_field='query_label') core_graph = core_graph.add_edges(core_edges, src_field='query_label', dst_field='reference_label') ## Compute core point connected components and relabel to be consecutive # integers cc = _tc.connected_components.create(core_graph, verbose=verbose) cc_labels = cc.component_size.add_row_number('__label') core_assignments = cc.component_id.join(cc_labels, on='component_id', how='left')[['__id', '__label']] core_assignments['type'] = 'core' ## Join potential boundary points to core cluster labels (points that aren't # really on a boundary are implicitly dropped) if verbose: logger.info("Processing boundary points.") boundary_edges = knn.filter_by(boundary_idx, 'query_label') # separate real boundary points from points in small isolated clusters boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label') # join a boundary point to its single closest core point. boundary_assignments = boundary_core_edges.groupby('query_label', {'reference_label': _agg.ARGMIN('rank', 'reference_label')}) boundary_assignments = boundary_assignments.join(core_assignments, on={'reference_label': '__id'}) boundary_assignments = boundary_assignments.rename({'query_label': '__id'}, inplace=True) boundary_assignments = boundary_assignments.remove_column('reference_label', inplace=True) boundary_assignments['type'] = 'boundary' ## Identify boundary candidates that turned out to be in small clusters but # not on real cluster boundaries small_cluster_idx = set(boundary_idx).difference( boundary_assignments['__id']) ## Identify individual noise points by the fact that they have no neighbors. noise_idx = set(range(dataset.num_rows())).difference( neighbor_counts['query_label']) noise_idx = noise_idx.union(small_cluster_idx) noise_assignments = _tc.SFrame({'row_id': _tc.SArray(list(noise_idx), int)}) noise_assignments['cluster_id'] = None noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype(int) noise_assignments['type'] = 'noise' ## Append core, boundary, and noise results to each other. master_assignments = _tc.SFrame() num_clusters = 0 if core_assignments.num_rows() > 0: core_assignments = core_assignments.rename({'__id': 'row_id', '__label': 'cluster_id'}, inplace=True) master_assignments = master_assignments.append(core_assignments) num_clusters = len(core_assignments['cluster_id'].unique()) if boundary_assignments.num_rows() > 0: boundary_assignments = boundary_assignments.rename({'__id': 'row_id', '__label': 'cluster_id'}, inplace=True) master_assignments = master_assignments.append(boundary_assignments) if noise_assignments.num_rows() > 0: master_assignments = master_assignments.append(noise_assignments) ## Post-processing and formatting state = {'verbose': verbose, 'radius': radius, 'min_core_neighbors': min_core_neighbors, 'distance': knn_model.distance, 'num_distance_components': knn_model.num_distance_components, 'num_examples': dataset.num_rows(), 'features': knn_model.features, 'num_features': knn_model.num_features, 'unpacked_features': knn_model.unpacked_features, 'num_unpacked_features': knn_model.num_unpacked_features, 'cluster_id': master_assignments, 'num_clusters': num_clusters, 'training_time': _time.time() - start_time} return DBSCANModel(state)
def create(dataset, annotations=None, feature=None, model='darknet-yolo', classes=None, max_iterations=0, verbose=True, **kwargs): """ Create a :class:`ObjectDetector` model. Parameters ---------- dataset : SFrame Input data. The columns named by the ``feature`` and ``annotations`` parameters will be extracted for training the detector. annotations : string Name of the column containing the object detection annotations. This column should be a list of dictionaries, with each dictionary representing a bounding box of an object instance. Here is an example of the annotations for a single image with two object instances:: [{'label': 'dog', 'type': 'rectangle', 'coordinates': {'x': 223, 'y': 198, 'width': 130, 'height': 230}}, {'label': 'cat', 'type': 'rectangle', 'coordinates': {'x': 40, 'y': 73, 'width': 80, 'height': 123}}] The value for `x` is the horizontal center of the box paired with `width` and `y` is the vertical center of the box paired with `height`. 'None' (the default) indicates the only list column in `dataset` should be used for the annotations. feature : string Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. model : string optional Object detection model to use: - "darknet-yolo" : Fast and medium-sized model classes : list optional List of strings containing the names of the classes of objects. Inferred from the data if not provided. max_iterations : int The number of training iterations. If 0, then it will be automatically be determined based on the amount of data you provide. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ObjectDetector A trained :class:`ObjectDetector` model. See Also -------- ObjectDetector Examples -------- .. sourcecode:: python # Train an object detector model >>> model = turicreate.object_detector.create(data) # Make predictions on the training set and as column to the SFrame >>> data['predictions'] = model.predict(data) # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _raise_error_if_not_sframe(dataset, "dataset") from ._mx_detector import YOLOLoss as _YOLOLoss from ._model import tiny_darknet as _tiny_darknet from ._sframe_loader import SFrameDetectionIter as _SFrameDetectionIter from ._manual_scheduler import ManualScheduler as _ManualScheduler import mxnet as _mx if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) start_time = _time.time() supported_detectors = ['darknet-yolo'] if feature is None: feature = _tkutl._find_only_image_column(dataset) if verbose: print("Using '%s' as feature column" % feature) if annotations is None: annotations = _tkutl._find_only_column_of_type(dataset, target_type=list, type_name='list', col_name='annotations') if verbose: print("Using '%s' as annotations column" % annotations) _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations=True) _tkutl._check_categorical_option_type('model', model, supported_detectors) base_model = model.split('-', 1)[0] ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() params = { 'anchors': [ (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), ], 'grid_shape': [13, 13], 'batch_size': 32, 'aug_resize': 0, 'aug_rand_crop': 0.9, 'aug_rand_pad': 0.9, 'aug_rand_gray': 0.0, 'aug_aspect_ratio': 1.25, 'aug_hue': 0.05, 'aug_brightness': 0.05, 'aug_saturation': 0.05, 'aug_contrast': 0.05, 'aug_horizontal_flip': True, 'aug_min_object_covered': 0, 'aug_min_eject_coverage': 0.5, 'aug_area_range': (.15, 2), 'aug_pca_noise': 0.0, 'aug_max_attempts': 20, 'aug_inter_method': 2, 'lmb_coord_xy': 10.0, 'lmb_coord_wh': 10.0, 'lmb_obj': 100.0, 'lmb_noobj': 5.0, 'lmb_class': 2.0, 'non_maximum_suppression_threshold': 0.45, 'rescore': True, 'clip_gradients': 0.025, 'learning_rate': 1.0e-3, 'shuffle': True, } if '_advanced_parameters' in kwargs: # Make sure no additional parameters are provided new_keys = set(kwargs['_advanced_parameters'].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported)) params.update(kwargs['_advanced_parameters']) anchors = params['anchors'] num_anchors = len(anchors) num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=params['batch_size']) batch_size_each = params['batch_size'] // max(num_gpus, 1) # Note, this may slightly alter the batch size to fit evenly on the GPUs batch_size = max(num_gpus, 1) * batch_size_each grid_shape = params['grid_shape'] input_image_shape = (3, grid_shape[0] * ref_model.spatial_reduction, grid_shape[1] * ref_model.spatial_reduction) try: instances = (dataset.stack(annotations, new_column_name='_bbox', drop_na=True) .unpack('_bbox', limit=['label'])) except (TypeError, RuntimeError): # If this fails, the annotation format isinvalid at the coarsest level raise _ToolkitError("Annotations format is invalid. Must be a list of " "dictionaries containing 'label' and 'coordinates'.") num_images = len(dataset) num_instances = len(instances) if classes is None: classes = instances['_bbox.label'].unique() classes = sorted(classes) # Make a class-to-index look-up table class_to_index = {name: index for index, name in enumerate(classes)} num_classes = len(classes) # Create data loader loader = _SFrameDetectionIter(dataset, batch_size=batch_size, input_shape=input_image_shape[1:], output_shape=grid_shape, anchors=anchors, class_to_index=class_to_index, aug_params=params, shuffle=params['shuffle'], loader_type='augmented', feature_column=feature, annotations_column=annotations) # Predictions per anchor box: x/y + w/h + object confidence + class probs preds_per_box = 5 + num_classes output_size = preds_per_box * num_anchors ymap_shape = (batch_size_each,) + tuple(grid_shape) + (num_anchors, preds_per_box) net = _tiny_darknet(output_size=output_size) loss = _YOLOLoss(input_shape=input_image_shape[1:], output_shape=grid_shape, batch_size=batch_size_each, num_classes=num_classes, anchors=anchors, parameters=params) base_lr = params['learning_rate'] if max_iterations == 0: # Set number of iterations through a heuristic num_iterations_raw = 5000 * _np.sqrt(num_instances) / batch_size num_iterations = 1000 * max(1, int(round(num_iterations_raw / 1000))) else: num_iterations = max_iterations steps = [num_iterations // 2, 3 * num_iterations // 4, num_iterations] steps_and_factors = [(step, 10**(-i)) for i, step in enumerate(steps)] steps, factors = zip(*steps_and_factors) lr_scheduler = _ManualScheduler(step=steps, factor=factors) ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size) net_params = net.collect_params() net_params.initialize(_mx.init.Xavier(), ctx=ctx) net_params['conv7_weight'].initialize(_mx.init.Xavier(factor_type='avg'), ctx=ctx, force_reinit=True) net_params['conv8_weight'].initialize(_mx.init.Uniform(0.00005), ctx=ctx, force_reinit=True) # Initialize object confidence low, preventing an unnecessary adjustment # period toward conservative estimates bias = _np.zeros(output_size, dtype=_np.float32) bias[4::preds_per_box] -= 6 from ._mx_detector import ConstantArray net_params['conv8_bias'].initialize(ConstantArray(bias), ctx, force_reinit=True) # Take a subset and then load the rest of the parameters. It is possible to # do allow_missing=True directly on net_params. However, this will more # easily hide bugs caused by names getting out of sync. ref_model.available_parameters_subset(net_params).load(ref_model.model_path, ctx) options = {'learning_rate': base_lr, 'lr_scheduler': lr_scheduler, 'momentum': 0.9, 'wd': 0.00005, 'rescale_grad': 1.0} clip_grad = params.get('clip_gradients') if clip_grad: options['clip_gradient'] = clip_grad trainer = _mx.gluon.Trainer(net.collect_params(), 'sgd', options) iteration = 0 smoothed_loss = None last_time = 0 while iteration < num_iterations: loader.reset() for batch in loader: data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) Ls = [] with _mx.autograd.record(): for x, y in zip(data, label): z = net(x) z0 = _mx.nd.transpose(z, [0, 2, 3, 1]).reshape(ymap_shape) L = loss(z0, y) Ls.append(L) for L in Ls: L.backward() cur_loss = _np.mean([L.asnumpy()[0] for L in Ls]) if smoothed_loss is None: smoothed_loss = cur_loss else: smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss trainer.step(1) iteration += 1 cur_time = _time.time() if verbose and cur_time > last_time + 10: print('{now:%Y-%m-%d %H:%M:%S} Training {cur_iter:{width}d}/{num_iterations:{width}d} Loss {loss:6.3f}'.format( now=_datetime.now(), cur_iter=iteration, num_iterations=num_iterations, loss=smoothed_loss, width=len(str(num_iterations)))) last_time = cur_time if iteration == num_iterations: break training_time = _time.time() - start_time # Save the model state = { '_model': net, '_class_to_index': class_to_index, '_training_time_as_string': _seconds_as_string(training_time), '_grid_shape': grid_shape, 'anchors': anchors, 'model': model, 'classes': classes, 'batch_size': batch_size, 'input_image_shape': input_image_shape, 'feature': feature, 'non_maximum_suppression_threshold': params['non_maximum_suppression_threshold'], 'annotations': annotations, 'num_classes': num_classes, 'num_examples': num_images, 'num_bounding_boxes': num_instances, 'training_time': training_time, 'training_epochs': loader.cur_epoch, 'training_iterations': iteration, 'max_iterations': max_iterations, 'training_loss': smoothed_loss, } return ObjectDetector(state)
def unstack_annotations(annotations_sframe, num_rows=None): """ Converts object detection annotations (ground truth or predictions) to unstacked format (an `SArray` where each element is a list of object instances). Parameters ---------- annotations_sframe: SFrame An `SFrame` with stacked predictions, produced by the `stack_annotations` function. num_rows: int Optionally specify the number of rows in your original dataset, so that all get represented in the unstacked format, regardless of whether or not they had instances or not. Returns ------- annotations_sarray: An `SArray` with unstacked annotations. See also -------- stack_annotations Examples -------- If you have annotations in stacked format: >>> stacked_predictions Data: +--------+------------+-------+-------+-------+-------+--------+ | row_id | confidence | label | x | y | width | height | +--------+------------+-------+-------+-------+-------+--------+ | 0 | 0.98 | dog | 123.0 | 128.0 | 80.0 | 182.0 | | 0 | 0.67 | cat | 150.0 | 183.0 | 129.0 | 101.0 | | 1 | 0.8 | dog | 50.0 | 432.0 | 65.0 | 98.0 | +--------+------------+-------+-------+-------+-------+--------+ [3 rows x 7 columns] They can be converted to unstacked format using this function: >>> turicreate.object_detector.util.unstack_annotations(stacked_predictions)[0] [{'confidence': 0.98, 'coordinates': {'height': 182.0, 'width': 80.0, 'x': 123.0, 'y': 128.0}, 'label': 'dog', 'type': 'rectangle'}, {'confidence': 0.67, 'coordinates': {'height': 101.0, 'width': 129.0, 'x': 150.0, 'y': 183.0}, 'label': 'cat', 'type': 'rectangle'}] """ _raise_error_if_not_sframe(annotations_sframe, variable_name="annotations_sframe") cols = ["label", "type", "coordinates"] has_confidence = "confidence" in annotations_sframe.column_names() if has_confidence: cols.append("confidence") if num_rows is None: if len(annotations_sframe) == 0: num_rows = 0 else: num_rows = annotations_sframe["row_id"].max() + 1 sf = annotations_sframe sf["type"] = "rectangle" sf = sf.pack_columns(["x", "y", "width", "height"], dtype=dict, new_column_name="coordinates") sf = sf.pack_columns(cols, dtype=dict, new_column_name="ann") sf = sf.unstack("ann", new_column_name="annotations") sf_all_ids = _tc.SFrame({"row_id": range(num_rows)}) sf = sf.join(sf_all_ids, on="row_id", how="right") sf = sf.fillna("annotations", []) sf = sf.sort("row_id") annotations_sarray = sf["annotations"] # Sort the confidences again, since the unstack does not preserve the order if has_confidence: annotations_sarray = annotations_sarray.apply( lambda x: sorted( x, key=lambda ann: ann["confidence"], reverse=True), dtype=list, ) return annotations_sarray
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of Turi Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~turicreate.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, turicreate.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`turicreate.SFrame.fillna` and :func:`turicreate.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of Turi Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~turicreate.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _turicreate.distances.cosine or distance == 'dot_product' or distance == _turicreate.distances.dot_product or distance == 'transformed_dot_product' or distance == _turicreate.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _turicreate.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _turicreate.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ getattr(_turicreate.distances, k) for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [f for f in feature_names if _dataset[f].dtype == list] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _turicreate.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _turicreate.distances.euclidean, _turicreate.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.train(opts) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~turicreate.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = turicreate.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = turicreate.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = turicreate.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.features sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _turicreate.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype == str and not dataset[ label].dtype == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.query(opts) return result['neighbors']
def predict(self, dataset, output_type='class', output_frequency='per_row'): """ Return predictions for ``dataset``, using the trained activity classifier. Predictions can be generated as class labels, or as a probability vector with probabilities for each class. The activity classifier generates a single prediction for each ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the number of predictions is smaller than the length of ``dataset``. By default each prediction is replicated by ``prediction_window`` to return a prediction for each row of ``dataset``. Use ``output_frequency`` to get the unreplicated predictions. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'class', 'probability_vector'}, optional Form of each prediction which is one of: - 'probability_vector': Prediction probability associated with each class as a vector. The probability of the first class (sorted alphanumerically by name of the class in the training set) is in position 0 of the vector, the second in position 1 and so on. - 'class': Class prediction. This returns the class with maximum probability. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. - 'per_row': Convenience option to make sure the number of predictions match the number of rows in the dataset. Each prediction from the model is repeated ``prediction_window`` times during that window. Returns ------- out : SArray | SFrame If ``output_frequency`` is 'per_row' return an SArray with predictions for each row in ``dataset``. If ``output_frequency`` is 'per_window' return an SFrame with predictions for ``prediction_window`` rows in ``dataset``. See Also ---------- create, evaluate, classify Examples -------- .. sourcecode:: python # One prediction per row >>> probability_predictions = model.predict( ... data, output_type='probability_vector', output_frequency='per_row')[:4] >>> probability_predictions dtype: array Rows: 4 [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]), array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])] # One prediction per window >>> class_predictions = model.predict( ... data, output_type='class', output_frequency='per_window') >>> class_predictions +---------------+------------+-----+ | prediction_id | session_id |class| +---------------+------------+-----+ | 0 | 3 | 5 | | 1 | 3 | 5 | | 2 | 3 | 5 | | 3 | 3 | 5 | | 4 | 3 | 5 | | 5 | 3 | 5 | | 6 | 3 | 5 | | 7 | 3 | 4 | | 8 | 3 | 4 | | 9 | 3 | 4 | | ... | ... | ... | +---------------+------------+-----+ """ _tkutl._raise_error_if_not_sframe(dataset, 'dataset') _tkutl._check_categorical_option_type( 'output_frequency', output_frequency, ['per_window', 'per_row']) _tkutl._check_categorical_option_type( 'output_type', output_type, ['probability_vector', 'class']) from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._sframe_sequence_iterator import _ceil_dev prediction_window = self.prediction_window chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window, self._predictions_in_chunk, verbose=False) data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features), prediction_window, self._predictions_in_chunk, self._recalibrated_batch_size, use_pad=True) chunked_data = data_iter.dataset preds = self._pred_model.predict(data_iter).asnumpy() if output_frequency == 'per_row': # Replicate each prediction times prediction_window preds = preds.repeat(prediction_window, axis=1) # Remove predictions for padded rows unpadded_len = chunked_data['chunk_len'].to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] # Reshape from (num_of_chunks, chunk_size, num_of_classes) # to (ceil(length / prediction_window), num_of_classes) # chunk_size is DIFFERENT between chunks - since padding was removed. out = _np.concatenate(preds) out = out.reshape((-1, len(self._target_id_map))) out = _SArray(out) if output_type == 'class': id_target_map = self._id_target_map out = out.apply(lambda c: id_target_map[_np.argmax(c)]) elif output_frequency == 'per_window': # Calculate the number of expected predictions and # remove predictions for padded data unpadded_len = chunked_data['chunk_len'].apply( lambda l: _ceil_dev(l, prediction_window)).to_numpy() preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)] out = _SFrame({ self.session_id: chunked_data['session_id'], 'preds': _SArray(preds, dtype=list) }).stack('preds', new_column_name='probability_vector') # Calculate the prediction index per session out = out.add_row_number(column_name='prediction_id') start_sess_idx = out.groupby( self.session_id, {'start_idx': _agg.MIN('prediction_id')}) start_sess_idx = start_sess_idx.unstack( [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0] if output_type == 'class': id_target_map = self._id_target_map out['probability_vector'] = out['probability_vector'].apply( lambda c: id_target_map[_np.argmax(c)]) out = out.rename({'probability_vector': 'class'}) return out
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(train, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") from ._model_architecture import _net_params from ._model_architecture import _define_model, _fit_model from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names(dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) if isinstance(validation_set, str) and validation_set == 'auto': if num_sessions < 100: validation_set = None else: dataset, validation_set = _random_split_by_session(dataset, session_id) # Create data iterators num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) user_provided_batch_size = batch_size batch_size = max(batch_size, num_gpus, 1) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) validation_set = validation_set.filter_by(target_map.keys(), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) loss_model, pred_model = _define_model(features, target_map, prediction_window, predictions_in_chunk, context) # Train the model log = _fit_model(loss_model, data_iter, valid_iter, max_iterations, num_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size' : user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, verbose=False): """ Return top-k most likely predictions for each observation in ``dataset``. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training, but does not require a target column. Additional columns are ignored. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame See Also ---------- create, classify, predict Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no neighbors in the training dataset. In this case, the query is dropped from the SFrame output by this method. If all queries have no neighbors, then the result is an empty SFrame. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = turicreate.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = turicreate.nearest_neighbor_classifier.create(sf_train, target='species') >>> ystar = m.predict_topk(sf_new, max_neighbors=2) >>> print ystar +--------+-------+-------------+ | row_id | class | probability | +--------+-------+-------------+ | 0 | dog | 1.0 | | 1 | fossa | 0.5 | | 1 | dog | 0.5 | +--------+-------+-------------+ """ ## Validate the number of results to return. Note that the # 'max_neighbors' and 'radius' parameters are validated by the nearest # neighbor model's query method. if not isinstance(k, int) or k < 1: raise TypeError("The number of results to return for each point, " + "'k', must be an integer greater than 0.") ## Validate the query dataset. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an empty SFrame. if knn.num_rows() == 0: ystar = _tc.SFrame({'row_id': [], 'class': [], 'probability': []}) ystar['row_id'] = ystar['row_id'].astype(int) ystar['class'] = ystar['class'].astype(str) else: ## Find the classes with the top-k vote totals grp = knn.groupby(['query_label', 'reference_label'], _tc.aggregate.COUNT) ystar = grp.unstack(column_names=['reference_label', 'Count'], new_column_name='votes') ystar['topk'] = ystar['votes'].apply( lambda x: _sort_topk_votes(x, k)) ystar['total_votes'] = ystar['votes'].apply( lambda x: sum(x.values())) ## Re-stack, unpack, and rename the results ystar = ystar.stack('topk', new_column_name='topk') ystar = ystar.unpack('topk') ystar.rename({'topk.class': 'class', 'query_label': 'row_id'}, inplace=True) ystar['probability'] = ystar['topk.votes'] / ystar['total_votes'] ystar = ystar[['row_id', 'class', 'probability']] return ystar
def classify(self, dataset, max_neighbors=10, radius=None, verbose=True): """ Return the predicted class for each observation in *dataset*. This prediction is made based on the closest neighbors stored in the nearest neighbors classifier model. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. verbose : bool, optional If True, print progress updates. max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : SFrame An SFrame with model predictions. The first column is the most likely class according to the model, and the second column is the predicted probability for that class. See Also -------- create, predict, predict_topk Notes ----- - If the 'radius' parameter is small, it is possible that a query point has no qualified neighbors in the training dataset. In this case, the resulting class and probability for that query are 'None' in the SFrame output by this method. If the target column in the training dataset has missing values, these predictions will be ambiguous. - Ties between predicted classes are broken randomly. Examples -------- >>> sf_train = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> sf_new = turicreate.SFrame({'height': [26, 19], ... 'weight': [25, 35]}) ... >>> m = turicreate.nearest_neighbor_classifier.create(sf, target='species') >>> ystar = m.classify(sf_new, max_neighbors=2) >>> print ystar +-------+-------------+ | class | probability | +-------+-------------+ | dog | 1.0 | | fossa | 0.5 | +-------+-------------+ """ ## Validate the query 'dataset'. Note that the 'max_neighbors' and # 'radius' parameters are validated by the nearest neighbor model's # query method. _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") n_query = dataset.num_rows() ## Validate neighborhood parameters 'max_neighbors'. # - NOTE: when the parameter name is changed in nearest neighbors, the # query call will do this itself, and this block can be removed. if max_neighbors is not None: if not isinstance(max_neighbors, int): raise ValueError("Input 'max_neighbors' must be an integer.") if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") ## Find the nearest neighbors for each query and count the number of # votes for each class. knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, verbose=verbose) ## If there are *no* results for *any* query make an SFrame of nothing. if knn.num_rows() == 0: ystar = _tc.SFrame( {'class': _tc.SArray([None] * n_query, self._target_type), 'probability': _tc.SArray([None] * n_query, int)}) else: ## Find the class with the most votes for each query and postprocess. grp = knn.groupby(['query_label', 'reference_label'], _tc.aggregate.COUNT) ystar = grp.groupby('query_label', {'class': _tc.aggregate.ARGMAX('Count', 'reference_label'), 'max_votes': _tc.aggregate.MAX('Count'), 'total_votes': _tc.aggregate.SUM('Count')}) ystar['probability'] = ystar['max_votes'] / ystar['total_votes'] ## Fill in 'None' for query points that don't have any near neighbors. row_ids = _tc.SFrame({'query_label': range(n_query)}) ystar = ystar.join(row_ids, how='right') ## Sort by row number (because row number is not returned) and return ystar = ystar.sort('query_label', ascending=True) ystar = ystar[['class', 'probability']] return ystar
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier` model. This model predicts the class of a query instance by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of Turi Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : str, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~turicreate.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier turicreate.toolkits.nearest_neighbors turicreate.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <https://web.stanford.edu/~hastie/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype == str and not dataset[target].dtype == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].countna() > 0: _logging.warning("Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} distance = _construct_auto_distance(_features, col_types) else: raise TypeError("Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _tc.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- state = { 'verbose' : verbose, 'distance' : knn_model.distance, 'num_distance_components' : knn_model.num_distance_components, 'num_examples' : dataset.num_rows(), 'features' : knn_model.features, 'target': target, 'num_classes': len(dataset[target].unique()), 'num_features': knn_model.num_features, 'num_unpacked_features': knn_model.num_unpacked_features, 'training_time': _time.time() - start_time, '_target_type': dataset[target].dtype, } model = NearestNeighborClassifier(knn_model, state) return model