def create(dataset, num_clusters=None, features=None, label=None, initial_centers=None, max_iterations=10, batch_size=None, verbose=True): """ Create a k-means clustering model. The KmeansModel object contains the computed cluster centers and the cluster assignment for each instance in the input 'dataset'. Given a number of clusters, k-means iteratively chooses the best cluster centers and assigns nearby points to the best cluster. If no points change cluster membership between iterations, the algorithm terminates. Parameters ---------- dataset : SFrame Each row in the SFrame is an observation. num_clusters : int Number of clusters. This is the 'k' in k-means. features : list[str], optional Names of feature columns to use in computing distances between observations and cluster centers. 'None' (the default) indicates that all columns should be used as features. Columns may be of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (int or float) values. Each list element is treated as a distinct feature in the model. - *Dict*: dictionary of keys mapped to numeric values. Each unique key is treated as a distinct feature in the model. Note that columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. label : str, optional Name of the column to use as row labels in the Kmeans output. The values in this column must be integers or strings. If not specified, row numbers are used by default. initial_centers : SFrame, optional Initial centers to use when starting the K-means algorithm. If specified, this parameter overrides the *num_clusters* parameter. The 'initial_centers' SFrame must contain the same features used in the input 'dataset'. If not specified (the default), initial centers are chosen intelligently with the K-means++ algorithm. max_iterations : int, optional The maximum number of iterations to run. Prints a warning if the algorithm does not converge after max_iterations iterations. If set to 0, the model returns clusters defined by the initial centers and assignments to those centers. batch_size : int, optional Number of randomly-chosen data points to use in each iteration. If 'None' (the default) or greater than the number of rows in 'dataset', then this parameter is ignored: all rows of `dataset` are used in each iteration and model training terminates once point assignments stop changing or `max_iterations` is reached. verbose : bool, optional If True, print model training progress to the screen. Returns ------- out : KmeansModel A Model object containing a cluster id for each vertex, and the centers of the clusters. See Also -------- KmeansModel Notes ----- - Integer features in the 'dataset' or 'initial_centers' inputs are converted internally to float type, and the corresponding features in the output centers are float-typed. - It can be important for the K-means model to standardize the features so they have the same scale. This function does *not* standardize automatically. References ---------- - `Wikipedia - k-means clustering <http://en.wikipedia.org/wiki/K-means_clustering>`_ - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms. pp. 1027-1035. - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings of the Twentieth International Conference on Machine Learning, Volume 3, pp. 147-153. - Sculley, D. (2010) `Web Scale K-Means Clustering <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In Proceedings of the 19th International Conference on World Wide Web. pp. 1177-1178 Examples -------- >>> sf = turicreate.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = turicreate.kmeans.create(sf, num_clusters=3) """ opts = { 'model_name': 'kmeans', 'max_iterations': max_iterations, } ## Validate the input dataset and initial centers. _validate_dataset(dataset) if initial_centers is not None: _validate_initial_centers(initial_centers) ## Validate and determine the correct number of clusters. opts['num_clusters'] = _validate_num_clusters(num_clusters, initial_centers, dataset.num_rows()) ## Validate the row label col_type_map = {c: dataset[c].dtype for c in dataset.column_names()} if label is not None: _validate_row_label(label, col_type_map) if label in ['cluster_id', 'distance']: raise ValueError("Row label column name cannot be 'cluster_id' " + "or 'distance'; these are reserved for other " + "columns in the Kmeans model's output.") opts['row_labels'] = dataset[label] opts['row_label_name'] = label else: opts['row_labels'] = _tc.SArray.from_sequence(dataset.num_rows()) opts['row_label_name'] = 'row_id' ## Validate the features relative to the input dataset. if features is None: features = dataset.column_names() valid_features = _validate_features(features, col_type_map, valid_types=[_array, dict, int, float], label=label) sf_features = dataset.select_columns(valid_features) opts['features'] = sf_features ## Validate the features in the initial centers (if provided) if initial_centers is not None: try: initial_centers = initial_centers.select_columns(valid_features) except: raise ValueError("Specified features cannot be extracted from " + "the provided initial centers.") if initial_centers.column_types() != sf_features.column_types(): raise TypeError("Feature types are different in the dataset and " + "initial centers.") else: initial_centers = _tc.SFrame() opts['initial_centers'] = initial_centers ## Validate the batch size and determine the training method. if batch_size is None: opts['method'] = 'elkan' opts['batch_size'] = dataset.num_rows() else: opts['method'] = 'minibatch' opts['batch_size'] = batch_size ## Create and return the model with _QuietProgress(verbose): params = _tc.extensions._kmeans.train(opts) return KmeansModel(params['model'])
def annotate(data, image_column=None, annotation_column="annotations"): """ Annotate images using a GUI assisted application. When the GUI is terminated an SFrame with the representative images and annotations is returned. Parameters ---------- data : SArray | SFrame The data containing the input images. image_column: string, optional The name of the input column in the SFrame that contains the image that needs to be annotated. In case `data` is of type SArray, then the output SFrame contains a column (with this name) containing the input images. annotation_column : string, optional The column containing the annotations in the output SFrame. Returns ------- out : SFrame A new SFrame that contains the newly annotated data. Examples -------- >>> import turicreate as tc >>> images = tc.image_analysis.load_images("path/to/images") >>> print(images) +------------------------+--------------------------+ | path | image | +------------------------+--------------------------+ | /Users/username/Doc... | Height: 1712 Width: 1952 | | /Users/username/Doc... | Height: 1386 Width: 1000 | | /Users/username/Doc... | Height: 536 Width: 858 | | /Users/username/Doc... | Height: 1512 Width: 2680 | +------------------------+--------------------------+ [4 rows x 2 columns] >>> images = tc.image_classifier.annotate(images) >>> print(images) +------------------------+--------------------------+-------------------+ | path | image | annotations | +------------------------+--------------------------+-------------------+ | /Users/username/Doc... | Height: 1712 Width: 1952 | dog | | /Users/username/Doc... | Height: 1386 Width: 1000 | dog | | /Users/username/Doc... | Height: 536 Width: 858 | cat | | /Users/username/Doc... | Height: 1512 Width: 2680 | mouse | +------------------------+--------------------------+-------------------+ [4 rows x 3 columns] """ # Check Value of Column Variables if not isinstance(data, __tc.SFrame): raise TypeError('"data" must be of type SFrame.') # Check if Value is Empty if data.num_rows() == 0: raise Exception("input data cannot be empty") if image_column == None: image_column = _tkutl._find_only_image_column(data) if image_column == None: raise ValueError("'image_column' cannot be 'None'") if type(image_column) != str: raise TypeError("'image_column' has to be of type 'str'") if annotation_column == None: annotation_column = "" if type(annotation_column) != str: raise TypeError("'annotation_column' has to be of type 'str'") # Check Data Structure if type(data) == __tc.data_structures.image.Image: data = __tc.SFrame({image_column: __tc.SArray([data])}) elif type(data) == __tc.data_structures.sframe.SFrame: if data.shape[0] == 0: return data if not (data[image_column].dtype == __tc.data_structures.image.Image): raise TypeError("'data[image_column]' must be an SFrame or SArray") elif type(data) == __tc.data_structures.sarray.SArray: if data.shape[0] == 0: return data data = __tc.SFrame({image_column: data}) else: raise TypeError("'data' must be an SFrame or SArray") annotation_window = __tc.extensions.create_image_classification_annotation( data, [image_column], annotation_column) with _QuietProgress(False): annotation_window.annotate(_get_client_app_path()) return annotation_window.returnAnnotations()
def predict(self, dataset, output_type='cluster_id', verbose=True): """ Return predicted cluster label for instances in the new 'dataset'. K-means predictions are made by assigning each new instance to the closest cluster center. Parameters ---------- dataset : SFrame Dataset of new observations. Must include the features used for model training; additional columns are ignored. output_type : {'cluster_id', 'distance'}, optional Form of the prediction. 'cluster_id' (the default) returns the cluster label assigned to each input instance, while 'distance' returns the Euclidean distance between the instance and its assigned cluster's center. verbose : bool, optional If True, print progress updates to the screen. Returns ------- out : SArray Model predictions. Depending on the specified `output_type`, either the assigned cluster label or the distance of each point to its closest cluster center. The order of the predictions is the same as order of the input data rows. See Also -------- create Examples -------- >>> sf = turicreate.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = turicreate.kmeans.create(sf, num_clusters=3) ... >>> sf_new = turicreate.SFrame({'x1': [-5.6584, -1.0167, -9.6181], ... 'x2': [-6.3803, -3.7937, -1.1022]}) >>> clusters = model.predict(sf_new, output_type='cluster_id') >>> print clusters [1, 0, 1] """ ## Validate the input dataset. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the output type. if not isinstance(output_type, str): raise TypeError("The 'output_type' parameter must be a string.") if not output_type in ('cluster_id', 'distance'): raise ValueError("The 'output_type' parameter must be either " + "'cluster_label' or 'distance'.") ## Get model features. ref_features = self.features sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Compute predictions. opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': sf_features } with _QuietProgress(verbose): result = _tc.extensions._kmeans.predict(opts) sf_result = result['predictions'] if output_type == 'distance': return sf_result['distance'] else: return sf_result['cluster_id']