def create(dataset, num_clusters=None, features=None, initial_centers=None, max_iterations=10, batch_size=None, verbose=True): r""" Run the k-means++ clustering algorithm, returning a KmeansModel object that contains the cluster centers and the cluster assignment for each data point in the dataset. Given a number of clusters, k-means++ iteratively chooses the best cluster centers and assigns nearby points to the best cluster. If no points change cluster membership between iterations, the algorithm terminates. Parameters ---------- dataset : SFrame Each row in the SFrame is an observation. num_clusters : int Number of clusters. This is the 'k' in k-means. features : list[string], optional Names of feature columns to use in computing distances between observations and cluster centers. 'None' (the default) indicates that all columns should be used as features. Columns may be of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a distinct feature in the model. - *Dict*: dictionary of keys mapped to numeric values. Each unique key is treated as a distinct feature in the model. Note that columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. initial_centers : SFrame, optional If None (default), k-means++ intelligently chooses initial cluster centers. Otherwise, the algorithm starts with the centers provided in this SFrame. If this SFrame is provided, the ``num_clusters`` parameter does not need to be specified. ``initial_centers`` must have the columns specified in the ``features`` argument. max_iterations : int, optional The maximum number of iterations to run. Prints a warning if the algorithm does not converge after max_iterations iterations. If set to 0, the model returns clusters defined by the initial centers and assignments to those centers. batch_size : int, optional Number of randomly-chosen data points to use in each iteration. If `None` (the default) or greater than the number of rows in `dataset`, then this parameter is ignored: all rows of `dataset` are used in each iteration and model training terminates once point assignments stop changing or `max_iterations` is reached. verbose : bool, optional If True, print model training progress to the screen. Returns ------- out : KmeansModel A Model object containing a cluster id for each vertex, and the centers of the clusters. See Also -------- KmeansModel References ---------- - `Wikipedia - k-means clustering <http://en.wikipedia.org/wiki/K-means_clustering>`_ - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms. pp. 1027-1035. - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings of the Twentieth International Conference on Machine Learning, Volume 3, pp. 147-153. - Sculley, D. (2010) `Web Scale K-Means Clustering <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In Proceedings of the 19th International Conference on World Wide Web. pp. 1177-1178 Examples -------- >>> sf = graphlab.SFrame({ "d1": [ 0.46973508, 0.0063261, 0.14143399, 0.35025834, 0.83728709, 0.81438336, 0.74205833, 0.36273747, 0.00793858, 0.02298716], "d2": [ 0.51050977, 0.82167952, 0.61451765, 0.51179513, 0.35223035, 0.59366481, 0.48848649, 0.90026032, 0.78798728, 0.40125452], "d3": [ 0.71716265, 0.54163387, 0.55577274, 0.12619953, 0.80172228, 0.21519973, 0.21014113, 0.54207596, 0.65432528, 0.00754797], "d4": [ 0.69486673, 0.92585721, 0.95461882, 0.72658554, 0.86590678, 0.18017175, 0.60361348, 0.89223113, 0.37992791, 0.44700959] }) It's important to standardize our columns to get the best results possible from the k-means algorithm. >>> for col in ['d1', 'd2', 'd3', 'd4']: sf[col] = (sf[col] - sf[col].mean()) / sf[col].std() >>> model = graphlab.kmeans.create(sf, num_clusters=3) """ _mt._get_metric_tracker().track('toolkit.kmeans.create') opts = {'model_name': 'kmeans', 'max_iterations': max_iterations, 'verbose': verbose} ## Validate input dataset if not (isinstance(dataset, _SFrame)): raise TypeError("Input 'dataset' must be an SFrame.") if dataset.num_rows() == 0 or dataset.num_cols() == 0: raise ValueError("Input 'dataset' has no data.") ## Validate input initial centers if initial_centers is not None: if not (isinstance(initial_centers, _SFrame)): raise TypeError("Input 'initial_centers' must be an SFrame.") if initial_centers.num_rows() == 0 or initial_centers.num_cols() == 0: raise ValueError("An 'initial_centers' argument is provided " +\ "but has no data.") ## Validate number of clusters if initial_centers is None: if num_clusters is None: raise ValueError("Number of clusters cannot be determined from " +\ "'num_clusters' or 'initial_centers'. You must " +\ "specify one of these arguments.") else: _num_clusters = num_clusters else: num_centers = initial_centers.num_rows() if num_clusters is None: _num_clusters = num_centers else: if num_clusters != num_centers: raise ValueError("The value of 'num_clusters' does not match " +\ "the number of provided initial centers. " +\ "Please provide only one of these arguments " +\ "or ensure the values match.") else: _num_clusters = num_clusters if not isinstance(_num_clusters, int): raise _ToolkitError("Parameter 'num_clusters' must be an integer.") if _num_clusters > dataset.num_rows(): raise ValueError("The desired number of clusters exceeds the number " + "of data points. Please set 'num_clusters' to be " + "smaller than the number of data points.") opts['num_clusters'] = _num_clusters ## Validate the features in the dataset features = _select_valid_features(dataset, features, [_array, dict, int, float]) sf_features = dataset.select_columns(features) opts['features'] = sf_features ## Validate the features in the initial centers (if provided) if initial_centers is not None: try: initial_centers = initial_centers.select_columns(features) except: raise ValueError("Specified features cannot be extracted from the " +\ "provided initial centers.") if initial_centers.column_types() != sf_features.column_types(): raise TypeError("Feature types are different in the dataset and " +\ "initial centers.") else: initial_centers = _graphlab.SFrame() opts['initial_centers'] = initial_centers ## Validate the batch size and determine the training method. if batch_size is None: opts['method'] = 'elkan' opts['batch_size'] = dataset.num_rows(); else: opts['method'] = 'minibatch' opts['batch_size'] = batch_size ## Create and return the model params = _graphlab.toolkits._main.run('kmeans_train', opts, verbose) return KmeansModel(params['model'])