Esempio n. 1
0
def create(dataset, num_clusters=None, features=None, initial_centers=None,
           max_iterations=10, batch_size=None, verbose=True):
    r"""
    Run the k-means++ clustering algorithm, returning a KmeansModel object
    that contains the cluster centers and the cluster assignment for
    each data point in the dataset.

    Given a number of clusters, k-means++ iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates.

    Parameters
    ----------
    dataset : SFrame
        Each row in the SFrame is an observation.

    num_clusters : int
        Number of clusters. This is the 'k' in k-means.

    features : list[string], optional
        Names of feature columns to use in computing distances between
        observations and cluster centers. 'None' (the default) indicates that
        all columns should be used as features. Columns may be of the following
        types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a distinct feature in the model.

        - *Dict*: dictionary of keys mapped to numeric values. Each unique key
          is treated as a distinct feature in the model.

        Note that columns of type *list* are not supported. Convert them to
        array columns if all entries in the list are of numeric types.

    initial_centers : SFrame, optional
        If None (default), k-means++ intelligently chooses initial cluster
        centers. Otherwise, the algorithm starts with the centers provided in
        this SFrame. If this SFrame is provided, the ``num_clusters`` parameter
        does not need to be specified. ``initial_centers`` must have the columns
        specified in the ``features`` argument.

    max_iterations : int, optional
        The maximum number of iterations to run. Prints a warning if the
        algorithm does not converge after max_iterations iterations. If set to
        0, the model returns clusters defined by the initial centers and
        assignments to those centers.

    batch_size : int, optional    
        Number of randomly-chosen data points to use in each iteration. If
        `None` (the default) or greater than the number of rows in `dataset`,
        then this parameter is ignored: all rows of `dataset` are used in each
        iteration and model training terminates once point assignments stop
        changing or `max_iterations` is reached.

    verbose : bool, optional
        If True, print model training progress to the screen.

    Returns
    -------
    out : KmeansModel
        A Model object containing a cluster id for each vertex, and the centers
        of the clusters.

    See Also
    --------
    KmeansModel

    References
    ----------
    - `Wikipedia - k-means clustering
      <http://en.wikipedia.org/wiki/K-means_clustering>`_

    - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of
      Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In
      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete
      Algorithms. pp. 1027-1035.

    - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means
      <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings of
      the Twentieth International Conference on Machine Learning, Volume 3, pp.
      147-153.

    - Sculley, D. (2010) `Web Scale K-Means Clustering
      <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In
      Proceedings of the 19th International Conference on World Wide Web. pp.
      1177-1178

    Examples
    --------
    >>> sf = graphlab.SFrame({
        "d1": [ 0.46973508, 0.0063261, 0.14143399, 0.35025834,
                0.83728709, 0.81438336, 0.74205833, 0.36273747,
                0.00793858, 0.02298716],
        "d2": [ 0.51050977, 0.82167952, 0.61451765, 0.51179513,
                0.35223035, 0.59366481, 0.48848649, 0.90026032,
                0.78798728, 0.40125452],
        "d3": [ 0.71716265, 0.54163387, 0.55577274, 0.12619953,
                0.80172228, 0.21519973, 0.21014113, 0.54207596,
                0.65432528, 0.00754797],
        "d4": [ 0.69486673, 0.92585721, 0.95461882, 0.72658554,
                0.86590678, 0.18017175, 0.60361348, 0.89223113,
                0.37992791, 0.44700959]
        })

    It's important to standardize our columns to get the best results possible
    from the k-means algorithm.

    >>> for col in ['d1', 'd2', 'd3', 'd4']:
            sf[col] = (sf[col] - sf[col].mean()) / sf[col].std()
    >>> model = graphlab.kmeans.create(sf, num_clusters=3)
    """

    _mt._get_metric_tracker().track('toolkit.kmeans.create')

    opts = {'model_name': 'kmeans',
            'max_iterations': max_iterations,
            'verbose': verbose}

    ## Validate input dataset
    if not (isinstance(dataset, _SFrame)):
        raise TypeError("Input 'dataset' must be an SFrame.")

    if dataset.num_rows() == 0 or dataset.num_cols() == 0:
        raise ValueError("Input 'dataset' has no data.")

    ## Validate input initial centers
    if initial_centers is not None:
        if not (isinstance(initial_centers, _SFrame)):
            raise TypeError("Input 'initial_centers' must be an SFrame.")

        if initial_centers.num_rows() == 0 or initial_centers.num_cols() == 0:
            raise ValueError("An 'initial_centers' argument is provided " +\
                             "but has no data.")

    ## Validate number of clusters
    if initial_centers is None:
        if num_clusters is None:
            raise ValueError("Number of clusters cannot be determined from " +\
                             "'num_clusters' or 'initial_centers'. You must " +\
                             "specify one of these arguments.")
        else:
            _num_clusters = num_clusters

    else:
        num_centers = initial_centers.num_rows()

        if num_clusters is None:
            _num_clusters = num_centers
        else:
            if num_clusters != num_centers:
                raise ValueError("The value of 'num_clusters' does not match " +\
                                 "the number of provided initial centers. " +\
                                 "Please provide only one of these arguments " +\
                                 "or ensure the values match.")
            else:
                _num_clusters = num_clusters

    if not isinstance(_num_clusters, int):
        raise _ToolkitError("Parameter 'num_clusters' must be an integer.")

    if _num_clusters > dataset.num_rows():
        raise ValueError("The desired number of clusters exceeds the number " +
                         "of data points. Please set 'num_clusters' to be " +
                         "smaller than the number of data points.")

    opts['num_clusters'] = _num_clusters

    ## Validate the features in the dataset
    features = _select_valid_features(dataset, features, [_array, dict, int, float])
    sf_features = dataset.select_columns(features)
    opts['features'] = sf_features

    ## Validate the features in the initial centers (if provided)
    if initial_centers is not None:
        try:
            initial_centers = initial_centers.select_columns(features)
        except:
            raise ValueError("Specified features cannot be extracted from the " +\
                             "provided initial centers.")

        if initial_centers.column_types() != sf_features.column_types():
            raise TypeError("Feature types are different in the dataset and " +\
                             "initial centers.")

    else:
        initial_centers = _graphlab.SFrame()

    opts['initial_centers'] = initial_centers


    ## Validate the batch size and determine the training method.
    if batch_size is None:
        opts['method'] = 'elkan'
        opts['batch_size'] = dataset.num_rows();

    else:
        opts['method'] = 'minibatch'
        opts['batch_size'] = batch_size


    ## Create and return the model
    params = _graphlab.toolkits._main.run('kmeans_train', opts, verbose)
    return KmeansModel(params['model'])