Ejemplo n.º 1
0
def create(dataset,
           num_clusters=None,
           features=None,
           label=None,
           initial_centers=None,
           max_iterations=10,
           batch_size=None,
           verbose=True):
    """
    Create a k-means clustering model. The KmeansModel object contains the
    computed cluster centers and the cluster assignment for each instance in
    the input 'dataset'.

    Given a number of clusters, k-means iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates.

    Parameters
    ----------
    dataset : SFrame
        Each row in the SFrame is an observation.

    num_clusters : int
        Number of clusters. This is the 'k' in k-means.

    features : list[str], optional
        Names of feature columns to use in computing distances between
        observations and cluster centers. 'None' (the default) indicates that
        all columns should be used as features. Columns may be of the following
        types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (int or float) values. Each list element
          is treated as a distinct feature in the model.

        - *Dict*: dictionary of keys mapped to numeric values. Each unique key
          is treated as a distinct feature in the model.

        Note that columns of type *list* are not supported. Convert them to
        array columns if all entries in the list are of numeric types.

    label : str, optional
        Name of the column to use as row labels in the Kmeans output. The
        values in this column must be integers or strings. If not specified,
        row numbers are used by default.

    initial_centers : SFrame, optional
        Initial centers to use when starting the K-means algorithm. If
        specified, this parameter overrides the *num_clusters* parameter. The
        'initial_centers' SFrame must contain the same features used in the
        input 'dataset'.

        If not specified (the default), initial centers are chosen
        intelligently with the K-means++ algorithm.

    max_iterations : int, optional
        The maximum number of iterations to run. Prints a warning if the
        algorithm does not converge after max_iterations iterations. If set to
        0, the model returns clusters defined by the initial centers and
        assignments to those centers.

    batch_size : int, optional
        Number of randomly-chosen data points to use in each iteration. If
        'None' (the default) or greater than the number of rows in 'dataset',
        then this parameter is ignored: all rows of `dataset` are used in each
        iteration and model training terminates once point assignments stop
        changing or `max_iterations` is reached.

    verbose : bool, optional
        If True, print model training progress to the screen.

    Returns
    -------
    out : KmeansModel
        A Model object containing a cluster id for each vertex, and the centers
        of the clusters.

    See Also
    --------
    KmeansModel

    Notes
    -----
    - Integer features in the 'dataset' or 'initial_centers' inputs are
      converted internally to float type, and the corresponding features in the
      output centers are float-typed.

    - It can be important for the K-means model to standardize the features so
      they have the same scale. This function does *not* standardize
      automatically.

    References
    ----------
    - `Wikipedia - k-means clustering
      <http://en.wikipedia.org/wiki/K-means_clustering>`_

    - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of
      Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In
      Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete
      Algorithms. pp. 1027-1035.

    - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means
      <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings
      of the Twentieth International Conference on Machine Learning, Volume 3,
      pp. 147-153.

    - Sculley, D. (2010) `Web Scale K-Means Clustering
      <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In
      Proceedings of the 19th International Conference on World Wide Web. pp.
      1177-1178

    Examples
    --------
    >>> sf = turicreate.SFrame({
    ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
    ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
    ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
    ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
    ...
    >>> model = turicreate.kmeans.create(sf, num_clusters=3)
    """
    opts = {
        'model_name': 'kmeans',
        'max_iterations': max_iterations,
    }

    ## Validate the input dataset and initial centers.
    _validate_dataset(dataset)

    if initial_centers is not None:
        _validate_initial_centers(initial_centers)

    ## Validate and determine the correct number of clusters.
    opts['num_clusters'] = _validate_num_clusters(num_clusters,
                                                  initial_centers,
                                                  dataset.num_rows())

    ## Validate the row label
    col_type_map = {c: dataset[c].dtype for c in dataset.column_names()}

    if label is not None:
        _validate_row_label(label, col_type_map)

        if label in ['cluster_id', 'distance']:
            raise ValueError("Row label column name cannot be 'cluster_id' " +
                             "or 'distance'; these are reserved for other " +
                             "columns in the Kmeans model's output.")

        opts['row_labels'] = dataset[label]
        opts['row_label_name'] = label

    else:
        opts['row_labels'] = _tc.SArray.from_sequence(dataset.num_rows())
        opts['row_label_name'] = 'row_id'

    ## Validate the features relative to the input dataset.
    if features is None:
        features = dataset.column_names()

    valid_features = _validate_features(features,
                                        col_type_map,
                                        valid_types=[_array, dict, int, float],
                                        label=label)

    sf_features = dataset.select_columns(valid_features)
    opts['features'] = sf_features

    ## Validate the features in the initial centers (if provided)
    if initial_centers is not None:
        try:
            initial_centers = initial_centers.select_columns(valid_features)
        except:
            raise ValueError("Specified features cannot be extracted from " +
                             "the provided initial centers.")

        if initial_centers.column_types() != sf_features.column_types():
            raise TypeError("Feature types are different in the dataset and " +
                            "initial centers.")

    else:
        initial_centers = _tc.SFrame()

    opts['initial_centers'] = initial_centers

    ## Validate the batch size and determine the training method.
    if batch_size is None:
        opts['method'] = 'elkan'
        opts['batch_size'] = dataset.num_rows()

    else:
        opts['method'] = 'minibatch'
        opts['batch_size'] = batch_size

    ## Create and return the model
    with _QuietProgress(verbose):
        params = _tc.extensions._kmeans.train(opts)

    return KmeansModel(params['model'])
Ejemplo n.º 2
0
def annotate(data, image_column=None, annotation_column="annotations"):
    """
    Annotate images using a GUI assisted application. When the GUI is
    terminated an SFrame with the representative images and annotations is
    returned.

    Parameters
    ----------
    data : SArray | SFrame
        The data containing the input images.

    image_column: string, optional
        The name of the input column in the SFrame that contains the image that
        needs to be annotated. In case `data` is of type SArray, then the
        output SFrame contains a column (with this name) containing the input
        images.

    annotation_column : string, optional
        The column containing the annotations in the output SFrame.

    Returns
    -------
    out : SFrame
        A new SFrame that contains the newly annotated data.

    Examples
    --------
    >>> import turicreate as tc
    >>> images = tc.image_analysis.load_images("path/to/images")
    >>> print(images)
        +------------------------+--------------------------+
        |          path          |          image           |
        +------------------------+--------------------------+
        | /Users/username/Doc... | Height: 1712 Width: 1952 |
        | /Users/username/Doc... | Height: 1386 Width: 1000 |
        | /Users/username/Doc... |  Height: 536 Width: 858  |
        | /Users/username/Doc... | Height: 1512 Width: 2680 |
        +------------------------+--------------------------+
        [4 rows x 2 columns]

    >>> images = tc.image_classifier.annotate(images)
    >>> print(images)
        +------------------------+--------------------------+-------------------+
        |          path          |          image           |    annotations    |
        +------------------------+--------------------------+-------------------+
        | /Users/username/Doc... | Height: 1712 Width: 1952 |        dog        |
        | /Users/username/Doc... | Height: 1386 Width: 1000 |        dog        |
        | /Users/username/Doc... |  Height: 536 Width: 858  |        cat        |
        | /Users/username/Doc... | Height: 1512 Width: 2680 |       mouse       |
        +------------------------+--------------------------+-------------------+
        [4 rows x 3 columns]

    """
    # Check Value of Column Variables
    if not isinstance(data, __tc.SFrame):
        raise TypeError('"data" must be of type SFrame.')

    # Check if Value is Empty
    if data.num_rows() == 0:
        raise Exception("input data cannot be empty")

    if image_column == None:
        image_column = _tkutl._find_only_image_column(data)

    if image_column == None:
        raise ValueError("'image_column' cannot be 'None'")

    if type(image_column) != str:
        raise TypeError("'image_column' has to be of type 'str'")

    if annotation_column == None:
        annotation_column = ""

    if type(annotation_column) != str:
        raise TypeError("'annotation_column' has to be of type 'str'")

    # Check Data Structure
    if type(data) == __tc.data_structures.image.Image:
        data = __tc.SFrame({image_column: __tc.SArray([data])})

    elif type(data) == __tc.data_structures.sframe.SFrame:
        if data.shape[0] == 0:
            return data
        if not (data[image_column].dtype == __tc.data_structures.image.Image):
            raise TypeError("'data[image_column]' must be an SFrame or SArray")

    elif type(data) == __tc.data_structures.sarray.SArray:
        if data.shape[0] == 0:
            return data

        data = __tc.SFrame({image_column: data})
    else:
        raise TypeError("'data' must be an SFrame or SArray")

    annotation_window = __tc.extensions.create_image_classification_annotation(
        data, [image_column], annotation_column)

    with _QuietProgress(False):
        annotation_window.annotate(_get_client_app_path())
        return annotation_window.returnAnnotations()
Ejemplo n.º 3
0
    def predict(self, dataset, output_type='cluster_id', verbose=True):
        """
        Return predicted cluster label for instances in the new 'dataset'.
        K-means predictions are made by assigning each new instance to the
        closest cluster center.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include the features used for
            model training; additional columns are ignored.

        output_type : {'cluster_id', 'distance'}, optional
            Form of the prediction. 'cluster_id' (the default) returns the
            cluster label assigned to each input instance, while 'distance'
            returns the Euclidean distance between the instance and its
            assigned cluster's center.

        verbose : bool, optional
            If True, print progress updates to the screen.

        Returns
        -------
        out : SArray
            Model predictions. Depending on the specified `output_type`, either
            the assigned cluster label or the distance of each point to its
            closest cluster center. The order of the predictions is the same as
            order of the input data rows.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = turicreate.SFrame({
        ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
        ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
        ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
        ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
        ...
        >>> model = turicreate.kmeans.create(sf, num_clusters=3)
        ...
        >>> sf_new = turicreate.SFrame({'x1': [-5.6584, -1.0167, -9.6181],
        ...                           'x2': [-6.3803, -3.7937, -1.1022]})
        >>> clusters = model.predict(sf_new, output_type='cluster_id')
        >>> print clusters
        [1, 0, 1]
        """

        ## Validate the input dataset.
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Validate the output type.
        if not isinstance(output_type, str):
            raise TypeError("The 'output_type' parameter must be a string.")

        if not output_type in ('cluster_id', 'distance'):
            raise ValueError("The 'output_type' parameter must be either " +
                             "'cluster_label' or 'distance'.")

        ## Get model features.
        ref_features = self.features
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Compute predictions.
        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': sf_features
        }

        with _QuietProgress(verbose):
            result = _tc.extensions._kmeans.predict(opts)

        sf_result = result['predictions']

        if output_type == 'distance':
            return sf_result['distance']
        else:
            return sf_result['cluster_id']