コード例 #1
0
    def tag(self,
            dataset,
            query_name=None,
            k=5,
            similarity_threshold=None,
            exclude_zeros=True,
            verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({
            "id": range(len(query_sa)),
            query_column: query_sa
        })

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self.__proxy__['nearest_neighbors_model'].query(
            features, label="id", k=k, radius=radius, verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({
                query_column + "_id": [],
                query_column: [],
                self.get("tag_name"): [],
                "score": []
            })

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id"})
        results.rename({query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({
            "reference_label": self.get("tag_name"),
            "query_label": query_column
        })
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError:  # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results
コード例 #2
0
    def tag(self, dataset, query_name=None, k=5, similarity_threshold=None,
            exclude_zeros=True, verbose=True):
        """
        Match the reference tags passed when a model is created to a new set of
        queries. This is a many-to-many match: each query may have any number of
        occurrences of a reference tag.

        Parameters
        ----------
        dataset : SFrame
            Query data to be tagged.

        query_name : string, optional
            Name of the column in ``dataset`` to be auto-tagged. If ``dataset``
            has more than one column, ``query_name`` must be specified.

        k : int, optional
            Number of results to return from the reference set for each query
            observation. The default is 5, but setting it to ``None`` will
            return all results whose score is greater than or equal to
            ``similarity_threshold``.

        similarity_threshold : float, optional
            Only results whose score is greater than or equal to the specified
            ``similarity_threshold`` are returned. The default is ``None``, in
            which case the ``k`` best results are returned for each query point.

        verbose : bool, optional
            If True, print progress updates and model details.

        exclude_zeros : boolean, optional
            If True, only entries for which there is a tag with a nonzero score
            are preserved in the output. This is the default behavior.

        Returns
        -------
        out : SFrame
            An SFrame with four columns:

            - row ID
            - column name specified as `tag_name` parameter to `create` method
            - column name specified as `query_name` parameter to `tag` method
            - a similarity score between 0 and 1, indicating the strength of the
              match between the query data and the suggested reference tag,
              where a score of zero indicates a poor match and a strength of 1
              corresponds to a perfect match

        Notes
        -----
        - By default, only rows for which there is a tag with a nonzero score
          are included in the output. To guarantee at least one output row for
          every input row in ``dataset``, set the ``exclude_zeros`` parameter
          to False.

        - If both ``k`` and ``similarity_threshold`` are set to ``None``, a
          ToolkitError is raised.

        Examples
        --------
        First construct a toy `SFrame` of actor names, which will serve as the
        reference set for our autotagger model.

        >>> actors_sf = gl.SFrame(
                {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                           "Tom Cruise", "Jude Law", "Robert Pattinson",
                           "Matt Damon", "Brad Pitt", "Johnny Depp",
                           "Leonardo DiCaprio", "Jennifer Aniston",
                           "Jessica Alba", "Emma Stone", "Cameron Diaz",
                           "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                           "Charlize Theron", "Marion Cotillard",
                           "Angelina Jolie"]})
        >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor")

        Then we load some IMDB movie reviews into an `SFrame` and tag them using
        the model we created above. The score field in the output is a
        similarity score, indicating the strength of the match between the query
        data and the suggested reference tag.

        >>> reviews_sf = gl.SFrame(
                "s3://dato-datasets/imdb_reviews/reviews.sframe")
        >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
        +-----------+-------------------------------+------------------+-----------------+
        | review_id |             review            |      actor       |      score      |
        +-----------+-------------------------------+------------------+-----------------+
        |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
        |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
        |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
        |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
        |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
        |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
        |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
        |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
        |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
        |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
        |    ...    |              ...              |       ...        |       ...       |
        +-----------+-------------------------------+------------------+-----------------+

        The initial results look a little noisy. To filter out obvious spurious
        matches, we can set the `tag` method's `similarity_threshold` parameter.

        >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
                  similarity_threshold=.8)
        +-----------+-------------------------------+------------------+----------------+
        | review_id |             review            |      actor       |     score      |
        +-----------+-------------------------------+------------------+----------------+
        |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
        |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
        |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
        |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
        +-----------+-------------------------------+------------------+----------------+

        """
        _mt._get_metric_tracker().track(self.__module__ + '.tag')

        # validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        # ensure that either k or similarity_threshold is set
        if not (k or similarity_threshold):
            raise _ToolkitError("Either k or similarity_threshold parameters " \
                                "must be set")

        # ensure that query_name is provided if dataset has > 1 column
        if dataset.num_cols() > 1 and not query_name:
            raise _ToolkitError("No query_name parameter specified on " \
                                "dataset with %d columns" % dataset.num_cols())

        query_column = query_name or dataset.column_names()[0]

        # ensure that column with name tag_name exists
        if query_column not in dataset.column_names():
            raise _ToolkitError('No column named "%s" in dataset' \
                                % query_column)

        query_sa = dataset.select_column(query_column)
        query_sf = _gl.SFrame({"id": range(len(query_sa)),
                               query_column: query_sa})

        features = _preprocess(query_sa)
        features = features.add_row_number()

        if similarity_threshold:
            if not isinstance(similarity_threshold, (float, int)):
                raise _ToolkitError("similarity_threshold parameter must be a" \
                                    "float or an int.")

            if similarity_threshold < 0 or similarity_threshold > 1:
                raise _ToolkitError("similarity_threshold parameter must be " \
                                    "between 0 and 1.")

        radius = (1 - similarity_threshold) if similarity_threshold else None

        results = self._nn_model.query(features, label="id", k=k,
                                       radius=radius,
                                       verbose=verbose)

        # return empty SFrame immediately if no NN results
        if len(results) == 0:
            return _gl.SFrame({query_column + "_id": [],
                               query_column: [],
                               self.get("tag_name"): [],
                               "score": []})

        results = results.join(query_sf, on={"query_label": "id"})
        results.rename({"query_label": query_column + "_id",
                        query_column: "query_label"})

        # convert distances to similarity scores
        scores = _dists_to_sim_scores("weighted_jaccard", results)

        results.add_column(scores, "score")
        results.remove_column("distance")
        results.remove_column("rank")
        results.rename({"reference_label": self.get("tag_name"),
                        "query_label": query_column})
        results.swap_columns(self.get("tag_name"), query_column)

        if exclude_zeros:
            try:
                results = results.filter_by(0.0, "score", exclude=True)
            except RuntimeError: # nothing to join
                _logging.getLogger(__name__).warn(
                    "Empty results after filtering scores of 0.")
                results = results.head(0)

        return results
コード例 #3
0
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a :class:`NearestNeighborAutoTagger`
    model, which can be used to quickly apply tags from a reference set of text
    labels to a new query set using the ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    graphlab.nearest_neighbors.NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "https://static.turi.com/datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % col_name)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(features,
                                     label=tag_name,
                                     distance=distance,
                                     features=feature_cols,
                                     verbose=verbose)

    # add standard toolkit state attributes
    state = {
        "nearest_neighbors_model": m,
        "training_time": m.get("training_time"),
        "tag_name": tag_name,
        "verbose": verbose,
        "num_examples": len(features),
        "features": feature_cols,
        "num_features": len(feature_cols),
        "distance": m.get("distance")
    }

    model = NearestNeighborAutoTagger(state)
    return model
コード例 #4
0
def create(dataset, tag_name=None, features=None, verbose=True):
    """
    Create a NearestNeighborAutotagger model, which can be used to quickly apply
    tags from a reference set of text labels to a new query set using the
    ``tag`` method.

    Parameters
    ----------
    dataset : SFrame
        Reference data. This SFrame must contain at least one column. By
        default, only the ``tag_name`` column is used as the basis for
        tagging. You may optionally include additional columns with the
        ``features`` parameter.

    tag_name : string, optional
        Name of the column in ``dataset`` with the tags. This column must
        contain string values. If ``dataset`` contains more than one column,
        ``tag_name`` must be specified.

    features : list[string], optional
        Names of the columns with features to use as the basis for tagging.
        'None' (the default) indicates that only the column specified by the
        ``tag_name`` parameter should be used. Only str or list fields are
        allowed. If a column of type list is specified, all values must be
        either of type string or convertible to type string.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : model
        A model for quickly tagging new query observations with entries from
        `dataset`. Currently, the only implementation is the following:

        - NearestNeighborAutoTagger

    See Also
    --------
    NearestNeighborsModel

    Examples
    --------
    First construct a toy `SFrame` of actor names, which will serve as the
    reference set for our autotagger model.

    >>> actors_sf = gl.SFrame(
            {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper",
                       "Tom Cruise", "Jude Law", "Robert Pattinson",
                       "Matt Damon", "Brad Pitt", "Johnny Depp",
                       "Leonardo DiCaprio", "Jennifer Aniston",
                       "Jessica Alba", "Emma Stone", "Cameron Diaz",
                       "Scarlett Johansson", "Mila Kunis", "Julia Roberts",
                       "Charlize Theron", "Marion Cotillard",
                       "Angelina Jolie"]})
    >>> m = gl.data_matching.nearest_neighbor_autotagger.create(
                actors_sf, tag_name="actor")

    Then we load some IMDB movie reviews into an `SFrame` and tag them using
    the model we created above. The score field in the output is a
    similarity score, indicating the strength of the match between the query
    data and the suggested reference tag.

    >>> reviews_sf = gl.SFrame(
            "s3://dato-datasets/imdb_reviews/reviews.sframe")
    >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False)
    +-----------+-------------------------------+------------------+-----------------+
    | review_id |             review            |      actor       |      score      |
    +-----------+-------------------------------+------------------+-----------------+
    |     0     | Story of a man who has unn... |   Cameron Diaz   | 0.0769230769231 |
    |     0     | Story of a man who has unn... |  Angelina Jolie  | 0.0666666666667 |
    |     0     | Story of a man who has unn... | Charlize Theron  |      0.0625     |
    |     0     | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 |
    |     1     | Bromwell High is a cartoon... |   Jessica Alba   |      0.125      |
    |     1     | Bromwell High is a cartoon... | Jennifer Aniston |       0.1       |
    |     1     | Bromwell High is a cartoon... | Charlize Theron  |       0.05      |
    |     1     | Bromwell High is a cartoon... | Robert Pattinson |  0.047619047619 |
    |     1     | Bromwell High is a cartoon... | Marion Cotillard |  0.047619047619 |
    |     2     | Airport '77 starts as a br... |  Julia Roberts   | 0.0961538461538 |
    |    ...    |              ...              |       ...        |       ...       |
    +-----------+-------------------------------+------------------+-----------------+

    The initial results look a little noisy. To filter out obvious spurious
    matches, we can set the `tag` method's similarity_threshold parameter.

    >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False,
              similarity_threshold=.8)
    +-----------+-------------------------------+------------------+----------------+
    | review_id |             review            |      actor       |     score      |
    +-----------+-------------------------------+------------------+----------------+
    |    341    | I caught this film at a te... |  Julia Roberts   | 0.857142857143 |
    |    657    | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 |
    |    668    | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 |
    |    673    | This film is the best film... | Jennifer Aniston |     0.9375     |
    +-----------+-------------------------------+------------------+----------------+

    In this second example, you'll notice that the ``review_id`` column is much
    more sparse. This is because all results whose score was below the specified
    similarity threshold (.8) were excluded from the output.

    """
    # validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    # ensure that tag_name is provided if dataset has > 1 column
    if dataset.num_cols() > 1 and not tag_name:
        raise _ToolkitError("No tag_name parameter specified on dataset " \
                            "with %d columns" % dataset.num_cols())
    tag_name = tag_name or dataset.column_names()[0]

    # ensure that column with name tag_name exists
    if tag_name not in dataset.column_names():
        raise _ToolkitError('No column named "%s" in dataset' % tag_name)

    # ensure that column is of type string
    if dataset[tag_name].dtype() != str:
        raise TypeError("The column used as the tag name must be of type " \
                        "string.")

    # use reasonable default for general case
    distance = _gl.distances.weighted_jaccard

    # if additional features are specified, ensure they are of appropriate types
    if features and not isinstance(features, list) and \
       all([isinstance(x, str) for x in features]):
        raise TypeError("The feature parameter must be a list of strings " \
                        "and those strings must correspond to columns in " \
                        "`dataset`.")

    # at a minimum, this SFrame will contain the tags as features;
    features = features or []
    features = [tag_name] + [x for x in features if x != tag_name]

    # ensure that each specified feature column is either of type list or str
    column_names = set(dataset.column_names())
    for col_name in features:
        if col_name not in column_names:
            raise _ToolkitError("Specified feature column (%s) not found " \
                                "in dataset" % x)

        if dataset.select_column(col_name).dtype() not in (str, list):
            raise TypeError("Only string and list columns are allowed as " \
                            "features.")

    # concatenate the feature columns into a single column
    features_sf = dataset.select_columns(features)
    feature_col, features_sf = _concat_string_features(features_sf, features)

    # compute features
    if verbose:
        _logging.getLogger().info("Extracting features...")

    features = _preprocess(features_sf.select_column(feature_col))

    # group by tag_name to ensure that tags are unique
    feature_cols = features.column_names()
    select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \
                   in feature_cols}
    features.add_column(dataset[tag_name], tag_name)
    features = features.groupby(tag_name, select_cols)

    # create nearest neighbors model
    m = _gl.nearest_neighbors.create(
        features, label=tag_name, distance=distance,
        features=feature_cols, verbose=verbose)

    # add standard toolkit state attributes
    state = {"training_time": m.get("training_time"),
             "tag_name": tag_name,
             "verbose": verbose,
             "num_examples": len(features),
             "features": feature_cols,
             "num_features": len(feature_cols),
             "distance": m.get("distance")}

    model = NearestNeighborAutoTagger(m, state)
    model.summary()

    return model