コード例 #1
0
ファイル: object_detector.py プロジェクト: uthens/turicreate
def _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations):
    _raise_error_if_not_sframe(dataset, 'datset')
    if feature not in dataset.column_names():
        raise _ToolkitError("Feature column '%s' does not exist" % feature)
    if dataset[feature].dtype != _tc.Image:
        raise _ToolkitError("Feature column must contain images")

    if require_annotations:
        if annotations not in dataset.column_names():
            raise _ToolkitError("Annotations column '%s' does not exist" % annotations)
        if dataset[annotations].dtype not in [list, dict]:
            raise _ToolkitError("Annotations column must be of type dict or list")
コード例 #2
0
ファイル: _mxnet_utils.py プロジェクト: zoecarver/turicreate
def assert_valid_num_gpus():
    from turicreate.util import _CUDA_GPUS
    num_gpus = _tc_config.get_num_gpus()
    if not _CUDA_GPUS and _sys.platform == 'darwin':
        # GPU acceleration requires macOS 10.14+
        if num_gpus == 1 and _mac_ver() < (10, 14):
            raise _ToolkitError(
                'GPU acceleration requires at least macOS 10.14')
        elif num_gpus >= 2:
            raise _ToolkitError(
                'Using more than one GPU is currently not supported on Mac')
    _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
コード例 #3
0
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
    """
    Randomly split an SFrame into two SFrames based on the `session_id` such
    that one split contains data for a `fraction` of the sessions while the
    second split contains all data for the rest of the sessions.

    Parameters
    ----------
    dataset : SFrame
        Dataset to split. It must contain a column of session ids.

    session_id : string, optional
        The name of the column in `dataset` that corresponds to the
        a unique identifier for each session.

    fraction : float, optional
        Fraction of the sessions to fetch for the first returned SFrame.  Must
        be between 0 and 1. Once the sessions are split, all data from a single
        session is in the same SFrame.

    seed : int, optional
        Seed for the random number generator used to split.

    Examples
    --------

    .. sourcecode:: python

        # Split the data so that train has 90% of the users.
        >>> train, valid = tc.activity_classifier.util.random_split_by_session(
        ...     dataset, session_id='session_id', fraction=0.9)

        # For example: If dataset has 2055 sessions
        >>> len(dataset['session_id'].unique())
        2055

        # The training set now has 90% of the sessions
        >>> len(train['session_id'].unique())
        1850

        # The validation set has the remaining 10% of the sessions
        >>> len(valid['session_id'].unique())
        205
    """

    _raise_error_if_not_of_type(dataset, _SFrame, 'dataset')
    _raise_error_if_not_of_type(session_id, str, 'session_id')
    _raise_error_if_not_of_type(fraction, float, 'fraction')
    _raise_error_if_not_of_type(seed, [int, type(None)], 'seed')
    _numeric_param_check_range('fraction', fraction, 0, 1)

    if session_id not in dataset.column_names():
        raise _ToolkitError(
            'Input "dataset" must contain a column called %s.' % session_id)

    unique_sessions = _SFrame({'session': dataset[session_id].unique()})
    chosen, not_chosen = unique_sessions.random_split(fraction, seed)
    train = dataset.filter_by(chosen['session'], session_id)
    valid = dataset.filter_by(not_chosen['session'], session_id)
    return train, valid
コード例 #4
0
    def _load_version(cls, state, version):
        """
        A function to load a previously saved ImageClassifier
        instance.

        Parameters
        ----------
        unpickler : GLUnpickler
            A GLUnpickler file handler.

        version : int
            Version number maintained by the class writer.
        """
        _tkutl._model_version_check(version,
                                    cls._PYTHON_IMAGE_SIMILARITY_VERSION)
        from turicreate.toolkits.nearest_neighbors import NearestNeighborsModel

        state["similarity_model"] = NearestNeighborsModel(
            state["similarity_model"])

        # Correct models saved with a previous typo
        if state["model"] == "VisionFeaturePrint_Screen":
            state["model"] = "VisionFeaturePrint_Scene"

        if state["model"] == "VisionFeaturePrint_Scene" and _mac_ver() < (10,
                                                                          14):
            raise _ToolkitError(
                "Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, "
                "which is only supported on macOS 10.14 and higher.")
        state[
            "feature_extractor"] = _image_feature_extractor._create_feature_extractor(
                state["model"])
        state["input_image_shape"] = tuple(
            [int(i) for i in state["input_image_shape"]])
        return ImageSimilarityModel(state)
コード例 #5
0
def draw_strokes(stroke_based_drawings):
    """
    Visualizes drawings (ground truth or predictions) by
    returning images to represent the stroke-based data from 
    the user.

    Parameters
    ----------
    stroke_based_drawings: SArray or list
        An `SArray` of type `list`. Each element in the SArray 
        should be a list of strokes, where each stroke is a list
        of points, and each point is represented as a dictionary
        with two keys, "x" and "y". A single stroke-based drawing
        is also supported, in which case, the type of the input
        would be list.
        
    Returns
    -------
    drawings: SArray or _tc.Image
        Each stroke-based drawing is converted into a 28x28 
        grayscale drawing for the user to visualize what their
        strokes traced.

    """
    single_input = False
    if (not isinstance(stroke_based_drawings, _tc.SArray)
            and not isinstance(stroke_based_drawings, list)):
        raise _ToolkitError(
            "Input to draw_strokes must be of type " +
            "turicreate.SArray or list (for a single stroke-based drawing)")
    if (isinstance(stroke_based_drawings, _tc.SArray)
            and stroke_based_drawings.dtype != list):
        raise _ToolkitError(
            "SArray input to draw_strokes must have dtype " +
            "list. Each element in the SArray should be a list of strokes, " +
            "where each stroke is a list of points, " +
            "and each point is represented as a dictionary " +
            "with two keys, \"x\" and \"y\".")
    if isinstance(stroke_based_drawings, list):
        single_input = True
        stroke_based_drawings = _tc.SArray([stroke_based_drawings])
    sf = _tc.SFrame({"drawings": stroke_based_drawings})
    sf_with_drawings = _extensions._drawing_classifier_prepare_data(
        sf, "drawings")
    if single_input:
        return sf_with_drawings["drawings"][0]
    return sf_with_drawings["drawings"]
コード例 #6
0
def _validate_num_clusters(num_clusters, initial_centers, num_rows):
    """
    Validate the combination of the `num_clusters` and `initial_centers`
    parameters in the Kmeans model create function. If the combination is
    valid, determine and return the correct number of clusters.

    Parameters
    ----------
    num_clusters : int
        Specified number of clusters.

    initial_centers : SFrame
        Specified initial cluster center locations, in SFrame form. If the
        number of rows in this SFrame does not match `num_clusters`, there is a
        problem.

    num_rows : int
        Number of rows in the input dataset.

    Returns
    -------
    _num_clusters : int
        The correct number of clusters to use going forward
    """

    ## Basic validation
    if num_clusters is not None and not isinstance(num_clusters, int):
        raise _ToolkitError("Parameter 'num_clusters' must be an integer.")

    ## Determine the correct number of clusters.
    if initial_centers is None:
        if num_clusters is None:
            raise ValueError("Number of clusters cannot be determined from " +
                             "'num_clusters' or 'initial_centers'. You must " +
                             "specify one of these arguments.")
        else:
            _num_clusters = num_clusters

    else:
        num_centers = initial_centers.num_rows()

        if num_clusters is None:
            _num_clusters = num_centers
        else:
            if num_clusters != num_centers:
                raise ValueError(
                    "The value of 'num_clusters' does not match " +
                    "the number of provided initial centers. " +
                    "Please provide only one of these arguments " +
                    "or ensure the values match.")
            else:
                _num_clusters = num_clusters

    if _num_clusters > num_rows:
        raise ValueError("The desired number of clusters exceeds the number " +
                         "of data points. Please set 'num_clusters' to be " +
                         "smaller than the number of data points.")

    return _num_clusters
コード例 #7
0
ファイル: evaluation.py プロジェクト: zxybdfz/turicreate
def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (len(targets) != len(predictions)):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")
コード例 #8
0
ファイル: evaluation.py プロジェクト: zxybdfz/turicreate
def _check_index_map(index_map):
    if index_map is None:
        return

    if not isinstance(index_map, dict):
        raise TypeError("Input `index_map` must be a dict mapping target label to prediction-vector index.")

    indices = [v for k,v in index_map.items()]
    indices.sort()
    if indices != list(range(len(index_map))):
        raise _ToolkitError("Invalid index_map: each target label must map to a distinct index into the prediction vector.")
コード例 #9
0
    def _style_input_check(self, style):
        set_of_all_idx = self._style_indices()
        scalar = False
        if isinstance(style, (list, tuple)):
            if len(style) == 0:
                raise _ToolkitError("the `style` list cannot be empty")
            elif set(style).issubset(set_of_all_idx):
                pass
            else:
                raise _ToolkitError("the `style` variable cannot be parsed")
        elif isinstance(style, _six.integer_types):
            scalar = True
            if style in set_of_all_idx:
                style = [style]
            else:
                raise _ToolkitError("the `style` variable cannot be parsed")
        elif style is None:
            style = list(set_of_all_idx)
        else:
            raise _ToolkitError("the `style` variable cannot be parsed")

        return style, scalar
コード例 #10
0
 def is_valid(ann):
     is_rect = ('type' not in ann or ann['type'] == 'rectangle')
     if not is_rect:
         # Not valid, but we bypass stricter checks (we simply
         # do not care about non rectangle types)
         return False
     ok_required = ('coordinates' in ann and
                    isinstance(ann['coordinates'], dict) and
                    set(ann['coordinates'].keys()) == {'x', 'y', 'width', 'height'} and
                    'label' in ann)
     if not ok_required:
         raise _ToolkitError("Detected an bounding box annotation with improper format: {}".format(ann))
     ok_optional = ann['label'] in self.class_to_index
     return ok_optional
コード例 #11
0
def _raise_error_if_not_drawing_classifier_input_sframe(
    dataset, feature, target):
    """
    Performs some sanity checks on the SFrame provided as input to
    `turicreate.drawing_classifier.create` and raises a ToolkitError
    if something in the dataset is missing or wrong.
    """
    from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe
    _raise_error_if_not_sframe(dataset)
    if feature not in dataset.column_names():
        raise _ToolkitError("Feature column '%s' does not exist" % feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)
    if (dataset[feature].dtype != _tc.Image and dataset[feature].dtype != list):
        raise _ToolkitError("Feature column must contain images"
            + " or stroke-based drawings encoded as lists of strokes"
            + " where each stroke is a list of points and"
            + " each point is stored as a dictionary")
    if dataset[target].dtype != int and dataset[target].dtype != str:
        raise _ToolkitError("Target column contains " + str(dataset[target].dtype)
            + " but it must contain strings or integers to represent"
            + " labels for drawings.")
    if len(dataset) == 0:
        raise _ToolkitError("Input Dataset is empty!")
コード例 #12
0
ファイル: image_analysis.py プロジェクト: chrinide/turicreate
def _is_image_deep_feature_sarray(feature_sarray, model_name):
    """
    Finds if the given `SArray` has extracted features for a given model_name.
    """
    from array import array

    if not (len(feature_sarray) > 0):
        return False
    if feature_sarray.dtype != array:
        return False
    if type(feature_sarray[0]) != array:
        return False
    if len(feature_sarray[0]) != MODEL_TO_FEATURE_SIZE_MAPPING[model_name]:
        raise _ToolkitError(
            "The given deep features are for a model other than {model_name}.".
            format(model_name=model_name))
    return True
コード例 #13
0
 def _extract_features(self, dataset, verbose=False, batch_size=64):
     if image_analysis._is_image_deep_feature_sarray(
             dataset[self.feature], self.model):
         return _tc.SFrame({"__image_features__": dataset[self.feature]})
     elif dataset[self.feature].dtype is _tc.Image:
         return _tc.SFrame({
             "__image_features__":
             self.feature_extractor.extract_features(dataset,
                                                     self.feature,
                                                     verbose=verbose,
                                                     batch_size=batch_size)
         })
     else:
         raise _ToolkitError(
             'The "{feature}" column of the SFrame neither has the dataype image or extracted features array.'
             .format(feature=feature) +
             ' "Datasets" consists of columns with types: ' +
             ", ".join([x.__name__ for x in dataset.column_types()]) + ".")
コード例 #14
0
ファイル: image_analysis.py プロジェクト: chrinide/turicreate
def _find_only_image_extracted_features_column(sframe, model_name):
    """
    Finds the only column in `sframe` with a type of array.array and has
    the length same as the last layer of the model in use.
    If there are zero or more than one image columns, an exception will
    be raised.
    """
    from array import array

    feature_column = _tkutl._find_only_column_of_type(sframe,
                                                      target_type=array,
                                                      type_name="array",
                                                      col_name="deep_features")
    if _is_image_deep_feature_sarray(sframe[feature_column], model_name):
        return feature_column
    else:
        raise _ToolkitError(
            'No "{col_name}" column specified and no column with expected type "{type_name}" is found.'
            .format(col_name="deep_features", type_name="array"))
コード例 #15
0
ファイル: _error_handling.py プロジェクト: zxybdfz/turicreate
def check_one_shot_input(data, target, backgrounds):
    if backgrounds is not None and not(isinstance(backgrounds, _tc.SArray)):
        raise TypeError("'backgrounds' must be None or an SArray.")
    if (isinstance(backgrounds, _tc.SArray) and len(backgrounds) == 0):
        raise _ToolkitError('Unable to train with no background images')
    if not isinstance(target, str):
        raise TypeError("'target' must be of type string.")
    if isinstance(data, _tc.SFrame):
        _tkutl._raise_error_if_column_exists(data, target, "data", target)
        image_column_name = _tkutl._find_only_image_column(data)
        target_column_name = target
        dataset_to_augment = data
    elif isinstance(data, _tc.Image):
        image_column_name = "image"
        target_column_name = "target"
        dataset_to_augment = _tc.SFrame({image_column_name: [data],
                                         target_column_name: [target]})
    else:
        raise TypeError("'data' must be of type SFrame or Image.")
    return dataset_to_augment, image_column_name, target_column_name
コード例 #16
0
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None):
    """
    Randomly split an SFrame into two SFrames based on the `session_id` such
    that one split contains data for a `fraction` of the sessions while the
    second split contains all data for the rest of the sessions.

    Parameters
    ----------
    dataset : SFrame
        Dataset to split. It must contain a column of session ids.

    session_id : string, optional
        The name of the column in `dataset` that corresponds to the
        a unique identifier for each session.

    fraction : float, optional
        Fraction of the sessions to fetch for the first returned SFrame.  Must
        be between 0 and 1. Once the sessions are split, all data from a single
        session is in the same SFrame.

    seed : int, optional
        Seed for the random number generator used to split.

    Examples
    --------

    .. sourcecode:: python

        # Split the data so that train has 90% of the users.
        >>> train, valid = tc.activity_classifier.util.random_split_by_session(
        ...     dataset, session_id='session_id', fraction=0.9)

        # For example: If dataset has 2055 sessions
        >>> len(dataset['session_id'].unique())
        2055

        # The training set now has 90% of the sessions
        >>> len(train['session_id'].unique())
        1850

        # The validation set has the remaining 10% of the sessions
        >>> len(valid['session_id'].unique())
        205
    """
    from random import Random

    _raise_error_if_not_of_type(dataset, _SFrame, 'dataset')
    _raise_error_if_not_of_type(session_id, str, 'session_id')
    _raise_error_if_not_of_type(fraction, float, 'fraction')
    _raise_error_if_not_of_type(seed, [int, type(None)], 'seed')
    _numeric_param_check_range('fraction', fraction, 0, 1)

    if session_id not in dataset.column_names():
        raise _ToolkitError(
            'Input "dataset" must contain a column called %s.' % session_id)

    if seed is None:
        # Include the nanosecond component as well.
        import time
        seed = abs(hash("%0.20f" % time.time())) % (2**31)

    # The cython bindings require this to be an int, so cast if we can.
    try:
        seed = int(seed)
    except ValueError:
        raise ValueError('The \'seed\' parameter must be of type int.')

    random = Random()

    # Create a random binary filter (boolean SArray), using the same probability across all lines
    # that belong to the same session. In expectancy - the desired fraction of the sessions will
    # go to the training set.
    # Since boolean filters preserve order - there is no need to re-sort the lines within each session.
    # The boolean filter is a pseudorandom function of the session_id and the
    # global seed above, allowing the train-test split to vary across runs using
    # the same dataset.
    def random_session_pick(session_id_hash):
        random.seed(session_id_hash)
        return random.uniform(0, 1) < fraction

    chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick)

    train = dataset[chosen_filter]
    valid = dataset[1 - chosen_filter]
    return train, valid
コード例 #17
0
ファイル: image_analysis.py プロジェクト: chrinide/turicreate
def get_deep_features(images, model_name, batch_size=64, verbose=True):
    """
    Extracts features from images from a specific model.

    Parameters
    ----------
    images : SArray
        Input data.

    model_name : string
        string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    Returns
    -------
    out : SArray
        Returns an SArray with all the extracted features.

    See Also
    --------
    turicreate.image_classifier.create
    turicreate.image_similarity.create

    Examples
    --------
    >>> url = 'https://static.turi.com/datasets/images/nested'
    >>> image_sframe = turicreate.load_images(url)
    >>> image_sarray = image_sframe["image"]
    >>> deep_features_sframe = turicreate.image_analysis.get_deep_features(image_sarray, model_name="resnet-50")
    """

    # Check model parameter
    allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append("VisionFeaturePrint_Scene")
    _tkutl._check_categorical_option_type("model", model_name, allowed_models)

    # Check images parameter
    if not isinstance(images, _tc.SArray):
        raise TypeError(
            "Unrecognized type for 'images'. An SArray is expected.")
    if len(images) == 0:
        raise _ToolkitError(
            "Unable to extract features on an empty SArray object")

    if batch_size < 1:
        raise ValueError("'batch_size' must be greater than or equal to 1")

    # Extract features
    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model_name)
    images_sf = _tc.SFrame({"image": images})
    return feature_extractor.extract_features(images_sf,
                                              "image",
                                              verbose=verbose,
                                              batch_size=batch_size)
コード例 #18
0
def assert_valid_num_gpus():
    from turicreate.util import _CUDA_GPU_IDS
    num_gpus = _tc_config.get_num_gpus()
    if not _CUDA_GPU_IDS and _sys.platform == 'darwin' and num_gpus > 0:
        raise _ToolkitError('Using GPUs is currently not supported on Mac')
    _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
コード例 #19
0
def create(dataset, target, feature, max_iterations=10,
           custom_layer_sizes=[100, 100], verbose=True,
           validation_set='auto', batch_size=64):
    '''
    Creates a :class:`SoundClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string, optional
        Name of the column containing the feature column. This column must
        contain audio data or deep audio features.
        Audio data is represented as dicts with key 'data' and 'sample_rate',
        see `turicreate.load_audio(...)`.
        Deep audio features are represented as a list of numpy arrays, each of
        size 12288, see `turicreate.sound_classifier.get_deep_features(...)`.

    max_iterations : int, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low.

    custom_layer_sizes : list of ints
        Specifies the architecture of the custom neural network. This neural
        network is made up of a series of dense layers. This parameter allows
        you to specify how many layers and the number of units in each layer.
        The custom neural network will always have one more layer than the
        length of this list. The last layer is always a soft max with units
        equal to the number of classes.

    verbose : bool, optional
        If True, prints progress updates and model details.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance. The
        format of this SFrame must be the same as the training dataset. By
        default, a validation set is automatically sampled. If `validation_set`
        is set to None, no validataion is used. You can also pass a validation
        set you have constructed yourself.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.
    '''
    import time
    import mxnet as mx

    from ._audio_feature_extractor import _get_feature_extractor

    start_time = time.time()

    # check parameters
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if feature not in dataset.column_names():
        raise _ToolkitError("Audio feature column '%s' does not exist" % feature)
    if not _is_deep_feature_sarray(dataset[feature]) and not _is_audio_data_sarray(dataset[feature]):
        raise _ToolkitError("'%s' column is not audio data." % feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)
    if not _tc.util._is_non_string_iterable(custom_layer_sizes) or len(custom_layer_sizes) == 0:
        raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.")
    for i in custom_layer_sizes:
        if not isinstance(i, int):
            raise _ToolkitError("'custom_layer_sizes' must contain only integers.")
    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'")
    if isinstance(validation_set, _tc.SFrame):
        if feature not in validation_set.column_names() or target not in validation_set.column_names():
            raise ValueError("The 'validation_set' SFrame must be in the same format as the 'dataset'")
    if batch_size < 1:
        raise ValueError('\'batch_size\' must be greater than or equal to 1')

    classes = list(dataset[target].unique().sort())
    num_labels = len(classes)
    feature_extractor_name = 'VGGish'
    feature_extractor = _get_feature_extractor(feature_extractor_name)
    class_label_to_id = {l: i for i, l in enumerate(classes)}

    # create the validation set
    if not isinstance(validation_set, _tc.SFrame) and validation_set == 'auto':
        if len(dataset) >= 100:
            print ( "Creating a validation set from 5 percent of training data. This may take a while.\n"
                    "\tYou can set ``validation_set=None`` to disable validation tracking.\n")
            dataset, validation_set = dataset.random_split(0.95, exact=True)
        else:
            validation_set = None

    encoded_target = dataset[target].apply(lambda x: class_label_to_id[x])

    if _is_deep_feature_sarray(dataset[feature]):
        train_deep_features = dataset[feature]
    else:
        # do the preprocess and VGGish feature extraction
        train_deep_features = get_deep_features(dataset[feature], verbose=verbose)

    train_data = _tc.SFrame({'deep features': train_deep_features, 'labels': encoded_target})
    train_data = train_data.stack('deep features', new_column_name='deep features')
    train_data, missing_ids = train_data.dropna_split(columns=['deep features'])

    if len(missing_ids) > 0:
        _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids))

    if validation_set is not None:
        if verbose:
            print("Preparing validataion set")
        validation_encoded_target = validation_set[target].apply(lambda x: class_label_to_id[x])

        if _is_deep_feature_sarray(validation_set[feature]):
            validation_deep_features = validation_set[feature]
        else:
            validation_deep_features = get_deep_features(validation_set[feature], verbose=verbose)

        validation_data = _tc.SFrame({'deep features': validation_deep_features, 'labels': validation_encoded_target})
        validation_data = validation_data.stack('deep features', new_column_name='deep features')
        validation_data = validation_data.dropna(columns=['deep features'])

        validation_batch_size = min(len(validation_data), batch_size)
        validation_data = mx.io.NDArrayIter(validation_data['deep features'].to_numpy(),
                                             label=validation_data['labels'].to_numpy(),
                                             batch_size=validation_batch_size)
    else:
        validation_data = []

    if verbose:
        print("\nTraining a custom neural network -")

    training_batch_size = min(len(train_data), batch_size)
    train_data = mx.io.NDArrayIter(train_data['deep features'].to_numpy(),
                                    label=train_data['labels'].to_numpy(),
                                    batch_size=training_batch_size, shuffle=True)

    custom_NN = SoundClassifier._build_custom_neural_network(feature_extractor.output_length, num_labels, custom_layer_sizes)
    ctx = _mxnet_utils.get_mxnet_context()
    custom_NN.initialize(mx.init.Xavier(), ctx=ctx)

    trainer = mx.gluon.Trainer(custom_NN.collect_params(), 'nag', {'learning_rate': 0.01, 'momentum': 0.9})

    if verbose:
        # Setup progress table
        row_ids = ['epoch', 'train_accuracy', 'time']
        row_display_names = ['Epoch', 'Training Accuracy (%)', 'Elapsed Time (seconds)']
        if validation_data:
            row_ids.insert(2, 'validation_accuracy')
            row_display_names.insert(2, 'Validation Accuracy (%)')
        table_printer = _tc.util._ProgressTablePrinter(row_ids, row_display_names)

    train_metric = mx.metric.Accuracy()
    if validation_data:
        validation_metric = mx.metric.Accuracy()
    softmax_cross_entropy_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    for i in range(max_iterations):
        # TODO: early stopping

        for batch in train_data:
            data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False)

            # Inside training scope
            with mx.autograd.record():
                for x, y in zip(data, label):
                    z = custom_NN(x)
                    # Computes softmax cross entropy loss.
                    loss = softmax_cross_entropy_loss(z, y)
                    # Backpropagate the error for one iteration.
                    loss.backward()
            # Make one step of parameter update. Trainer needs to know the
            # batch size of data to normalize the gradient by 1/batch_size.
            trainer.step(batch.data[0].shape[0])
        train_data.reset()

        # Calculate training metric
        for batch in train_data:
            data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False)
            outputs = [custom_NN(x) for x in data]
            train_metric.update(label, outputs)
        train_data.reset()

        # Calculate validataion metric
        for batch in validation_data:
            data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False)
            label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False)
            outputs = [custom_NN(x) for x in data]
            validation_metric.update(label, outputs)

        # Get metrics, print progress table
        _, train_accuracy = train_metric.get()
        train_metric.reset()
        printed_row_values = {'epoch': i, 'train_accuracy': train_accuracy}
        if validation_data:
            _, validataion_accuracy = validation_metric.get()
            printed_row_values['validation_accuracy'] = validataion_accuracy
            validation_metric.reset()
            validation_data.reset()
        if verbose:
            printed_row_values['time'] = time.time()-start_time
            table_printer.print_row(**printed_row_values)


    state = {
        '_class_label_to_id': class_label_to_id,
        '_custom_classifier': custom_NN,
        '_feature_extractor': feature_extractor,
        '_id_to_class_label': {v: k for k, v in class_label_to_id.items()},
        'classes': classes,
        'custom_layer_sizes': custom_layer_sizes,
        'feature': feature,
        'feature_extractor_name': feature_extractor.name,
        'num_classes': num_labels,
        'num_examples': len(dataset),
        'target': target,
        'training_accuracy': train_accuracy,
        'training_time': time.time() - start_time,
        'validation_accuracy': validataion_accuracy if validation_data else None,
    }
    return SoundClassifier(state)
コード例 #20
0
def create(dataset, session_id, target, features=None, prediction_window=100,
           validation_set='auto', max_iterations=10, batch_size=32, verbose=True):
    """
    Create an :class:`ActivityClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data which consists of `sessions` of data where each session is
        a sequence of data. The data must be in `stacked` format, grouped by
        session. Within each session, the data is assumed to be sorted
        temporally. Columns in `features` will be used to train a model that
        will make a prediction using labels in the `target` column.

    session_id : string
        Name of the column that contains a unique ID for each session.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. Use `model.classes` to
        retrieve the order in which the classes are mapped.

    features : list[string], optional
        Name of the columns containing the input features that will be used
        for classification. If set to `None`, all columns except `session_id`
        and `target` will be used.

    prediction_window : int, optional
        Number of time units between predictions. For example, if your input
        data is sampled at 100Hz, and the `prediction_window` is set to 100,
        then this model will make a prediction every 1 second.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance to
        prevent the model from overfitting to the training data.

        For each row of the progress table, accuracy is measured over the
        provided training dataset and the `validation_set`. The format of this
        SFrame must be the same as the training set.

        When set to 'auto', a validation set is automatically sampled from the
        training data (if the training data has > 100 sessions). If
        validation_set is set to None, then all the data will be used for
        training.

    max_iterations : int , optional
        Maximum number of iterations/epochs made over the data during the
        training phase.

    batch_size : int, optional
        Number of sequence chunks used per training step. Must be greater than
        the number of GPUs in use.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ActivityClassifier
        A trained :class:`ActivityClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate as tc

        # Training on dummy data
        >>> data = tc.SFrame({
        ...    'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10,
        ...    'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10,
        ...    'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10,
        ...    'session_id': [0, 0, 0] * 10 + [1, 1] * 10,
        ...    'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10
        ... })

        # Create an activity classifier
        >>> model = tc.activity_classifier.create(train,
        ...     session_id='session_id', target='activity',
        ...     features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z'])

        # Make predictions (as probability vector, or class)
        >>> predictions = model.predict(data)
        >>> predictions = model.predict(data, output_type='probability_vector')

        # Get both predictions and classes together
        >>> predictions = model.classify(data)

        # Get topk predictions (instead of only top-1) if your labels have more
        # 2 classes
        >>> predictions = model.predict_topk(data, k = 3)

        # Evaluate the model
        >>> results = model.evaluate(data)

    See Also
    --------
    ActivityClassifier, util.random_split_by_session
    """
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    from ._model_architecture import _net_params
    from ._model_architecture import _define_model, _fit_model
    from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
    from ._sframe_sequence_iterator import prep_data as _prep_data

    if not isinstance(target, str):
        raise _ToolkitError('target must be of type str')
    if not isinstance(session_id, str):
        raise _ToolkitError('session_id must be of type str')
    _tkutl._raise_error_if_sframe_empty(dataset, 'dataset')
    _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400)
    _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE)

    if features is None:
        features = _fe_tkutl.get_column_names(dataset,
                                              interpret_as_excluded=True,
                                              column_names=[session_id, target])
    if not hasattr(features, '__iter__'):
        raise TypeError("Input 'features' must be a list.")
    if not all([isinstance(x, str) for x in features]):
        raise TypeError("Invalid feature %s: Feature names must be of type str." % x)
    if len(features) == 0:
        raise TypeError("Input 'features' must contain at least one column name.")

    start_time = _time.time()
    dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int])
    _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int])

    # Encode the target column to numerical values
    use_target = target is not None
    dataset, target_map = _encode_target(dataset, target)

    predictions_in_chunk = 20
    chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window,
                                            predictions_in_chunk, target=target, verbose=verbose)

    if isinstance(validation_set, str) and validation_set == 'auto':
        if num_sessions < 100:
            validation_set = None
        else:
            dataset, validation_set = _random_split_by_session(dataset, session_id)

    # Create data iterators
    num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions)
    user_provided_batch_size = batch_size
    batch_size = max(batch_size, num_gpus, 1)
    data_iter = _SFrameSequenceIter(chunked_data, len(features),
                                    prediction_window, predictions_in_chunk,
                                    batch_size, use_target=use_target)

    if validation_set is not None:
        _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set')
        _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set')
        validation_set = _tkutl._toolkits_select_columns(
            validation_set, features + [session_id, target])
        validation_set = validation_set.filter_by(target_map.keys(), target)
        validation_set, mapping = _encode_target(validation_set, target, target_map)
        chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window,
                                            predictions_in_chunk, target=target, verbose=False)

        valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features),
                                    prediction_window, predictions_in_chunk,
                                    batch_size, use_target=use_target)
    else:
        valid_iter = None

    # Define model architecture
    context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions)
    loss_model, pred_model = _define_model(features, target_map, prediction_window,
                                           predictions_in_chunk, context)

    # Train the model
    log = _fit_model(loss_model, data_iter, valid_iter,
                     max_iterations, num_gpus, verbose)

    # Set up prediction model
    pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None,
                    for_training=False)
    arg_params, aux_params = loss_model.get_params()
    pred_model.init_params(arg_params=arg_params, aux_params=aux_params)

    # Save the model
    state = {
        '_pred_model': pred_model,
        'verbose': verbose,
        'training_time': _time.time() - start_time,
        'target': target,
        'classes': sorted(target_map.keys()),
        'features': features,
        'session_id': session_id,
        'prediction_window': prediction_window,
        'max_iterations': max_iterations,
        'num_examples': len(dataset),
        'num_sessions': num_sessions,
        'num_classes': len(target_map),
        'num_features': len(features),
        'training_accuracy': log['train_acc'],
        'training_log_loss': log['train_loss'],
        '_target_id_map': target_map,
        '_id_target_map': {v: k for k, v in target_map.items()},
        '_predictions_in_chunk': predictions_in_chunk,
        '_recalibrated_batch_size': data_iter.batch_size,
        'batch_size' : user_provided_batch_size
    }

    if validation_set is not None:
        state['valid_accuracy'] = log['valid_acc']
        state['valid_log_loss'] = log['valid_loss']

    model = ActivityClassifier(state)
    return model
コード例 #21
0
def create(dataset, annotations=None, feature=None, model='darknet-yolo',
           classes=None, max_iterations=0, verbose=True, **kwargs):
    """
    Create a :class:`ObjectDetector` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``annotations``
        parameters will be extracted for training the detector.

    annotations : string
        Name of the column containing the object detection annotations.
        This column should be a list of dictionaries, with each dictionary
        representing a bounding box of an object instance. Here is an example
        of the annotations for a single image with two object instances::

            [{'label': 'dog',
              'type': 'rectangle',
              'coordinates': {'x': 223, 'y': 198,
                              'width': 130, 'height': 230}},
             {'label': 'cat',
              'type': 'rectangle',
              'coordinates': {'x': 40, 'y': 73,
                              'width': 80, 'height': 123}}]

        The value for `x` is the horizontal center of the box paired with
        `width` and `y` is the vertical center of the box paired with `height`.
        'None' (the default) indicates the only list column in `dataset` should
        be used for the annotations.

    feature : string
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Object detection model to use:

           - "darknet-yolo" : Fast and medium-sized model

    classes : list optional
        List of strings containing the names of the classes of objects.
        Inferred from the data if not provided.

    max_iterations : int
        The number of training iterations. If 0, then it will be automatically
        be determined based on the amount of data you provide.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : ObjectDetector
        A trained :class:`ObjectDetector` model.

    See Also
    --------
    ObjectDetector

    Examples
    --------
    .. sourcecode:: python

        # Train an object detector model
        >>> model = turicreate.object_detector.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

        # Visualize predictions by generating a new column of marked up images
        >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions'])
    """
    _raise_error_if_not_sframe(dataset, "dataset")
    from ._mx_detector import YOLOLoss as _YOLOLoss
    from ._model import tiny_darknet as _tiny_darknet
    from ._sframe_loader import SFrameDetectionIter as _SFrameDetectionIter
    from ._manual_scheduler import ManualScheduler as _ManualScheduler
    import mxnet as _mx
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')

    _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE)
    start_time = _time.time()

    supported_detectors = ['darknet-yolo']

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)
        if verbose:
            print("Using '%s' as feature column" % feature)
    if annotations is None:
        annotations = _tkutl._find_only_column_of_type(dataset,
                                                       target_type=list,
                                                       type_name='list',
                                                       col_name='annotations')
        if verbose:
            print("Using '%s' as annotations column" % annotations)

    _raise_error_if_not_detection_sframe(dataset, feature, annotations,
                                         require_annotations=True)

    _tkutl._check_categorical_option_type('model', model,
            supported_detectors)

    base_model = model.split('-', 1)[0]
    ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]()

    params = {
        'anchors': [
            (1.0, 2.0), (1.0, 1.0), (2.0, 1.0),
            (2.0, 4.0), (2.0, 2.0), (4.0, 2.0),
            (4.0, 8.0), (4.0, 4.0), (8.0, 4.0),
            (8.0, 16.0), (8.0, 8.0), (16.0, 8.0),
            (16.0, 32.0), (16.0, 16.0), (32.0, 16.0),
        ],
        'grid_shape': [13, 13],
        'batch_size': 32,
        'aug_resize': 0,
        'aug_rand_crop': 0.9,
        'aug_rand_pad': 0.9,
        'aug_rand_gray': 0.0,
        'aug_aspect_ratio': 1.25,
        'aug_hue': 0.05,
        'aug_brightness': 0.05,
        'aug_saturation': 0.05,
        'aug_contrast': 0.05,
        'aug_horizontal_flip': True,
        'aug_min_object_covered': 0,
        'aug_min_eject_coverage': 0.5,
        'aug_area_range': (.15, 2),
        'aug_pca_noise': 0.0,
        'aug_max_attempts': 20,
        'aug_inter_method': 2,
        'lmb_coord_xy': 10.0,
        'lmb_coord_wh': 10.0,
        'lmb_obj': 100.0,
        'lmb_noobj': 5.0,
        'lmb_class': 2.0,
        'non_maximum_suppression_threshold': 0.45,
        'rescore': True,
        'clip_gradients': 0.025,
        'learning_rate': 1.0e-3,
        'shuffle': True,
    }

    if '_advanced_parameters' in kwargs:
        # Make sure no additional parameters are provided
        new_keys = set(kwargs['_advanced_parameters'].keys())
        set_keys = set(params.keys()) 
        unsupported = new_keys - set_keys
        if unsupported:
            raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported))

        params.update(kwargs['_advanced_parameters'])

    anchors = params['anchors']
    num_anchors = len(anchors)

    num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=params['batch_size'])
    batch_size_each = params['batch_size'] // max(num_gpus, 1)
    # Note, this may slightly alter the batch size to fit evenly on the GPUs
    batch_size = max(num_gpus, 1) * batch_size_each

    grid_shape = params['grid_shape']
    input_image_shape = (3,
                         grid_shape[0] * ref_model.spatial_reduction,
                         grid_shape[1] * ref_model.spatial_reduction)

    try:
        instances = (dataset.stack(annotations, new_column_name='_bbox', drop_na=True)
                            .unpack('_bbox', limit=['label']))
    except (TypeError, RuntimeError):
        # If this fails, the annotation format isinvalid at the coarsest level
        raise _ToolkitError("Annotations format is invalid. Must be a list of "
                            "dictionaries containing 'label' and 'coordinates'.")
    num_images = len(dataset)
    num_instances = len(instances)
    if classes is None:
        classes = instances['_bbox.label'].unique()
    classes = sorted(classes)

    # Make a class-to-index look-up table
    class_to_index = {name: index for index, name in enumerate(classes)}
    num_classes = len(classes)

    # Create data loader
    loader = _SFrameDetectionIter(dataset,
                                  batch_size=batch_size,
                                  input_shape=input_image_shape[1:],
                                  output_shape=grid_shape,
                                  anchors=anchors,
                                  class_to_index=class_to_index,
                                  aug_params=params,
                                  shuffle=params['shuffle'],
                                  loader_type='augmented',
                                  feature_column=feature,
                                  annotations_column=annotations)

    # Predictions per anchor box: x/y + w/h + object confidence + class probs
    preds_per_box = 5 + num_classes
    output_size = preds_per_box * num_anchors
    ymap_shape = (batch_size_each,) + tuple(grid_shape) + (num_anchors, preds_per_box)

    net = _tiny_darknet(output_size=output_size)

    loss = _YOLOLoss(input_shape=input_image_shape[1:],
                     output_shape=grid_shape,
                     batch_size=batch_size_each,
                     num_classes=num_classes,
                     anchors=anchors,
                     parameters=params)

    base_lr = params['learning_rate']
    if max_iterations == 0:
        # Set number of iterations through a heuristic
        num_iterations_raw = 5000 * _np.sqrt(num_instances) / batch_size
        num_iterations = 1000 * max(1, int(round(num_iterations_raw / 1000)))
    else:
        num_iterations = max_iterations

    steps = [num_iterations // 2, 3 * num_iterations // 4, num_iterations]
    steps_and_factors = [(step, 10**(-i)) for i, step in enumerate(steps)]

    steps, factors = zip(*steps_and_factors)
    lr_scheduler = _ManualScheduler(step=steps, factor=factors)

    ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size)

    net_params = net.collect_params()
    net_params.initialize(_mx.init.Xavier(), ctx=ctx)
    net_params['conv7_weight'].initialize(_mx.init.Xavier(factor_type='avg'), ctx=ctx, force_reinit=True)
    net_params['conv8_weight'].initialize(_mx.init.Uniform(0.00005), ctx=ctx, force_reinit=True)
    # Initialize object confidence low, preventing an unnecessary adjustment
    # period toward conservative estimates
    bias = _np.zeros(output_size, dtype=_np.float32)
    bias[4::preds_per_box] -= 6
    from ._mx_detector import ConstantArray
    net_params['conv8_bias'].initialize(ConstantArray(bias), ctx, force_reinit=True)

    # Take a subset and then load the rest of the parameters. It is possible to
    # do allow_missing=True directly on net_params. However, this will more
    # easily hide bugs caused by names getting out of sync.
    ref_model.available_parameters_subset(net_params).load(ref_model.model_path, ctx)

    options = {'learning_rate': base_lr, 'lr_scheduler': lr_scheduler,
               'momentum': 0.9, 'wd': 0.00005, 'rescale_grad': 1.0}
    clip_grad = params.get('clip_gradients')
    if clip_grad:
        options['clip_gradient'] = clip_grad

    trainer = _mx.gluon.Trainer(net.collect_params(), 'sgd', options)

    iteration = 0
    smoothed_loss = None
    last_time = 0
    while iteration < num_iterations:
        loader.reset()
        for batch in loader:
            data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)

            Ls = []
            with _mx.autograd.record():
                for x, y in zip(data, label):
                    z = net(x)
                    z0 = _mx.nd.transpose(z, [0, 2, 3, 1]).reshape(ymap_shape)
                    L = loss(z0, y)
                    Ls.append(L)
                for L in Ls:
                    L.backward()

            cur_loss = _np.mean([L.asnumpy()[0] for L in Ls])
            if smoothed_loss is None:
                smoothed_loss = cur_loss
            else:
                smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss
            trainer.step(1)
            iteration += 1
            cur_time = _time.time()
            if verbose and cur_time > last_time + 10:
                print('{now:%Y-%m-%d %H:%M:%S}  Training {cur_iter:{width}d}/{num_iterations:{width}d}  Loss {loss:6.3f}'.format(
                    now=_datetime.now(), cur_iter=iteration, num_iterations=num_iterations,
                    loss=smoothed_loss, width=len(str(num_iterations))))
                last_time = cur_time
            if iteration == num_iterations:
                break

    training_time = _time.time() - start_time

    # Save the model
    state = {
        '_model': net,
        '_class_to_index': class_to_index,
        '_training_time_as_string': _seconds_as_string(training_time),
        '_grid_shape': grid_shape,
        'anchors': anchors,
        'model': model,
        'classes': classes,
        'batch_size': batch_size,
        'input_image_shape': input_image_shape,
        'feature': feature,
        'non_maximum_suppression_threshold': params['non_maximum_suppression_threshold'],
        'annotations': annotations,
        'num_classes': num_classes,
        'num_examples': num_images,
        'num_bounding_boxes': num_instances,
        'training_time': training_time,
        'training_epochs': loader.cur_epoch,
        'training_iterations': iteration,
        'max_iterations': max_iterations,
        'training_loss': smoothed_loss,
    }
    return ObjectDetector(state)
コード例 #22
0
def create(dataset,
           label=None,
           features=None,
           distance=None,
           method='auto',
           verbose=True,
           **kwargs):
    """
    Create a nearest neighbor model, which can be searched efficiently and
    quickly for the nearest neighbors of a query observation. If the `method`
    argument is specified as `auto`, the type of model is chosen automatically
    based on the type of data in `dataset`.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of Turi Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change;
        it is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Reference data. If the features for each observation are numeric, they
        may be in separate columns of 'dataset' or a single column with lists
        of values. The features may also be in the form of a column of sparse
        vectors (i.e. dictionaries), with string keys and numeric values.

    label : string, optional
        Name of the SFrame column with row labels. If 'label' is not specified,
        row numbers are used to identify reference dataset rows when the model
        is queried.

    features : list[string], optional
        Name of the columns with features to use in computing distances between
        observations and the query points. 'None' (the default) indicates that
        all columns except the label should be used as features. Each column
        can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *List*: list of integer or string values. Each element is treated as
          a separate variable in the model.

        - *String*: string values.

        Please note: if a composite distance is also specified, this parameter
        is ignored.

    distance : string, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~turicreate.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (strings)

          2. standard distance name (string)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        If 'distance' is left unspecified or set to 'auto', a composite
        distance is constructed automatically based on feature types.

    method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional
        Method for computing nearest neighbors. The options are:

        - *auto* (default): the method is chosen automatically, based on the
          type of data and the distance. If the distance is 'manhattan' or
          'euclidean' and the features are numeric or vectors of numeric
          values, then the 'ball_tree' method is used. Otherwise, the
          'brute_force' method is used.

        - *ball_tree*: use a tree structure to find the k-closest neighbors to
          each query point. The ball tree model is slower to construct than the
          brute force model, but queries are faster than linear time. This
          method is not applicable for the cosine and dot product distances.
          See `Liu, et al (2004)
          <http://papers.nips.cc/paper/2666-an-investigation-of-p
          ractical-approximat e-nearest-neighbor-algorithms>`_ for
          implementation details.

        - *brute_force*: compute the distance from a query point to all
          reference observations. There is no computation time for model
          creation with the brute force method (although the reference data is
          held in the model, but each query takes linear time.

        - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate
          nearest neighbors efficiently. The LSH model supports 'euclidean',
          'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product'
          (deprecated), and 'transformed_dot_product' distances. Two options
          are provided for LSH -- ``num_tables`` and
          ``num_projections_per_table``. See the notes below for details.

    verbose: bool, optional
        If True, print progress updates and model details.

    **kwargs : optional
        Options for the distance function and query method.

        - *leaf_size*: for the ball tree method, the number of points in each
          leaf of the tree. The default is to use the max of 1,000 and
          n/(2^11), which ensures a maximum tree depth of 12.

        - *num_tables*: For the LSH method, the number of hash tables
          constructed. The default value is 20. We recommend choosing values
          from 10 to 30.

        - *num_projections_per_table*: For the LSH method, the number of
          projections/hash functions for each hash table. The default value is
          4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other
          distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8
          ~ 20 for 'cosine' distance and 4 ~ 12 for other distances.

    Returns
    -------
    out : NearestNeighborsModel
        A structure for efficiently computing the nearest neighbors in 'dataset'
        of new query points.

    See Also
    --------
    NearestNeighborsModel.query, turicreate.toolkits.distances

    Notes
    -----
    - Missing data is not allowed in the 'dataset' provided to this function.
      Please use the :func:`turicreate.SFrame.fillna` and
      :func:`turicreate.SFrame.dropna` utilities to handle missing data before
      creating a nearest neighbors model.

    - Missing keys in sparse vectors are assumed to have value 0.

    - The `composite_params` parameter was removed as of Turi Create
      version 1.5. The `distance` parameter now accepts either standard or
      composite distances. Please see the :mod:`~turicreate.toolkits.distances`
      module documentation for more information on composite distances.

    - If the features should be weighted equally in the distance calculations
      but are measured on different scales, it is important to standardize the
      features. One way to do this is to subtract the mean of each column and
      divide by the standard deviation.

    **Locality Sensitive Hashing (LSH)**

    There are several efficient nearest neighbors search algorithms that work
    well for data with low dimensions :math:`d` (approximately 50). However,
    most of the solutions suffer from either space or query time that is
    exponential in :math:`d`. For large :math:`d`, they often provide little,
    if any, improvement over the 'brute_force' method. This is a well-known
    consequence of the phenomenon called `The Curse of Dimensionality`.

    `Locality Sensitive Hashing (LSH)
    <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach
    that is designed to efficiently solve the *approximate* nearest neighbor
    search problem for high dimensional data. The key idea of LSH is to hash
    the data points using several hash functions, so that the probability of
    collision is much higher for data points which are close to each other than
    those which are far apart.

    An LSH family is a family of functions :math:`h` which map points from the
    metric space to a bucket, so that

    - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`.
    - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`.

    LSH for efficient approximate nearest neighbor search:

    - We define a new family of hash functions :math:`g`, where each
      function :math:`g` is obtained by concatenating :math:`k` functions
      :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`.
      The algorithm constructs :math:`L` hash tables, each of which
      corresponds to a different randomly chosen hash function :math:`g`.
      There are :math:`k \\cdot L` hash functions used in total.

    - In the preprocessing step, we hash all :math:`n` reference points
      into each of the :math:`L` hash tables.

    - Given a query point :math:`q`, the algorithm iterates over the
      :math:`L` hash functions :math:`g`. For each :math:`g` considered, it
      retrieves the data points that are hashed into the same bucket as q.
      These data points from all the :math:`L` hash tables are considered as
      candidates that are then re-ranked by their real distances with the query
      data.

    **Note** that the number of tables :math:`L` and the number of hash
    functions per table :math:`k` are two main parameters. They can be set
    using the options ``num_tables`` and ``num_projections_per_table``
    respectively.

    Hash functions for different distances:

    - `euclidean` and `squared_euclidean`:
      :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where
      :math:`a` is a vector, of which the elements are independently
      sampled from normal distribution, and :math:`b` is a number
      uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the
      bucket width. We set :math:`r` using the average all-pair `euclidean`
      distances from a small randomly sampled subset of the reference data.

    - `manhattan`: The hash function of `manhattan` is similar with that of
      `euclidean`. The only difference is that the elements of `a` are sampled
      from Cauchy distribution, instead of normal distribution.

    - `cosine`: Random Projection is designed to approximate the cosine
      distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot
      q)`, where :math:`a` is randomly sampled normal unit vector.

    - `jaccard`: We use a recently proposed method one permutation hashing by
      Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014]
      <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for
      details.

    - `dot_product`: The reference data points are first transformed to
      fixed-norm vectors, and then the minimum `dot_product` distance search
      problem can be solved via finding the reference data with smallest
      `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015]
      <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details.

    References
    ----------
    - `Wikipedia - nearest neighbor
      search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_

    - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_

    - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of
      Practical Approximate Nearest Neighbor Algorithms
      <http://papers.nips.cc/paper/2666-an-investigation-of-p
      ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural
      Information Processing Systems pp. 825-832.

    - `Wikipedia - Jaccard distance
      <http://en.wikipedia.org/wiki/Jaccard_index>`_

    - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the
      Jaccard Median
      <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_.
      Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete
      Algorithms. Society for Industrial and Applied Mathematics.

    - `Wikipedia - Cosine distance
      <http://en.wikipedia.org/wiki/Cosine_similarity>`_

    - `Wikipedia - Levenshtein distance
      <http://en.wikipedia.org/wiki/Levenshtein_distance>`_

    - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive
      Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_.

    Examples
    --------
    Construct a nearest neighbors model with automatically determined method
    and distance:

    >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11],
    ...                       'X2': [0.69, 0.58, 0.36],
    ...                       'str_feature': ['cat', 'dog', 'fossa']})
    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'])

    For datasets with a large number of rows and up to about 100 variables, the
    ball tree method often leads to much faster queries.

    >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'],
    ...                                           method='ball_tree')

    Often the final determination of a neighbor is based on several distance
    computations over different sets of features. Each part of this composite
    distance may have a different relative weight.

    >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.],
    ...            [['str_feature'], 'levenshtein', 3.]]
    ...
    >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist)
    """

    ## Validate the 'dataset' input
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

    ## Basic validation of the features input
    if features is not None and not isinstance(features, list):
        raise TypeError("If specified, input 'features' must be a list of " +
                        "strings.")

    ## Clean the method options and create the options dictionary
    allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table']
    _method_options = {}

    for k, v in kwargs.items():
        if k in allowed_kwargs:
            _method_options[k] = v
        else:
            raise _ToolkitError(
                "'{}' is not a valid keyword argument".format(k) +
                " for the nearest neighbors model. Please " +
                "check for capitalization and other typos.")

    ## Exclude inappropriate combinations of method an distance
    if method == 'ball_tree' and (
            distance == 'cosine' or distance == _turicreate.distances.cosine
            or distance == 'dot_product'
            or distance == _turicreate.distances.dot_product
            or distance == 'transformed_dot_product'
            or distance == _turicreate.distances.transformed_dot_product):
        raise TypeError(
            "The ball tree method does not work with 'cosine' " +
            "'dot_product', or 'transformed_dot_product' distance." +
            "Please use the 'brute_force' method for these distances.")

    if method == 'lsh' and ('num_projections_per_table'
                            not in _method_options):
        if distance == 'jaccard' or distance == _turicreate.distances.jaccard:
            _method_options['num_projections_per_table'] = 4
        elif distance == 'cosine' or distance == _turicreate.distances.cosine:
            _method_options['num_projections_per_table'] = 16
        else:
            _method_options['num_projections_per_table'] = 8

    ## Initial validation and processing of the label
    if label is None:
        _label = _robust_column_name('__id', dataset.column_names())
        _dataset = dataset.add_row_number(_label)
    else:
        _label = label
        _dataset = _copy.copy(dataset)

    col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()}
    _validate_row_label(_label, col_type_map)
    ref_labels = _dataset[_label]

    ## Determine the internal list of available feature names (may still include
    #  the row label name).
    if features is None:
        _features = _dataset.column_names()
    else:
        _features = _copy.deepcopy(features)

    ## Check if there's only one feature and it's the same as the row label.
    #  This would also be trapped by the composite distance validation, but the
    #  error message is not very informative for the user.
    free_features = set(_features).difference([_label])
    if len(free_features) < 1:
        raise _ToolkitError("The only available feature is the same as the " +
                            "row label column. Please specify features " +
                            "that are not also row labels.")

    ### Validate and preprocess the distance function
    ### ---------------------------------------------
    # - The form of the 'distance' controls how we interact with the 'features'
    #   parameter as well.
    # - At this point, the row label 'label' may still be in the list(s) of
    #   features.

    ## Convert any distance function input into a single composite distance.
    # distance is already a composite distance
    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    # distance is a single name (except 'auto') or function handle.
    elif (hasattr(distance, '__call__')
          or (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    # distance is unspecified and needs to be constructed.
    elif distance is None or distance == 'auto':
        sample = _dataset.head()
        distance = _construct_auto_distance(_features, _dataset.column_names(),
                                            _dataset.column_types(), sample)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' "
                        " argument must be a string, function handle, or " +
                        "composite distance.")

    ## Basic composite distance validation, remove the row label from all
    #  feature lists, and convert string distance names into distance functions.
    distance = _scrub_composite_distance_features(distance, [_label])
    distance = _convert_distance_names_to_functions(distance)
    _validate_composite_distance(distance)

    ## Raise an error if any distances are used with non-lists
    list_features_to_check = []
    sparse_distances = [
        'jaccard', 'weighted_jaccard', 'cosine', 'dot_product',
        'transformed_dot_product'
    ]
    sparse_distances = [
        getattr(_turicreate.distances, k) for k in sparse_distances
    ]
    for d in distance:
        feature_names, dist, _ = d
        list_features = [f for f in feature_names if _dataset[f].dtype == list]
        for f in list_features:
            if dist in sparse_distances:
                list_features_to_check.append(f)
            else:
                raise TypeError(
                    "The chosen distance cannot currently be used " +
                    "on list-typed columns.")
    for f in list_features_to_check:
        only_str_lists = _validate_lists(_dataset[f], [str])
        if not only_str_lists:
            raise TypeError("Distances for sparse data, such as jaccard " +
                            "and weighted_jaccard, can only be used on " +
                            "lists containing only strings. Please modify " +
                            "any list features accordingly before creating " +
                            "the nearest neighbors model.")

    ## Raise an error if any component has string features are in single columns
    for d in distance:
        feature_names, dist, _ = d

        if (len(feature_names) > 1) and (dist
                                         == _turicreate.distances.levenshtein):
            raise ValueError(
                "Levenshtein distance cannot be used with multiple " +
                "columns. Please concatenate strings into a single " +
                "column before creating the nearest neighbors model.")

    ## Get the union of feature names and make a clean dataset.
    clean_features = _get_composite_distance_features(distance)
    sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features)

    ## Decide which method to use
    ## - If more than one distance component (specified either directly or
    #  generated automatically because distance set to 'auto'), then do brute
    #  force.
    if len(distance) > 1:
        _method = 'brute_force'

        if method != 'brute_force' and verbose is True:
            print("Defaulting to brute force instead of ball tree because " +\
                "there are multiple distance components.")

    else:
        if method == 'auto':

            # get the total number of variables. Assume the number of elements in
            # array type columns does not change
            num_variables = sum([
                len(x) if hasattr(x, '__iter__') else 1
                for x in _six.itervalues(sf_clean[0])
            ])

            # flag if all the features in the single composite are of numeric
            # type.
            numeric_type_flag = all([
                x in [int, float, list, array.array]
                for x in sf_clean.column_types()
            ])

            ## Conditions necessary for ball tree to work and be worth it
            if ((distance[0][1] in [
                    'euclidean', 'manhattan', _turicreate.distances.euclidean,
                    _turicreate.distances.manhattan
            ]) and numeric_type_flag is True and num_variables <= 200):

                _method = 'ball_tree'

            else:
                _method = 'brute_force'

        else:
            _method = method

    ## Pick the right model name for the method
    if _method == 'ball_tree':
        model_name = 'nearest_neighbors_ball_tree'

    elif _method == 'brute_force':
        model_name = 'nearest_neighbors_brute_force'

    elif _method == 'lsh':
        model_name = 'nearest_neighbors_lsh'

    else:
        raise ValueError(
            "Method must be 'auto', 'ball_tree', 'brute_force', " +
            "or 'lsh'.")

    ## Package the model options
    opts = {}
    opts.update(_method_options)
    opts.update({
        'model_name': model_name,
        'ref_labels': ref_labels,
        'label': label,
        'sf_features': sf_clean,
        'composite_params': distance
    })

    ## Construct the nearest neighbors model
    with QuietProgress(verbose):
        result = _turicreate.extensions._nearest_neighbors.train(opts)

    model_proxy = result['model']
    model = NearestNeighborsModel(model_proxy)

    return model
コード例 #23
0
    def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an ROC curve

            For more flexibility in calculating evaluation metrics, use the
            :class:`~turicreate.toolkits.evaluation` module.

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict, classify

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """

        import os, json, math

        if (batch_size < 1):
            raise ValueError("'batch_size' must be greater than or equal to 1")
        if self.target not in dataset.column_names():
            raise _ToolkitError("Target column '%s' does not exist" %
                                self.target)

        extracted_features = self._extract_features(dataset,
                                                    verbose=verbose,
                                                    batch_size=batch_size)
        extracted_features[self.target] = dataset[self.target]

        metrics = self.classifier.evaluate(extracted_features,
                                           metric=metric,
                                           with_predictions=True)

        predictions = metrics["predictions"]["probs"]
        state = self.__proxy__.get_state()
        labels = state["classes"]

        from .._evaluate_utils import (entropy, confidence,
                                       relative_confidence,
                                       get_confusion_matrix, hclusterSort,
                                       l2Dist)

        evaluation_result = {
            k: metrics[k]
            for k in [
                'accuracy', 'f1_score', 'log_loss', 'precision', 'recall',
                'auc', 'roc_curve', 'confusion_matrix'
            ]
        }
        evaluation_result['num_test_examples'] = len(dataset)
        for k in [
                'num_classes', 'num_features', 'input_image_shape',
                'num_examples', 'training_loss', 'training_time', 'model',
                'max_iterations'
        ]:
            evaluation_result[k] = getattr(self, k)

        # Extend the given test data
        extended_test = dataset.add_column(predictions, 'probs')
        extended_test['label'] = dataset[self.target]
        extended_test = extended_test.add_columns([
            extended_test.apply(
                lambda d: labels[d['probs'].index(confidence(d['probs']))]),
            extended_test.apply(lambda d: entropy(d['probs'])),
            extended_test.apply(lambda d: confidence(d['probs'])),
            extended_test.apply(lambda d: relative_confidence(d['probs']))
        ], ['predicted_label', 'entropy', 'confidence', 'relative_confidence'])
        extended_test = extended_test.add_column(
            extended_test.apply(lambda d: d['label'] == d['predicted_label']),
            'correct')

        evaluation_result['model_name'] = state['model']
        # Calculate the confusion matrix
        sf_conf_mat = get_confusion_matrix(extended_test, labels)
        confidence_threshold = 0.5
        hesitant_threshold = 0.2
        evaluation_result['confidence_threshold'] = confidence_threshold
        evaluation_result['hesitant_threshold'] = hesitant_threshold
        evaluation_result[
            'confidence_metric_for_threshold'] = 'relative_confidence'

        evaluation_result['conf_mat'] = list(sf_conf_mat)

        # Get sorted labels (sorted by hCluster)
        vectors = map(
            lambda l: {
                'name':
                l,
                'pos':
                list(sf_conf_mat[sf_conf_mat['target_label'] == l].sort(
                    'predicted_label')['norm_prob'])
            }, labels)
        evaluation_result['sorted_labels'] = hclusterSort(
            vectors, l2Dist)[0]['name'].split("|")

        # Get recall and precision per label
        per_l = extended_test.groupby(
            ['label'], {
                'count': _tc.aggregate.COUNT,
                'correct_count': _tc.aggregate.SUM('correct')
            })
        per_l['recall'] = per_l.apply(
            lambda l: l['correct_count'] * 1.0 / l['count'])

        per_pl = extended_test.groupby(
            ['predicted_label'], {
                'predicted_count': _tc.aggregate.COUNT,
                'correct_count': _tc.aggregate.SUM('correct')
            })
        per_pl['precision'] = per_pl.apply(
            lambda l: l['correct_count'] * 1.0 / l['predicted_count'])
        per_pl = per_pl.rename({'predicted_label': 'label'})
        evaluation_result['label_metrics'] = list(
            per_l.join(per_pl, on='label', how='outer').select_columns([
                'label', 'count', 'correct_count', 'predicted_count', 'recall',
                'precision'
            ]))
        evaluation_result['labels'] = labels

        extended_test = extended_test.add_row_number('__idx').rename(
            {'label': 'target_label'})

        evaluation_result['test_data'] = extended_test
        evaluation_result['feature'] = self.feature

        return _Evaluation(evaluation_result)
コード例 #24
0
def create(dataset, target, feature = None, model = 'resnet-50',
           validation_set='auto', max_iterations = 10, verbose = True,
           seed = None, batch_size=64):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    max_iterations : float, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check model parameter
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10,14):
        allowed_models.append('VisionFeaturePrint_Screen')
    _tkutl._check_categorical_option_type('model', model, allowed_models)

    # Check dataset parameter
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" % feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if(batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'.")

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(model)

    # Extract features
    extracted_features = _tc.SFrame({
        target: dataset[target],
        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size),
        })
    if isinstance(validation_set, _tc.SFrame):
        extracted_features_validation = _tc.SFrame({
            target: validation_set[target],
            '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size),
        })
    else:
        extracted_features_validation = validation_set

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(extracted_features,
                                              features=['__image_features__'],
                                              target=target,
                                              max_iterations=max_iterations,
                                              validation_set=extracted_features_validation,
                                              seed=seed,
                                              verbose=verbose)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:    # model == VisionFeaturePrint_Screen
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
コード例 #25
0
def create(dataset,
           label=None,
           feature=None,
           model="resnet-50",
           verbose=True,
           batch_size=64):
    """
    Create a :class:`ImageSimilarityModel` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    label : string
        Name of the SFrame column with row labels to be used as uuid's to
        identify the data. If 'label' is set to None, row numbers are used to
        identify reference dataset rows when the model is queried.

    feature : string
        Name of the column containing the input images. 'None' (the default)
        indicates that the SFrame has only one column of Image type and that will
        be used for similarity.

    model: string, optional
        Uses a pretrained model to bootstrap an image similarity model

           - "resnet-50" : Uses a pretrained resnet model.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    verbose : bool, optional
        If True, print progress updates and model details.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageSimilarityModel
        A trained :class:`ImageSimilarityModel` model.

    See Also
    --------
    ImageSimilarityModel

    Examples
    --------
    .. sourcecode:: python

        # Train an image similarity model
        >>> model = turicreate.image_similarity.create(data)

        # Query the model for similar images
        >>> similar_images = model.query(data)
        +-------------+-----------------+-------------------+------+
        | query_label | reference_label |      distance     | rank |
        +-------------+-----------------+-------------------+------+
        |      0      |        0        |        0.0        |  1   |
        |      0      |       519       |   12.5319706301   |  2   |
        |      0      |       1619      |   12.5563764596   |  3   |
        |      0      |       186       |   12.6132604915   |  4   |
        |      0      |       1809      |   12.9180964745   |  5   |
        |      1      |        1        | 2.02304872852e-06 |  1   |
        |      1      |       1579      |   11.4288186151   |  2   |
        |      1      |       1237      |   12.3764325949   |  3   |
        |      1      |        80       |   12.7264363676   |  4   |
        |      1      |        58       |   12.7675058558   |  5   |
        +-------------+-----------------+-------------------+------+
        [500 rows x 4 columns]
    """
    start_time = _time.time()
    if not isinstance(dataset, _tc.SFrame):
        raise TypeError("'dataset' must be of type SFrame.")

    # Check parameters
    allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append("VisionFeaturePrint_Scene")

        # Also, to make sure existing code doesn't break, replace incorrect name
        # with the correct name version
        if model == "VisionFeaturePrint_Screen":
            print(
                "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene.  VisionFeaturePrint_Screen will be removed in future releases."
            )
            model = "VisionFeaturePrint_Scene"

    _tkutl._check_categorical_option_type("model", model, allowed_models)
    if len(dataset) == 0:
        raise _ToolkitError("Unable to train on empty dataset")
    if (label is not None) and (label not in dataset.column_names()):
        raise _ToolkitError("Row label column '%s' does not exist" % label)
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if batch_size < 1:
        raise ValueError("'batch_size' must be greater than or equal to 1")

    # Set defaults
    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        "__image_features__":
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })

    # Train a similarity model using the extracted features
    if label is not None:
        extracted_features[label] = dataset[label]
    nn_model = _tc.nearest_neighbors.create(
        extracted_features,
        label=label,
        features=["__image_features__"],
        verbose=verbose,
    )

    # set input image shape
    if model in _pre_trained_models.IMAGE_MODELS:
        input_image_shape = _pre_trained_models.IMAGE_MODELS[
            model].input_image_shape
    else:  # model == VisionFeaturePrint_Scene
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        "similarity_model": nn_model,
        "model": model,
        "feature_extractor": feature_extractor,
        "input_image_shape": input_image_shape,
        "label": label,
        "feature": feature,
        "num_features": 1,
        "num_examples": nn_model.num_examples,
        "training_time": _time.time() - start_time,
    }
    return ImageSimilarityModel(state)
コード例 #26
0
def create(dataset, target, features=None, distance=None, verbose=True):
    """
    Create a
    :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier`
    model. This model predicts the class of a query instance by finding the most
    common class among the query's nearest neighbors.

    .. warning::

        The 'dot_product' distance is deprecated and will be removed in future
        versions of Turi Create. Please use 'transformed_dot_product'
        distance instead, although note that this is more than a name change; it
        is a *different* transformation of the dot product of two vectors.
        Please see the distances module documentation for more details.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns except the target variable
        should be used. Please note: if `distance` is specified as a composite
        distance, then that parameter controls which features are used in the
        model. Each column can be one of the following types:

        - *Numeric*: values of numeric type integer or float.

        - *Array*: array of numeric (integer or float) values. Each array
          element is treated as a separate variable in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values.
          Each key indicates a separate variable in the model.

        - *String*: string values.

        Please note: if `distance` is specified as a composite distance, then
        that parameter controls which features are used in the model.

    distance : str, function, or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of three types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Function*: a function handle from the
          :mod:`~turicreate.toolkits.distances` module.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified or set to 'auto', a composite distance
        is constructed automatically based on feature types.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : NearestNeighborClassifier
        A trained model of type
        :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier`.

    See Also
    --------
    NearestNeighborClassifier
    turicreate.toolkits.nearest_neighbors
    turicreate.toolkits.distances

    References
    ----------
    - `Wikipedia - nearest neighbors classifier
      <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_

    - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of
      Statistical Learning <https://web.stanford.edu/~hastie/ElemStatLearn/>`_.
      Vol. 2. New York. Springer. pp. 463-481.

    Examples
    --------
    >>> sf = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
    ...                       'height': [9, 25, 20, 23],
    ...                       'weight': [13, 28, 33, 22]})
    ...
    >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species')

    As with the nearest neighbors toolkit, the nearest neighbor classifier
    accepts composite distance functions.

    >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7],
    ...            [('height', 'weight'), 'manhattan', 1.6]]
    ...
    >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species',
    ...                                                     distance=my_dist)
    """

    ## Set up
    ## ------
    start_time = _time.time()


    ## Validation and preprocessing
    ## ----------------------------

    ## 'dataset' must be a non-empty SFrame
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_sframe_empty(dataset, "dataset")


    ## 'target' must be a string, in 'dataset', and the type of the target must
    #  be string or integer.
    if not isinstance(target, str) or target not in dataset.column_names():
        raise _ToolkitError("The 'target' parameter must be the name of a "
                            "column in the input dataset.")

    if not dataset[target].dtype == str and not dataset[target].dtype == int:
        raise TypeError("The target column must contain integers or strings.")


    ## Warn that 'None' values in the target may lead to ambiguous predictions.
    if dataset[target].countna() > 0:
        _logging.warning("Missing values detected in the target column. This " +
                         "may lead to ambiguous 'None' predictions, if the " +
                         "'radius' parameter is set too small in the prediction, " +
                         "classification, or evaluation methods.")


    ## convert features and distance arguments into a composite distance
    ## NOTE: this is done here instead of in the nearest neighbors toolkit
    #  because the automatic distance construction may be different for the two
    #  toolkits.
    if features is None:
        _features = [x for x in dataset.column_names() if x != target]
    else:
        _features = [x for x in features if x != target]


    if isinstance(distance, list):
        distance = _copy.deepcopy(distance)

    elif (hasattr(distance, '__call__') or
        (isinstance(distance, str) and not distance == 'auto')):
        distance = [[_features, distance, 1]]

    elif distance is None or distance == 'auto':
        col_types = {k: v for k, v in zip(dataset.column_names(),
                                          dataset.column_types())}
        distance = _construct_auto_distance(_features, col_types)

    else:
        raise TypeError("Input 'distance' not understood. The 'distance' " +
                        "parameter must be a string or a composite distance, " +
                        " or left unspecified.")


    ## Construct and query the nearest neighbors model
    ## -----------------------------------------------
    knn_model = _tc.nearest_neighbors.create(dataset, label=target,
                                             distance=distance,
                                             verbose=verbose)


    ## Postprocessing and formatting
    ## -----------------------------
    state = {
       'verbose'  : verbose,
       'distance' : knn_model.distance,
       'num_distance_components' : knn_model.num_distance_components,
       'num_examples' : dataset.num_rows(),
       'features' : knn_model.features,
       'target': target,
       'num_classes': len(dataset[target].unique()),
       'num_features':  knn_model.num_features,
       'num_unpacked_features': knn_model.num_unpacked_features,
       'training_time': _time.time() - start_time,
       '_target_type': dataset[target].dtype,
    }
    model = NearestNeighborClassifier(knn_model, state)
    return model
コード例 #27
0
ファイル: image_classifier.py プロジェクト: lbddk/turicreate
def create(
        dataset,
        target,
        feature=None,
        model='resnet-50',
        l2_penalty=0.01,
        l1_penalty=0.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        step_size=_DEFAULT_SOLVER_OPTIONS['step_size'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        class_weights=None,
        validation_set='auto',
        verbose=True,
        seed=None,
        batch_size=64):
    """
    Create a :class:`ImageClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    target : string, or int
        Name of the column containing the target variable. The values in this
        column must be of string or integer type. String target variables are
        automatically mapped to integers in the order in which they are provided.
        For example, a target variable with 'cat' and 'dog' as possible
        values is mapped to 0 and 1 respectively with 0 being the base class
        and 1 being the reference class. Use `model.classes` to retrieve
        the order in which the classes are mapped.

    feature : string, optional
        indicates that the SFrame has only column of Image type and that will
        Name of the column containing the input images. 'None' (the default)
        indicates the only image column in `dataset` should be used as the
        feature.

    l2_penalty : float, optional
        Weight on l2 regularization of the model. The larger this weight, the
        more the model coefficients shrink toward 0. This introduces bias into
        the model but decreases variance, potentially leading to better
        predictions. The default value is 0.01; setting this parameter to 0
        corresponds to unregularized logistic regression. See the ridge
        regression reference for more detail.

    l1_penalty : float, optional
        Weight on l1 regularization of the model. Like the l2 penalty, the
        higher the l1 penalty, the more the estimated coefficients shrink toward
        0. The l1 penalty, however, completely zeros out sufficiently small
        coefficients, automatically indicating features that are not useful
        for the model. The default weight of 0 prevents any features from
        being discarded. See the LASSO regression reference for more detail.

    solver : string, optional
        Name of the solver to be used to solve the regression. See the
        references for more detail on each solver. Available solvers are:

        - *auto (default)*: automatically chooses the best solver for the data
          and model parameters.
        - *newton*: Newton-Raphson
        - *lbfgs*: limited memory BFGS
        - *fista*: accelerated gradient descent

        For this model, the Newton-Raphson method is equivalent to the
        iteratively re-weighted least squares algorithm. If the l1_penalty is
        greater than 0, use the 'fista' solver.

        The model is trained using a carefully engineered collection of methods
        that are automatically picked based on the input data. The ``newton``
        method  works best for datasets with plenty of examples and few features
        (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for
        wide datasets (i.e datasets with many coefficients).  ``fista`` is the
        default solver for l1-regularized linear regression. The solvers are all
        automatically tuned and the default options should function well. See
        the solver options guide for setting additional parameters for each of
        the solvers.

        See the user guide for additional details on how the solver is chosen.
        (see `here
        <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_)

    feature_rescaling : boolean, optional
        Feature rescaling is an important pre-processing step that ensures that
        all features are on the same scale. An l2-norm rescaling is performed
        to make sure that all features are of the same norm. Categorical
        features are also rescaled by rescaling the dummy variables that are
        used to represent them. The coefficients are returned in original scale
        of the problem. This process is particularly useful when features
        vary widely in their ranges.

    convergence_threshold : float, optional
        Convergence is tested using variation in the training objective. The
        variation in the training objective is calculated using the difference
        between the objective values between two steps. Consider reducing this
        below the default value (0.01) for a more accurately trained model.
        Beware of overfitting (i.e a model that works well only on the training
        data) if this parameter is set to a very low value.

    lbfgs_memory_level : float, optional
        The L-BFGS algorithm keeps track of gradient information from the
        previous ``lbfgs_memory_level`` iterations. The storage requirement for
        each of these gradients is the ``num_coefficients`` in the problem.
        Increasing the ``lbfgs_memory_level ``can help improve the quality of
        the model trained. Setting this to more than ``max_iterations`` has the
        same effect as setting it to ``max_iterations``.

    model : string optional
        Uses a pretrained model to bootstrap an image classifier:

           - "resnet-50" : Uses a pretrained resnet model.
                           Exported Core ML model will be ~90M.

           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
                                 Exported Core ML model will be ~4.7M.

           - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor.
                                          Only on available on iOS 12.0+,
                                          macOS 10.14+ and tvOS 12.0+.
                                          Exported Core ML model will be ~41K.

        Models are downloaded from the internet if not available locally. Once
        downloaded, the models are cached for future use.

    step_size : float, optional
        The starting step size to use for the ``fista`` solver. The default is
        set to 1.0, this is an aggressive setting. If the first iteration takes
        a considerable amount of time, reducing this parameter may speed up
        model training.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    max_iterations : int, optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. Consider
        increasing this (the default value is 10) if the training accuracy is
        low and the *Grad-Norm* in the display is large.

    verbose : bool, optional
        If True, prints progress updates and model details.

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    batch_size : int, optional
        If you are getting memory errors, try decreasing this value. If you
        have a powerful computer, increasing this value may improve performance.

    Returns
    -------
    out : ImageClassifier
        A trained :class:`ImageClassifier` model.

    Examples
    --------
    .. sourcecode:: python

        >>> model = turicreate.image_classifier.create(data, target='is_expensive')

        # Make predictions (in various forms)
        >>> predictions = model.predict(data)      # predictions
        >>> predictions = model.classify(data)     # predictions with confidence
        >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass)

        # Evaluate the model with ground truth data
        >>> results = model.evaluate(data)

    See Also
    --------
    ImageClassifier
    """
    start_time = _time.time()

    # Check model parameter
    allowed_models = list(_pre_trained_models.MODELS.keys())
    if _mac_ver() >= (10, 14):
        allowed_models.append('VisionFeaturePrint_Scene')

        # Also, to make sure existing code doesn't break, replace incorrect name
        # with the correct name version
        if model == "VisionFeaturePrint_Screen":
            print(
                "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene; VisionFeaturePrint_Screen will be removed in subsequent versions."
            )
            model = "VisionFeaturePrint_Scene"

    _tkutl._check_categorical_option_type('model', model, allowed_models)

    # Check dataset parameter
    if len(dataset) == 0:
        raise _ToolkitError('Unable to train on empty dataset')
    if (feature is not None) and (feature not in dataset.column_names()):
        raise _ToolkitError("Image feature column '%s' does not exist" %
                            feature)
    if target not in dataset.column_names():
        raise _ToolkitError("Target column '%s' does not exist" % target)

    if (batch_size < 1):
        raise ValueError("'batch_size' must be greater than or equal to 1")

    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto'
            or validation_set is None):
        raise TypeError("Unrecognized value for 'validation_set'.")

    if feature is None:
        feature = _tkutl._find_only_image_column(dataset)

    feature_extractor = _image_feature_extractor._create_feature_extractor(
        model)

    # Extract features
    extracted_features = _tc.SFrame({
        target:
        dataset[target],
        '__image_features__':
        feature_extractor.extract_features(dataset,
                                           feature,
                                           verbose=verbose,
                                           batch_size=batch_size),
    })
    if isinstance(validation_set, _tc.SFrame):
        extracted_features_validation = _tc.SFrame({
            target:
            validation_set[target],
            '__image_features__':
            feature_extractor.extract_features(validation_set,
                                               feature,
                                               verbose=verbose,
                                               batch_size=batch_size),
        })
    else:
        extracted_features_validation = validation_set

    # Train a classifier using the extracted features
    extracted_features[target] = dataset[target]
    lr_model = _tc.logistic_classifier.create(
        extracted_features,
        features=['__image_features__'],
        target=target,
        max_iterations=max_iterations,
        validation_set=extracted_features_validation,
        seed=seed,
        verbose=verbose,
        l2_penalty=l2_penalty,
        l1_penalty=l1_penalty,
        solver=solver,
        feature_rescaling=feature_rescaling,
        convergence_threshold=convergence_threshold,
        step_size=step_size,
        lbfgs_memory_level=lbfgs_memory_level,
        class_weights=class_weights)

    # set input image shape
    if model in _pre_trained_models.MODELS:
        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
    else:  # model == VisionFeaturePrint_Scene
        input_image_shape = (3, 299, 299)

    # Save the model
    state = {
        'classifier': lr_model,
        'model': model,
        'max_iterations': max_iterations,
        'feature_extractor': feature_extractor,
        'input_image_shape': input_image_shape,
        'target': target,
        'feature': feature,
        'num_features': 1,
        'num_classes': lr_model.num_classes,
        'classes': lr_model.classes,
        'num_examples': lr_model.num_examples,
        'training_time': _time.time() - start_time,
        'training_loss': lr_model.training_loss,
    }
    return ImageClassifier(state)
コード例 #28
0
    def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None):
        """
        Evaluate the model's predictive accuracy. This is done by predicting the
        target class for instances in a new dataset and comparing to known
        target values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the target and features used for model training. Additional
            columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto': Returns all available metrics.

            - 'accuracy': Classification accuracy.

            - 'confusion_matrix': An SFrame with counts of possible
              prediction/true label combinations.

            - 'roc_curve': An SFrame containing information needed for an roc
              curve (binary classification only).

        max_neighbors : int, optional
            Maximum number of neighbors to consider for each point.

        radius : float, optional
            Maximum distance from each point to a neighbor in the reference
            dataset.

        Returns
        -------
        out : dict
            Evaluation results. The dictionary keys are *accuracy* and
            *confusion_matrix* and *roc_curve* (if applicable).

        See also
        --------
        create, predict, predict_topk, classify

        Notes
        -----
        - Because the model randomly breaks ties between predicted classes, the
          results of repeated calls to `evaluate` method may differ.

        Examples
        --------
        >>> sf_train = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'],
        ...                             'height': [9, 25, 20, 23],
        ...                             'weight': [13, 28, 33, 22]})
        >>> m = turicreate.nearest_neighbor_classifier.create(sf, target='species')
        >>> ans = m.evaluate(sf_train, max_neighbors=2,
        ...                  metric='confusion_matrix')
        >>> print ans['confusion_matrix']
        +--------------+-----------------+-------+
        | target_label | predicted_label | count |
        +--------------+-----------------+-------+
        |     cat      |       dog       |   1   |
        |     dog      |       dog       |   2   |
        |    fossa     |       dog       |   1   |
        +--------------+-----------------+-------+
        """

        ## Validate the metric name
        _raise_error_evaluation_metric_is_valid(metric,
                    ['auto', 'accuracy', 'confusion_matrix', 'roc_curve'])

        ## Make sure the input dataset has a target column with an appropriate
        #  type.
        target = self.target
        _raise_error_if_column_exists(dataset, target, 'dataset', target)

        if not dataset[target].dtype == str and not dataset[target].dtype == int:
            raise TypeError("The target column of the evaluation dataset must "
                            "contain integers or strings.")

        if self.num_classes != 2:
            if (metric == 'roc_curve') or (metric == ['roc_curve']):
                err_msg  = "Currently, ROC curve is not supported for "
                err_msg += "multi-class classification in this model."
                raise _ToolkitError(err_msg)
            else:
                warn_msg  = "WARNING: Ignoring `roc_curve`. "
                warn_msg += "Not supported for multi-class classification."
                print(warn_msg)

        ## Compute predictions with the input dataset.
        ystar = self.predict(dataset, output_type='class',
                             max_neighbors=max_neighbors, radius=radius)
        ystar_prob = self.predict(dataset, output_type='probability',
                             max_neighbors=max_neighbors, radius=radius)


        ## Compile accuracy metrics
        results = {}

        if metric in ['accuracy', 'auto']:
            results['accuracy'] = _evaluation.accuracy(targets=dataset[target],
                                                          predictions=ystar)

        if metric in ['confusion_matrix', 'auto']:
            results['confusion_matrix'] = \
                _evaluation.confusion_matrix(targets=dataset[target],
                                                predictions=ystar)

        if self.num_classes == 2:
            if metric in ['roc_curve', 'auto']:
                results['roc_curve'] = \
                      _evaluation.roc_curve(targets=dataset[target],
                                               predictions=ystar_prob)
        return results
コード例 #29
0
    def evaluate(self, dataset, metric='auto', output_type='dict', verbose=True):
        """
        Evaluate the model by making predictions and comparing these to ground
        truth bounding box annotations.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the annotations and feature used for model training.
            Additional columns are ignored.

        metric : str or list, optional
            Name of the evaluation metric or list of several names. The primary
            metric is average precision, which is the area under the
            precision/recall curve and reported as a value between 0 and 1 (1
            being perfect). Possible values are:

            - 'auto'                      : Returns all primary metrics.
            - 'all'                       : Returns all available metrics.
            - 'average_precision'         : Average precision per class calculated over multiple
                                            intersection-over-union thresholds
                                            (at 50%, 55%, ..., 95%) and averaged.
            - 'average_precision_50'      : Average precision per class with
                                            intersection-over-union threshold at
                                            50% (PASCAL VOC metric).
            - 'mean_average_precision'    : Mean over all classes (for ``'average_precision'``)
                                            This is the primary single-value metric.
            - 'mean_average_precision_50' : Mean over all classes (for ``'average_precision_50'``).

        output_type : str
            Type of output:

            - 'dict'      : You are given a dictionary where each key is a metric name and the
                            value is another dictionary containing class-to-metric entries.
            - 'sframe'    : All metrics are returned as a single `SFrame`, where each row is a
                            class and each column is a metric. Metrics that are averaged over
                            class cannot be returned and are ignored under this format.
                            However, these are easily computed from the `SFrame` (e.g.
                            ``results['average_precision'].mean()``).

        verbose : bool
            If True, prints evaluation progress.

        Returns
        -------
        out : dict / SFrame
            Output type depends on the option `output_type`.

        See Also
        --------
        create, predict

        Examples
        --------
        >>> results = model.evaluate(data)
        >>> print('mAP: {:.1%}'.format(results['mean_average_precision']))
        mAP: 43.2%
        """
        AP = 'average_precision'
        MAP = 'mean_average_precision'
        AP50 = 'average_precision_50'
        MAP50 = 'mean_average_precision_50'
        ALL_METRICS = {AP, MAP, AP50, MAP50}
        if isinstance(metric, (list, tuple, set)):
            metrics = metric
        elif metric == 'all':
            metrics = ALL_METRICS
        elif metric == 'auto':
            metrics = {AP, MAP}
        elif metric in ALL_METRICS:
            metrics = {metric}
        else:
            raise _ToolkitError("Metric '{}' not supported".format(metric))

        pred, gt = self._predict_with_options(dataset, with_ground_truth=True,
                                              verbose=verbose)

        pred_df = pred.to_dataframe()
        gt_df = gt.to_dataframe()

        thresholds = _np.arange(0.5, 1.0, 0.05)
        all_th_aps = _average_precision(pred_df, gt_df,
                                        class_to_index=self._class_to_index,
                                        iou_thresholds=thresholds)

        def class_dict(aps):
            return {classname: aps[index]
                    for classname, index in self._class_to_index.items()}

        if output_type == 'dict':
            ret = {}
            if AP50 in metrics:
                ret[AP50] = class_dict(all_th_aps[0])
            if AP in metrics:
                ret[AP] = class_dict(all_th_aps.mean(0))
            if MAP50 in metrics:
                ret[MAP50] = all_th_aps[0].mean()
            if MAP in metrics:
                ret[MAP] = all_th_aps.mean()
        elif output_type == 'sframe':
            ret = _tc.SFrame({'label': self.classes})
            if AP50 in metrics:
                ret[AP50] = all_th_aps[0]
            if AP in metrics:
                ret[AP] = all_th_aps.mean(0)
        else:
            raise _ToolkitError("Output type '{}' not supported".format(output_type))

        return ret
コード例 #30
0
    def predict(self, dataset, output_type='class', verbose=True, batch_size=64):
        """
        Return predictions for ``dataset``. Predictions can be generated
        as class labels or probabilities.

        Parameters
        ----------
        dataset : SFrame | SArray | dict
            The audio data to be classified.
            If dataset is an SFrame, it must have a column with the same name as
            the feature used for model training, but does not require a target
            column. Additional columns are ignored.

        output_type : {'probability', 'class', 'probability_vector'}, optional
            Form of the predictions which are one of:

            - 'class': Class prediction. For multi-class classification, this
              returns the class with maximum probability.
            - 'probability': Prediction probability associated with the True
              class (not applicable for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. Label ordering is dictated by the ``classes``
              member variable.

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : SArray
            An SArray with the predictions.

        See Also
        ----------
        evaluate, classify

        Examples
        ----------
        >>> probability_predictions = model.predict(data, output_type='probability')
        >>> prediction_vector = model.predict(data, output_type='probability_vector')
        >>> class_predictions = model.predict(data, output_type='class')

        """
        import mxnet as mx

        if not isinstance(dataset, (_tc.SFrame, _tc.SArray, dict)):
            raise TypeError('\'dataset\' parameter must be either an SFrame, SArray or dictionary')

        if isinstance(dataset, dict):
            if(set(dataset.keys()) != {'sample_rate', 'data'}):
                raise ValueError('\'dataset\' parameter is a dictionary but does not appear to be audio data.')
            dataset = _tc.SArray([dataset])
        elif isinstance(dataset, _tc.SFrame):
            dataset = dataset[self.feature]

        if not _is_deep_feature_sarray(dataset) and not _is_audio_data_sarray(dataset):
            raise ValueError('\'dataset\' must be either audio data or audio deep features.')

        if output_type not in ('probability', 'probability_vector', 'class'):
            raise ValueError('\'dataset\' parameter must be either an SFrame, SArray or dictionary')
        if output_type == 'probability' and self.num_classes != 2:
            raise _ToolkitError('Output type \'probability\' is only supported for binary'
                                ' classification. For multi-class classification, use'
                                ' predict_topk() instead.')
        if(batch_size < 1):
            raise ValueError("'batch_size' must be greater than or equal to 1")

        if _is_deep_feature_sarray(dataset):
            deep_features = dataset
        else:
            deep_features = get_deep_features(dataset, verbose=verbose)
        
        deep_features = _tc.SFrame({'deep features': deep_features})
        deep_features = deep_features.add_row_number()
        deep_features = deep_features.stack('deep features', new_column_name='deep features')
        deep_features, missing_ids = deep_features.dropna_split(columns=['deep features'])

        if len(missing_ids) > 0:
            _logging.warning("Unable to make predictions for %d examples because they are less than 975ms in length."
                             % len(missing_ids))

        if batch_size > len(deep_features):
            batch_size = len(deep_features)

        y = []
        for batch in mx.io.NDArrayIter(deep_features['deep features'].to_numpy(), batch_size=batch_size):
            ctx = _mxnet_utils.get_mxnet_context()
            if(len(batch.data[0]) < len(ctx)):
                ctx = ctx[:len(batch.data[0])]

            batch_data = batch.data[0]
            if batch.pad != 0:
                batch_data = batch_data[:-batch.pad]    # prevent batches looping back

            batch_data = mx.gluon.utils.split_and_load(batch_data, ctx_list=ctx, batch_axis=0, even_split=False)

            for x in batch_data:
                forward_output = self._custom_classifier.forward(x)
                y += mx.nd.softmax(forward_output).asnumpy().tolist()
        assert(len(y) == len(deep_features))

        # Combine predictions from multiple frames
        sf = _tc.SFrame({'predictions': y, 'id': deep_features['id']})
        probabilities_sum = sf.groupby('id', {'prob_sum': _tc.aggregate.SUM('predictions')})

        if output_type == 'class':
            predicted_ids = probabilities_sum['prob_sum'].apply(lambda x: _np.argmax(x))
            mappings = self._id_to_class_label
            probabilities_sum['results'] = predicted_ids.apply(lambda x: mappings[x])
        else:
            assert output_type in ('probability', 'probability_vector')
            frame_per_example_count = sf.groupby('id', _tc.aggregate.COUNT())
            probabilities_sum = probabilities_sum.join(frame_per_example_count)
            probabilities_sum['results'] = probabilities_sum.apply(lambda row: [i / row['Count'] for i in row['prob_sum']])

        if len(missing_ids) > 0:
            output_type = probabilities_sum['results'].dtype
            missing_predictions = _tc.SFrame({'id': missing_ids['id'],
                                              'results': _tc.SArray([ None ] * len(missing_ids), dtype=output_type)
                                              })
            probabilities_sum = probabilities_sum[['id', 'results']].append(missing_predictions)

        probabilities_sum = probabilities_sum.sort('id')
        return probabilities_sum['results']