コード例 #1
0
def create(input_dataset, target, feature=None, validation_set='auto',
            warm_start='auto', batch_size=256,
            max_iterations=100, verbose=True):
    """
    Create a :class:`DrawingClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``target``
        parameters will be extracted for training the drawing classifier.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string optional
        Name of the column containing the input drawings. 'None' (the default)
        indicates the column in `dataset` named "drawing" should be used as the
        feature.
        The feature column can contain both bitmap-based drawings as well as
        stroke-based drawings. Bitmap-based drawing input can be a grayscale
        tc.Image of any size.
        Stroke-based drawing input must be in the following format:
        Every drawing must be represented by a list of strokes, where each
        stroke must be a list of points in the order in which they were drawn
        on the canvas.
        Each point must be a dictionary with two keys, "x" and "y", and their
        respective values must be numerical, i.e. either integer or float.

    validation_set : SFrame optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    warm_start : string optional
        A string to denote which pretrained model to use. Set to "auto"
        by default which uses a model trained on 245 of the 345 classes in the
        Quick, Draw! dataset. To disable warm start, pass in None to this
        argument. Here is a list of all the pretrained models that
        can be passed in as this argument:
        "auto": Uses quickdraw_245_v0
        "quickdraw_245_v0": Uses a model trained on 245 of the 345 classes in the
                         Quick, Draw! dataset.
        None: No Warm Start

    batch_size: int optional
        The number of drawings per training step. If not set, a default
        value of 256 will be used. If you are getting memory errors,
        try decreasing this value. If you have a powerful computer, increasing
        this value may improve performance.

    max_iterations : int optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model.

    verbose : bool optional
        If True, print progress updates and model details.

    Returns
    -------
    out : DrawingClassifier
        A trained :class:`DrawingClassifier` model.

    See Also
    --------
    DrawingClassifier

    Examples
    --------
    .. sourcecode:: python

        # Train a drawing classifier model
        >>> model = turicreate.drawing_classifier.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

    """

    import mxnet as _mx
    from mxnet import autograd as _autograd
    from ._model_architecture import Model as _Model
    from ._sframe_loader import SFrameClassifierIter as _SFrameClassifierIter
    from .._mxnet import _mxnet_utils

    start_time = _time.time()
    accepted_values_for_warm_start = ["auto", "quickdraw_245_v0", None]

    # @TODO: Should be able to automatically choose number of iterations
    # based on data size: Tracked in Github Issue #1576

    # automatically infer feature column
    if feature is None:
        feature = _tkutl._find_only_drawing_column(input_dataset)

    _raise_error_if_not_drawing_classifier_input_sframe(
        input_dataset, feature, target)

    if batch_size is not None and not isinstance(batch_size, int):
        raise TypeError("'batch_size' must be an integer >= 1")
    if batch_size is not None and batch_size < 1:
        raise ValueError("'batch_size' must be >= 1")
    if max_iterations is not None and not isinstance(max_iterations, int):
        raise TypeError("'max_iterations' must be an integer >= 1")
    if max_iterations is not None and max_iterations < 1:
        raise ValueError("'max_iterations' must be >= 1")

    is_stroke_input = (input_dataset[feature].dtype != _tc.Image)
    dataset = _extensions._drawing_classifier_prepare_data(
        input_dataset, feature) if is_stroke_input else input_dataset

    iteration = -1

    classes = dataset[target].unique()
    classes = sorted(classes)

    if len(classes) == 1:
        _ToolkitError("The number of classes has to be greater than one")

    class_to_index = {name: index for index, name in enumerate(classes)}

    validation_set_corrective_string = ("'validation_set' parameter must be "
        + "an SFrame, or None, or must be set to 'auto' for the toolkit to "
        + "automatically create a validation set.")
    if isinstance(validation_set, _tc.SFrame):
        _raise_error_if_not_drawing_classifier_input_sframe(
            validation_set, feature, target)
        is_validation_stroke_input = (validation_set[feature].dtype != _tc.Image)
        validation_dataset = _extensions._drawing_classifier_prepare_data(
            validation_set, feature) if is_validation_stroke_input else validation_set
    elif isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print ( "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n"
                            "          You can set ``validation_set=None`` to disable validation tracking.\n")
                dataset, validation_dataset = dataset.random_split(TRAIN_VALIDATION_SPLIT, exact=True)
            else:
                validation_set = None
                validation_dataset = _tc.SFrame()
        else:
            raise _ToolkitError("Unrecognized value for 'validation_set'. "
                + validation_set_corrective_string)
    elif validation_set is None:
        validation_dataset = _tc.SFrame()
    else:
        raise TypeError("Unrecognized type for 'validation_set'."
            + validation_set_corrective_string)

    dataset = _drop_missing_values(dataset, feature, is_train =True)
    if len(validation_dataset) > 0:
        validation_dataset = _drop_missing_values(validation_dataset, feature, is_train=False)

    train_loader = _SFrameClassifierIter(dataset, batch_size,
                 feature_column=feature,
                 target_column=target,
                 class_to_index=class_to_index,
                 load_labels=True,
                 shuffle=True,
                 iterations=max_iterations)
    train_loader_to_compute_accuracy = _SFrameClassifierIter(dataset, batch_size,
                 feature_column=feature,
                 target_column=target,
                 class_to_index=class_to_index,
                 load_labels=True,
                 shuffle=True,
                 iterations=1)
    validation_loader = _SFrameClassifierIter(validation_dataset, batch_size,
                 feature_column=feature,
                 target_column=target,
                 class_to_index=class_to_index,
                 load_labels=True,
                 shuffle=True,
                 iterations=1)


    ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size)
    model = _Model(num_classes = len(classes), prefix="drawing_")
    model_params = model.collect_params()
    model_params.initialize(_mx.init.Xavier(), ctx=ctx)

    if warm_start is not None:
        if type(warm_start) is not str:
            raise TypeError("'warm_start' must be a string or None. "
                + "'warm_start' can take in the following values: "
                + str(accepted_values_for_warm_start))
        if warm_start not in accepted_values_for_warm_start:
            raise _ToolkitError("Unrecognized value for 'warm_start': "
                + warm_start + ". 'warm_start' can take in the following "
                + "values: " + str(accepted_values_for_warm_start))
        pretrained_model = _pre_trained_models.DrawingClassifierPreTrainedModel(
            warm_start)
        pretrained_model_params_path = pretrained_model.get_model_path()
        model.load_params(pretrained_model_params_path,
            ctx=ctx,
            allow_missing=True)
    softmax_cross_entropy = _mx.gluon.loss.SoftmaxCrossEntropyLoss()
    model.hybridize()
    trainer = _mx.gluon.Trainer(model.collect_params(), 'adam')

    if verbose and iteration == -1:
        column_names = ['iteration', 'train_loss', 'train_accuracy', 'time']
        column_titles = ['Iteration', 'Training Loss', 'Training Accuracy', 'Elapsed Time (seconds)']
        if validation_set is not None:
            column_names.insert(3, 'validation_accuracy')
            column_titles.insert(3, 'Validation Accuracy')
        table_printer = _tc.util._ProgressTablePrinter(
            column_names, column_titles)

    train_accuracy = _mx.metric.Accuracy()
    validation_accuracy = _mx.metric.Accuracy()

    def get_data_and_label_from_batch(batch):
        if batch.pad is not None:
            size = batch_size - batch.pad
            sliced_data  = _mx.nd.slice_axis(batch.data[0], axis=0, begin=0, end=size)
            sliced_label = _mx.nd.slice_axis(batch.label[0], axis=0, begin=0, end=size)
            num_devices = min(sliced_data.shape[0], len(ctx))
            batch_data = _mx.gluon.utils.split_and_load(sliced_data, ctx_list=ctx[:num_devices], even_split=False)
            batch_label = _mx.gluon.utils.split_and_load(sliced_label, ctx_list=ctx[:num_devices], even_split=False)
        else:
            batch_data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            batch_label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        return batch_data, batch_label

    def compute_accuracy(accuracy_metric, batch_loader):
        batch_loader.reset()
        accuracy_metric.reset()
        for batch in batch_loader:
            batch_data, batch_label = get_data_and_label_from_batch(batch)
            outputs = []
            for x, y in zip(batch_data, batch_label):
                if x is None or y is None: continue
                z = model(x)
                outputs.append(z)
            accuracy_metric.update(batch_label, outputs)

    for train_batch in train_loader:
        train_batch_data, train_batch_label = get_data_and_label_from_batch(train_batch)
        with _autograd.record():
            # Inside training scope
            for x, y in zip(train_batch_data, train_batch_label):
                z = model(x)
                # Computes softmax cross entropy loss.
                loss = softmax_cross_entropy(z, y)
                # Backpropagate the error for one iteration.
                loss.backward()

        # Make one step of parameter update. Trainer needs to know the
        # batch size of data to normalize the gradient by 1/batch_size.
        trainer.step(train_batch.data[0].shape[0])
        # calculate training metrics
        train_loss = loss.mean().asscalar()

        if train_batch.iteration > iteration:

            # Compute training accuracy
            compute_accuracy(train_accuracy, train_loader_to_compute_accuracy)
            # Compute validation accuracy
            if validation_set is not None:
                compute_accuracy(validation_accuracy, validation_loader)
            iteration = train_batch.iteration
            if verbose:
                kwargs = {  "iteration": iteration + 1,
                            "train_loss": float(train_loss),
                            "train_accuracy": train_accuracy.get()[1],
                            "time": _time.time() - start_time}
                if validation_set is not None:
                    kwargs["validation_accuracy"] = validation_accuracy.get()[1]
                table_printer.print_row(**kwargs)

    state = {
        '_model': model,
        '_class_to_index': class_to_index,
        'num_classes': len(classes),
        'classes': classes,
        'input_image_shape': (1, BITMAP_WIDTH, BITMAP_HEIGHT),
        'training_loss': train_loss,
        'training_accuracy': train_accuracy.get()[1],
        'training_time': _time.time() - start_time,
        'validation_accuracy': validation_accuracy.get()[1] if validation_set else None,
        # None if validation_set=None
        'max_iterations': max_iterations,
        'target': target,
        'feature': feature,
        'num_examples': len(input_dataset)
    }
    return DrawingClassifier(state)
コード例 #2
0
def create(input_dataset,
           target,
           feature=None,
           validation_set="auto",
           warm_start="auto",
           batch_size=256,
           max_iterations=500,
           verbose=True,
           random_seed=None,
           **kwargs):
    """
    Create a :class:`DrawingClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``target``
        parameters will be extracted for training the drawing classifier.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string optional
        Name of the column containing the input drawings.
        The feature column can contain either bitmap-based drawings or
        stroke-based drawings. Bitmap-based drawing input can be a grayscale
        tc.Image of any size.
        Stroke-based drawing input must be in the following format:
        Every drawing must be represented by a list of strokes, where each
        stroke must be a list of points in the order in which they were drawn
        on the canvas.
        Each point must be a dictionary with two keys, "x" and "y", and their
        respective values must be numerical, i.e. either integer or float.

    validation_set : SFrame optional
        A dataset for monitoring the model's generalization performance.
        The format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    warm_start : string optional
        A string to denote which pretrained model to use. Set to "auto"
        by default which uses a model trained on 245 of the 345 classes in the
        Quick, Draw! dataset. To disable warm start, pass in None to this
        argument. Here is a list of all the pretrained models that
        can be passed in as this argument:
        "auto": Uses quickdraw_245_v0
        "quickdraw_245_v0": Uses a model trained on 245 of the 345 classes in the
                         Quick, Draw! dataset.
        None: No Warm Start

    batch_size: int optional
        The number of drawings per training step. If not set, a default
        value of 256 will be used. If you are getting memory errors,
        try decreasing this value. If you have a powerful computer, increasing
        this value may improve performance.

    max_iterations : int optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model.

    verbose : bool optional
        If True, print progress updates and model details.

    random_seed : int, optional
        The results can be reproduced when given the same seed.

    Returns
    -------
    out : DrawingClassifier
        A trained :class:`DrawingClassifier` model.

    See Also
    --------
    DrawingClassifier

    Examples
    --------
    .. sourcecode:: python

        # Train a drawing classifier model
        >>> model = turicreate.drawing_classifier.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)
    """

    accepted_values_for_warm_start = ["auto", "quickdraw_245_v0", None]
    if warm_start is not None:
        if type(warm_start) is not str:
            raise TypeError("'warm_start' must be a string or None. " +
                            "'warm_start' can take in the following values: " +
                            str(accepted_values_for_warm_start))
        if warm_start not in accepted_values_for_warm_start:
            raise _ToolkitError("Unrecognized value for 'warm_start': " +
                                warm_start +
                                ". 'warm_start' can take in the following " +
                                "values: " +
                                str(accepted_values_for_warm_start))
        # Replace 'auto' with name of current default Warm Start model.
        warm_start = warm_start.replace("auto", "quickdraw_245_v0")

    if "_advanced_parameters" in kwargs:
        # Make sure no additional parameters are provided
        new_keys = set(kwargs["_advanced_parameters"].keys())
        set_keys = set(params.keys())
        unsupported = new_keys - set_keys
        if unsupported:
            raise _ToolkitError(
                "Unknown advanced parameters: {}".format(unsupported))

        params.update(kwargs["_advanced_parameters"])

    # @TODO: Should be able to automatically choose number of iterations
    # based on data size: Tracked in Github Issue #1576
    if not isinstance(input_dataset, _tc.SFrame):
        raise TypeError('"input_dataset" must be of type SFrame.')

    # automatically infer feature column
    if feature is None:
        feature = _tkutl._find_only_drawing_column(input_dataset)

    _raise_error_if_not_drawing_classifier_input_sframe(
        input_dataset, feature, target)

    if batch_size is not None and not isinstance(batch_size, int):
        raise TypeError("'batch_size' must be an integer >= 1")
    if batch_size is not None and batch_size < 1:
        raise ValueError("'batch_size' must be >= 1")
    if max_iterations is not None and not isinstance(max_iterations, int):
        raise TypeError("'max_iterations' must be an integer >= 1")
    if max_iterations is not None and max_iterations < 1:
        raise ValueError("'max_iterations' must be >= 1")

    import turicreate.toolkits.libtctensorflow

    model = _tc.extensions.drawing_classifier()
    options = dict()
    options["batch_size"] = batch_size
    options["max_iterations"] = max_iterations
    options["verbose"] = verbose
    options["_show_loss"] = False
    if validation_set is None:
        validation_set = _tc.SFrame()
    if warm_start:
        # Load CoreML warmstart model
        pretrained_mlmodel = _pre_trained_models.DrawingClassifierPreTrainedMLModel(
        )
        options["mlmodel_path"] = pretrained_mlmodel.get_model_path()
    if random_seed is not None:
        options["random_seed"] = random_seed
    options["warm_start"] = "" if warm_start is None else warm_start
    model.train(input_dataset, target, feature, validation_set, options)
    return DrawingClassifier(model_proxy=model, name="drawing_classifier")
コード例 #3
0
def create(input_dataset, target, feature=None, 
            pretrained_model_url=None, batch_size=256, 
            max_iterations=100, verbose=True):
    """
    Create a :class:`DrawingClassifier` model.

    Parameters
    ----------
    dataset : SFrame
        Input data. The columns named by the ``feature`` and ``target``
        parameters will be extracted for training the drawing classifier.

    target : string
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.

    feature : string optional
        Name of the column containing the input drawings. 'None' (the default)
        indicates the column in `dataset` named "drawing" should be used as the
        feature.
        The feature column can contain both bitmap-based drawings as well as
        stroke-based drawings. Bitmap-based drawing input can be a grayscale
        tc.Image of any size.
        Stroke-based drawing input must be in the following format:
        Every drawing must be represented by a list of strokes, where each
        stroke must be a list of points in the order in which they were drawn
        on the canvas.
        Each point must be a dictionary with two keys, "x" and "y", and their
        respective values must be numerical, i.e. either integer or float.

    pretrained_model_url : string optional
        A URL to the pretrained model that must be used for a warm start before
        training.

    batch_size: int optional
        The number of images per training step. If not set, a default
        value of 256 will be used. If you are getting memory errors,
        try decreasing this value. If you have a powerful computer, increasing
        this value may improve performance.

    max_iterations : int optional
        The maximum number of allowed passes through the data. More passes over
        the data can result in a more accurately trained model. 

    verbose : bool optional
        If True, print progress updates and model details.

    Returns
    -------
    out : DrawingClassifier
        A trained :class:`DrawingClassifier` model.

    See Also
    --------
    DrawingClassifier

    Examples
    --------
    .. sourcecode:: python

        # Train a drawing classifier model
        >>> model = turicreate.drawing_classifier.create(data)

        # Make predictions on the training set and as column to the SFrame
        >>> data['predictions'] = model.predict(data)

    """
    import mxnet as _mx
    from mxnet import autograd as _autograd
    from ._model_architecture import Model as _Model
    from ._sframe_loader import SFrameClassifierIter as _SFrameClassifierIter
    
    start_time = _time.time()

    # @TODO: Should be able to automatically choose number of iterations
    # based on data size: Tracked in Github Issue #1576

    # automatically infer feature column
    if feature is None:
        feature = _tkutl._find_only_drawing_column(input_dataset)

    _raise_error_if_not_drawing_classifier_input_sframe(
        input_dataset, feature, target)

    is_stroke_input = (input_dataset[feature].dtype != _tc.Image)
    dataset = _extensions._drawing_classifier_prepare_data(
        input_dataset, feature) if is_stroke_input else input_dataset

    column_names = ['Iteration', 'Loss', 'Elapsed Time']
    num_columns = len(column_names)
    column_width = max(map(lambda x: len(x), column_names)) + 2
    hr = '+' + '+'.join(['-' * column_width] * num_columns) + '+'

    progress = {'smoothed_loss': None, 'last_time': 0}
    iteration = 0

    classes = dataset[target].unique()
    classes = sorted(classes)
    class_to_index = {name: index for index, name in enumerate(classes)}

    def update_progress(cur_loss, iteration):
        iteration_base1 = iteration + 1
        if progress['smoothed_loss'] is None:
            progress['smoothed_loss'] = cur_loss
        else:
            progress['smoothed_loss'] = (0.9 * progress['smoothed_loss'] 
                + 0.1 * cur_loss)
        cur_time = _time.time()

        # Printing of table header is deferred, so that start-of-training
        # warnings appear above the table
        if verbose and iteration == 0:
            # Print progress table header
            print(hr)
            print(('| {:<{width}}' * num_columns + '|').format(*column_names, 
                width=column_width-1))
            print(hr)

        if verbose and (cur_time > progress['last_time'] + 10 or
                        iteration_base1 == max_iterations):
            # Print progress table row
            elapsed_time = cur_time - start_time
            print(
                "| {cur_iter:<{width}}| {loss:<{width}.3f}| {time:<{width}.1f}|".format(
                cur_iter=iteration_base1, loss=progress['smoothed_loss'],
                time=elapsed_time , width=column_width-1))
            progress['last_time'] = cur_time

    loader = _SFrameClassifierIter(dataset, batch_size,
                 feature_column=feature,
                 target_column=target,
                 class_to_index=class_to_index,
                 load_labels=True,
                 shuffle=True,
                 epochs=max_iterations,
                 iterations=None)

    ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size)
    model = _Model(num_classes = len(classes), prefix="drawing_")
    model_params = model.collect_params()
    model_params.initialize(_mx.init.Xavier(), ctx=ctx)

    if pretrained_model_url is not None:
        pretrained_model = _pre_trained_models.DrawingClassifierPreTrainedModel(pretrained_model_url)
        pretrained_model_params_path = pretrained_model.get_model_path()
        model.load_params(pretrained_model_params_path, 
            ctx=ctx, 
            allow_missing=True)
    softmax_cross_entropy = _mx.gluon.loss.SoftmaxCrossEntropyLoss()
    model.hybridize()
    trainer = _mx.gluon.Trainer(model.collect_params(), 'adam')

    train_loss = 0.
    for batch in loader:
        data = _mx.gluon.utils.split_and_load(batch.data[0], 
            ctx_list=ctx, batch_axis=0)[0]
        label = _mx.nd.array(
            _mx.gluon.utils.split_and_load(batch.label[0], 
                ctx_list=ctx, batch_axis=0)[0]
            )

        with _autograd.record():
            output = model(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        # update parameters
        trainer.step(1)
        # calculate training metrics
        cur_loss = loss.mean().asscalar()
        
        update_progress(cur_loss, batch.iteration)
        iteration = batch.iteration

    training_time = _time.time() - start_time
    if verbose:
        print(hr)   # progress table footer
    state = {
        '_model': model,
        '_class_to_index': class_to_index,
        'num_classes': len(classes),
        'classes': classes,
        'input_image_shape': (1, BITMAP_WIDTH, BITMAP_HEIGHT),
        'batch_size': batch_size,
        'training_loss': cur_loss,
        'training_time': training_time,
        'max_iterations': max_iterations,
        'target': target,
        'feature': feature,
        'num_examples': len(input_dataset)
    }
    return DrawingClassifier(state)