def create( dataset, session_id, target, features=None, prediction_window=100, validation_set="auto", max_iterations=10, batch_size=32, verbose=True, ): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(data, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") if not isinstance(target, str): raise _ToolkitError("target must be of type str") if not isinstance(session_id, str): raise _ToolkitError("session_id must be of type str") if not isinstance(batch_size, int): raise _ToolkitError("batch_size must be of type int") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") _tkutl._numeric_param_check_range("prediction_window", prediction_window, 1, 400) _tkutl._numeric_param_check_range("max_iterations", max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names( dataset, interpret_as_excluded=True, column_names=[session_id, target] ) if not hasattr(features, "__iter__"): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype( dataset[target], target, [str, int] ) _tkutl._raise_error_if_sarray_not_expected_dtype( dataset[session_id], session_id, [str, int] ) for feature in features: _tkutl._handle_missing_values(dataset, feature, "training_dataset") # Check for missing values for sframe validation set if isinstance(validation_set, _SFrame): _tkutl._raise_error_if_sframe_empty(validation_set, "validation_set") for feature in features: _tkutl._handle_missing_values(validation_set, feature, "validation_set") # C++ model name = "activity_classifier" import turicreate as _turicreate # Imports tensorflow import turicreate.toolkits.libtctensorflow model = _turicreate.extensions.activity_classifier() options = {} options["prediction_window"] = prediction_window options["batch_size"] = batch_size options["max_iterations"] = max_iterations options["verbose"] = verbose options["_show_loss"] = False model.train(dataset, target, session_id, validation_set, options) return ActivityClassifier(model_proxy=model, name=name)
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(train, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") from ._model_architecture import _net_params from ._model_architecture import _define_model, _fit_model from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names(dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) if isinstance(validation_set, str) and validation_set == 'auto': if num_sessions < 100: validation_set = None else: dataset, validation_set = _random_split_by_session(dataset, session_id) # Create data iterators num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) user_provided_batch_size = batch_size batch_size = max(batch_size, num_gpus, 1) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) validation_set = validation_set.filter_by(target_map.keys(), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) loss_model, pred_model = _define_model(features, target_map, prediction_window, predictions_in_chunk, context) # Train the model log = _fit_model(loss_model, data_iter, valid_iter, max_iterations, num_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size' : user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True, **kwargs): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(data, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ from .._mxnet import _mxnet_utils from ._mx_model_architecture import _net_params from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._mx_model_architecture import _define_model_mxnet, _fit_model_mxnet from ._mps_model_architecture import _define_model_mps, _fit_model_mps from .._mps_utils import (use_mps as _use_mps, mps_device_name as _mps_device_name, ac_weights_mps_to_mxnet as _ac_weights_mps_to_mxnet) _tkutl._raise_error_if_not_sframe(dataset, "dataset") if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names( dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError( "Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) params = {'use_tensorflow': False, 'show_deprecated_warnings': False} if '_advanced_parameters' in kwargs: # Make sure no additional parameters are provided new_keys = set(kwargs['_advanced_parameters'].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: raise _ToolkitError( 'Unknown advanced parameters: {}'.format(unsupported)) params.update(kwargs['_advanced_parameters']) if params['use_tensorflow'] and not (params['show_deprecated_warnings']): # Imports tensorflow import tensorflow as _tf from ._tf_model_architecture import ActivityTensorFlowModel, _fit_model_tf # Supresses verbosity to only errors _tf.compat.v1.logging.set_verbosity(_tf.compat.v1.logging.ERROR) if isinstance(validation_set, str) and validation_set == 'auto': # Computing the number of unique sessions in this way is relatively # expensive. Ideally we'd incorporate this logic into the C++ code that # chunks the raw data by prediction window. # TODO: https://github.com/apple/turicreate/issues/991 unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print( "The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set" ) validation_set = None else: dataset, validation_set = _random_split_by_session( dataset, session_id) for feature in features: _tkutl._handle_missing_values(dataset, feature, 'training_dataset') # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) # Decide whether to use MPS GPU, MXnet GPU or CPU num_mxnet_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) use_mps = _use_mps() and num_mxnet_gpus == 0 and not ( params['use_tensorflow']) if verbose: if use_mps: print('Using GPU to create model ({})'.format(_mps_device_name())) elif num_mxnet_gpus == 1: print('Using GPU to create model (CUDA)') elif num_mxnet_gpus > 1: print( 'Using {} GPUs to create model (CUDA)'.format(num_mxnet_gpus)) elif params['use_tensorflow']: print('Using Tensorflow to create model') else: print('Using CPU to create model') # Create data iterators user_provided_batch_size = batch_size batch_size = max(batch_size, num_mxnet_gpus, 1) use_mx_data_batch = not (use_mps or params['use_tensorflow']) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) for feature in features: _tkutl._handle_missing_values(dataset, feature, 'validation_set') validation_set = validation_set.filter_by(list(target_map.keys()), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) # Always create MXNet models, as the pred_model is later saved to the state # If MPS is used - the loss_model will be overwritten loss_model, pred_model = _define_model_mxnet(len(target_map), prediction_window, predictions_in_chunk, context) if use_mps: loss_model = _define_model_mps(batch_size, len(features), len(target_map), prediction_window, predictions_in_chunk, is_prediction_model=False) log = _fit_model_mps(loss_model, data_iter, valid_iter, max_iterations, verbose) else: if params['use_tensorflow']: net_params = _initialize_with_mxnet_weights( loss_model, chunked_data, features, prediction_window, predictions_in_chunk, batch_size, use_target) ac_model = ActivityTensorFlowModel(net_params, batch_size, len(features), len(target_map), prediction_window, predictions_in_chunk) # Train the model using Tensorflow log = _fit_model_tf(ac_model, net_params, data_iter, valid_iter, max_iterations, verbose, 1e-3) else: # Train the model using Mxnet log = _fit_model_mxnet(loss_model, data_iter, valid_iter, max_iterations, num_mxnet_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) if use_mps: mps_params = loss_model.export() arg_params, aux_params = _ac_weights_mps_to_mxnet( mps_params, _net_params['lstm_h']) elif params['use_tensorflow']: # Copy the weights back in the MXNet format arg_params, aux_params = ac_model.get_weights() else: arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size': user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model