def from_model(cls, model, tree_id=0): import turicreate as _tc from turicreate.toolkits import _supervised_learning as _sl import json as _json _raise_error_if_not_of_type(tree_id, [int, long], "tree_id") _numeric_param_check_range("tree_id", tree_id, 0, model.num_trees - 1) tree = DecisionTree() nodes = {} tree_str = _tc.extensions._xgboost_get_tree(model.__proxy__, tree_id) metadata_mapping = _tc.extensions._get_metadata_mapping( model.__proxy__) trees_json = _json.loads(tree_str) # Parse the tree from the JSON. tree._make_tree(trees_json, metadata_mapping) tree.root_id = 0 # Keep track of the attributes. for key in { "num_examples", "num_features", "num_unpacked_features", "max_depth" }: setattr(tree, key, model._get(key)) return tree
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) chosen, not_chosen = unique_sessions.random_split(fraction, seed) train = dataset.filter_by(chosen['session'], session_id) valid = dataset.filter_by(not_chosen['session'], session_id) return train, valid
def get_prediction_score(self, node_id): """ Return the prediction score (if leaf node) or None if its an intermediate node. Parameters ---------- node_id: id of the node to get the prediction value. Returns ------- float or None: returns float value of predictio if leaf node and None if not. Examples -------- .. sourcecode:: python >>> tree.get_prediction_score(120) # Leaf node 0.251092 >>> tree.get_prediction_score(120) # Not a leaf node None """ _raise_error_if_not_of_type(node_id, [int,long], "node_id") _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1) node = self.nodes[node_id] return None if node.is_leaf == False else node.value
def draw_bounding_boxes(images, annotations, confidence_threshold=0): """ Visualizes bounding boxes (ground truth or predictions) by returning annotated copies of the images. Parameters ---------- images: SArray or Image An `SArray` of type `Image`. A single `Image` instance may also be given. annotations: SArray or list An `SArray` of annotations (either output from the `ObjectDetector.predict` function or ground truth). A single list of annotations may also be given, provided that it is coupled with a single image. confidence_threshold: float Confidence threshold can limit the number of boxes to draw. By default, this is set to 0, since the prediction may have already pruned with an appropriate confidence threshold. Returns ------- annotated_images: SArray or Image Similar to the input `images`, except the images are decorated with boxes to visualize the object instances. See also -------- unstack_annotations """ _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) from PIL import Image def draw_single_image(row): image = row['image'] anns = row['annotations'] pil_img = Image.fromarray(image.pixel_data) _annotate_image(pil_img, anns, confidence_threshold=confidence_threshold) image = _np.array(pil_img) FORMAT_RAW = 2 annotated_image = _tc.Image(_image_data=image.tobytes(), _width=image.shape[1], _height=image.shape[0], _channels=image.shape[2], _format_enum=FORMAT_RAW, _image_data_size=image.size) return annotated_image if isinstance(images, _tc.Image) and isinstance(annotations, list): return draw_single_image({'image': images, 'annotations': annotations}) else: return (_tc.SFrame({ 'image': images, 'annotations': annotations }).apply(draw_single_image))
def evaluate( self, dataset, metric="auto", output_type="dict", confidence_threshold=0.001, iou_threshold=0.45, ): """ Evaluate the model by making predictions and comparing these to ground truth bounding box annotations. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the annotations and feature used for model training. Additional columns are ignored. metric : str or list, optional Name of the evaluation metric or list of several names. The primary metric is average precision, which is the area under the precision/recall curve and reported as a value between 0 and 1 (1 being perfect). Possible values are: - 'auto' : Returns all primary metrics. - 'all' : Returns all available metrics. - 'average_precision_50' : Average precision per class with intersection-over-union threshold at 50% (PASCAL VOC metric). - 'average_precision' : Average precision per class calculated over multiple intersection-over-union thresholds (at 50%, 55%, ..., 95%) and averaged. - 'mean_average_precision_50' : Mean over all classes (for ``'average_precision_50'``). This is the primary single-value metric. - 'mean_average_precision' : Mean over all classes (for ``'average_precision'``) Returns ------- out : dict / SFrame Output type depends on the option `output_type`. See Also -------- create, predict Examples -------- >>> results = model.evaluate(data) >>> print('mAP: {:.1%}'.format(results['mean_average_precision'])) mAP: 43.2% """ _numeric_param_check_range("confidence_threshold", confidence_threshold, 0.0, 1.0) _numeric_param_check_range("iou_threshold", iou_threshold, 0.0, 1.0) options = {} options["confidence_threshold"] = confidence_threshold options["iou_threshold"] = iou_threshold return self.__proxy__.evaluate(dataset, metric, output_type, options)
def predict(self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbose=True): """ Predict object instances in an SFrame of images. Parameters ---------- dataset : SFrame | SArray | turicreate.Image The images on which to perform object detection. If dataset is an SFrame, it must have a column with the same name as the feature column during training. Additional columns are ignored. Returns ------- out : SArray An SArray with model predictions. Each element corresponds to an image and contains a list of dictionaries. Each dictionary describes an object instances that was found in the image. If `dataset` is a single image, the return value will be a single prediction. See Also -------- evaluate Examples -------- .. sourcecode:: python # Make predictions >>> pred = model.predict(data) # Stack predictions, for a better overview >>> turicreate.object_detector.util.stack_annotations(pred) Data: +--------+------------+-------+-------+-------+-------+--------+ | row_id | confidence | label | x | y | width | height | +--------+------------+-------+-------+-------+-------+--------+ | 0 | 0.98 | dog | 123.0 | 128.0 | 80.0 | 182.0 | | 0 | 0.67 | cat | 150.0 | 183.0 | 129.0 | 101.0 | | 1 | 0.8 | dog | 50.0 | 432.0 | 65.0 | 98.0 | +--------+------------+-------+-------+-------+-------+--------+ [3 rows x 7 columns] # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _numeric_param_check_range("confidence_threshold", confidence_threshold, 0.0, 1.0) _numeric_param_check_range("iou_threshold", iou_threshold, 0.0, 1.0) options = {} options["confidence_threshold"] = confidence_threshold options["iou_threshold"] = iou_threshold options["verbose"] = verbose return self.__proxy__.predict(dataset, options)
def predict(self, dataset, confidence_threshold=0.25, verbose=True): """ Predict object instances in an sframe of images. Parameters ---------- dataset : SFrame A dataset that has the same columns that were used during training. If the annotations column exists in ``dataset`` it will be ignored while making predictions. confidence_threshold : float Only return predictions above this level of confidence. The threshold can range from 0 to 1. verbose : bool If True, prints prediction progress. Returns ------- out : SArray An SArray with model predictions. Each element corresponds to an image and contains a list of dictionaries. Each dictionary describes an object instances that was found in the image. See Also -------- evaluate Examples -------- .. sourcecode:: python # Make predictions >>> pred = model.predict(data) # Stack predictions, for a better overview >>> turicreate.object_detector.util.stack_annotations(pred) Data: +--------+------------+-------+-------+-------+-------+--------+ | row_id | confidence | label | x | y | width | height | +--------+------------+-------+-------+-------+-------+--------+ | 0 | 0.98 | dog | 123.0 | 128.0 | 80.0 | 182.0 | | 0 | 0.67 | cat | 150.0 | 183.0 | 129.0 | 101.0 | | 1 | 0.8 | dog | 50.0 | 432.0 | 65.0 | 98.0 | +--------+------------+-------+-------+-------+-------+--------+ [3 rows x 7 columns] # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) stacked_pred = self._predict_with_options(dataset, with_ground_truth=False, confidence_threshold=confidence_threshold, verbose=verbose) from . import util return util.unstack_annotations(stacked_pred, num_rows=len(dataset))
def assert_valid_num_gpus(): from turicreate.util import _CUDA_GPU_IDS num_gpus = _tc_config.get_num_gpus() if not _CUDA_GPU_IDS and _sys.platform == 'darwin': # GPU acceleration requires macOS 10.14+ if num_gpus == 1 and _mac_ver() < (10, 14): raise _ToolkitError( 'GPU acceleration requires at least macOS 10.14') elif num_gpus >= 2: raise _ToolkitError( 'Using more than one GPU is currently not supported on Mac') _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(train, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") from ._model_architecture import _net_params from ._model_architecture import _define_model, _fit_model from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names(dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) if isinstance(validation_set, str) and validation_set == 'auto': if num_sessions < 100: validation_set = None else: dataset, validation_set = _random_split_by_session(dataset, session_id) # Create data iterators num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) user_provided_batch_size = batch_size batch_size = max(batch_size, num_gpus, 1) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) validation_set = validation_set.filter_by(target_map.keys(), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) loss_model, pred_model = _define_model(features, target_map, prediction_window, predictions_in_chunk, context) # Train the model log = _fit_model(loss_model, data_iter, valid_iter, max_iterations, num_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size' : user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ from random import Random _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) if seed is None: # Include the nanosecond component as well. import time seed = abs(hash("%0.20f" % time.time())) % (2**31) # The cython bindings require this to be an int, so cast if we can. try: seed = int(seed) except ValueError: raise ValueError('The \'seed\' parameter must be of type int.') random = Random() # Create a random binary filter (boolean SArray), using the same probability across all lines # that belong to the same session. In expectancy - the desired fraction of the sessions will # go to the training set. # Since boolean filters preserve order - there is no need to re-sort the lines within each session. # The boolean filter is a pseudorandom function of the session_id and the # global seed above, allowing the train-test split to vary across runs using # the same dataset. def random_session_pick(session_id_hash): random.seed(session_id_hash) return random.uniform(0, 1) < fraction chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick) train = dataset[chosen_filter] valid = dataset[1 - chosen_filter] return train, valid
def assert_valid_num_gpus(): from turicreate.util import _CUDA_GPU_IDS num_gpus = _tc_config.get_num_gpus() if not _CUDA_GPU_IDS and _sys.platform == 'darwin' and num_gpus > 0: raise _ToolkitError('Using GPUs is currently not supported on Mac') _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
def get_prediction_path(self, node_id, missing_id = []): """ Return the prediction path from this node to the parent node. Parameters ---------- node_id : id of the node to get the prediction path. missing_id : Additional info that contains nodes with missing features. Returns ------- list: The list of decisions (top to bottom) from the root to this node. Examples -------- .. sourcecode:: python >>> tree.get_prediction_score(5) # Any node [{'child_id': 2, 'feature': 'Quantity_features_90', 'index': 'sum_timegaplast_gap', 'node_id': 0, 'sign': '>', 'value': 53.5}, {'child_id': 5, 'feature': 'Quantity_features_90', 'index': 'sum_sum', 'node_id': 2, 'sign': '<=', 'value': 146.5}] """ _raise_error_if_not_of_type(node_id, [int,long], "node_id") _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1) def _deduplicate_path(path): s_nodes = {} # super_nodes s_path = [] # paths of super nodes. for node in path: feature = node['feature'] index = node['index'] if (feature, index) not in s_nodes: s_nodes[feature, index] = node s_path.append(node) else: s_node = s_nodes[feature, index] s_sign = s_node['sign'] sign = node['sign'] value = node['value'] # Supernode has no range. if s_sign == "<": if sign == ">=": s_node["value"] = [value, s_node["value"]] s_node["sign"] = "in" elif sign == "<": s_node["value"] = value elif s_sign == ">=": if sign == ">=": s_node["value"] = value elif sign == "<": s_node["value"] = [s_node["value"], value] s_node["sign"] = "in" # Supernode has a range. elif s_sign == "in": if sign == ">=": s_node["value"][0] = value elif sign == "<": s_node["value"][1] = value # Return super node path. return s_path path = [] node = self.nodes[node_id] while node.parent is not None: parent = node.parent is_missing = node.node_id in missing_id path.insert(0, parent.get_decision(node, is_missing)) node = node.parent return _deduplicate_path(path)
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print ("The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set") return dataset, None # We need an actual seed number, which we will later use in the apply function (see below). # If the user didn't provide a seed - we can generate one based on current system time # (similarly to mechanism behind random.seed(None) ) if seed is None: import time seed = long(time.time() * 256) random = Random() # Create a random binary filter (boolean SArray), using the same probability across all lines # that belong to the same session. In expectancy - the desired fraction of the sessions will # go to the training set. # Since boolean filters preserve order - there is no need to re-sort the lines within each session. def random_session_pick(session_id): # If we will use only the session_id as the seed - the split will be constant for the # same dataset across different runs, which is of course undesired random.seed(hash(session_id) + seed) return random.uniform(0, 1) < fraction chosen_filter = dataset[session_id].apply(random_session_pick) train = dataset[chosen_filter] valid = dataset[1 - chosen_filter] return train, valid
def to_json(self, root_id = 0, output = {}): """ Recursive function to dump this tree as a json blob. Parameters ---------- root_id: Root id of the sub-tree output: Carry over output from the previous sub-trees. Returns ------- dict: A tree in JSON format. Starts at the root node and recusively represents each node in JSON. - node_id : ID of the node. - left_id : ID of left child (None if it doesn't exist). - right_id : ID of right child (None if it doesn't exist). - split_feature_column : Feature column on which a decision is made. - split_feature_index : Feature index (within that column) on which the decision is made. - is_leaf : Is this node a leaf node? - node_type : Node type (categorical, numerical, leaf etc.) - value : Prediction (if leaf), decision split point (if not leaf). - left : JSON representation of the left node. - right : JSON representation of the right node. Examples -------- .. sourcecode:: python >>> tree.to_json() # Leaf node {'is_leaf': False, 'left': {'is_leaf': True, 'left_id': None, 'node_id': 115, 'node_type': u'leaf', 'parent_id': 60, 'right_id': None, 'split_feature_column': None, 'split_feature_index': None, 'value': 0.436364}, 'left_id': 115, 'node_id': 60, 'node_type': u'float', 'parent_id': 29, 'right': {'is_leaf': True, 'left_id': None, 'node_id': 116, 'node_type': u'leaf', 'parent_id': 60, 'right_id': None, 'split_feature_column': None, 'split_feature_index': None, 'value': -0.105882}, 'right_id': 116, 'split_feature_column': 'Quantity_features_14', 'split_feature_index': 'count_sum', 'value': 22.5} """ _raise_error_if_not_of_type(root_id, [int,long], "root_id") _numeric_param_check_range("root_id", root_id, 0, self.num_nodes - 1) node = self.nodes[root_id] output = node.to_dict() if node.left_id is not None: j = node.left_id output['left'] = self.to_json(j, output) if node.right_id is not None: j = node.right_id output['right'] = self.to_json(j, output) return output
def predict_topk( self, dataset, output_type="probability", k=3, output_frequency="per_row" ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `prediction_id`, `class`, and `probability`, or `rank`, depending on the ``output_type`` parameter. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features and session id used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. k : int, optional Number of classes to return for each input example. output_frequency : {'per_row', 'per_window'}, optional The frequency of the predictions which is one of: - 'per_row': Each prediction is returned ``prediction_window`` times. - 'per_window': Return a single prediction for each ``prediction_window`` rows in ``dataset`` per ``session_id``. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +---------------+-------+-------------------+ | row_id | class | probability | +---------------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +---------------+-------+-------------------+ """ if not isinstance(k, int): raise TypeError("k must be of type int") _tkutl._numeric_param_check_range("k", k, 1, _six.MAXSIZE) return self.__proxy__.predict_topk(dataset, output_type, k, output_frequency)
def create( dataset, session_id, target, features=None, prediction_window=100, validation_set="auto", max_iterations=10, batch_size=32, verbose=True, ): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(data, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") if not isinstance(target, str): raise _ToolkitError("target must be of type str") if not isinstance(session_id, str): raise _ToolkitError("session_id must be of type str") if not isinstance(batch_size, int): raise _ToolkitError("batch_size must be of type int") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") _tkutl._numeric_param_check_range("prediction_window", prediction_window, 1, 400) _tkutl._numeric_param_check_range("max_iterations", max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names( dataset, interpret_as_excluded=True, column_names=[session_id, target] ) if not hasattr(features, "__iter__"): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype( dataset[target], target, [str, int] ) _tkutl._raise_error_if_sarray_not_expected_dtype( dataset[session_id], session_id, [str, int] ) for feature in features: _tkutl._handle_missing_values(dataset, feature, "training_dataset") # Check for missing values for sframe validation set if isinstance(validation_set, _SFrame): _tkutl._raise_error_if_sframe_empty(validation_set, "validation_set") for feature in features: _tkutl._handle_missing_values(validation_set, feature, "validation_set") # C++ model name = "activity_classifier" import turicreate as _turicreate # Imports tensorflow import turicreate.toolkits.libtctensorflow model = _turicreate.extensions.activity_classifier() options = {} options["prediction_window"] = prediction_window options["batch_size"] = batch_size options["max_iterations"] = max_iterations options["verbose"] = verbose options["_show_loss"] = False model.train(dataset, target, session_id, validation_set, options) return ActivityClassifier(model_proxy=model, name=name)
def predict_topk( self, dataset, output_type="probability", k=3, verbose=True, batch_size=64 ): """ Return top-k predictions for the ``dataset``. Predictions are returned as an SFrame with three columns: `id`, `class`, and `probability` or `rank` depending on the ``output_type`` parameter. Parameters ---------- dataset : SFrame | SArray | dict The audio data to be classified. If dataset is an SFrame, it must have a column with the same name as the feature used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. k : int, optional Number of classes to return for each input example. verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +------+-------+-------------------+ | id | class | probability | +------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +------+-------+-------------------+ """ if not isinstance(k, int): raise TypeError("'k' must be of type int.") _tk_utils._numeric_param_check_range("k", k, 1, _six.MAXSIZE) prob_vector = self.predict( dataset, output_type="probability_vector", verbose=verbose, batch_size=batch_size, ) id_to_label = self._id_to_class_label if output_type == "probability": results = prob_vector.apply( lambda p: [ {"class": id_to_label[i], "probability": p[i]} for i in reversed(_np.argsort(p)[-k:]) ] ) else: assert output_type == "rank" results = prob_vector.apply( lambda p: [ {"class": id_to_label[i], "rank": rank} for rank, i in enumerate(reversed(_np.argsort(p)[-k:])) ] ) results = _tc.SFrame({"X": results}) results = results.add_row_number() results = results.stack("X", new_column_name="X") results = results.unpack("X", column_name_prefix="") return results
def assert_valid_num_gpus(): num_gpus = _tc_config.get_num_gpus() if _sys.platform == 'darwin' and num_gpus > 0: raise _ToolkitError('Using GPUs is currently not supported on Mac') _numeric_param_check_range('num_gpus', num_gpus, -1, _sys.maxint)
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True, **kwargs): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(data, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ from .._mxnet import _mxnet_utils from ._mx_model_architecture import _net_params from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data from ._mx_model_architecture import _define_model_mxnet, _fit_model_mxnet from ._mps_model_architecture import _define_model_mps, _fit_model_mps from .._mps_utils import (use_mps as _use_mps, mps_device_name as _mps_device_name, ac_weights_mps_to_mxnet as _ac_weights_mps_to_mxnet) _tkutl._raise_error_if_not_sframe(dataset, "dataset") if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names( dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError( "Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) params = {'use_tensorflow': False, 'show_deprecated_warnings': False} if '_advanced_parameters' in kwargs: # Make sure no additional parameters are provided new_keys = set(kwargs['_advanced_parameters'].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: raise _ToolkitError( 'Unknown advanced parameters: {}'.format(unsupported)) params.update(kwargs['_advanced_parameters']) if params['use_tensorflow'] and not (params['show_deprecated_warnings']): # Imports tensorflow import tensorflow as _tf from ._tf_model_architecture import ActivityTensorFlowModel, _fit_model_tf # Supresses verbosity to only errors _tf.compat.v1.logging.set_verbosity(_tf.compat.v1.logging.ERROR) if isinstance(validation_set, str) and validation_set == 'auto': # Computing the number of unique sessions in this way is relatively # expensive. Ideally we'd incorporate this logic into the C++ code that # chunks the raw data by prediction window. # TODO: https://github.com/apple/turicreate/issues/991 unique_sessions = _SFrame({'session': dataset[session_id].unique()}) if len(unique_sessions) < _MIN_NUM_SESSIONS_FOR_SPLIT: print( "The dataset has less than the minimum of", _MIN_NUM_SESSIONS_FOR_SPLIT, "sessions required for train-validation split. Continuing without validation set" ) validation_set = None else: dataset, validation_set = _random_split_by_session( dataset, session_id) for feature in features: _tkutl._handle_missing_values(dataset, feature, 'training_dataset') # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) # Decide whether to use MPS GPU, MXnet GPU or CPU num_mxnet_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) use_mps = _use_mps() and num_mxnet_gpus == 0 and not ( params['use_tensorflow']) if verbose: if use_mps: print('Using GPU to create model ({})'.format(_mps_device_name())) elif num_mxnet_gpus == 1: print('Using GPU to create model (CUDA)') elif num_mxnet_gpus > 1: print( 'Using {} GPUs to create model (CUDA)'.format(num_mxnet_gpus)) elif params['use_tensorflow']: print('Using Tensorflow to create model') else: print('Using CPU to create model') # Create data iterators user_provided_batch_size = batch_size batch_size = max(batch_size, num_mxnet_gpus, 1) use_mx_data_batch = not (use_mps or params['use_tensorflow']) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) for feature in features: _tkutl._handle_missing_values(dataset, feature, 'validation_set') validation_set = validation_set.filter_by(list(target_map.keys()), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target, mx_output=use_mx_data_batch) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) # Always create MXNet models, as the pred_model is later saved to the state # If MPS is used - the loss_model will be overwritten loss_model, pred_model = _define_model_mxnet(len(target_map), prediction_window, predictions_in_chunk, context) if use_mps: loss_model = _define_model_mps(batch_size, len(features), len(target_map), prediction_window, predictions_in_chunk, is_prediction_model=False) log = _fit_model_mps(loss_model, data_iter, valid_iter, max_iterations, verbose) else: if params['use_tensorflow']: net_params = _initialize_with_mxnet_weights( loss_model, chunked_data, features, prediction_window, predictions_in_chunk, batch_size, use_target) ac_model = ActivityTensorFlowModel(net_params, batch_size, len(features), len(target_map), prediction_window, predictions_in_chunk) # Train the model using Tensorflow log = _fit_model_tf(ac_model, net_params, data_iter, valid_iter, max_iterations, verbose, 1e-3) else: # Train the model using Mxnet log = _fit_model_mxnet(loss_model, data_iter, valid_iter, max_iterations, num_mxnet_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) if use_mps: mps_params = loss_model.export() arg_params, aux_params = _ac_weights_mps_to_mxnet( mps_params, _net_params['lstm_h']) elif params['use_tensorflow']: # Copy the weights back in the MXNet format arg_params, aux_params = ac_model.get_weights() else: arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size': user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def draw_bounding_boxes(images, annotations, confidence_threshold=0): """ Visualizes bounding boxes (ground truth or predictions) by returning annotated copies of the images. Parameters ---------- images: SArray or Image An `SArray` of type `Image`. A single `Image` instance may also be given. annotations: SArray or list An `SArray` of annotations (either output from the `ObjectDetector.predict` function or ground truth). A single list of annotations may also be given, provided that it is coupled with a single image. confidence_threshold: float Confidence threshold can limit the number of boxes to draw. By default, this is set to 0, since the prediction may have already pruned with an appropriate confidence threshold. Returns ------- annotated_images: SArray or Image Similar to the input `images`, except the images are decorated with boxes to visualize the object instances. See also -------- unstack_annotations """ _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) from PIL import Image def draw_single_image(row): image = row['image'] anns = row['annotations'] row_number = row['id'] if anns == None: anns = [] elif type(anns) == dict: anns = [anns] try: pil_img = Image.fromarray(image.pixel_data) _annotate_image(pil_img, anns, confidence_threshold=confidence_threshold) image = _np.array(pil_img) if len(image.shape) == 2: # Grayscale image, reshape image shape image = image.reshape(image.shape[0], image.shape[1], 1) FORMAT_RAW = 2 annotated_image = _tc.Image(_image_data=image.tobytes(), _width=image.shape[1], _height=image.shape[0], _channels=image.shape[2], _format_enum=FORMAT_RAW, _image_data_size=image.size) except Exception as e: if row_number == -1: # indication that it was a single image and not an SFrame raise _ToolkitError(e) raise _ToolkitError("Received exception at row " + str(row_number) + ": " + e) return annotated_image if isinstance(images, _tc.Image) and isinstance(annotations, list): return draw_single_image({ 'image': images, 'annotations': annotations, 'id': -1 }) else: sf = _tc.SFrame({'image': images, 'annotations': annotations}) sf = sf.add_row_number() annotated_images = sf.apply(draw_single_image) return annotated_images
def create( dataset, target, feature, max_iterations=10, custom_layer_sizes=[100, 100], verbose=True, validation_set="auto", batch_size=64, ): """ Creates a :class:`SoundClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string or int Name of the column containing the target variable. The values in this column must be of string or integer type. feature : string Name of the column containing the feature column. This column must contain audio data or deep audio features. Audio data is represented as dicts with key 'data' and 'sample_rate', see `turicreate.load_audio(...)`. Deep audio features are represented as a list of numpy arrays, each of size 12288, see `turicreate.sound_classifier.get_deep_features(...)`. max_iterations : int, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low. custom_layer_sizes : list of ints Specifies the architecture of the custom neural network. This neural network is made up of a series of dense layers. This parameter allows you to specify how many layers and the number of units in each layer. The custom neural network will always have one more layer than the length of this list. The last layer is always a soft max with units equal to the number of classes. verbose : bool, optional If True, prints progress updates and model details. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training dataset. By default, a validation set is automatically sampled. If `validation_set` is set to None, no validation is used. You can also pass a validation set you have constructed yourself. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. """ import time from ._audio_feature_extractor import _get_feature_extractor start_time = time.time() if not isinstance(dataset, _tc.SFrame): raise TypeError('"dataset" must be of type SFrame.') # check parameters if len(dataset) == 0: raise _ToolkitError("Unable to train on empty dataset") if feature not in dataset.column_names(): raise _ToolkitError("Audio feature column '%s' does not exist" % feature) if not _is_deep_feature_sarray(dataset[feature]) and not _is_audio_data_sarray( dataset[feature] ): raise _ToolkitError("'%s' column is not audio data." % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if ( not _tc.util._is_non_string_iterable(custom_layer_sizes) or len(custom_layer_sizes) == 0 ): raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.") for i in custom_layer_sizes: if not isinstance(i, int): raise _ToolkitError("'custom_layer_sizes' must contain only integers.") if not i >= 1: raise _ToolkitError("'custom_layer_sizes' must contain integers >= 1.") if not ( isinstance(validation_set, _tc.SFrame) or validation_set == "auto" or validation_set is None ): raise TypeError("Unrecognized value for 'validation_set'") if isinstance(validation_set, _tc.SFrame): if ( feature not in validation_set.column_names() or target not in validation_set.column_names() ): raise ValueError( "The 'validation_set' SFrame must be in the same format as the 'dataset'" ) if not isinstance(batch_size, int): raise TypeError("'batch_size' must be of type int.") if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") if not isinstance(max_iterations, int): raise TypeError("'max_iterations' must be type int.") _tk_utils._numeric_param_check_range( "max_iterations", max_iterations, 1, _six.MAXSIZE ) classes = list(dataset[target].unique().sort()) num_labels = len(classes) if num_labels <= 1: raise ValueError("The number of classes must be greater than one.") feature_extractor_name = "VGGish" feature_extractor = _get_feature_extractor(feature_extractor_name) class_label_to_id = {l: i for i, l in enumerate(classes)} # create the validation set if not isinstance(validation_set, _tc.SFrame) and validation_set == "auto": if len(dataset) >= 100: print( "Creating a validation set from 5 percent of training data. This may take a while.\n" "\tYou can set ``validation_set=None`` to disable validation tracking.\n" ) dataset, validation_set = dataset.random_split(0.95, exact=True) else: validation_set = None encoded_target = dataset[target].apply(lambda x: class_label_to_id[x]) if _is_deep_feature_sarray(dataset[feature]): train_deep_features = dataset[feature] else: # do the preprocess and VGGish feature extraction train_deep_features = get_deep_features(dataset[feature], verbose=verbose) train_data = _tc.SFrame( {"deep features": train_deep_features, "labels": encoded_target} ) train_data = train_data.stack("deep features", new_column_name="deep features") train_data, missing_ids = train_data.dropna_split(columns=["deep features"]) training_batch_size = min(len(train_data), batch_size) train_data = _create_data_iterator( train_data["deep features"].to_numpy(), train_data["labels"].to_numpy(), batch_size=training_batch_size, shuffle=True, ) if len(missing_ids) > 0: _logging.warning( "Dropping %d examples which are less than 975ms in length." % len(missing_ids) ) if validation_set is not None: if verbose: print("Preparing validation set") validation_encoded_target = validation_set[target].apply( lambda x: class_label_to_id[x] ) if _is_deep_feature_sarray(validation_set[feature]): validation_deep_features = validation_set[feature] else: validation_deep_features = get_deep_features( validation_set[feature], verbose=verbose ) validation_data = _tc.SFrame( { "deep features": validation_deep_features, "labels": validation_encoded_target, } ) validation_data = validation_data.stack( "deep features", new_column_name="deep features" ) validation_data = validation_data.dropna(columns=["deep features"]) validation_batch_size = min(len(validation_data), batch_size) validation_data = _create_data_iterator( validation_data["deep features"].to_numpy(), validation_data["labels"].to_numpy(), batch_size=validation_batch_size, ) else: validation_data = [] train_metric = _get_accuracy_metric() if validation_data: validation_metric = _get_accuracy_metric() if verbose: print("\nTraining a custom neural network -") from ._tf_sound_classifier import SoundClassifierTensorFlowModel custom_NN = SoundClassifierTensorFlowModel( feature_extractor.output_length, num_labels, custom_layer_sizes ) if verbose: # Setup progress table row_ids = ["iteration", "train_accuracy", "time"] row_display_names = ["Iteration", "Training Accuracy", "Elapsed Time"] if validation_data: row_ids.insert(2, "validation_accuracy") row_display_names.insert(2, "Validation Accuracy (%)") table_printer = _tc.util._ProgressTablePrinter(row_ids, row_display_names) for i in range(max_iterations): # TODO: early stopping for data, label in train_data: custom_NN.train(data, label) train_data.reset() # Calculate training metric for data, label in train_data: outputs = custom_NN.predict(data) train_metric.update(label, outputs) train_data.reset() for data, label in validation_data: outputs = custom_NN.predict(data) validation_metric.update(label, outputs) # Get metrics, print progress table train_accuracy = train_metric.get() train_metric.reset() printed_row_values = {"iteration": i + 1, "train_accuracy": train_accuracy} if validation_data: validation_accuracy = validation_metric.get() printed_row_values["validation_accuracy"] = validation_accuracy validation_metric.reset() validation_data.reset() if verbose: printed_row_values["time"] = time.time() - start_time table_printer.print_row(**printed_row_values) state = { "_class_label_to_id": class_label_to_id, "_custom_classifier": custom_NN, "_feature_extractor": feature_extractor, "_id_to_class_label": {v: k for k, v in class_label_to_id.items()}, "classes": classes, "custom_layer_sizes": custom_layer_sizes, "feature": feature, "feature_extractor_name": feature_extractor.name, "num_classes": num_labels, "num_examples": len(dataset), "target": target, "training_accuracy": train_accuracy, "training_time": time.time() - start_time, "validation_accuracy": validation_accuracy if validation_data else None, } return SoundClassifier(state)
def create(dataset, annotations=None, feature=None, model='darknet-yolo', classes=None, max_iterations=0, verbose=True, **kwargs): """ Create a :class:`ObjectDetector` model. Parameters ---------- dataset : SFrame Input data. The columns named by the ``feature`` and ``annotations`` parameters will be extracted for training the detector. annotations : string Name of the column containing the object detection annotations. This column should be a list of dictionaries, with each dictionary representing a bounding box of an object instance. Here is an example of the annotations for a single image with two object instances:: [{'label': 'dog', 'type': 'rectangle', 'coordinates': {'x': 223, 'y': 198, 'width': 130, 'height': 230}}, {'label': 'cat', 'type': 'rectangle', 'coordinates': {'x': 40, 'y': 73, 'width': 80, 'height': 123}}] The value for `x` is the horizontal center of the box paired with `width` and `y` is the vertical center of the box paired with `height`. 'None' (the default) indicates the only list column in `dataset` should be used for the annotations. feature : string Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. model : string optional Object detection model to use: - "darknet-yolo" : Fast and medium-sized model classes : list optional List of strings containing the names of the classes of objects. Inferred from the data if not provided. max_iterations : int The number of training iterations. If 0, then it will be automatically be determined based on the amount of data you provide. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ObjectDetector A trained :class:`ObjectDetector` model. See Also -------- ObjectDetector Examples -------- .. sourcecode:: python # Train an object detector model >>> model = turicreate.object_detector.create(data) # Make predictions on the training set and as column to the SFrame >>> data['predictions'] = model.predict(data) # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _raise_error_if_not_sframe(dataset, "dataset") from ._mx_detector import YOLOLoss as _YOLOLoss from ._model import tiny_darknet as _tiny_darknet from ._sframe_loader import SFrameDetectionIter as _SFrameDetectionIter from ._manual_scheduler import ManualScheduler as _ManualScheduler import mxnet as _mx if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) start_time = _time.time() supported_detectors = ['darknet-yolo'] if feature is None: feature = _tkutl._find_only_image_column(dataset) if verbose: print("Using '%s' as feature column" % feature) if annotations is None: annotations = _tkutl._find_only_column_of_type(dataset, target_type=list, type_name='list', col_name='annotations') if verbose: print("Using '%s' as annotations column" % annotations) _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations=True) _tkutl._check_categorical_option_type('model', model, supported_detectors) base_model = model.split('-', 1)[0] ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() params = { 'anchors': [ (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), ], 'grid_shape': [13, 13], 'batch_size': 32, 'aug_resize': 0, 'aug_rand_crop': 0.9, 'aug_rand_pad': 0.9, 'aug_rand_gray': 0.0, 'aug_aspect_ratio': 1.25, 'aug_hue': 0.05, 'aug_brightness': 0.05, 'aug_saturation': 0.05, 'aug_contrast': 0.05, 'aug_horizontal_flip': True, 'aug_min_object_covered': 0, 'aug_min_eject_coverage': 0.5, 'aug_area_range': (.15, 2), 'aug_pca_noise': 0.0, 'aug_max_attempts': 20, 'aug_inter_method': 2, 'lmb_coord_xy': 10.0, 'lmb_coord_wh': 10.0, 'lmb_obj': 100.0, 'lmb_noobj': 5.0, 'lmb_class': 2.0, 'non_maximum_suppression_threshold': 0.45, 'rescore': True, 'clip_gradients': 0.025, 'learning_rate': 1.0e-3, 'shuffle': True, } if '_advanced_parameters' in kwargs: # Make sure no additional parameters are provided new_keys = set(kwargs['_advanced_parameters'].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported)) params.update(kwargs['_advanced_parameters']) anchors = params['anchors'] num_anchors = len(anchors) num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=params['batch_size']) batch_size_each = params['batch_size'] // max(num_gpus, 1) # Note, this may slightly alter the batch size to fit evenly on the GPUs batch_size = max(num_gpus, 1) * batch_size_each grid_shape = params['grid_shape'] input_image_shape = (3, grid_shape[0] * ref_model.spatial_reduction, grid_shape[1] * ref_model.spatial_reduction) try: instances = (dataset.stack(annotations, new_column_name='_bbox', drop_na=True) .unpack('_bbox', limit=['label'])) except (TypeError, RuntimeError): # If this fails, the annotation format isinvalid at the coarsest level raise _ToolkitError("Annotations format is invalid. Must be a list of " "dictionaries containing 'label' and 'coordinates'.") num_images = len(dataset) num_instances = len(instances) if classes is None: classes = instances['_bbox.label'].unique() classes = sorted(classes) # Make a class-to-index look-up table class_to_index = {name: index for index, name in enumerate(classes)} num_classes = len(classes) # Create data loader loader = _SFrameDetectionIter(dataset, batch_size=batch_size, input_shape=input_image_shape[1:], output_shape=grid_shape, anchors=anchors, class_to_index=class_to_index, aug_params=params, shuffle=params['shuffle'], loader_type='augmented', feature_column=feature, annotations_column=annotations) # Predictions per anchor box: x/y + w/h + object confidence + class probs preds_per_box = 5 + num_classes output_size = preds_per_box * num_anchors ymap_shape = (batch_size_each,) + tuple(grid_shape) + (num_anchors, preds_per_box) net = _tiny_darknet(output_size=output_size) loss = _YOLOLoss(input_shape=input_image_shape[1:], output_shape=grid_shape, batch_size=batch_size_each, num_classes=num_classes, anchors=anchors, parameters=params) base_lr = params['learning_rate'] if max_iterations == 0: # Set number of iterations through a heuristic num_iterations_raw = 5000 * _np.sqrt(num_instances) / batch_size num_iterations = 1000 * max(1, int(round(num_iterations_raw / 1000))) else: num_iterations = max_iterations steps = [num_iterations // 2, 3 * num_iterations // 4, num_iterations] steps_and_factors = [(step, 10**(-i)) for i, step in enumerate(steps)] steps, factors = zip(*steps_and_factors) lr_scheduler = _ManualScheduler(step=steps, factor=factors) ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size) net_params = net.collect_params() net_params.initialize(_mx.init.Xavier(), ctx=ctx) net_params['conv7_weight'].initialize(_mx.init.Xavier(factor_type='avg'), ctx=ctx, force_reinit=True) net_params['conv8_weight'].initialize(_mx.init.Uniform(0.00005), ctx=ctx, force_reinit=True) # Initialize object confidence low, preventing an unnecessary adjustment # period toward conservative estimates bias = _np.zeros(output_size, dtype=_np.float32) bias[4::preds_per_box] -= 6 from ._mx_detector import ConstantArray net_params['conv8_bias'].initialize(ConstantArray(bias), ctx, force_reinit=True) # Take a subset and then load the rest of the parameters. It is possible to # do allow_missing=True directly on net_params. However, this will more # easily hide bugs caused by names getting out of sync. ref_model.available_parameters_subset(net_params).load(ref_model.model_path, ctx) options = {'learning_rate': base_lr, 'lr_scheduler': lr_scheduler, 'momentum': 0.9, 'wd': 0.00005, 'rescale_grad': 1.0} clip_grad = params.get('clip_gradients') if clip_grad: options['clip_gradient'] = clip_grad trainer = _mx.gluon.Trainer(net.collect_params(), 'sgd', options) iteration = 0 smoothed_loss = None last_time = 0 while iteration < num_iterations: loader.reset() for batch in loader: data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) Ls = [] with _mx.autograd.record(): for x, y in zip(data, label): z = net(x) z0 = _mx.nd.transpose(z, [0, 2, 3, 1]).reshape(ymap_shape) L = loss(z0, y) Ls.append(L) for L in Ls: L.backward() cur_loss = _np.mean([L.asnumpy()[0] for L in Ls]) if smoothed_loss is None: smoothed_loss = cur_loss else: smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss trainer.step(1) iteration += 1 cur_time = _time.time() if verbose and cur_time > last_time + 10: print('{now:%Y-%m-%d %H:%M:%S} Training {cur_iter:{width}d}/{num_iterations:{width}d} Loss {loss:6.3f}'.format( now=_datetime.now(), cur_iter=iteration, num_iterations=num_iterations, loss=smoothed_loss, width=len(str(num_iterations)))) last_time = cur_time if iteration == num_iterations: break training_time = _time.time() - start_time # Save the model state = { '_model': net, '_class_to_index': class_to_index, '_training_time_as_string': _seconds_as_string(training_time), '_grid_shape': grid_shape, 'anchors': anchors, 'model': model, 'classes': classes, 'batch_size': batch_size, 'input_image_shape': input_image_shape, 'feature': feature, 'non_maximum_suppression_threshold': params['non_maximum_suppression_threshold'], 'annotations': annotations, 'num_classes': num_classes, 'num_examples': num_images, 'num_bounding_boxes': num_instances, 'training_time': training_time, 'training_epochs': loader.cur_epoch, 'training_iterations': iteration, 'max_iterations': max_iterations, 'training_loss': smoothed_loss, } return ObjectDetector(state)
def create(dataset, annotations=None, feature=None, model="darknet-yolo", classes=None, batch_size=0, max_iterations=0, verbose=True, grid_shape=[13, 13], **kwargs): """ Create a :class:`ObjectDetector` model. Parameters ---------- dataset : SFrame Input data. The columns named by the ``feature`` and ``annotations`` parameters will be extracted for training the detector. annotations : string Name of the column containing the object detection annotations. This column should be a list of dictionaries (or a single dictionary), with each dictionary representing a bounding box of an object instance. Here is an example of the annotations for a single image with two object instances:: [{'label': 'dog', 'type': 'rectangle', 'coordinates': {'x': 223, 'y': 198, 'width': 130, 'height': 230}}, {'label': 'cat', 'type': 'rectangle', 'coordinates': {'x': 40, 'y': 73, 'width': 80, 'height': 123}}] The value for `x` is the horizontal center of the box paired with `width` and `y` is the vertical center of the box paired with `height`. 'None' (the default) indicates the only list column in `dataset` should be used for the annotations. feature : string Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. model : string optional Object detection model to use: - "darknet-yolo" : Fast and medium-sized model grid_shape : array optional Shape of the grid used for object detection. Higher values increase precision for small objects, but at a higher computational cost - [13, 13] : Default grid value for a Fast and medium-sized model classes : list optional List of strings containing the names of the classes of objects. Inferred from the data if not provided. batch_size: int The number of images per training iteration. If 0, then it will be automatically determined based on resource availability. max_iterations : int The number of training iterations. If 0, then it will be automatically be determined based on the amount of data you provide. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ObjectDetector A trained :class:`ObjectDetector` model. See Also -------- ObjectDetector Examples -------- .. sourcecode:: python # Train an object detector model >>> model = turicreate.object_detector.create(data) # Make predictions on the training set and as column to the SFrame >>> data['predictions'] = model.predict(data) # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _raise_error_if_not_sframe(dataset, "dataset") if len(dataset) == 0: raise _ToolkitError("Unable to train on empty dataset") _numeric_param_check_range("max_iterations", max_iterations, 0, _six.MAXSIZE) start_time = _time.time() supported_detectors = ["darknet-yolo"] if feature is None: feature = _tkutl._find_only_image_column(dataset) if verbose: print("Using '%s' as feature column" % feature) if annotations is None: annotations = _tkutl._find_only_column_of_type( dataset, target_type=[list, dict], type_name="list", col_name="annotations") if verbose: print("Using '%s' as annotations column" % annotations) _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations=True) _tkutl._handle_missing_values(dataset, feature, "dataset") _tkutl._check_categorical_option_type("model", model, supported_detectors) base_model = model.split("-", 1)[0] ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() pretrained_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[ "darknet_mlmodel"]() pretrained_model_path = pretrained_model.get_model_path() params = { "anchors": [ (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), ], "grid_shape": grid_shape, "aug_resize": 0, "aug_rand_crop": 0.9, "aug_rand_pad": 0.9, "aug_rand_gray": 0.0, "aug_aspect_ratio": 1.25, "aug_hue": 0.05, "aug_brightness": 0.05, "aug_saturation": 0.05, "aug_contrast": 0.05, "aug_horizontal_flip": True, "aug_min_object_covered": 0, "aug_min_eject_coverage": 0.5, "aug_area_range": (0.15, 2), "aug_pca_noise": 0.0, "aug_max_attempts": 20, "aug_inter_method": 2, "lmb_coord_xy": 10.0, "lmb_coord_wh": 10.0, "lmb_obj": 100.0, "lmb_noobj": 5.0, "lmb_class": 2.0, "non_maximum_suppression_threshold": 0.45, "rescore": True, "clip_gradients": 0.025, "weight_decay": 0.0005, "sgd_momentum": 0.9, "learning_rate": 1.0e-3, "shuffle": True, "mps_loss_mult": 8, # This large buffer size (8 batches) is an attempt to mitigate against # the SFrame shuffle operation that can occur after each epoch. "io_thread_buffer_size": 8, "mlmodel_path": pretrained_model_path, } # create tensorflow model here import turicreate.toolkits.libtctensorflow if classes == None: classes = [] _raise_error_if_not_iterable(classes) _raise_error_if_not_iterable(grid_shape) grid_shape = [int(x) for x in grid_shape] assert len(grid_shape) == 2 tf_config = { "grid_height": params["grid_shape"][0], "grid_width": params["grid_shape"][1], "mlmodel_path": params["mlmodel_path"], "classes": classes, "compute_final_metrics": False, "verbose": verbose, "model": "darknet-yolo", } # If batch_size or max_iterations = 0, they will be automatically # generated in C++. if batch_size > 0: tf_config["batch_size"] = batch_size if max_iterations > 0: tf_config["max_iterations"] = max_iterations model = _tc.extensions.object_detector() model.train( data=dataset, annotations_column_name=annotations, image_column_name=feature, options=tf_config, ) return ObjectDetector(model_proxy=model, name="object_detector")