def _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations): _raise_error_if_not_sframe(dataset, 'datset') if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if dataset[feature].dtype != _tc.Image: raise _ToolkitError("Feature column must contain images") if require_annotations: if annotations not in dataset.column_names(): raise _ToolkitError("Annotations column '%s' does not exist" % annotations) if dataset[annotations].dtype not in [list, dict]: raise _ToolkitError("Annotations column must be of type dict or list")
def assert_valid_num_gpus(): from turicreate.util import _CUDA_GPUS num_gpus = _tc_config.get_num_gpus() if not _CUDA_GPUS and _sys.platform == 'darwin': # GPU acceleration requires macOS 10.14+ if num_gpus == 1 and _mac_ver() < (10, 14): raise _ToolkitError( 'GPU acceleration requires at least macOS 10.14') elif num_gpus >= 2: raise _ToolkitError( 'Using more than one GPU is currently not supported on Mac') _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) unique_sessions = _SFrame({'session': dataset[session_id].unique()}) chosen, not_chosen = unique_sessions.random_split(fraction, seed) train = dataset.filter_by(chosen['session'], session_id) valid = dataset.filter_by(not_chosen['session'], session_id) return train, valid
def _load_version(cls, state, version): """ A function to load a previously saved ImageClassifier instance. Parameters ---------- unpickler : GLUnpickler A GLUnpickler file handler. version : int Version number maintained by the class writer. """ _tkutl._model_version_check(version, cls._PYTHON_IMAGE_SIMILARITY_VERSION) from turicreate.toolkits.nearest_neighbors import NearestNeighborsModel state["similarity_model"] = NearestNeighborsModel( state["similarity_model"]) # Correct models saved with a previous typo if state["model"] == "VisionFeaturePrint_Screen": state["model"] = "VisionFeaturePrint_Scene" if state["model"] == "VisionFeaturePrint_Scene" and _mac_ver() < (10, 14): raise _ToolkitError( "Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, " "which is only supported on macOS 10.14 and higher.") state[ "feature_extractor"] = _image_feature_extractor._create_feature_extractor( state["model"]) state["input_image_shape"] = tuple( [int(i) for i in state["input_image_shape"]]) return ImageSimilarityModel(state)
def draw_strokes(stroke_based_drawings): """ Visualizes drawings (ground truth or predictions) by returning images to represent the stroke-based data from the user. Parameters ---------- stroke_based_drawings: SArray or list An `SArray` of type `list`. Each element in the SArray should be a list of strokes, where each stroke is a list of points, and each point is represented as a dictionary with two keys, "x" and "y". A single stroke-based drawing is also supported, in which case, the type of the input would be list. Returns ------- drawings: SArray or _tc.Image Each stroke-based drawing is converted into a 28x28 grayscale drawing for the user to visualize what their strokes traced. """ single_input = False if (not isinstance(stroke_based_drawings, _tc.SArray) and not isinstance(stroke_based_drawings, list)): raise _ToolkitError( "Input to draw_strokes must be of type " + "turicreate.SArray or list (for a single stroke-based drawing)") if (isinstance(stroke_based_drawings, _tc.SArray) and stroke_based_drawings.dtype != list): raise _ToolkitError( "SArray input to draw_strokes must have dtype " + "list. Each element in the SArray should be a list of strokes, " + "where each stroke is a list of points, " + "and each point is represented as a dictionary " + "with two keys, \"x\" and \"y\".") if isinstance(stroke_based_drawings, list): single_input = True stroke_based_drawings = _tc.SArray([stroke_based_drawings]) sf = _tc.SFrame({"drawings": stroke_based_drawings}) sf_with_drawings = _extensions._drawing_classifier_prepare_data( sf, "drawings") if single_input: return sf_with_drawings["drawings"][0] return sf_with_drawings["drawings"]
def _validate_num_clusters(num_clusters, initial_centers, num_rows): """ Validate the combination of the `num_clusters` and `initial_centers` parameters in the Kmeans model create function. If the combination is valid, determine and return the correct number of clusters. Parameters ---------- num_clusters : int Specified number of clusters. initial_centers : SFrame Specified initial cluster center locations, in SFrame form. If the number of rows in this SFrame does not match `num_clusters`, there is a problem. num_rows : int Number of rows in the input dataset. Returns ------- _num_clusters : int The correct number of clusters to use going forward """ ## Basic validation if num_clusters is not None and not isinstance(num_clusters, int): raise _ToolkitError("Parameter 'num_clusters' must be an integer.") ## Determine the correct number of clusters. if initial_centers is None: if num_clusters is None: raise ValueError("Number of clusters cannot be determined from " + "'num_clusters' or 'initial_centers'. You must " + "specify one of these arguments.") else: _num_clusters = num_clusters else: num_centers = initial_centers.num_rows() if num_clusters is None: _num_clusters = num_centers else: if num_clusters != num_centers: raise ValueError( "The value of 'num_clusters' does not match " + "the number of provided initial centers. " + "Please provide only one of these arguments " + "or ensure the values match.") else: _num_clusters = num_clusters if _num_clusters > num_rows: raise ValueError("The desired number of clusters exceeds the number " + "of data points. Please set 'num_clusters' to be " + "smaller than the number of data points.") return _num_clusters
def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check types and sizes of the inputs. """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") if (len(targets) != len(predictions)): raise _ToolkitError( "Input SArrays 'targets' and 'predictions' must be of the same length.")
def _check_index_map(index_map): if index_map is None: return if not isinstance(index_map, dict): raise TypeError("Input `index_map` must be a dict mapping target label to prediction-vector index.") indices = [v for k,v in index_map.items()] indices.sort() if indices != list(range(len(index_map))): raise _ToolkitError("Invalid index_map: each target label must map to a distinct index into the prediction vector.")
def _style_input_check(self, style): set_of_all_idx = self._style_indices() scalar = False if isinstance(style, (list, tuple)): if len(style) == 0: raise _ToolkitError("the `style` list cannot be empty") elif set(style).issubset(set_of_all_idx): pass else: raise _ToolkitError("the `style` variable cannot be parsed") elif isinstance(style, _six.integer_types): scalar = True if style in set_of_all_idx: style = [style] else: raise _ToolkitError("the `style` variable cannot be parsed") elif style is None: style = list(set_of_all_idx) else: raise _ToolkitError("the `style` variable cannot be parsed") return style, scalar
def is_valid(ann): is_rect = ('type' not in ann or ann['type'] == 'rectangle') if not is_rect: # Not valid, but we bypass stricter checks (we simply # do not care about non rectangle types) return False ok_required = ('coordinates' in ann and isinstance(ann['coordinates'], dict) and set(ann['coordinates'].keys()) == {'x', 'y', 'width', 'height'} and 'label' in ann) if not ok_required: raise _ToolkitError("Detected an bounding box annotation with improper format: {}".format(ann)) ok_optional = ann['label'] in self.class_to_index return ok_optional
def _raise_error_if_not_drawing_classifier_input_sframe( dataset, feature, target): """ Performs some sanity checks on the SFrame provided as input to `turicreate.drawing_classifier.create` and raises a ToolkitError if something in the dataset is missing or wrong. """ from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe _raise_error_if_not_sframe(dataset) if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if (dataset[feature].dtype != _tc.Image and dataset[feature].dtype != list): raise _ToolkitError("Feature column must contain images" + " or stroke-based drawings encoded as lists of strokes" + " where each stroke is a list of points and" + " each point is stored as a dictionary") if dataset[target].dtype != int and dataset[target].dtype != str: raise _ToolkitError("Target column contains " + str(dataset[target].dtype) + " but it must contain strings or integers to represent" + " labels for drawings.") if len(dataset) == 0: raise _ToolkitError("Input Dataset is empty!")
def _is_image_deep_feature_sarray(feature_sarray, model_name): """ Finds if the given `SArray` has extracted features for a given model_name. """ from array import array if not (len(feature_sarray) > 0): return False if feature_sarray.dtype != array: return False if type(feature_sarray[0]) != array: return False if len(feature_sarray[0]) != MODEL_TO_FEATURE_SIZE_MAPPING[model_name]: raise _ToolkitError( "The given deep features are for a model other than {model_name}.". format(model_name=model_name)) return True
def _extract_features(self, dataset, verbose=False, batch_size=64): if image_analysis._is_image_deep_feature_sarray( dataset[self.feature], self.model): return _tc.SFrame({"__image_features__": dataset[self.feature]}) elif dataset[self.feature].dtype is _tc.Image: return _tc.SFrame({ "__image_features__": self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose, batch_size=batch_size) }) else: raise _ToolkitError( 'The "{feature}" column of the SFrame neither has the dataype image or extracted features array.' .format(feature=feature) + ' "Datasets" consists of columns with types: ' + ", ".join([x.__name__ for x in dataset.column_types()]) + ".")
def _find_only_image_extracted_features_column(sframe, model_name): """ Finds the only column in `sframe` with a type of array.array and has the length same as the last layer of the model in use. If there are zero or more than one image columns, an exception will be raised. """ from array import array feature_column = _tkutl._find_only_column_of_type(sframe, target_type=array, type_name="array", col_name="deep_features") if _is_image_deep_feature_sarray(sframe[feature_column], model_name): return feature_column else: raise _ToolkitError( 'No "{col_name}" column specified and no column with expected type "{type_name}" is found.' .format(col_name="deep_features", type_name="array"))
def check_one_shot_input(data, target, backgrounds): if backgrounds is not None and not(isinstance(backgrounds, _tc.SArray)): raise TypeError("'backgrounds' must be None or an SArray.") if (isinstance(backgrounds, _tc.SArray) and len(backgrounds) == 0): raise _ToolkitError('Unable to train with no background images') if not isinstance(target, str): raise TypeError("'target' must be of type string.") if isinstance(data, _tc.SFrame): _tkutl._raise_error_if_column_exists(data, target, "data", target) image_column_name = _tkutl._find_only_image_column(data) target_column_name = target dataset_to_augment = data elif isinstance(data, _tc.Image): image_column_name = "image" target_column_name = "target" dataset_to_augment = _tc.SFrame({image_column_name: [data], target_column_name: [target]}) else: raise TypeError("'data' must be of type SFrame or Image.") return dataset_to_augment, image_column_name, target_column_name
def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such that one split contains data for a `fraction` of the sessions while the second split contains all data for the rest of the sessions. Parameters ---------- dataset : SFrame Dataset to split. It must contain a column of session ids. session_id : string, optional The name of the column in `dataset` that corresponds to the a unique identifier for each session. fraction : float, optional Fraction of the sessions to fetch for the first returned SFrame. Must be between 0 and 1. Once the sessions are split, all data from a single session is in the same SFrame. seed : int, optional Seed for the random number generator used to split. Examples -------- .. sourcecode:: python # Split the data so that train has 90% of the users. >>> train, valid = tc.activity_classifier.util.random_split_by_session( ... dataset, session_id='session_id', fraction=0.9) # For example: If dataset has 2055 sessions >>> len(dataset['session_id'].unique()) 2055 # The training set now has 90% of the sessions >>> len(train['session_id'].unique()) 1850 # The validation set has the remaining 10% of the sessions >>> len(valid['session_id'].unique()) 205 """ from random import Random _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') _raise_error_if_not_of_type(session_id, str, 'session_id') _raise_error_if_not_of_type(fraction, float, 'fraction') _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') _numeric_param_check_range('fraction', fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( 'Input "dataset" must contain a column called %s.' % session_id) if seed is None: # Include the nanosecond component as well. import time seed = abs(hash("%0.20f" % time.time())) % (2**31) # The cython bindings require this to be an int, so cast if we can. try: seed = int(seed) except ValueError: raise ValueError('The \'seed\' parameter must be of type int.') random = Random() # Create a random binary filter (boolean SArray), using the same probability across all lines # that belong to the same session. In expectancy - the desired fraction of the sessions will # go to the training set. # Since boolean filters preserve order - there is no need to re-sort the lines within each session. # The boolean filter is a pseudorandom function of the session_id and the # global seed above, allowing the train-test split to vary across runs using # the same dataset. def random_session_pick(session_id_hash): random.seed(session_id_hash) return random.uniform(0, 1) < fraction chosen_filter = dataset[session_id].hash(seed).apply(random_session_pick) train = dataset[chosen_filter] valid = dataset[1 - chosen_filter] return train, valid
def get_deep_features(images, model_name, batch_size=64, verbose=True): """ Extracts features from images from a specific model. Parameters ---------- images : SArray Input data. model_name : string string optional Uses a pretrained model to bootstrap an image classifier: - "resnet-50" : Uses a pretrained resnet model. Exported Core ML model will be ~90M. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. Exported Core ML model will be ~4.7M. - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Exported Core ML model will be ~41K. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. Returns ------- out : SArray Returns an SArray with all the extracted features. See Also -------- turicreate.image_classifier.create turicreate.image_similarity.create Examples -------- >>> url = 'https://static.turi.com/datasets/images/nested' >>> image_sframe = turicreate.load_images(url) >>> image_sarray = image_sframe["image"] >>> deep_features_sframe = turicreate.image_analysis.get_deep_features(image_sarray, model_name="resnet-50") """ # Check model parameter allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys()) if _mac_ver() >= (10, 14): allowed_models.append("VisionFeaturePrint_Scene") _tkutl._check_categorical_option_type("model", model_name, allowed_models) # Check images parameter if not isinstance(images, _tc.SArray): raise TypeError( "Unrecognized type for 'images'. An SArray is expected.") if len(images) == 0: raise _ToolkitError( "Unable to extract features on an empty SArray object") if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") # Extract features feature_extractor = _image_feature_extractor._create_feature_extractor( model_name) images_sf = _tc.SFrame({"image": images}) return feature_extractor.extract_features(images_sf, "image", verbose=verbose, batch_size=batch_size)
def assert_valid_num_gpus(): from turicreate.util import _CUDA_GPU_IDS num_gpus = _tc_config.get_num_gpus() if not _CUDA_GPU_IDS and _sys.platform == 'darwin' and num_gpus > 0: raise _ToolkitError('Using GPUs is currently not supported on Mac') _numeric_param_check_range('num_gpus', num_gpus, -1, _six.MAXSIZE)
def create(dataset, target, feature, max_iterations=10, custom_layer_sizes=[100, 100], verbose=True, validation_set='auto', batch_size=64): ''' Creates a :class:`SoundClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string or int Name of the column containing the target variable. The values in this column must be of string or integer type. feature : string, optional Name of the column containing the feature column. This column must contain audio data or deep audio features. Audio data is represented as dicts with key 'data' and 'sample_rate', see `turicreate.load_audio(...)`. Deep audio features are represented as a list of numpy arrays, each of size 12288, see `turicreate.sound_classifier.get_deep_features(...)`. max_iterations : int, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low. custom_layer_sizes : list of ints Specifies the architecture of the custom neural network. This neural network is made up of a series of dense layers. This parameter allows you to specify how many layers and the number of units in each layer. The custom neural network will always have one more layer than the length of this list. The last layer is always a soft max with units equal to the number of classes. verbose : bool, optional If True, prints progress updates and model details. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training dataset. By default, a validation set is automatically sampled. If `validation_set` is set to None, no validataion is used. You can also pass a validation set you have constructed yourself. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. ''' import time import mxnet as mx from ._audio_feature_extractor import _get_feature_extractor start_time = time.time() # check parameters if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if feature not in dataset.column_names(): raise _ToolkitError("Audio feature column '%s' does not exist" % feature) if not _is_deep_feature_sarray(dataset[feature]) and not _is_audio_data_sarray(dataset[feature]): raise _ToolkitError("'%s' column is not audio data." % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if not _tc.util._is_non_string_iterable(custom_layer_sizes) or len(custom_layer_sizes) == 0: raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.") for i in custom_layer_sizes: if not isinstance(i, int): raise _ToolkitError("'custom_layer_sizes' must contain only integers.") if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): raise TypeError("Unrecognized value for 'validation_set'") if isinstance(validation_set, _tc.SFrame): if feature not in validation_set.column_names() or target not in validation_set.column_names(): raise ValueError("The 'validation_set' SFrame must be in the same format as the 'dataset'") if batch_size < 1: raise ValueError('\'batch_size\' must be greater than or equal to 1') classes = list(dataset[target].unique().sort()) num_labels = len(classes) feature_extractor_name = 'VGGish' feature_extractor = _get_feature_extractor(feature_extractor_name) class_label_to_id = {l: i for i, l in enumerate(classes)} # create the validation set if not isinstance(validation_set, _tc.SFrame) and validation_set == 'auto': if len(dataset) >= 100: print ( "Creating a validation set from 5 percent of training data. This may take a while.\n" "\tYou can set ``validation_set=None`` to disable validation tracking.\n") dataset, validation_set = dataset.random_split(0.95, exact=True) else: validation_set = None encoded_target = dataset[target].apply(lambda x: class_label_to_id[x]) if _is_deep_feature_sarray(dataset[feature]): train_deep_features = dataset[feature] else: # do the preprocess and VGGish feature extraction train_deep_features = get_deep_features(dataset[feature], verbose=verbose) train_data = _tc.SFrame({'deep features': train_deep_features, 'labels': encoded_target}) train_data = train_data.stack('deep features', new_column_name='deep features') train_data, missing_ids = train_data.dropna_split(columns=['deep features']) if len(missing_ids) > 0: _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids)) if validation_set is not None: if verbose: print("Preparing validataion set") validation_encoded_target = validation_set[target].apply(lambda x: class_label_to_id[x]) if _is_deep_feature_sarray(validation_set[feature]): validation_deep_features = validation_set[feature] else: validation_deep_features = get_deep_features(validation_set[feature], verbose=verbose) validation_data = _tc.SFrame({'deep features': validation_deep_features, 'labels': validation_encoded_target}) validation_data = validation_data.stack('deep features', new_column_name='deep features') validation_data = validation_data.dropna(columns=['deep features']) validation_batch_size = min(len(validation_data), batch_size) validation_data = mx.io.NDArrayIter(validation_data['deep features'].to_numpy(), label=validation_data['labels'].to_numpy(), batch_size=validation_batch_size) else: validation_data = [] if verbose: print("\nTraining a custom neural network -") training_batch_size = min(len(train_data), batch_size) train_data = mx.io.NDArrayIter(train_data['deep features'].to_numpy(), label=train_data['labels'].to_numpy(), batch_size=training_batch_size, shuffle=True) custom_NN = SoundClassifier._build_custom_neural_network(feature_extractor.output_length, num_labels, custom_layer_sizes) ctx = _mxnet_utils.get_mxnet_context() custom_NN.initialize(mx.init.Xavier(), ctx=ctx) trainer = mx.gluon.Trainer(custom_NN.collect_params(), 'nag', {'learning_rate': 0.01, 'momentum': 0.9}) if verbose: # Setup progress table row_ids = ['epoch', 'train_accuracy', 'time'] row_display_names = ['Epoch', 'Training Accuracy (%)', 'Elapsed Time (seconds)'] if validation_data: row_ids.insert(2, 'validation_accuracy') row_display_names.insert(2, 'Validation Accuracy (%)') table_printer = _tc.util._ProgressTablePrinter(row_ids, row_display_names) train_metric = mx.metric.Accuracy() if validation_data: validation_metric = mx.metric.Accuracy() softmax_cross_entropy_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() for i in range(max_iterations): # TODO: early stopping for batch in train_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) # Inside training scope with mx.autograd.record(): for x, y in zip(data, label): z = custom_NN(x) # Computes softmax cross entropy loss. loss = softmax_cross_entropy_loss(z, y) # Backpropagate the error for one iteration. loss.backward() # Make one step of parameter update. Trainer needs to know the # batch size of data to normalize the gradient by 1/batch_size. trainer.step(batch.data[0].shape[0]) train_data.reset() # Calculate training metric for batch in train_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [custom_NN(x) for x in data] train_metric.update(label, outputs) train_data.reset() # Calculate validataion metric for batch in validation_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [custom_NN(x) for x in data] validation_metric.update(label, outputs) # Get metrics, print progress table _, train_accuracy = train_metric.get() train_metric.reset() printed_row_values = {'epoch': i, 'train_accuracy': train_accuracy} if validation_data: _, validataion_accuracy = validation_metric.get() printed_row_values['validation_accuracy'] = validataion_accuracy validation_metric.reset() validation_data.reset() if verbose: printed_row_values['time'] = time.time()-start_time table_printer.print_row(**printed_row_values) state = { '_class_label_to_id': class_label_to_id, '_custom_classifier': custom_NN, '_feature_extractor': feature_extractor, '_id_to_class_label': {v: k for k, v in class_label_to_id.items()}, 'classes': classes, 'custom_layer_sizes': custom_layer_sizes, 'feature': feature, 'feature_extractor_name': feature_extractor.name, 'num_classes': num_labels, 'num_examples': len(dataset), 'target': target, 'training_accuracy': train_accuracy, 'training_time': time.time() - start_time, 'validation_accuracy': validataion_accuracy if validation_data else None, } return SoundClassifier(state)
def create(dataset, session_id, target, features=None, prediction_window=100, validation_set='auto', max_iterations=10, batch_size=32, verbose=True): """ Create an :class:`ActivityClassifier` model. Parameters ---------- dataset : SFrame Input data which consists of `sessions` of data where each session is a sequence of data. The data must be in `stacked` format, grouped by session. Within each session, the data is assumed to be sorted temporally. Columns in `features` will be used to train a model that will make a prediction using labels in the `target` column. session_id : string Name of the column that contains a unique ID for each session. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. Use `model.classes` to retrieve the order in which the classes are mapped. features : list[string], optional Name of the columns containing the input features that will be used for classification. If set to `None`, all columns except `session_id` and `target` will be used. prediction_window : int, optional Number of time units between predictions. For example, if your input data is sampled at 100Hz, and the `prediction_window` is set to 100, then this model will make a prediction every 1 second. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance to prevent the model from overfitting to the training data. For each row of the progress table, accuracy is measured over the provided training dataset and the `validation_set`. The format of this SFrame must be the same as the training set. When set to 'auto', a validation set is automatically sampled from the training data (if the training data has > 100 sessions). If validation_set is set to None, then all the data will be used for training. max_iterations : int , optional Maximum number of iterations/epochs made over the data during the training phase. batch_size : int, optional Number of sequence chunks used per training step. Must be greater than the number of GPUs in use. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ActivityClassifier A trained :class:`ActivityClassifier` model. Examples -------- .. sourcecode:: python >>> import turicreate as tc # Training on dummy data >>> data = tc.SFrame({ ... 'accelerometer_x': [0.1, 0.2, 0.3, 0.4, 0.5] * 10, ... 'accelerometer_y': [0.5, 0.4, 0.3, 0.2, 0.1] * 10, ... 'accelerometer_z': [0.01, 0.01, 0.02, 0.02, 0.01] * 10, ... 'session_id': [0, 0, 0] * 10 + [1, 1] * 10, ... 'activity': ['walk', 'run', 'run'] * 10 + ['swim', 'swim'] * 10 ... }) # Create an activity classifier >>> model = tc.activity_classifier.create(train, ... session_id='session_id', target='activity', ... features=['accelerometer_x', 'accelerometer_y', 'accelerometer_z']) # Make predictions (as probability vector, or class) >>> predictions = model.predict(data) >>> predictions = model.predict(data, output_type='probability_vector') # Get both predictions and classes together >>> predictions = model.classify(data) # Get topk predictions (instead of only top-1) if your labels have more # 2 classes >>> predictions = model.predict_topk(data, k = 3) # Evaluate the model >>> results = model.evaluate(data) See Also -------- ActivityClassifier, util.random_split_by_session """ _tkutl._raise_error_if_not_sframe(dataset, "dataset") from ._model_architecture import _net_params from ._model_architecture import _define_model, _fit_model from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter from ._sframe_sequence_iterator import prep_data as _prep_data if not isinstance(target, str): raise _ToolkitError('target must be of type str') if not isinstance(session_id, str): raise _ToolkitError('session_id must be of type str') _tkutl._raise_error_if_sframe_empty(dataset, 'dataset') _tkutl._numeric_param_check_range('prediction_window', prediction_window, 1, 400) _tkutl._numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) if features is None: features = _fe_tkutl.get_column_names(dataset, interpret_as_excluded=True, column_names=[session_id, target]) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str." % x) if len(features) == 0: raise TypeError("Input 'features' must contain at least one column name.") start_time = _time.time() dataset = _tkutl._toolkits_select_columns(dataset, features + [session_id, target]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[target], target, [str, int]) _tkutl._raise_error_if_sarray_not_expected_dtype(dataset[session_id], session_id, [str, int]) # Encode the target column to numerical values use_target = target is not None dataset, target_map = _encode_target(dataset, target) predictions_in_chunk = 20 chunked_data, num_sessions = _prep_data(dataset, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=verbose) if isinstance(validation_set, str) and validation_set == 'auto': if num_sessions < 100: validation_set = None else: dataset, validation_set = _random_split_by_session(dataset, session_id) # Create data iterators num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=num_sessions) user_provided_batch_size = batch_size batch_size = max(batch_size, num_gpus, 1) data_iter = _SFrameSequenceIter(chunked_data, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) if validation_set is not None: _tkutl._raise_error_if_not_sframe(validation_set, 'validation_set') _tkutl._raise_error_if_sframe_empty(validation_set, 'validation_set') validation_set = _tkutl._toolkits_select_columns( validation_set, features + [session_id, target]) validation_set = validation_set.filter_by(target_map.keys(), target) validation_set, mapping = _encode_target(validation_set, target, target_map) chunked_validation_set, _ = _prep_data(validation_set, features, session_id, prediction_window, predictions_in_chunk, target=target, verbose=False) valid_iter = _SFrameSequenceIter(chunked_validation_set, len(features), prediction_window, predictions_in_chunk, batch_size, use_target=use_target) else: valid_iter = None # Define model architecture context = _mxnet_utils.get_mxnet_context(max_devices=num_sessions) loss_model, pred_model = _define_model(features, target_map, prediction_window, predictions_in_chunk, context) # Train the model log = _fit_model(loss_model, data_iter, valid_iter, max_iterations, num_gpus, verbose) # Set up prediction model pred_model.bind(data_shapes=data_iter.provide_data, label_shapes=None, for_training=False) arg_params, aux_params = loss_model.get_params() pred_model.init_params(arg_params=arg_params, aux_params=aux_params) # Save the model state = { '_pred_model': pred_model, 'verbose': verbose, 'training_time': _time.time() - start_time, 'target': target, 'classes': sorted(target_map.keys()), 'features': features, 'session_id': session_id, 'prediction_window': prediction_window, 'max_iterations': max_iterations, 'num_examples': len(dataset), 'num_sessions': num_sessions, 'num_classes': len(target_map), 'num_features': len(features), 'training_accuracy': log['train_acc'], 'training_log_loss': log['train_loss'], '_target_id_map': target_map, '_id_target_map': {v: k for k, v in target_map.items()}, '_predictions_in_chunk': predictions_in_chunk, '_recalibrated_batch_size': data_iter.batch_size, 'batch_size' : user_provided_batch_size } if validation_set is not None: state['valid_accuracy'] = log['valid_acc'] state['valid_log_loss'] = log['valid_loss'] model = ActivityClassifier(state) return model
def create(dataset, annotations=None, feature=None, model='darknet-yolo', classes=None, max_iterations=0, verbose=True, **kwargs): """ Create a :class:`ObjectDetector` model. Parameters ---------- dataset : SFrame Input data. The columns named by the ``feature`` and ``annotations`` parameters will be extracted for training the detector. annotations : string Name of the column containing the object detection annotations. This column should be a list of dictionaries, with each dictionary representing a bounding box of an object instance. Here is an example of the annotations for a single image with two object instances:: [{'label': 'dog', 'type': 'rectangle', 'coordinates': {'x': 223, 'y': 198, 'width': 130, 'height': 230}}, {'label': 'cat', 'type': 'rectangle', 'coordinates': {'x': 40, 'y': 73, 'width': 80, 'height': 123}}] The value for `x` is the horizontal center of the box paired with `width` and `y` is the vertical center of the box paired with `height`. 'None' (the default) indicates the only list column in `dataset` should be used for the annotations. feature : string Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. model : string optional Object detection model to use: - "darknet-yolo" : Fast and medium-sized model classes : list optional List of strings containing the names of the classes of objects. Inferred from the data if not provided. max_iterations : int The number of training iterations. If 0, then it will be automatically be determined based on the amount of data you provide. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : ObjectDetector A trained :class:`ObjectDetector` model. See Also -------- ObjectDetector Examples -------- .. sourcecode:: python # Train an object detector model >>> model = turicreate.object_detector.create(data) # Make predictions on the training set and as column to the SFrame >>> data['predictions'] = model.predict(data) # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ _raise_error_if_not_sframe(dataset, "dataset") from ._mx_detector import YOLOLoss as _YOLOLoss from ._model import tiny_darknet as _tiny_darknet from ._sframe_loader import SFrameDetectionIter as _SFrameDetectionIter from ._manual_scheduler import ManualScheduler as _ManualScheduler import mxnet as _mx if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) start_time = _time.time() supported_detectors = ['darknet-yolo'] if feature is None: feature = _tkutl._find_only_image_column(dataset) if verbose: print("Using '%s' as feature column" % feature) if annotations is None: annotations = _tkutl._find_only_column_of_type(dataset, target_type=list, type_name='list', col_name='annotations') if verbose: print("Using '%s' as annotations column" % annotations) _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations=True) _tkutl._check_categorical_option_type('model', model, supported_detectors) base_model = model.split('-', 1)[0] ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() params = { 'anchors': [ (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), ], 'grid_shape': [13, 13], 'batch_size': 32, 'aug_resize': 0, 'aug_rand_crop': 0.9, 'aug_rand_pad': 0.9, 'aug_rand_gray': 0.0, 'aug_aspect_ratio': 1.25, 'aug_hue': 0.05, 'aug_brightness': 0.05, 'aug_saturation': 0.05, 'aug_contrast': 0.05, 'aug_horizontal_flip': True, 'aug_min_object_covered': 0, 'aug_min_eject_coverage': 0.5, 'aug_area_range': (.15, 2), 'aug_pca_noise': 0.0, 'aug_max_attempts': 20, 'aug_inter_method': 2, 'lmb_coord_xy': 10.0, 'lmb_coord_wh': 10.0, 'lmb_obj': 100.0, 'lmb_noobj': 5.0, 'lmb_class': 2.0, 'non_maximum_suppression_threshold': 0.45, 'rescore': True, 'clip_gradients': 0.025, 'learning_rate': 1.0e-3, 'shuffle': True, } if '_advanced_parameters' in kwargs: # Make sure no additional parameters are provided new_keys = set(kwargs['_advanced_parameters'].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported)) params.update(kwargs['_advanced_parameters']) anchors = params['anchors'] num_anchors = len(anchors) num_gpus = _mxnet_utils.get_num_gpus_in_use(max_devices=params['batch_size']) batch_size_each = params['batch_size'] // max(num_gpus, 1) # Note, this may slightly alter the batch size to fit evenly on the GPUs batch_size = max(num_gpus, 1) * batch_size_each grid_shape = params['grid_shape'] input_image_shape = (3, grid_shape[0] * ref_model.spatial_reduction, grid_shape[1] * ref_model.spatial_reduction) try: instances = (dataset.stack(annotations, new_column_name='_bbox', drop_na=True) .unpack('_bbox', limit=['label'])) except (TypeError, RuntimeError): # If this fails, the annotation format isinvalid at the coarsest level raise _ToolkitError("Annotations format is invalid. Must be a list of " "dictionaries containing 'label' and 'coordinates'.") num_images = len(dataset) num_instances = len(instances) if classes is None: classes = instances['_bbox.label'].unique() classes = sorted(classes) # Make a class-to-index look-up table class_to_index = {name: index for index, name in enumerate(classes)} num_classes = len(classes) # Create data loader loader = _SFrameDetectionIter(dataset, batch_size=batch_size, input_shape=input_image_shape[1:], output_shape=grid_shape, anchors=anchors, class_to_index=class_to_index, aug_params=params, shuffle=params['shuffle'], loader_type='augmented', feature_column=feature, annotations_column=annotations) # Predictions per anchor box: x/y + w/h + object confidence + class probs preds_per_box = 5 + num_classes output_size = preds_per_box * num_anchors ymap_shape = (batch_size_each,) + tuple(grid_shape) + (num_anchors, preds_per_box) net = _tiny_darknet(output_size=output_size) loss = _YOLOLoss(input_shape=input_image_shape[1:], output_shape=grid_shape, batch_size=batch_size_each, num_classes=num_classes, anchors=anchors, parameters=params) base_lr = params['learning_rate'] if max_iterations == 0: # Set number of iterations through a heuristic num_iterations_raw = 5000 * _np.sqrt(num_instances) / batch_size num_iterations = 1000 * max(1, int(round(num_iterations_raw / 1000))) else: num_iterations = max_iterations steps = [num_iterations // 2, 3 * num_iterations // 4, num_iterations] steps_and_factors = [(step, 10**(-i)) for i, step in enumerate(steps)] steps, factors = zip(*steps_and_factors) lr_scheduler = _ManualScheduler(step=steps, factor=factors) ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size) net_params = net.collect_params() net_params.initialize(_mx.init.Xavier(), ctx=ctx) net_params['conv7_weight'].initialize(_mx.init.Xavier(factor_type='avg'), ctx=ctx, force_reinit=True) net_params['conv8_weight'].initialize(_mx.init.Uniform(0.00005), ctx=ctx, force_reinit=True) # Initialize object confidence low, preventing an unnecessary adjustment # period toward conservative estimates bias = _np.zeros(output_size, dtype=_np.float32) bias[4::preds_per_box] -= 6 from ._mx_detector import ConstantArray net_params['conv8_bias'].initialize(ConstantArray(bias), ctx, force_reinit=True) # Take a subset and then load the rest of the parameters. It is possible to # do allow_missing=True directly on net_params. However, this will more # easily hide bugs caused by names getting out of sync. ref_model.available_parameters_subset(net_params).load(ref_model.model_path, ctx) options = {'learning_rate': base_lr, 'lr_scheduler': lr_scheduler, 'momentum': 0.9, 'wd': 0.00005, 'rescale_grad': 1.0} clip_grad = params.get('clip_gradients') if clip_grad: options['clip_gradient'] = clip_grad trainer = _mx.gluon.Trainer(net.collect_params(), 'sgd', options) iteration = 0 smoothed_loss = None last_time = 0 while iteration < num_iterations: loader.reset() for batch in loader: data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) Ls = [] with _mx.autograd.record(): for x, y in zip(data, label): z = net(x) z0 = _mx.nd.transpose(z, [0, 2, 3, 1]).reshape(ymap_shape) L = loss(z0, y) Ls.append(L) for L in Ls: L.backward() cur_loss = _np.mean([L.asnumpy()[0] for L in Ls]) if smoothed_loss is None: smoothed_loss = cur_loss else: smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss trainer.step(1) iteration += 1 cur_time = _time.time() if verbose and cur_time > last_time + 10: print('{now:%Y-%m-%d %H:%M:%S} Training {cur_iter:{width}d}/{num_iterations:{width}d} Loss {loss:6.3f}'.format( now=_datetime.now(), cur_iter=iteration, num_iterations=num_iterations, loss=smoothed_loss, width=len(str(num_iterations)))) last_time = cur_time if iteration == num_iterations: break training_time = _time.time() - start_time # Save the model state = { '_model': net, '_class_to_index': class_to_index, '_training_time_as_string': _seconds_as_string(training_time), '_grid_shape': grid_shape, 'anchors': anchors, 'model': model, 'classes': classes, 'batch_size': batch_size, 'input_image_shape': input_image_shape, 'feature': feature, 'non_maximum_suppression_threshold': params['non_maximum_suppression_threshold'], 'annotations': annotations, 'num_classes': num_classes, 'num_examples': num_images, 'num_bounding_boxes': num_instances, 'training_time': training_time, 'training_epochs': loader.cur_epoch, 'training_iterations': iteration, 'max_iterations': max_iterations, 'training_loss': smoothed_loss, } return ObjectDetector(state)
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of Turi Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~turicreate.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, turicreate.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`turicreate.SFrame.fillna` and :func:`turicreate.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of Turi Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~turicreate.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _turicreate.distances.cosine or distance == 'dot_product' or distance == _turicreate.distances.dot_product or distance == 'transformed_dot_product' or distance == _turicreate.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _turicreate.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _turicreate.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ getattr(_turicreate.distances, k) for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [f for f in feature_names if _dataset[f].dtype == list] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _turicreate.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _turicreate.distances.euclidean, _turicreate.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.train(opts) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve For more flexibility in calculating evaluation metrics, use the :class:`~turicreate.toolkits.evaluation` module. verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- create, predict, classify Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print results['accuracy'] """ import os, json, math if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if self.target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % self.target) extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size) extracted_features[self.target] = dataset[self.target] metrics = self.classifier.evaluate(extracted_features, metric=metric, with_predictions=True) predictions = metrics["predictions"]["probs"] state = self.__proxy__.get_state() labels = state["classes"] from .._evaluate_utils import (entropy, confidence, relative_confidence, get_confusion_matrix, hclusterSort, l2Dist) evaluation_result = { k: metrics[k] for k in [ 'accuracy', 'f1_score', 'log_loss', 'precision', 'recall', 'auc', 'roc_curve', 'confusion_matrix' ] } evaluation_result['num_test_examples'] = len(dataset) for k in [ 'num_classes', 'num_features', 'input_image_shape', 'num_examples', 'training_loss', 'training_time', 'model', 'max_iterations' ]: evaluation_result[k] = getattr(self, k) # Extend the given test data extended_test = dataset.add_column(predictions, 'probs') extended_test['label'] = dataset[self.target] extended_test = extended_test.add_columns([ extended_test.apply( lambda d: labels[d['probs'].index(confidence(d['probs']))]), extended_test.apply(lambda d: entropy(d['probs'])), extended_test.apply(lambda d: confidence(d['probs'])), extended_test.apply(lambda d: relative_confidence(d['probs'])) ], ['predicted_label', 'entropy', 'confidence', 'relative_confidence']) extended_test = extended_test.add_column( extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct') evaluation_result['model_name'] = state['model'] # Calculate the confusion matrix sf_conf_mat = get_confusion_matrix(extended_test, labels) confidence_threshold = 0.5 hesitant_threshold = 0.2 evaluation_result['confidence_threshold'] = confidence_threshold evaluation_result['hesitant_threshold'] = hesitant_threshold evaluation_result[ 'confidence_metric_for_threshold'] = 'relative_confidence' evaluation_result['conf_mat'] = list(sf_conf_mat) # Get sorted labels (sorted by hCluster) vectors = map( lambda l: { 'name': l, 'pos': list(sf_conf_mat[sf_conf_mat['target_label'] == l].sort( 'predicted_label')['norm_prob']) }, labels) evaluation_result['sorted_labels'] = hclusterSort( vectors, l2Dist)[0]['name'].split("|") # Get recall and precision per label per_l = extended_test.groupby( ['label'], { 'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) per_l['recall'] = per_l.apply( lambda l: l['correct_count'] * 1.0 / l['count']) per_pl = extended_test.groupby( ['predicted_label'], { 'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) per_pl['precision'] = per_pl.apply( lambda l: l['correct_count'] * 1.0 / l['predicted_count']) per_pl = per_pl.rename({'predicted_label': 'label'}) evaluation_result['label_metrics'] = list( per_l.join(per_pl, on='label', how='outer').select_columns([ 'label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision' ])) evaluation_result['labels'] = labels extended_test = extended_test.add_row_number('__idx').rename( {'label': 'target_label'}) evaluation_result['test_data'] = extended_test evaluation_result['feature'] = self.feature return _Evaluation(evaluation_result)
def create(dataset, target, feature = None, model = 'resnet-50', validation_set='auto', max_iterations = 10, verbose = True, seed = None, batch_size=64): """ Create a :class:`ImageClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string, or int Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in the order in which they are provided. For example, a target variable with 'cat' and 'dog' as possible values is mapped to 0 and 1 respectively with 0 being the base class and 1 being the reference class. Use `model.classes` to retrieve the order in which the classes are mapped. feature : string, optional indicates that the SFrame has only column of Image type and that will Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. model : string optional Uses a pretrained model to bootstrap an image classifier: - "resnet-50" : Uses a pretrained resnet model. Exported Core ML model will be ~90M. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. Exported Core ML model will be ~4.7M. - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Exported Core ML model will be ~41K. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. max_iterations : float, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low and the *Grad-Norm* in the display is large. verbose : bool, optional If True, prints progress updates and model details. seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : ImageClassifier A trained :class:`ImageClassifier` model. Examples -------- .. sourcecode:: python >>> model = turicreate.image_classifier.create(data, target='is_expensive') # Make predictions (in various forms) >>> predictions = model.predict(data) # predictions >>> predictions = model.classify(data) # predictions with confidence >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass) # Evaluate the model with ground truth data >>> results = model.evaluate(data) See Also -------- ImageClassifier """ start_time = _time.time() # Check model parameter allowed_models = list(_pre_trained_models.MODELS.keys()) if _mac_ver() >= (10,14): allowed_models.append('VisionFeaturePrint_Screen') _tkutl._check_categorical_option_type('model', model, allowed_models) # Check dataset parameter if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if(batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): raise TypeError("Unrecognized value for 'validation_set'.") if feature is None: feature = _tkutl._find_only_image_column(dataset) feature_extractor = _image_feature_extractor._create_feature_extractor(model) # Extract features extracted_features = _tc.SFrame({ target: dataset[target], '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), }) if isinstance(validation_set, _tc.SFrame): extracted_features_validation = _tc.SFrame({ target: validation_set[target], '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size), }) else: extracted_features_validation = validation_set # Train a classifier using the extracted features extracted_features[target] = dataset[target] lr_model = _tc.logistic_classifier.create(extracted_features, features=['__image_features__'], target=target, max_iterations=max_iterations, validation_set=extracted_features_validation, seed=seed, verbose=verbose) # set input image shape if model in _pre_trained_models.MODELS: input_image_shape = _pre_trained_models.MODELS[model].input_image_shape else: # model == VisionFeaturePrint_Screen input_image_shape = (3, 299, 299) # Save the model state = { 'classifier': lr_model, 'model': model, 'max_iterations': max_iterations, 'feature_extractor': feature_extractor, 'input_image_shape': input_image_shape, 'target': target, 'feature': feature, 'num_features': 1, 'num_classes': lr_model.num_classes, 'classes': lr_model.classes, 'num_examples': lr_model.num_examples, 'training_time': _time.time() - start_time, 'training_loss': lr_model.training_loss, } return ImageClassifier(state)
def create(dataset, label=None, feature=None, model="resnet-50", verbose=True, batch_size=64): """ Create a :class:`ImageSimilarityModel` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. label : string Name of the SFrame column with row labels to be used as uuid's to identify the data. If 'label' is set to None, row numbers are used to identify reference dataset rows when the model is queried. feature : string Name of the column containing the input images. 'None' (the default) indicates that the SFrame has only one column of Image type and that will be used for similarity. model: string, optional Uses a pretrained model to bootstrap an image similarity model - "resnet-50" : Uses a pretrained resnet model. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. verbose : bool, optional If True, print progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : ImageSimilarityModel A trained :class:`ImageSimilarityModel` model. See Also -------- ImageSimilarityModel Examples -------- .. sourcecode:: python # Train an image similarity model >>> model = turicreate.image_similarity.create(data) # Query the model for similar images >>> similar_images = model.query(data) +-------------+-----------------+-------------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+-------------------+------+ | 0 | 0 | 0.0 | 1 | | 0 | 519 | 12.5319706301 | 2 | | 0 | 1619 | 12.5563764596 | 3 | | 0 | 186 | 12.6132604915 | 4 | | 0 | 1809 | 12.9180964745 | 5 | | 1 | 1 | 2.02304872852e-06 | 1 | | 1 | 1579 | 11.4288186151 | 2 | | 1 | 1237 | 12.3764325949 | 3 | | 1 | 80 | 12.7264363676 | 4 | | 1 | 58 | 12.7675058558 | 5 | +-------------+-----------------+-------------------+------+ [500 rows x 4 columns] """ start_time = _time.time() if not isinstance(dataset, _tc.SFrame): raise TypeError("'dataset' must be of type SFrame.") # Check parameters allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys()) if _mac_ver() >= (10, 14): allowed_models.append("VisionFeaturePrint_Scene") # Also, to make sure existing code doesn't break, replace incorrect name # with the correct name version if model == "VisionFeaturePrint_Screen": print( "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene. VisionFeaturePrint_Screen will be removed in future releases." ) model = "VisionFeaturePrint_Scene" _tkutl._check_categorical_option_type("model", model, allowed_models) if len(dataset) == 0: raise _ToolkitError("Unable to train on empty dataset") if (label is not None) and (label not in dataset.column_names()): raise _ToolkitError("Row label column '%s' does not exist" % label) if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") # Set defaults if feature is None: feature = _tkutl._find_only_image_column(dataset) feature_extractor = _image_feature_extractor._create_feature_extractor( model) # Extract features extracted_features = _tc.SFrame({ "__image_features__": feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), }) # Train a similarity model using the extracted features if label is not None: extracted_features[label] = dataset[label] nn_model = _tc.nearest_neighbors.create( extracted_features, label=label, features=["__image_features__"], verbose=verbose, ) # set input image shape if model in _pre_trained_models.IMAGE_MODELS: input_image_shape = _pre_trained_models.IMAGE_MODELS[ model].input_image_shape else: # model == VisionFeaturePrint_Scene input_image_shape = (3, 299, 299) # Save the model state = { "similarity_model": nn_model, "model": model, "feature_extractor": feature_extractor, "input_image_shape": input_image_shape, "label": label, "feature": feature, "num_features": 1, "num_examples": nn_model.num_examples, "training_time": _time.time() - start_time, } return ImageSimilarityModel(state)
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier` model. This model predicts the class of a query instance by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of Turi Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : str, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~turicreate.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~turicreate.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier turicreate.toolkits.nearest_neighbors turicreate.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <https://web.stanford.edu/~hastie/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = turicreate.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype == str and not dataset[target].dtype == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].countna() > 0: _logging.warning("Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} distance = _construct_auto_distance(_features, col_types) else: raise TypeError("Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _tc.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- state = { 'verbose' : verbose, 'distance' : knn_model.distance, 'num_distance_components' : knn_model.num_distance_components, 'num_examples' : dataset.num_rows(), 'features' : knn_model.features, 'target': target, 'num_classes': len(dataset[target].unique()), 'num_features': knn_model.num_features, 'num_unpacked_features': knn_model.num_unpacked_features, 'training_time': _time.time() - start_time, '_target_type': dataset[target].dtype, } model = NearestNeighborClassifier(knn_model, state) return model
def create( dataset, target, feature=None, model='resnet-50', l2_penalty=0.01, l1_penalty=0.0, solver='auto', feature_rescaling=True, convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'], step_size=_DEFAULT_SOLVER_OPTIONS['step_size'], lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'], class_weights=None, validation_set='auto', verbose=True, seed=None, batch_size=64): """ Create a :class:`ImageClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string, or int Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in the order in which they are provided. For example, a target variable with 'cat' and 'dog' as possible values is mapped to 0 and 1 respectively with 0 being the base class and 1 being the reference class. Use `model.classes` to retrieve the order in which the classes are mapped. feature : string, optional indicates that the SFrame has only column of Image type and that will Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. l2_penalty : float, optional Weight on l2 regularization of the model. The larger this weight, the more the model coefficients shrink toward 0. This introduces bias into the model but decreases variance, potentially leading to better predictions. The default value is 0.01; setting this parameter to 0 corresponds to unregularized logistic regression. See the ridge regression reference for more detail. l1_penalty : float, optional Weight on l1 regularization of the model. Like the l2 penalty, the higher the l1 penalty, the more the estimated coefficients shrink toward 0. The l1 penalty, however, completely zeros out sufficiently small coefficients, automatically indicating features that are not useful for the model. The default weight of 0 prevents any features from being discarded. See the LASSO regression reference for more detail. solver : string, optional Name of the solver to be used to solve the regression. See the references for more detail on each solver. Available solvers are: - *auto (default)*: automatically chooses the best solver for the data and model parameters. - *newton*: Newton-Raphson - *lbfgs*: limited memory BFGS - *fista*: accelerated gradient descent For this model, the Newton-Raphson method is equivalent to the iteratively re-weighted least squares algorithm. If the l1_penalty is greater than 0, use the 'fista' solver. The model is trained using a carefully engineered collection of methods that are automatically picked based on the input data. The ``newton`` method works best for datasets with plenty of examples and few features (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for wide datasets (i.e datasets with many coefficients). ``fista`` is the default solver for l1-regularized linear regression. The solvers are all automatically tuned and the default options should function well. See the solver options guide for setting additional parameters for each of the solvers. See the user guide for additional details on how the solver is chosen. (see `here <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_) feature_rescaling : boolean, optional Feature rescaling is an important pre-processing step that ensures that all features are on the same scale. An l2-norm rescaling is performed to make sure that all features are of the same norm. Categorical features are also rescaled by rescaling the dummy variables that are used to represent them. The coefficients are returned in original scale of the problem. This process is particularly useful when features vary widely in their ranges. convergence_threshold : float, optional Convergence is tested using variation in the training objective. The variation in the training objective is calculated using the difference between the objective values between two steps. Consider reducing this below the default value (0.01) for a more accurately trained model. Beware of overfitting (i.e a model that works well only on the training data) if this parameter is set to a very low value. lbfgs_memory_level : float, optional The L-BFGS algorithm keeps track of gradient information from the previous ``lbfgs_memory_level`` iterations. The storage requirement for each of these gradients is the ``num_coefficients`` in the problem. Increasing the ``lbfgs_memory_level ``can help improve the quality of the model trained. Setting this to more than ``max_iterations`` has the same effect as setting it to ``max_iterations``. model : string optional Uses a pretrained model to bootstrap an image classifier: - "resnet-50" : Uses a pretrained resnet model. Exported Core ML model will be ~90M. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. Exported Core ML model will be ~4.7M. - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Exported Core ML model will be ~41K. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. step_size : float, optional The starting step size to use for the ``fista`` solver. The default is set to 1.0, this is an aggressive setting. If the first iteration takes a considerable amount of time, reducing this parameter may speed up model training. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If set to `None`, all classes are supposed to have weight one. The `auto` mode set the class weight to be inversely proportional to number of examples in the training data with the given class. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. max_iterations : int, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low and the *Grad-Norm* in the display is large. verbose : bool, optional If True, prints progress updates and model details. seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : ImageClassifier A trained :class:`ImageClassifier` model. Examples -------- .. sourcecode:: python >>> model = turicreate.image_classifier.create(data, target='is_expensive') # Make predictions (in various forms) >>> predictions = model.predict(data) # predictions >>> predictions = model.classify(data) # predictions with confidence >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass) # Evaluate the model with ground truth data >>> results = model.evaluate(data) See Also -------- ImageClassifier """ start_time = _time.time() # Check model parameter allowed_models = list(_pre_trained_models.MODELS.keys()) if _mac_ver() >= (10, 14): allowed_models.append('VisionFeaturePrint_Scene') # Also, to make sure existing code doesn't break, replace incorrect name # with the correct name version if model == "VisionFeaturePrint_Screen": print( "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene; VisionFeaturePrint_Screen will be removed in subsequent versions." ) model = "VisionFeaturePrint_Scene" _tkutl._check_categorical_option_type('model', model, allowed_models) # Check dataset parameter if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): raise TypeError("Unrecognized value for 'validation_set'.") if feature is None: feature = _tkutl._find_only_image_column(dataset) feature_extractor = _image_feature_extractor._create_feature_extractor( model) # Extract features extracted_features = _tc.SFrame({ target: dataset[target], '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), }) if isinstance(validation_set, _tc.SFrame): extracted_features_validation = _tc.SFrame({ target: validation_set[target], '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size), }) else: extracted_features_validation = validation_set # Train a classifier using the extracted features extracted_features[target] = dataset[target] lr_model = _tc.logistic_classifier.create( extracted_features, features=['__image_features__'], target=target, max_iterations=max_iterations, validation_set=extracted_features_validation, seed=seed, verbose=verbose, l2_penalty=l2_penalty, l1_penalty=l1_penalty, solver=solver, feature_rescaling=feature_rescaling, convergence_threshold=convergence_threshold, step_size=step_size, lbfgs_memory_level=lbfgs_memory_level, class_weights=class_weights) # set input image shape if model in _pre_trained_models.MODELS: input_image_shape = _pre_trained_models.MODELS[model].input_image_shape else: # model == VisionFeaturePrint_Scene input_image_shape = (3, 299, 299) # Save the model state = { 'classifier': lr_model, 'model': model, 'max_iterations': max_iterations, 'feature_extractor': feature_extractor, 'input_image_shape': input_image_shape, 'target': target, 'feature': feature, 'num_features': 1, 'num_classes': lr_model.num_classes, 'classes': lr_model.classes, 'num_examples': lr_model.num_examples, 'training_time': _time.time() - start_time, 'training_loss': lr_model.training_loss, } return ImageClassifier(state)
def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ Evaluate the model's predictive accuracy. This is done by predicting the target class for instances in a new dataset and comparing to known target values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto': Returns all available metrics. - 'accuracy': Classification accuracy. - 'confusion_matrix': An SFrame with counts of possible prediction/true label combinations. - 'roc_curve': An SFrame containing information needed for an roc curve (binary classification only). max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : dict Evaluation results. The dictionary keys are *accuracy* and *confusion_matrix* and *roc_curve* (if applicable). See also -------- create, predict, predict_topk, classify Notes ----- - Because the model randomly breaks ties between predicted classes, the results of repeated calls to `evaluate` method may differ. Examples -------- >>> sf_train = turicreate.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) >>> m = turicreate.nearest_neighbor_classifier.create(sf, target='species') >>> ans = m.evaluate(sf_train, max_neighbors=2, ... metric='confusion_matrix') >>> print ans['confusion_matrix'] +--------------+-----------------+-------+ | target_label | predicted_label | count | +--------------+-----------------+-------+ | cat | dog | 1 | | dog | dog | 2 | | fossa | dog | 1 | +--------------+-----------------+-------+ """ ## Validate the metric name _raise_error_evaluation_metric_is_valid(metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) ## Make sure the input dataset has a target column with an appropriate # type. target = self.target _raise_error_if_column_exists(dataset, target, 'dataset', target) if not dataset[target].dtype == str and not dataset[target].dtype == int: raise TypeError("The target column of the evaluation dataset must " "contain integers or strings.") if self.num_classes != 2: if (metric == 'roc_curve') or (metric == ['roc_curve']): err_msg = "Currently, ROC curve is not supported for " err_msg += "multi-class classification in this model." raise _ToolkitError(err_msg) else: warn_msg = "WARNING: Ignoring `roc_curve`. " warn_msg += "Not supported for multi-class classification." print(warn_msg) ## Compute predictions with the input dataset. ystar = self.predict(dataset, output_type='class', max_neighbors=max_neighbors, radius=radius) ystar_prob = self.predict(dataset, output_type='probability', max_neighbors=max_neighbors, radius=radius) ## Compile accuracy metrics results = {} if metric in ['accuracy', 'auto']: results['accuracy'] = _evaluation.accuracy(targets=dataset[target], predictions=ystar) if metric in ['confusion_matrix', 'auto']: results['confusion_matrix'] = \ _evaluation.confusion_matrix(targets=dataset[target], predictions=ystar) if self.num_classes == 2: if metric in ['roc_curve', 'auto']: results['roc_curve'] = \ _evaluation.roc_curve(targets=dataset[target], predictions=ystar_prob) return results
def evaluate(self, dataset, metric='auto', output_type='dict', verbose=True): """ Evaluate the model by making predictions and comparing these to ground truth bounding box annotations. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the annotations and feature used for model training. Additional columns are ignored. metric : str or list, optional Name of the evaluation metric or list of several names. The primary metric is average precision, which is the area under the precision/recall curve and reported as a value between 0 and 1 (1 being perfect). Possible values are: - 'auto' : Returns all primary metrics. - 'all' : Returns all available metrics. - 'average_precision' : Average precision per class calculated over multiple intersection-over-union thresholds (at 50%, 55%, ..., 95%) and averaged. - 'average_precision_50' : Average precision per class with intersection-over-union threshold at 50% (PASCAL VOC metric). - 'mean_average_precision' : Mean over all classes (for ``'average_precision'``) This is the primary single-value metric. - 'mean_average_precision_50' : Mean over all classes (for ``'average_precision_50'``). output_type : str Type of output: - 'dict' : You are given a dictionary where each key is a metric name and the value is another dictionary containing class-to-metric entries. - 'sframe' : All metrics are returned as a single `SFrame`, where each row is a class and each column is a metric. Metrics that are averaged over class cannot be returned and are ignored under this format. However, these are easily computed from the `SFrame` (e.g. ``results['average_precision'].mean()``). verbose : bool If True, prints evaluation progress. Returns ------- out : dict / SFrame Output type depends on the option `output_type`. See Also -------- create, predict Examples -------- >>> results = model.evaluate(data) >>> print('mAP: {:.1%}'.format(results['mean_average_precision'])) mAP: 43.2% """ AP = 'average_precision' MAP = 'mean_average_precision' AP50 = 'average_precision_50' MAP50 = 'mean_average_precision_50' ALL_METRICS = {AP, MAP, AP50, MAP50} if isinstance(metric, (list, tuple, set)): metrics = metric elif metric == 'all': metrics = ALL_METRICS elif metric == 'auto': metrics = {AP, MAP} elif metric in ALL_METRICS: metrics = {metric} else: raise _ToolkitError("Metric '{}' not supported".format(metric)) pred, gt = self._predict_with_options(dataset, with_ground_truth=True, verbose=verbose) pred_df = pred.to_dataframe() gt_df = gt.to_dataframe() thresholds = _np.arange(0.5, 1.0, 0.05) all_th_aps = _average_precision(pred_df, gt_df, class_to_index=self._class_to_index, iou_thresholds=thresholds) def class_dict(aps): return {classname: aps[index] for classname, index in self._class_to_index.items()} if output_type == 'dict': ret = {} if AP50 in metrics: ret[AP50] = class_dict(all_th_aps[0]) if AP in metrics: ret[AP] = class_dict(all_th_aps.mean(0)) if MAP50 in metrics: ret[MAP50] = all_th_aps[0].mean() if MAP in metrics: ret[MAP] = all_th_aps.mean() elif output_type == 'sframe': ret = _tc.SFrame({'label': self.classes}) if AP50 in metrics: ret[AP50] = all_th_aps[0] if AP in metrics: ret[AP] = all_th_aps.mean(0) else: raise _ToolkitError("Output type '{}' not supported".format(output_type)) return ret
def predict(self, dataset, output_type='class', verbose=True, batch_size=64): """ Return predictions for ``dataset``. Predictions can be generated as class labels or probabilities. Parameters ---------- dataset : SFrame | SArray | dict The audio data to be classified. If dataset is an SFrame, it must have a column with the same name as the feature used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'class', 'probability_vector'}, optional Form of the predictions which are one of: - 'class': Class prediction. For multi-class classification, this returns the class with maximum probability. - 'probability': Prediction probability associated with the True class (not applicable for multi-class classification) - 'probability_vector': Prediction probability associated with each class as a vector. Label ordering is dictated by the ``classes`` member variable. verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : SArray An SArray with the predictions. See Also ---------- evaluate, classify Examples ---------- >>> probability_predictions = model.predict(data, output_type='probability') >>> prediction_vector = model.predict(data, output_type='probability_vector') >>> class_predictions = model.predict(data, output_type='class') """ import mxnet as mx if not isinstance(dataset, (_tc.SFrame, _tc.SArray, dict)): raise TypeError('\'dataset\' parameter must be either an SFrame, SArray or dictionary') if isinstance(dataset, dict): if(set(dataset.keys()) != {'sample_rate', 'data'}): raise ValueError('\'dataset\' parameter is a dictionary but does not appear to be audio data.') dataset = _tc.SArray([dataset]) elif isinstance(dataset, _tc.SFrame): dataset = dataset[self.feature] if not _is_deep_feature_sarray(dataset) and not _is_audio_data_sarray(dataset): raise ValueError('\'dataset\' must be either audio data or audio deep features.') if output_type not in ('probability', 'probability_vector', 'class'): raise ValueError('\'dataset\' parameter must be either an SFrame, SArray or dictionary') if output_type == 'probability' and self.num_classes != 2: raise _ToolkitError('Output type \'probability\' is only supported for binary' ' classification. For multi-class classification, use' ' predict_topk() instead.') if(batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if _is_deep_feature_sarray(dataset): deep_features = dataset else: deep_features = get_deep_features(dataset, verbose=verbose) deep_features = _tc.SFrame({'deep features': deep_features}) deep_features = deep_features.add_row_number() deep_features = deep_features.stack('deep features', new_column_name='deep features') deep_features, missing_ids = deep_features.dropna_split(columns=['deep features']) if len(missing_ids) > 0: _logging.warning("Unable to make predictions for %d examples because they are less than 975ms in length." % len(missing_ids)) if batch_size > len(deep_features): batch_size = len(deep_features) y = [] for batch in mx.io.NDArrayIter(deep_features['deep features'].to_numpy(), batch_size=batch_size): ctx = _mxnet_utils.get_mxnet_context() if(len(batch.data[0]) < len(ctx)): ctx = ctx[:len(batch.data[0])] batch_data = batch.data[0] if batch.pad != 0: batch_data = batch_data[:-batch.pad] # prevent batches looping back batch_data = mx.gluon.utils.split_and_load(batch_data, ctx_list=ctx, batch_axis=0, even_split=False) for x in batch_data: forward_output = self._custom_classifier.forward(x) y += mx.nd.softmax(forward_output).asnumpy().tolist() assert(len(y) == len(deep_features)) # Combine predictions from multiple frames sf = _tc.SFrame({'predictions': y, 'id': deep_features['id']}) probabilities_sum = sf.groupby('id', {'prob_sum': _tc.aggregate.SUM('predictions')}) if output_type == 'class': predicted_ids = probabilities_sum['prob_sum'].apply(lambda x: _np.argmax(x)) mappings = self._id_to_class_label probabilities_sum['results'] = predicted_ids.apply(lambda x: mappings[x]) else: assert output_type in ('probability', 'probability_vector') frame_per_example_count = sf.groupby('id', _tc.aggregate.COUNT()) probabilities_sum = probabilities_sum.join(frame_per_example_count) probabilities_sum['results'] = probabilities_sum.apply(lambda row: [i / row['Count'] for i in row['prob_sum']]) if len(missing_ids) > 0: output_type = probabilities_sum['results'].dtype missing_predictions = _tc.SFrame({'id': missing_ids['id'], 'results': _tc.SArray([ None ] * len(missing_ids), dtype=output_type) }) probabilities_sum = probabilities_sum[['id', 'results']].append(missing_predictions) probabilities_sum = probabilities_sum.sort('id') return probabilities_sum['results']