def _get_data(feature, annotations): from PIL import Image as _PIL_Image rs = np.random.RandomState(1234) def from_pil_image(pil_img, image_format="png"): # The above didn't work, so as a temporary fix write to temp files if image_format == "raw": image = np.array(pil_img) FORMAT_RAW = 2 return tc.Image( _image_data=image.tobytes(), _width=image.shape[1], _height=image.shape[0], _channels=image.shape[2], _format_enum=FORMAT_RAW, _image_data_size=image.size, ) else: with tempfile.NamedTemporaryFile(mode="w+b", suffix="." + image_format) as f: pil_img.save(f, format=image_format) return tc.Image(f.name) num_examples = 100 max_num_boxes_per_image = 10 classes = _CLASSES images = [] anns = [] FORMATS = ["png", "jpeg", "raw"] for i in range(num_examples): # Randomly determine image size (should handle large and small) img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, ) img = rs.randint(255, size=img_shape) pil_img = _PIL_Image.fromarray(img, mode="RGB") # Randomly select image format image_format = FORMATS[rs.randint(len(FORMATS))] images.append(from_pil_image(pil_img, image_format=image_format)) ann = [] for j in range(rs.randint(max_num_boxes_per_image)): left, right = np.sort(rs.randint(0, img_shape[1], size=2)) top, bottom = np.sort(rs.randint(0, img_shape[0], size=2)) x = (left + right) / 2 y = (top + bottom) / 2 width = max(right - left, 1) height = max(bottom - top, 1) label = { "coordinates": { "x": x, "y": y, "width": width, "height": height, }, "label": classes[rs.randint(len(classes))], "type": "rectangle", } ann.append(label) anns.append(ann) data = tc.SFrame({ feature: tc.SArray(images), annotations: tc.SArray(anns), }) return data
def predict_topk(self, dataset, output_type='probability', k=3, verbose=True, batch_size=64): """ Return top-k predictions for the ``dataset``. Predictions are returned as an SFrame with three columns: `id`, `class`, and `probability` or `rank` depending on the ``output_type`` parameter. Parameters ---------- dataset : SFrame | SArray | dict The audio data to be classified. If dataset is an SFrame, it must have a column with the same name as the feature used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank'}, optional Choose the return type of the prediction: - `probability`: Probability associated with each label in the prediction. - `rank` : Rank associated with each label in the prediction. k : int, optional Number of classes to return for each input example. verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> pred = m.predict_topk(validation_data, k=3) >>> pred +------+-------+-------------------+ | id | class | probability | +------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +------+-------+-------------------+ """ prob_vector = self.predict(dataset, output_type='probability_vector', verbose=verbose, batch_size=64) id_to_label = self._id_to_class_label if output_type == 'probability': results = prob_vector.apply(lambda p: [{ 'class': id_to_label[i], 'probability': p[i] } for i in reversed(_np.argsort(p)[-k:])]) else: assert (output_type == 'rank') results = prob_vector.apply(lambda p: [{ 'class': id_to_label[i], 'rank': rank } for rank, i in enumerate(reversed(_np.argsort(p)[-k:]))]) results = _tc.SFrame({'X': results}) results = results.add_row_number() results = results.stack('X', new_column_name='X') results = results.unpack('X', column_name_prefix='') return results
def create(dataset, label=None, feature=None, model='resnet-50', verbose=True, batch_size=64): """ Create a :class:`ImageSimilarityModel` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. label : string Name of the SFrame column with row labels to be used as uuid's to identify the data. If 'label' is set to None, row numbers are used to identify reference dataset rows when the model is queried. feature : string indicates that the SFrame has only column of Image type and that will Name of the column containing the input images. 'None' (the default) be used for similarity. model: string, optional Uses a pretrained model to bootstrap an image similarity model - "resnet-50" : Uses a pretrained resnet model. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. verbose : bool, optional If True, print progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : ImageSimilarityModel A trained :class:`ImageSimilarityModel` model. See Also -------- ImageSimilarityModel Examples -------- .. sourcecode:: python # Train an image similarity model >>> model = turicreate.image_similarity.create(data) # Query the model for similar images >>> similar_images = model.query(data) +-------------+-----------------+-------------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+-------------------+------+ | 0 | 0 | 0.0 | 1 | | 0 | 519 | 12.5319706301 | 2 | | 0 | 1619 | 12.5563764596 | 3 | | 0 | 186 | 12.6132604915 | 4 | | 0 | 1809 | 12.9180964745 | 5 | | 1 | 1 | 2.02304872852e-06 | 1 | | 1 | 1579 | 11.4288186151 | 2 | | 1 | 1237 | 12.3764325949 | 3 | | 1 | 80 | 12.7264363676 | 4 | | 1 | 58 | 12.7675058558 | 5 | +-------------+-----------------+-------------------+------+ [500 rows x 4 columns] """ start_time = _time.time() # Check parameters allowed_models = list(_pre_trained_models.MODELS.keys()) if _mac_ver() >= (10, 14): allowed_models.append('VisionFeaturePrint_Screen') _tkutl._check_categorical_option_type('model', model, allowed_models) if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if (label is not None) and (label not in dataset.column_names()): raise _ToolkitError("Row label column '%s' does not exist" % label) if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") # Set defaults if feature is None: feature = _tkutl._find_only_image_column(dataset) feature_extractor = _image_feature_extractor._create_feature_extractor( model) # Extract features extracted_features = _tc.SFrame({ '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), }) # Train a similarity model using the extracted features if label is not None: extracted_features[label] = dataset[label] nn_model = _tc.nearest_neighbors.create(extracted_features, label=label, features=['__image_features__'], verbose=verbose) # set input image shape if model in _pre_trained_models.MODELS: input_image_shape = _pre_trained_models.MODELS[model].input_image_shape else: # model == VisionFeaturePrint_Screen input_image_shape = (3, 299, 299) # Save the model state = { 'similarity_model': nn_model, 'model': model, 'feature_extractor': feature_extractor, 'input_image_shape': input_image_shape, 'label': label, 'feature': feature, 'num_features': 1, 'num_examples': nn_model.num_examples, 'training_time': _time.time() - start_time, } return ImageSimilarityModel(state)
def create( dataset, features=None, distance=None, radius=1.0, min_core_neighbors=10, verbose=True, ): """ Create a DBSCAN clustering model. The DBSCAN method partitions the input dataset into three types of points, based on the estimated probability density at each point. - **Core** points have a large number of points within a given neighborhood. Specifically, `min_core_neighbors` must be within distance `radius` of a point for it to be considered a core point. - **Boundary** points are within distance `radius` of a core point, but don't have sufficient neighbors of their own to be considered core. - **Noise** points comprise the remainder of the data. These points have too few neighbors to be considered core points, and are further than distance `radius` from all core points. Clusters are formed by connecting core points that are neighbors of each other, then assigning boundary points to their nearest core neighbor's cluster. Parameters ---------- dataset : SFrame Training data, with each row corresponding to an observation. Must include all features specified in the `features` parameter, but may have additional columns as well. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns of the input `dataset` should be used to train the model. All features must be numeric, i.e. integer or float types. distance : str or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified, a composite distance is constructed automatically based on feature types. radius : int or float, optional Size of each point's neighborhood, with respect to the specified distance function. min_core_neighbors : int, optional Number of neighbors that must be within distance `radius` of a point in order for that point to be considered a "core point" of a cluster. verbose : bool, optional If True, print progress updates and model details during model creation. Returns ------- out : DBSCANModel A model containing a cluster label for each row in the input `dataset`. Also contains the indices of the core points, cluster boundary points, and noise points. See Also -------- DBSCANModel, turicreate.toolkits.distances Notes ----- - Our implementation of DBSCAN first computes the similarity graph on the input dataset, which can be a computationally intensive process. In the current implementation, some distances are substantially faster than others; in particular "euclidean", "squared_euclidean", "cosine", and "transformed_dot_product" are quite fast, while composite distances can be slow. - Any distance function in the Turi Create library may be used with DBSCAN but the results may be poor for distances that violate the standard metric properties, i.e. symmetry, non-negativity, triangle inequality, and identity of indiscernibles. In particular, the DBSCAN algorithm is based on the concept of connecting high-density points that are *close* to each other into a single cluster, but the notion of *close* may be very counterintuitive if the chosen distance function is not a valid metric. The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will likely yield the best results. References ---------- - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise <https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_. In Proceedings of the Second International Conference on Knowledge Discovery and Data Mining. pp. 226-231. - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_ - `Visualizing DBSCAN Clustering <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_ Examples -------- >>> sf = turicreate.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = turicreate.dbscan.create(sf, radius=4.25, min_core_neighbors=3) >>> model.cluster_id.print_rows(15) +--------+------------+----------+ | row_id | cluster_id | type | +--------+------------+----------+ | 8 | 0 | core | | 7 | 2 | core | | 0 | 1 | core | | 2 | 2 | core | | 3 | 1 | core | | 11 | 2 | core | | 4 | 2 | core | | 1 | 0 | boundary | | 6 | 0 | boundary | | 5 | 0 | boundary | | 9 | 0 | boundary | | 12 | 2 | boundary | | 10 | 1 | boundary | | 13 | 1 | boundary | +--------+------------+----------+ [14 rows x 3 columns] """ ## Start the training time clock and instantiate an empty model logger = _logging.getLogger(__name__) start_time = _time.time() ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0: raise ValueError("Input 'min_core_neighbors' must be a non-negative " + "integer.") if not isinstance(radius, (int, float)) or radius < 0: raise ValueError("Input 'radius' must be a non-negative integer " + "or float.") ## Compute all-point nearest neighbors within `radius` and count # neighborhood sizes knn_model = _tc.nearest_neighbors.create( dataset, features=features, distance=distance, method="brute_force", verbose=verbose, ) knn = knn_model.similarity_graph( k=None, radius=radius, include_self_edges=False, output_type="SFrame", verbose=verbose, ) neighbor_counts = knn.groupby("query_label", _agg.COUNT) ### NOTE: points with NO neighbors are already dropped here! ## Identify core points and boundary candidate points. Not all of the # boundary candidates will be boundary points - some are in small isolated # clusters. if verbose: logger.info("Identifying noise points and core points.") boundary_mask = neighbor_counts["Count"] < min_core_neighbors core_mask = 1 - boundary_mask # this includes too small clusters boundary_idx = neighbor_counts[boundary_mask]["query_label"] core_idx = neighbor_counts[core_mask]["query_label"] ## Build a similarity graph on the core points ## NOTE: careful with singleton core points - the second filter removes them # from the edge set so they have to be added separately as vertices. if verbose: logger.info("Constructing the core point similarity graph.") core_vertices = knn.filter_by(core_idx, "query_label") core_edges = core_vertices.filter_by(core_idx, "reference_label") core_graph = _tc.SGraph() core_graph = core_graph.add_vertices(core_vertices[["query_label"]], vid_field="query_label") core_graph = core_graph.add_edges(core_edges, src_field="query_label", dst_field="reference_label") ## Compute core point connected components and relabel to be consecutive # integers cc = _tc.connected_components.create(core_graph, verbose=verbose) cc_labels = cc.component_size.add_row_number("__label") core_assignments = cc.component_id.join(cc_labels, on="component_id", how="left")[["__id", "__label"]] core_assignments["type"] = "core" ## Join potential boundary points to core cluster labels (points that aren't # really on a boundary are implicitly dropped) if verbose: logger.info("Processing boundary points.") boundary_edges = knn.filter_by(boundary_idx, "query_label") # separate real boundary points from points in small isolated clusters boundary_core_edges = boundary_edges.filter_by(core_idx, "reference_label") # join a boundary point to its single closest core point. boundary_assignments = boundary_core_edges.groupby( "query_label", {"reference_label": _agg.ARGMIN("rank", "reference_label")}) boundary_assignments = boundary_assignments.join( core_assignments, on={"reference_label": "__id"}) boundary_assignments = boundary_assignments.rename({"query_label": "__id"}, inplace=True) boundary_assignments = boundary_assignments.remove_column( "reference_label", inplace=True) boundary_assignments["type"] = "boundary" ## Identify boundary candidates that turned out to be in small clusters but # not on real cluster boundaries small_cluster_idx = set(boundary_idx).difference( boundary_assignments["__id"]) ## Identify individual noise points by the fact that they have no neighbors. noise_idx = set(range(dataset.num_rows())).difference( neighbor_counts["query_label"]) noise_idx = noise_idx.union(small_cluster_idx) noise_assignments = _tc.SFrame( {"row_id": _tc.SArray(list(noise_idx), int)}) noise_assignments["cluster_id"] = None noise_assignments["cluster_id"] = noise_assignments["cluster_id"].astype( int) noise_assignments["type"] = "noise" ## Append core, boundary, and noise results to each other. master_assignments = _tc.SFrame() num_clusters = 0 if core_assignments.num_rows() > 0: core_assignments = core_assignments.rename( { "__id": "row_id", "__label": "cluster_id" }, inplace=True) master_assignments = master_assignments.append(core_assignments) num_clusters = len(core_assignments["cluster_id"].unique()) if boundary_assignments.num_rows() > 0: boundary_assignments = boundary_assignments.rename( { "__id": "row_id", "__label": "cluster_id" }, inplace=True) master_assignments = master_assignments.append(boundary_assignments) if noise_assignments.num_rows() > 0: master_assignments = master_assignments.append(noise_assignments) ## Post-processing and formatting state = { "verbose": verbose, "radius": radius, "min_core_neighbors": min_core_neighbors, "distance": knn_model.distance, "num_distance_components": knn_model.num_distance_components, "num_examples": dataset.num_rows(), "features": knn_model.features, "num_features": knn_model.num_features, "unpacked_features": knn_model.unpacked_features, "num_unpacked_features": knn_model.num_unpacked_features, "cluster_id": master_assignments, "num_clusters": num_clusters, "training_time": _time.time() - start_time, } return DBSCANModel(state)
def create(dataset, target, feature, max_iterations=10, custom_layer_sizes=[100, 100], verbose=True, validation_set='auto', batch_size=64): ''' Creates a :class:`SoundClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string or int Name of the column containing the target variable. The values in this column must be of string or integer type. feature : string, optional Name of the column containing the feature column. This column must contain audio data or deep audio features. Audio data is represented as dicts with key 'data' and 'sample_rate', see `turicreate.load_audio(...)`. Deep audio features are represented as a list of numpy arrays, each of size 12288, see `turicreate.sound_classifier.get_deep_features(...)`. max_iterations : int, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low. custom_layer_sizes : list of ints Specifies the architecture of the custom neural network. This neural network is made up of a series of dense layers. This parameter allows you to specify how many layers and the number of units in each layer. The custom neural network will always have one more layer than the length of this list. The last layer is always a soft max with units equal to the number of classes. verbose : bool, optional If True, prints progress updates and model details. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training dataset. By default, a validation set is automatically sampled. If `validation_set` is set to None, no validataion is used. You can also pass a validation set you have constructed yourself. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. ''' import time from .._mxnet import _mxnet_utils import mxnet as mx from ._audio_feature_extractor import _get_feature_extractor start_time = time.time() # check parameters if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if feature not in dataset.column_names(): raise _ToolkitError("Audio feature column '%s' does not exist" % feature) if not _is_deep_feature_sarray( dataset[feature]) and not _is_audio_data_sarray(dataset[feature]): raise _ToolkitError("'%s' column is not audio data." % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if not _tc.util._is_non_string_iterable(custom_layer_sizes) or len( custom_layer_sizes) == 0: raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.") for i in custom_layer_sizes: if not isinstance(i, int): raise _ToolkitError( "'custom_layer_sizes' must contain only integers.") if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): raise TypeError("Unrecognized value for 'validation_set'") if isinstance(validation_set, _tc.SFrame): if feature not in validation_set.column_names( ) or target not in validation_set.column_names(): raise ValueError( "The 'validation_set' SFrame must be in the same format as the 'dataset'" ) if batch_size < 1: raise ValueError('\'batch_size\' must be greater than or equal to 1') classes = list(dataset[target].unique().sort()) num_labels = len(classes) if num_labels <= 1: raise ValueError('The number of classes must be greater than one.') feature_extractor_name = 'VGGish' feature_extractor = _get_feature_extractor(feature_extractor_name) class_label_to_id = {l: i for i, l in enumerate(classes)} # create the validation set if not isinstance(validation_set, _tc.SFrame) and validation_set == 'auto': if len(dataset) >= 100: print( "Creating a validation set from 5 percent of training data. This may take a while.\n" "\tYou can set ``validation_set=None`` to disable validation tracking.\n" ) dataset, validation_set = dataset.random_split(0.95, exact=True) else: validation_set = None encoded_target = dataset[target].apply(lambda x: class_label_to_id[x]) if _is_deep_feature_sarray(dataset[feature]): train_deep_features = dataset[feature] else: # do the preprocess and VGGish feature extraction train_deep_features = get_deep_features(dataset[feature], verbose=verbose) train_data = _tc.SFrame({ 'deep features': train_deep_features, 'labels': encoded_target }) train_data = train_data.stack('deep features', new_column_name='deep features') train_data, missing_ids = train_data.dropna_split( columns=['deep features']) if len(missing_ids) > 0: _logging.warning( "Dropping %d examples which are less than 975ms in length." % len(missing_ids)) if validation_set is not None: if verbose: print("Preparing validataion set") validation_encoded_target = validation_set[target].apply( lambda x: class_label_to_id[x]) if _is_deep_feature_sarray(validation_set[feature]): validation_deep_features = validation_set[feature] else: validation_deep_features = get_deep_features( validation_set[feature], verbose=verbose) validation_data = _tc.SFrame({ 'deep features': validation_deep_features, 'labels': validation_encoded_target }) validation_data = validation_data.stack( 'deep features', new_column_name='deep features') validation_data = validation_data.dropna(columns=['deep features']) validation_batch_size = min(len(validation_data), batch_size) validation_data = mx.io.NDArrayIter( validation_data['deep features'].to_numpy(), label=validation_data['labels'].to_numpy(), batch_size=validation_batch_size) else: validation_data = [] if verbose: print("\nTraining a custom neural network -") training_batch_size = min(len(train_data), batch_size) train_data = mx.io.NDArrayIter(train_data['deep features'].to_numpy(), label=train_data['labels'].to_numpy(), batch_size=training_batch_size, shuffle=True) custom_NN = SoundClassifier._build_custom_neural_network( feature_extractor.output_length, num_labels, custom_layer_sizes) ctx = _mxnet_utils.get_mxnet_context() custom_NN.initialize(mx.init.Xavier(), ctx=ctx) trainer = mx.gluon.Trainer(custom_NN.collect_params(), 'nag', { 'learning_rate': 0.01, 'momentum': 0.9 }) if verbose: # Setup progress table row_ids = ['iteration', 'train_accuracy', 'time'] row_display_names = ['Iteration', 'Training Accuracy', 'Elapsed Time'] if validation_data: row_ids.insert(2, 'validation_accuracy') row_display_names.insert(2, 'Validation Accuracy (%)') table_printer = _tc.util._ProgressTablePrinter(row_ids, row_display_names) train_metric = mx.metric.Accuracy() if validation_data: validation_metric = mx.metric.Accuracy() softmax_cross_entropy_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() for i in range(max_iterations): # TODO: early stopping for batch in train_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) # Inside training scope with mx.autograd.record(): for x, y in zip(data, label): z = custom_NN(x) # Computes softmax cross entropy loss. loss = softmax_cross_entropy_loss(z, y) # Backpropagate the error for one iteration. loss.backward() # Make one step of parameter update. Trainer needs to know the # batch size of data to normalize the gradient by 1/batch_size. trainer.step(batch.data[0].shape[0]) train_data.reset() # Calculate training metric for batch in train_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [custom_NN(x) for x in data] train_metric.update(label, outputs) train_data.reset() # Calculate validataion metric for batch in validation_data: data = mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [custom_NN(x) for x in data] validation_metric.update(label, outputs) # Get metrics, print progress table _, train_accuracy = train_metric.get() train_metric.reset() printed_row_values = { 'iteration': i + 1, 'train_accuracy': train_accuracy } if validation_data: _, validataion_accuracy = validation_metric.get() printed_row_values['validation_accuracy'] = validataion_accuracy validation_metric.reset() validation_data.reset() if verbose: printed_row_values['time'] = time.time() - start_time table_printer.print_row(**printed_row_values) state = { '_class_label_to_id': class_label_to_id, '_custom_classifier': custom_NN, '_feature_extractor': feature_extractor, '_id_to_class_label': {v: k for k, v in class_label_to_id.items()}, 'classes': classes, 'custom_layer_sizes': custom_layer_sizes, 'feature': feature, 'feature_extractor_name': feature_extractor.name, 'num_classes': num_labels, 'num_examples': len(dataset), 'target': target, 'training_accuracy': train_accuracy, 'training_time': time.time() - start_time, 'validation_accuracy': validataion_accuracy if validation_data else None, } return SoundClassifier(state)
def test_categorical_2(self): sf = tc.SFrame({ 'cat[1]': ['1', '1', '2', '2', '2'] * 100, 'cat[2]': ['1', '3', '3', '1', '1'] * 100 }) self._run_test(sf, 4)
def setUpClass(self): np.random.seed(37) self.n = 30 self.sf = tc.SFrame(np.random.rand(self.n, 2))
def get_confusion_matrix(extended_test, labels): #Init a matrix sf_confusion_matrix = { 'label': [], 'predicted_label': [], 'prob_default': [] } for target_l in labels: for predicted_l in labels: sf_confusion_matrix['label'].append(target_l) sf_confusion_matrix['predicted_label'].append(predicted_l) sf_confusion_matrix['prob_default'].append(0) sf_confusion_matrix = _tc.SFrame(sf_confusion_matrix) sf_confusion_matrix = sf_confusion_matrix.join( extended_test.groupby(['label', 'predicted_label'], {'count': _tc.aggregate.COUNT}), how='left', on=['label', 'predicted_label']) sf_confusion_matrix = sf_confusion_matrix.fillna('count', 0) label_column = _tc.SFrame({'label': extended_test['label']}) predictions = extended_test['probs'] for i in range(0, len(labels)): new_test_data = label_column.add_columns([ predictions.apply(lambda probs: probs[i]), predictions.apply(lambda probs: labels[i]) ], ['prob', 'predicted_label']) if (i == 0): test_longer_form = new_test_data else: test_longer_form = test_longer_form.append(new_test_data) if len(extended_test) is 0: sf_confusion_matrix = sf_confusion_matrix.rename({ 'prob_default': 'prob', 'label': 'target_label' }) else: sf_confusion_matrix = sf_confusion_matrix.join( test_longer_form.groupby( ['label', 'predicted_label'], {'prob': _tc.aggregate.SUM('prob')}), how='left', on=['label', 'predicted_label']) sf_confusion_matrix = sf_confusion_matrix.rename({ 'label': 'target_label' }).fillna('prob', 0) def wo_divide_by_zero(a, b): if b == 0: return None else: return a * 1.0 / b sf_confusion_matrix['norm_prob'] = sf_confusion_matrix.join( sf_confusion_matrix.groupby( 'target_label', {'sum_prob': _tc.aggregate.SUM('prob')}), how='left').apply( lambda x: wo_divide_by_zero(x['prob'], x['sum_prob'])) return sf_confusion_matrix.fillna('norm_prob', 0)
def test_categorical(self): # Arrange sf = tc.SFrame({ 'cat1': ['1', '1', '2', '2', '2'] * 100, 'cat2': ['1', '3', '3', '1', '1'] * 100, 'target': ['1', '2', '1', '2', '1'] * 100, }) # Act tree = _make_tree(sf) root = tree.root # Check the root node. self.assertEquals(len(tree.nodes), 7) self.assertEquals( root.to_dict(), { 'is_leaf': False, 'left_id': 2, 'node_id': 0, 'missing_id': 1, 'node_type': u'indicator', 'parent_id': None, 'right_id': 1, 'split_feature_column': 'cat1', 'split_feature_index': '1', 'value': 1 }) # Check prediction paths. self.assertEquals(tree.get_prediction_path(0), []) self.assertEquals(tree.get_prediction_path(1), [{ 'child_id': 1, 'feature': 'cat1', 'index': '1', 'node_type': 'indicator', 'node_id': 0, 'sign': '!=', 'value': 1, 'is_missing': False }]) self.assertEquals(tree.get_prediction_path(2), [{ 'child_id': 2, 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }]) self.assertEquals(tree.get_prediction_path(3), [{ 'child_id': 1, 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '!=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }, { 'child_id': 3, 'feature': 'cat2', 'index': '1', 'node_id': 1, 'sign': '!=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }]) self.assertEquals(tree.get_prediction_path(4), [{ 'child_id': 1, 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '!=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }, { 'child_id': 4, 'feature': 'cat2', 'index': '1', 'node_id': 1, 'sign': '=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }]) self.assertEquals(tree.get_prediction_path(5), [{ 'child_id': 2, 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }, { 'child_id': 5, 'feature': 'cat2', 'index': '1', 'node_id': 2, 'sign': '!=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }]) self.assertEquals(tree.get_prediction_path(6), [{ 'child_id': 2, 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }, { 'child_id': 6, 'feature': 'cat2', 'index': '1', 'node_id': 2, 'sign': '=', 'value': 1, 'node_type': 'indicator', 'is_missing': False }])
class MyfirstappConfig(AppConfig): name = 'myFirstApp' #loading ML model test_keys_model = tc.load_model('/home/supriy/myENV/venv/DjangoProject/src/myFirstApp/models/keys_model') #loading movie data movie_sframe = tc.SFrame('/home/supriy/myENV/venv/DjangoProject/src/myFirstApp/models/final_django_sframe')
def create( dataset, target, feature=None, model='resnet-50', l2_penalty=0.01, l1_penalty=0.0, solver='auto', feature_rescaling=True, convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'], step_size=_DEFAULT_SOLVER_OPTIONS['step_size'], lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'], class_weights=None, validation_set='auto', verbose=True, seed=None, batch_size=64): """ Create a :class:`ImageClassifier` model. Parameters ---------- dataset : SFrame Input data. The column named by the 'feature' parameter will be extracted for modeling. target : string, or int Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in the order in which they are provided. For example, a target variable with 'cat' and 'dog' as possible values is mapped to 0 and 1 respectively with 0 being the base class and 1 being the reference class. Use `model.classes` to retrieve the order in which the classes are mapped. feature : string, optional indicates that the SFrame has only column of Image type and that will Name of the column containing the input images. 'None' (the default) indicates the only image column in `dataset` should be used as the feature. l2_penalty : float, optional Weight on l2 regularization of the model. The larger this weight, the more the model coefficients shrink toward 0. This introduces bias into the model but decreases variance, potentially leading to better predictions. The default value is 0.01; setting this parameter to 0 corresponds to unregularized logistic regression. See the ridge regression reference for more detail. l1_penalty : float, optional Weight on l1 regularization of the model. Like the l2 penalty, the higher the l1 penalty, the more the estimated coefficients shrink toward 0. The l1 penalty, however, completely zeros out sufficiently small coefficients, automatically indicating features that are not useful for the model. The default weight of 0 prevents any features from being discarded. See the LASSO regression reference for more detail. solver : string, optional Name of the solver to be used to solve the regression. See the references for more detail on each solver. Available solvers are: - *auto (default)*: automatically chooses the best solver for the data and model parameters. - *newton*: Newton-Raphson - *lbfgs*: limited memory BFGS - *fista*: accelerated gradient descent For this model, the Newton-Raphson method is equivalent to the iteratively re-weighted least squares algorithm. If the l1_penalty is greater than 0, use the 'fista' solver. The model is trained using a carefully engineered collection of methods that are automatically picked based on the input data. The ``newton`` method works best for datasets with plenty of examples and few features (long datasets). Limited memory BFGS (``lbfgs``) is a robust solver for wide datasets (i.e datasets with many coefficients). ``fista`` is the default solver for l1-regularized linear regression. The solvers are all automatically tuned and the default options should function well. See the solver options guide for setting additional parameters for each of the solvers. See the user guide for additional details on how the solver is chosen. (see `here <https://apple.github.io/turicreate/docs/userguide/supervised-learning/linear-regression.html>`_) feature_rescaling : boolean, optional Feature rescaling is an important pre-processing step that ensures that all features are on the same scale. An l2-norm rescaling is performed to make sure that all features are of the same norm. Categorical features are also rescaled by rescaling the dummy variables that are used to represent them. The coefficients are returned in original scale of the problem. This process is particularly useful when features vary widely in their ranges. convergence_threshold : float, optional Convergence is tested using variation in the training objective. The variation in the training objective is calculated using the difference between the objective values between two steps. Consider reducing this below the default value (0.01) for a more accurately trained model. Beware of overfitting (i.e a model that works well only on the training data) if this parameter is set to a very low value. lbfgs_memory_level : float, optional The L-BFGS algorithm keeps track of gradient information from the previous ``lbfgs_memory_level`` iterations. The storage requirement for each of these gradients is the ``num_coefficients`` in the problem. Increasing the ``lbfgs_memory_level ``can help improve the quality of the model trained. Setting this to more than ``max_iterations`` has the same effect as setting it to ``max_iterations``. model : string optional Uses a pretrained model to bootstrap an image classifier: - "resnet-50" : Uses a pretrained resnet model. Exported Core ML model will be ~90M. - "squeezenet_v1.1" : Uses a pretrained squeezenet model. Exported Core ML model will be ~4.7M. - "VisionFeaturePrint_Scene": Uses an OS internal feature extractor. Only on available on iOS 12.0+, macOS 10.14+ and tvOS 12.0+. Exported Core ML model will be ~41K. Models are downloaded from the internet if not available locally. Once downloaded, the models are cached for future use. step_size : float, optional The starting step size to use for the ``fista`` solver. The default is set to 1.0, this is an aggressive setting. If the first iteration takes a considerable amount of time, reducing this parameter may speed up model training. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If set to `None`, all classes are supposed to have weight one. The `auto` mode set the class weight to be inversely proportional to number of examples in the training data with the given class. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. max_iterations : int, optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. Consider increasing this (the default value is 10) if the training accuracy is low and the *Grad-Norm* in the display is large. verbose : bool, optional If True, prints progress updates and model details. seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : ImageClassifier A trained :class:`ImageClassifier` model. Examples -------- .. sourcecode:: python >>> model = turicreate.image_classifier.create(data, target='is_expensive') # Make predictions (in various forms) >>> predictions = model.predict(data) # predictions >>> predictions = model.classify(data) # predictions with confidence >>> predictions = model.predict_topk(data) # Top-5 predictions (multiclass) # Evaluate the model with ground truth data >>> results = model.evaluate(data) See Also -------- ImageClassifier """ start_time = _time.time() # Check model parameter allowed_models = list(_pre_trained_models.MODELS.keys()) if _mac_ver() >= (10, 14): allowed_models.append('VisionFeaturePrint_Scene') # Also, to make sure existing code doesn't break, replace incorrect name # with the correct name version if model == "VisionFeaturePrint_Screen": print( "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene; VisionFeaturePrint_Screen will be removed in subsequent versions." ) model = "VisionFeaturePrint_Scene" _tkutl._check_categorical_option_type('model', model, allowed_models) # Check dataset parameter if len(dataset) == 0: raise _ToolkitError('Unable to train on empty dataset') if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): raise TypeError("Unrecognized value for 'validation_set'.") if feature is None: feature = _tkutl._find_only_image_column(dataset) feature_extractor = _image_feature_extractor._create_feature_extractor( model) # Extract features extracted_features = _tc.SFrame({ target: dataset[target], '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), }) if isinstance(validation_set, _tc.SFrame): extracted_features_validation = _tc.SFrame({ target: validation_set[target], '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size), }) else: extracted_features_validation = validation_set # Train a classifier using the extracted features extracted_features[target] = dataset[target] lr_model = _tc.logistic_classifier.create( extracted_features, features=['__image_features__'], target=target, max_iterations=max_iterations, validation_set=extracted_features_validation, seed=seed, verbose=verbose, l2_penalty=l2_penalty, l1_penalty=l1_penalty, solver=solver, feature_rescaling=feature_rescaling, convergence_threshold=convergence_threshold, step_size=step_size, lbfgs_memory_level=lbfgs_memory_level, class_weights=class_weights) # set input image shape if model in _pre_trained_models.MODELS: input_image_shape = _pre_trained_models.MODELS[model].input_image_shape else: # model == VisionFeaturePrint_Scene input_image_shape = (3, 299, 299) # Save the model state = { 'classifier': lr_model, 'model': model, 'max_iterations': max_iterations, 'feature_extractor': feature_extractor, 'input_image_shape': input_image_shape, 'target': target, 'feature': feature, 'num_features': 1, 'num_classes': lr_model.num_classes, 'classes': lr_model.classes, 'num_examples': lr_model.num_examples, 'training_time': _time.time() - start_time, 'training_loss': lr_model.training_loss, } return ImageClassifier(state)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed May 6 15:02:14 2020 @author: sandeepkompella """ import turicreate as tc song_data = tc.SFrame('song_data') print(song_data.column_names()) print(len(song_data)) #get the total no of unique users and thier count, use the len function to find the count #print(song_data['user_id'].unique()) print(len(song_data['user_id'].unique())) train_data, test_data = song_data.random_split(0.8,seed=0) # create a song recommender using the simple popularity model popularity_model = tc.popularity_recommender.create(train_data,user_id='user_id',item_id='song') #make some predictions now for user 0 and 1 #print(popularity_model.recommend(users = [song_data['user_id'][0]])) #print(popularity_model.recommend(users = [song_data['user_id'][1]])) # create a song recommender using the personalization personalized_model = tc.item_similarity_recommender.create(train_data, user_id='user_id',item_id='song') print(personalized_model.recommend(users = [song_data['user_id'][0]])) print(personalized_model.recommend(users = [song_data['user_id'][1]])) #create a song similar one to with or Without you using the personalization print(personalized_model.get_similar_items(['With Or Without You - U2'])) print(personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club']))
# Refrence the dataset path url = "/Users/ahmedbekhit/Documents/Data/Development/TuriCreate/repo/turicreate-notebook/notebooks/data/food_images" # Label the dataset ## Load the dataset folder image content using the image_analysis property data = turi.image_analysis.load_images(url) ## Create a "foodType" key for each image in the dataset to specify whether it's an Egg or Soup, based on which folder it's located in. data["foodType"] = data["path"].apply(lambda path: "Eggs" if "eggs" in path else "Soup") ## Export the labeled images as an SFrame object in order to use it while creating our image classifier. data.save("egg_or_soup.sframe") ## Visualize the new labeled images list. data.explore() # Load the SFrame object that contains the labeled images. dataBuffer = turi.SFrame("egg_or_soup.sframe") # Randmly split the SFrame object ''' 90% of the data from the original SFrame object will be used to train the image classifier. 10% of the data from the original SFrame object will be used to test the image classifier. ''' trainingBuffers, testingBuffers = dataBuffer.random_split(0.9) # Train the image classifier using the SqueezeNet architecture and pre-trained model. model = turi.image_classifier.create(trainingBuffers, target="foodType", model="squeezenet_v1.1") # Evaluate the test data to determine the model accuracy evaluations = model.evaluate(testingBuffers)
def test_label_propagation(self): if "label_propagation" in get_unity().list_toolkit_functions(): g = self.graph.copy() num_vertices = len(g.vertices) num_classes = 2 def get_label(vid): if vid < 100: return 0 elif vid > num_vertices - 100: return 1 else: return None g.vertices['label'] = g.vertices['__id'].apply(get_label, int) m = tc.label_propagation.create(g, label_field='label') m.summary() self.__test_model_save_load_helper__(m) for row in m.graph.vertices: predicted_label = row['predicted_label'] if predicted_label is None: for k in ['P%d' % i for i in range(num_classes)]: self.assertAlmostEqual(row[k], 1.0 / num_classes) else: sum_of_prob = 0.0 for k in ['P%d' % i for i in range(num_classes)]: sum_of_prob += row[k] self.assertGreaterEqual(row['P%d' % predicted_label], row[k]) self.assertAlmostEqual(sum_of_prob, 1.0) # Add more options: weighted edges, change self weight, and undirected edges def get_edge_weight(vid): return float(vid) * 10 / num_vertices g.edges['weight'] = g.edges['__src_id'].apply( get_edge_weight, float) m = tc.label_propagation.create(g, label_field='label', threshold=1e-2, weight_field='weight', self_weight=0.5, undirected=True) # Test early termination using max_iteration max_iter = 3 m = tc.label_propagation.create(g, label_field='label', threshold=1e-10, max_iterations=max_iter) self.assertEqual(m.num_iterations, max_iter) # Test that the predict class should be None if all class probabilities are equal g = g.add_vertices(tc.SFrame({'__id': [-1]})) m = tc.label_propagation.create(g, label_field='label', threshold=1e-10, max_iterations=max_iter) result = m.graph.vertices self.assertEqual( result[result['__id'] == -1]['predicted_label'][0], None)
# selected_data = removeFront(selected_data) # user_interface(SELECTED_DATA) # return selected_data # ===================================主执行部分=================================== # 执行UI var_list= [] # 新建精选数据库 SELECTED_DATA = tc.SFrame({'code': ['000000'], 'name': ['数据不存在'],'bankuai': ['二次元'], 'close': [0.0], 'percent_chg': [0.0],'change': [0.0], 'volume': [0.0], 'turn_volume': [0.0], 'amplitude': [0.0],'volume_rate': [0.0],'turnover_rate': [0.0], 'news_url': ['http://www.bilibili.com'], 'income_increase': [0.0], 'profit_increase': [0.0]}) # UI user_interface(SELECTED_DATA)
def create(dataset, target, model_name, features=None, validation_set='auto', distributed='auto', verbose=True, seed=None, **kwargs): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Determine columns to keep if features is None: features = [feat for feat in dataset.column_names() if feat != target] if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError( "Invalid feature %s: Feature names must be of type str" % x) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95, seed=seed) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') if validation_set is None: validation_set = _turicreate.SFrame() else: if not isinstance(validation_set, _turicreate.SFrame): raise TypeError("validation_set must be either 'auto' or an SFrame " "matching the training data.") # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) # Reduce validation set to requested columns validation_set = _toolkits_select_columns( validation_set, features + [target]) # Reduce training set to requested columns dataset = _toolkits_select_columns(dataset, features + [target]) # Sanitize model-specific options options = {k.lower(): kwargs[k] for k in kwargs} # Create a model instance and train it model = _turicreate.extensions.__dict__[model_name]() with QuietProgress(verbose): model.train(dataset, target, validation_set, options) return SupervisedLearningModel(model, model_name)
def parseSFrame(fileName): filePath = './SelectedData/' + fileName + '/' SELECTED_DATA = tc.SFrame(data=filePath)
def test_create_with_missing_value(self): sf = self.train.append(tc.SFrame({self.feature: tc.SArray([None], dtype=tc.Image), self.target: [self.train[self.target][0]]})) with self.assertRaises(_ToolkitError): tc.one_shot_object_detector.create(sf, target=self.target)
# ### From the above summary, we select the Cosine similarity on scaled number of seconds approach as our final model, because this combination gives the best results (the desirable outcome has low RMSE and precision-recall close to 1). # ## 9. Final Output # In[32]: # data_norm # In[31]: # rerun the model using the whole dataset, as we came to a final model using train data and evaluated with test set. final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), user_id=user_id, item_id=item_id, target ='Scaled_SecNumF', similarity_type = 'cosine' ) recom = final_model.recommend(users=users_to_recommend, k=n_rec) recom.print_rows(n_display) # #### 9.1. CSV output file # In[32]: df_rec = recom.to_dataframe() print(df_rec.shape)
# Turi is a platform for Machine Learning # turicreate is Apple's version of Turi # https://apple.github.io/turicreate/docs/api/index.html# import turicreate as turi #import the data url = "hypothyroiddataset/hypothyroid.data.csv" #save into sframe data = turi.SFrame(url) #pretty prints data data.explore() #shows graph of data data.show() #split the data into 80% training, 20% evaluation trainingBuffers, testingBuffers = data.random_split(0.80) #create the model using 'classification' #model = turi.classifier.create(trainingBuffers, # target='diagnosis') model = turi.random_forest_classifier.create(trainingBuffers, target='diagnosis') #evaluate the model evaluations = model.evaluate(testingBuffers) print evaluations["accuracy"] #save & export
def setUp(self): data = tc.SFrame() data["user_id"] = ["a", "b", "b", "c", "c", "c"] data["item_id"] = ["x", "x", "y", "v", "w", "z"] data["rating"] = [0, 1, 2, 3, 4, 5] # Make internal indices so that we can check predictions/ranking. # IDs are in the order they are seen in the above data SFrame. user_index = {"a": 0, "b": 1, "c": 2} item_index = {"x": 0, "y": 1, "v": 2, "w": 3, "z": 4} user_data = tc.SFrame() user_data["user_id"] = ["a", "b"] user_data["user_feature_value"] = [0.5, 0.9] user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}] user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]] user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] item_data = tc.SFrame() item_data["item_id"] = ["x", "v", "w", "y"] item_data["item_feature_value"] = [-0.3, 0.7, 0.3, 0.05] item_data["item_dict_value"] = [{ 1: 0.5 }, { 4: 0.9 }, { 4: 0.9 }, { 5: 1, 6: 2 }] item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4], [2, 3, 4], [2, 3, 5]] item_data["item_str_dict_value"] = [ { "tt": 0.5 }, { "tt": 0.9 }, { "t": 0.9 }, { "ttt": 0.9 }, ] new_data = tc.SFrame() new_data["user_id"] = ["a", "b"] new_data["item_id"] = ["v", "z"] new_data["rating"] = [7, 8] new_user_data = tc.SFrame() new_user_data["user_id"] = ["a", "c"] new_user_data["user_feature_value"] = [0.0, 2.9] new_user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}] new_user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]] new_user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] new_item_data = tc.SFrame() new_item_data["item_id"] = ["y", "z"] new_item_data["item_feature_value"] = [0.5, 0.6] new_item_data["item_dict_value"] = [{1: 0.5}, {4: 0.9}] new_item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4]] new_item_data["item_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] exclude = tc.SFrame() exclude["user_id"] = ["a"] exclude["item_id"] = ["x"] users_all = tc.SArray(["a", "b", "c"]) items_all = tc.SArray(["v", "w", "x", "y", "z"]) items_some = tc.SArray(["v", "w"]) self.data = data self.user_data = user_data self.item_data = item_data self.new_data = new_data self.new_user_data = new_user_data self.new_item_data = new_item_data self.exclude = exclude self.users_all = users_all self.items_all = items_all self.items_some = items_some self.user_index = user_index self.item_index = item_index
import turicreate as tc import coremltools print("set_num_gpus") # configure the GPUs tc.config.set_num_gpus(0) print("Load SFrame") # Load SFrame data = tc.SFrame('/storage/xy_signs.sframe') print("split") # Make a train-test split train_data, test_data = data.random_split(0.8) print("create and train") # Create and train model model = tc.object_detector.create(train_data) model.evaluate(test_data) model.export_coreml('/storage/xy_signs.mlmodel') # reduce model size model_spec = coremltools.utils.load_spec('/storage/xy_signs.mlmodel') model_fp16_spec = coremltools.utils.convert_neural_network_spec_weights_to_fp16( model_spec) coremltools.utils.save_spec(model_fp16_spec, '/storage/xy_signs_16bit.mlmodel')
def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset to use for evaluation, must include a column with the same name as the features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- classify, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print results['accuracy'] """ from turicreate.toolkits import evaluation # parameter checking if not isinstance(dataset, _tc.SFrame): raise TypeError('\'dataset\' parameter must be an SFrame') avail_metrics = [ 'accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve' ] _tk_utils._check_categorical_option_type('metric', metric, avail_metrics + ['auto']) if metric == 'auto': metrics = avail_metrics else: metrics = [metric] if _is_deep_feature_sarray(dataset[self.feature]): deep_features = dataset[self.feature] else: deep_features = get_deep_features(dataset[self.feature], verbose=verbose) data = _tc.SFrame({'deep features': deep_features}) data = data.add_row_number() missing_ids = data.filter_by([[]], 'deep features')['id'] if len(missing_ids) > 0: data = data.filter_by([[]], 'deep features', exclude=True) # Remove the labels for entries without deep features _logging.warning( "Dropping %d examples which are less than 975ms in length." % len(missing_ids)) labels = dataset[[self.target]].add_row_number() labels = data.join(labels, how='left')[self.target] else: labels = dataset[self.target] assert (len(labels) == len(data)) if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]): probs = self.predict(data['deep features'], output_type='probability_vector', verbose=verbose, batch_size=batch_size) if any([ m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix') ]): classes = self.predict(data['deep features'], output_type='class', verbose=verbose, batch_size=batch_size) ret = {} if 'accuracy' in metrics: ret['accuracy'] = evaluation.accuracy(labels, classes) if 'auc' in metrics: ret['auc'] = evaluation.auc(labels, probs, index_map=self._class_label_to_id) if 'precision' in metrics: ret['precision'] = evaluation.precision(labels, classes) if 'recall' in metrics: ret['recall'] = evaluation.recall(labels, classes) if 'f1_score' in metrics: ret['f1_score'] = evaluation.f1_score(labels, classes) if 'log_loss' in metrics: ret['log_loss'] = evaluation.log_loss( labels, probs, index_map=self._class_label_to_id) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = evaluation.confusion_matrix( labels, classes) if 'roc_curve' in metrics: ret['roc_curve'] = evaluation.roc_curve( labels, probs, index_map=self._class_label_to_id) return ret
import turicreate as tc # Load sessions from preprocessed data data = tc.SFrame('hapt_data.sframe') # Train/test split by recording sessions train, test = tc.activity_classifier.util.random_split_by_session( data, session_id='exp_id', fraction=0.8) # Create an activity classifier model = tc.activity_classifier.create(train, session_id='exp_id', target='activity', prediction_window=50) # Evaluate the model and save the results into a dictionary metrics = model.evaluate(test) print(metrics['accuracy']) # Save the model for later use in Turi Create model.save('TuriActivityClassify.model') # Export for use in Core ML model.export_coreml('CoreMLActivityClassify.mlmodel')
def predict(self, dataset, output_type='class', verbose=True, batch_size=64): """ Return predictions for ``dataset``. Predictions can be generated as class labels or probabilities. Parameters ---------- dataset : SFrame | SArray | dict The audio data to be classified. If dataset is an SFrame, it must have a column with the same name as the feature used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'class', 'probability_vector'}, optional Form of the predictions which are one of: - 'class': Class prediction. For multi-class classification, this returns the class with maximum probability. - 'probability': Prediction probability associated with the True class (not applicable for multi-class classification) - 'probability_vector': Prediction probability associated with each class as a vector. Label ordering is dictated by the ``classes`` member variable. verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : SArray An SArray with the predictions. See Also ---------- evaluate, classify Examples ---------- >>> probability_predictions = model.predict(data, output_type='probability') >>> prediction_vector = model.predict(data, output_type='probability_vector') >>> class_predictions = model.predict(data, output_type='class') """ from .._mxnet import _mxnet_utils import mxnet as mx if not isinstance(dataset, (_tc.SFrame, _tc.SArray, dict)): raise TypeError( '\'dataset\' parameter must be either an SFrame, SArray or dictionary' ) if isinstance(dataset, dict): if (set(dataset.keys()) != {'sample_rate', 'data'}): raise ValueError( '\'dataset\' parameter is a dictionary but does not appear to be audio data.' ) dataset = _tc.SArray([dataset]) elif isinstance(dataset, _tc.SFrame): dataset = dataset[self.feature] if not _is_deep_feature_sarray(dataset) and not _is_audio_data_sarray( dataset): raise ValueError( '\'dataset\' must be either audio data or audio deep features.' ) if output_type not in ('probability', 'probability_vector', 'class'): raise ValueError( '\'dataset\' parameter must be either an SFrame, SArray or dictionary' ) if output_type == 'probability' and self.num_classes != 2: raise _ToolkitError( 'Output type \'probability\' is only supported for binary' ' classification. For multi-class classification, use' ' predict_topk() instead.') if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if _is_deep_feature_sarray(dataset): deep_features = dataset else: deep_features = get_deep_features(dataset, verbose=verbose) deep_features = _tc.SFrame({'deep features': deep_features}) deep_features = deep_features.add_row_number() deep_features = deep_features.stack('deep features', new_column_name='deep features') deep_features, missing_ids = deep_features.dropna_split( columns=['deep features']) if len(missing_ids) > 0: _logging.warning( "Unable to make predictions for %d examples because they are less than 975ms in length." % len(missing_ids)) if batch_size > len(deep_features): batch_size = len(deep_features) y = [] for batch in mx.io.NDArrayIter( deep_features['deep features'].to_numpy(), batch_size=batch_size): ctx = _mxnet_utils.get_mxnet_context() if (len(batch.data[0]) < len(ctx)): ctx = ctx[:len(batch.data[0])] batch_data = batch.data[0] if batch.pad != 0: batch_data = batch_data[:-batch. pad] # prevent batches looping back batch_data = mx.gluon.utils.split_and_load(batch_data, ctx_list=ctx, batch_axis=0, even_split=False) for x in batch_data: forward_output = self._custom_classifier.forward(x) y += mx.nd.softmax(forward_output).asnumpy().tolist() assert (len(y) == len(deep_features)) # Combine predictions from multiple frames sf = _tc.SFrame({'predictions': y, 'id': deep_features['id']}) probabilities_sum = sf.groupby( 'id', {'prob_sum': _tc.aggregate.SUM('predictions')}) if output_type == 'class': predicted_ids = probabilities_sum['prob_sum'].apply( lambda x: _np.argmax(x)) mappings = self._id_to_class_label probabilities_sum['results'] = predicted_ids.apply( lambda x: mappings[x]) else: assert output_type in ('probability', 'probability_vector') frame_per_example_count = sf.groupby('id', _tc.aggregate.COUNT()) probabilities_sum = probabilities_sum.join(frame_per_example_count) probabilities_sum['results'] = probabilities_sum.apply( lambda row: [i / row['Count'] for i in row['prob_sum']]) if len(missing_ids) > 0: output_type = probabilities_sum['results'].dtype missing_predictions = _tc.SFrame({ 'id': missing_ids['id'], 'results': _tc.SArray([None] * len(missing_ids), dtype=output_type) }) probabilities_sum = probabilities_sum[[ 'id', 'results' ]].append(missing_predictions) probabilities_sum = probabilities_sum.sort('id') return probabilities_sum['results']
def extract_features(self, dataset, feature, batch_size=512, verbose=False): """ Parameters ---------- dataset: SFrame SFrame of images """ from ..mx import SFrameImageIter as _SFrameImageIter import turicreate as _tc import array if len(dataset) == 0: return _tc.SArray([], array.array) # Resize images if needed preprocessed_dataset = _tc.SFrame() if verbose: print("Resizing images...") preprocessed_dataset[feature] = _tc.image_analysis.resize( dataset[feature], *tuple(reversed(self.image_shape))) batch_size = min(len(dataset), batch_size) # Make a data iterator dataIter = _SFrameImageIter(sframe=preprocessed_dataset, data_field=[feature], batch_size=batch_size) # Setup the MXNet model model = MXFeatureExtractor._get_mx_module(self.ptModel.mxmodel, self.data_layer, self.feature_layer, self.context, self.image_shape, batch_size) out = _tc.SArrayBuilder(dtype=array.array) num_processed = 0 if verbose: print("Performing feature extraction on resized images...") while dataIter.has_next: if dataIter.data_shape[1:] != self.image_shape: raise RuntimeError( "Expected image of size %s. Got %s instead." % (self.image_shape, dataIter.data_shape[1:])) model.forward(next(dataIter)) mx_out = model.get_outputs()[0].asnumpy() if dataIter.getpad() != 0: # If batch size is not evenly divisible by the length, it will loop back around. # We don't want that. mx_out = mx_out[:-dataIter.getpad()] out.append_multiple(mx_out) num_processed += batch_size num_processed = min(len(dataset), num_processed) if verbose: print('Completed {num_processed:{width}d}/{total:{width}d}'. format(num_processed=num_processed, total=len(dataset), width=len(str(len(dataset))))) return out.close()
def create(dataset, target, features=None, validation_set="auto", verbose=True): """ Automatically create a suitable regression model based on the provided training data. To use specific options of a desired model, use the ``create`` function of the corresponding model. Parameters ---------- dataset : SFrame Dataset for training the model. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type (int/float). features : list[string], optional Names of the columns containing features. 'None' (the default) indicates that all columns except the target variable should be used as features. The features are columns in the input SFrame that can be of the following types: - *Numeric*: values of numeric type integer or float. - *Categorical*: values of type string. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate feature in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values Each key of a dictionary is treated as a separate feature and the value in the dictionary corresponds to the value of the feature. Dictionaries are ideal for representing sparse data. Columns of type *list* are not supported. Convert such feature columns to type array if all entries in the list are of numeric types. If the lists contain data of mixed types, separate them out into different columns. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. verbose : boolean, optional If True, print progress information during training. Returns ------- out : A trained regression model. See Also -------- turicreate.linear_regression.LinearRegression, turicreate.boosted_trees_regression.BoostedTreesRegression Examples -------- .. sourcecode:: python # Setup the data >>> import turicreate as tc >>> data = tc.SFrame('https://static.turi.com/datasets/regression/houses.csv') # Selects the best model based on your data. >>> model = tc.regression.create(data, target='price', ... features=['bath', 'bedroom', 'size']) # Make predictions and evaluate results. >>> predictions = model.predict(data) >>> results = model.evaluate(data) # Setup the data >>> import turicreate as tc >>> data = tc.SFrame('https://static.turi.com/datasets/regression/houses.csv') # Selects the best model based on your data. >>> model = tc.regression.create(data, target='price', ... features=['bath', 'bedroom', 'size']) # Make predictions and evaluate results. >>> predictions = model.predict(data) >>> results = model.evaluate(data) """ dataset, validation_set = _validate_data(dataset, target, features, validation_set) if validation_set is None: validation_set = _turicreate.SFrame() model_proxy = _turicreate.extensions.create_automatic_regression_model( dataset, target, validation_set, {}) return _sl.wrap_model_proxy(model_proxy)
# In[1]: import pandas as pd import numpy as np import turicreate as tc # In[2]: #Read CSV beer2 = pd.read_csv('/beer2.csv') # In[7]: #Create dataframe of required columns then convert to SFrame for turicreate beer2_1 = beer2[['userId', 'beer_beerid', 'review_overall']] beer2_1 = tc.SFrame(beer2_1) beer2_1 = beer2_1.dropna() # In[8]: #Create SFrame of additional info on beers for model beer_info = beer2[['beer_beerid', 'beer_style', 'beer_abv']].drop_duplicates() beer_info = tc.SFrame(beer_info) # In[9]: #Create training and validation set training_data, validation_data = tc.recommender.util.random_split_by_user( beer2_1, 'userId', 'beer_beerid') # In[10]:
def query(self, dataset, label=None, k=5, radius=None, verbose=True, batch_size=64): """ For each image, retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model. Parameters ---------- dataset : SFrame | SArray | turicreate.Image Query data. If dataset is an SFrame, it must contain columns with the same names and types as the features used to train the model. Additional columns are ignored. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)): raise TypeError( 'dataset must be either an SFrame, SArray or turicreate.Image') if (batch_size < 1): raise ValueError("'batch_size' must be greater than or equal to 1") if isinstance(dataset, _tc.SArray): dataset = _tc.SFrame({self.feature: dataset}) elif isinstance(dataset, _tc.Image): dataset = _tc.SFrame({self.feature: [dataset]}) extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size) if label is not None: extracted_features[label] = dataset[label] return self.similarity_model.query(extracted_features, label, k, radius, verbose)
import turicreate as tc tc.config.set_num_gpus(-1) # Load the data data = tc.SFrame('plate.sframe') # Make a train-test split train_data, test_data = data.random_split(0.8) # Create a model model = tc.object_detector.create(train_data) # Save predictions to an SArray predictions = model.predict(test_data) # Evaluate the model and save the results into a dictionary metrics = model.evaluate(test_data) # Save the model for later use in Turi Create model.save('model_plate_turi.model') # Export for use in Core ML model.export_coreml('model_plate_turi.mlmodel')