def _run_toolkit_function(fnname, arguments, args, kwargs): """ Dispatches arguments to a toolkit function. Parameters ---------- fnname : string The toolkit function to run arguments : list[string] The list of all the arguments the function takes. args : list The arguments that were passed kwargs : dictionary The keyword arguments that were passed """ # scan for all the arguments in args num_args_got = len(args) + len(kwargs) num_args_required = len(arguments) if num_args_got != num_args_required: raise TypeError("Expecting " + str(num_args_required) + " arguments, got " + str(num_args_got)) ## fill the dict first with the regular args argument_dict = {} for i in range(len(args)): argument_dict[arguments[i]] = args[i] # now fill with the kwargs. for k in kwargs.keys(): if k in argument_dict: raise TypeError("Got multiple values for keyword argument '" + k + "'") argument_dict[k] = kwargs[k] argument_dict = _translate_function_arguments(argument_dict) # unwrap it with cython_context(): ret = _gl.connect.main.get_unity().run_toolkit(fnname, argument_dict) # handle errors if ret[0] != True: if len(ret[1]) > 0: raise _ToolkitError(ret[1]) else: raise _ToolkitError("Toolkit failed with unknown error") ret = _wrap_function_return(ret[2]) if type(ret) == dict and 'return_value' in ret: return ret['return_value'] else: return ret
def __init__(self, data, row_label=None, feature=None, feature_model='auto', method='brute_force', verbose=False): start_time = _time.time() self._state = {'row_label': row_label, 'method': method, 'verbose': verbose, 'features': feature, 'num_examples': data.num_rows()} if row_label is not None: data_subset = data[[feature, row_label]] else: data_subset = data[[feature]] self._feature_type = data_subset[feature].dtype() if data_subset[feature].dtype() == _Image: prefix = 'extracted' extractor = _gl.feature_engineering.DeepFeatureExtractor( features=feature, output_column_prefix=prefix, model=feature_model) self._state['output_column_name'] = prefix + '.' + feature self._state['feature_model'] = extractor['model'] self._extractor = extractor.fit(data_subset) self._data = self._extractor.transform(data_subset) else: raise _ToolkitError('Feature type not supported.') if method == 'brute_force': self._neighbors_model = _gl.toolkits.nearest_neighbors.create( self._data, label=row_label, features=[self._state['output_column_name']], distance='cosine', method='brute_force', verbose=verbose) elif method == 'lsh': num_tables = 20 num_projections_per_table = 16 self._neighbors_model = _gl.toolkits.nearest_neighbors.create( self._data, label=row_label, features=[self._state['output_column_name']], distance='cosine', method = 'lsh', num_tables=num_tables, num_projections_per_table=num_projections_per_table, verbose=verbose) else: raise _ToolkitError('Unsupported Method %s' % method) self._state['training_time'] = _time.time() - start_time
def __init__(self, *args, **kwargs): tkclass_name = getattr(self.__init__, "tkclass_name") _proxy = None if "_proxy" in kwargs: _proxy = kwargs['_proxy'] del kwargs['_proxy'] if _proxy: self.__dict__['_tkclass'] = _proxy elif tkclass_name: self.__dict__['_tkclass'] = _gl.connect.main.get_unity().create_toolkit_class(tkclass_name) try: # fill the functions and properties self.__dict__['_functions'] = self._tkclass.get('list_functions') self.__dict__['_get_properties'] = self._tkclass.get('list_get_properties') self.__dict__['_set_properties'] = self._tkclass.get('list_set_properties') # rewrite the doc string for this class try: self.__dict__['__doc__'] = self._tkclass.get('get_docstring', {'__symbol__':'__doc__'}) self.__class__.__dict__['__doc__'] = self.__dict__['__doc__'] except: pass except: raise _ToolkitError("Cannot create Toolkit Class for this class. " "This class was not created with the new toolkit class system.") # for compatibility with older classes / models self.__dict__['__proxy__'] = self.__dict__['_tkclass'] if '__init__' in self.__dict__['_functions']: self.__run_class_function("__init__", args, kwargs) elif len(args) != 0 or len(kwargs) != 0: raise TypeError("This constructor takes no arguments")
def validate_distance_feature_types(dataset, distance, allowed_types): """ Check that the features passed to each standard distance function are allowed for that distance. NOTE: this function *does not* check that each distance function is one of the standard types; only that the feature types are correct if a distance function *is* standard. Parameters ---------- dataset : SFrame Input dataset. distance : list[list] Composite distance. allowed_types : dict(string, list[type]) Feature types allowed for each distance function. """ for d in distance: ftr_names, dist, weight = d if dist in allowed_types.keys(): for ftr in ftr_names: try: ftr_type = dataset[ftr].dtype() except: raise _ToolkitError("Feature '{}' could not be found in".format(ftr) + " the input dataset.") if not ftr_type in allowed_types[dist]: raise TypeError("Feature '{}' is type '{}'".format(ftr, ftr_type.__name__) + " in the input dataset, which is not allowed " + "for distance function '{}'.".format(dist))
def __init__(self, model, state=None): assert(isinstance( model, _gl.nearest_neighbors.NearestNeighborsModel)) if model.get("distance") == "dot_product": raise _ToolkitError("%s is not a supported distance function for " \ "the NearestNeighborAutoTagger. Use %s " \ "instead." % ("dot_product", "cosine")) if model.get("distance") == "transformed_dot_product": raise _ToolkitError("%s is not a supported distance function for " \ "the NearestNeighborAutoTagger. Use %s " \ "instead." % ("transformed_dot_product", "cosine")) self._state = state or {} self._nn_model = model
def _validate_num_clusters(num_clusters, initial_centers, num_rows): """ Validate the combination of the `num_clusters` and `initial_centers` parameters in the Kmeans model create function. If the combination is valid, determine and return the correct number of clusters. Parameters ---------- num_clusters : int Specified number of clusters. initial_centers : SFrame Specified initial cluster center locations, in SFrame form. If the number of rows in this SFrame does not match `num_clusters`, there is a problem. num_rows : int Number of rows in the input dataset. Returns ------- _num_clusters : int The correct number of clusters to use going forward """ ## Basic validation if num_clusters is not None and not isinstance(num_clusters, int): raise _ToolkitError("Parameter 'num_clusters' must be an integer.") ## Determine the correct number of clusters. if initial_centers is None: if num_clusters is None: raise ValueError("Number of clusters cannot be determined from " + "'num_clusters' or 'initial_centers'. You must " + "specify one of these arguments.") else: _num_clusters = num_clusters else: num_centers = initial_centers.num_rows() if num_clusters is None: _num_clusters = num_centers else: if num_clusters != num_centers: raise ValueError("The value of 'num_clusters' does not match " + "the number of provided initial centers. " + "Please provide only one of these arguments " + "or ensure the values match.") else: _num_clusters = num_clusters if _num_clusters > num_rows: raise ValueError("The desired number of clusters exceeds the number " + "of data points. Please set 'num_clusters' to be " + "smaller than the number of data points.") return _num_clusters
def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check types and sizes of the inputs. """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") if (targets.size() != predictions.size()): raise _ToolkitError( "Input SArrays 'targets' and 'predictions' must be of the same length.")
def __init__(self, state={}): if 'nearest_neighbors_model' in state: model = state['nearest_neighbors_model'] else: model = None assert (isinstance(model, _gl.nearest_neighbors.NearestNeighborsModel)) if model.get("distance") == "dot_product": raise _ToolkitError("%s is not a supported distance function for " \ "the NearestNeighborAutoTagger. Use %s " \ "instead." % ("dot_product", "cosine")) if model.get("distance") == "transformed_dot_product": raise _ToolkitError("%s is not a supported distance function for " \ "the NearestNeighborAutoTagger. Use %s " \ "instead." % ("transformed_dot_product", "cosine")) self.__proxy__ = _PythonProxy(state)
def __init__(self, tkclass_name=None, _proxy=None): if _proxy: self.__dict__['_tkclass'] = _proxy elif tkclass_name: self.__dict__['_tkclass'] = _gl.connect.main.get_unity().create_toolkit_class(tkclass_name) try: # fill the functions and properties self.__dict__['_functions'] = self._tkclass.get('list_functions') self.__dict__['_get_properties'] = self._tkclass.get('list_get_properties') self.__dict__['_set_properties'] = self._tkclass.get('list_set_properties') # rewrite the doc string for this class try: self.__dict__['__doc__'] = self._tkclass.get('get_docstring', {'__symbol__':'__doc__'}) self.__class__.__dict__['__doc__'] = self.__dict__['__doc__'] except: pass except: raise _ToolkitError("Cannot create Toolkit Class for this class. " "This class was not created with the new toolkit class system.")
def __init__(self, *args, **kwargs): tkclass_name = getattr(self.__init__, "tkclass_name") _proxy = None if "_proxy" in kwargs: _proxy = kwargs['_proxy'] del kwargs['_proxy'] if _proxy: self.__dict__['_tkclass'] = _proxy elif tkclass_name: self.__dict__['_tkclass'] = _gl.connect.main.get_unity( ).create_toolkit_class(tkclass_name) try: # fill the functions and properties self.__dict__['_functions'] = self._tkclass.get('list_functions') self.__dict__['_get_properties'] = self._tkclass.get( 'list_get_properties') self.__dict__['_set_properties'] = self._tkclass.get( 'list_set_properties') # rewrite the doc string for this class try: self.__dict__['__doc__'] = self._tkclass.get( 'get_docstring', {'__symbol__': '__doc__'}) self.__class__.__dict__['__doc__'] = self.__dict__['__doc__'] except: pass except: raise _ToolkitError( "Cannot create Toolkit Class for this class. " "This class was not created with the new toolkit class system." ) # for compatibility with older classes / models self.__dict__['__proxy__'] = self.__dict__['_tkclass'] if '__init__' in self.__dict__['_functions']: self.__run_class_function("__init__", args, kwargs) elif len(args) != 0 or len(kwargs) != 0: raise TypeError("This constructor takes no arguments")
def create(datasets, row_label=None, features=None, grouping_features=None, distance=None, k=2, radius=None, verbose=True): """ Create a deduplication model based on nearest neighbors and SGraph connected components. This method creates a :class:`NearestNeighborDeduplication` model by constructing a nearest neighbors similarity graph on all of the rows in the input 'datasets', then using the connected components tool in the :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label to each record. Records which share the same label are considered to be duplicates. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- datasets : SFrame or list[SFrame] or dict(string: SFrame) Input datasets. Each SFrame in the list must include all of the features specified in the `features` or 'distance' parameters, but may have additional columns as well. SFrames can be input as values in a dictionary, where the keys are strings used in the output to identify the SFrame from which each record originated. row_label : string, optional Name of the SFrame column with row labels. If not specified, row numbers are used to identify rows in the output. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates the intersection of columns over all SFrames in `datasets` should be used (except the label column, if specified). Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Any additional columns named in 'features' will be included in the model output but not used for distance computations. grouping_features : list[string], optional Names of features to use in grouping records before finding approximate matches. These columns must have string or integer type data. See the Notes section for more details on grouping. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. k : int, optional Number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a potential duplicate. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborDeduplication model The NearestNeighborDeduplication object contains a field 'entities' which shows the entity label for each input record. It also shows the features for each record that are used to construct the model, as well as the original SFrame and row label for each record. If the original `datasets` are passed in a list, the SFrame identifier is the index of the SFrame in that list. See Also -------- NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors, graphlab.SFrame.groupby Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For datasets with more than about 10,000 records, *grouping* (also known as *blocking*) is a critical step to avoid computing distances between all pairs of records. The grouping step simply assigns each record to a group that has identical values for all `grouping_features`, and only looks for duplicates within each group. - Records with missing data in the `grouping_features` are removed from consideration as duplicates. These records are given the entity label "None". - For tasks that require *only* exact matches on certain features, it is generally more natural to use the SFrame `groupby` function. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> sf1 = graphlab.SFrame({'id': [0, 1, 2], ... 'x0': [0.5, 0.5, 0.3], ... 'x1': [1., 0.8, 0.6], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... ... # note: misspellings in the following dataset do not prevent correct ... # matches. >>> sf2 = graphlab.SFrame({'id': [9, 10], ... 'x0': [0.35, 0.4], ... 'x1': [0.65, 0.8], ... 'city': ['bostan', 'seatle'], ... 'state': ['MA', 'WA']}) ... >>> dist = [[('city',), 'levenshtein', 2], ... [('x0', 'x1'), 'euclidean', 1.5]] ... >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2}, ... row_label='id', ... grouping_features=['state'], ... distance=dist, k=None, ... radius=3) ... >>> print m['entities'] +----------+----+----------+-------+------+---------+------+ | __sframe | id | __entity | state | x0 | city | x1 | +----------+----+----------+-------+------+---------+------+ | a | 1 | 0 | WA | 0.5 | olympia | 0.8 | | a | 0 | 1 | WA | 0.5 | seattle | 1.0 | | b | 10 | 1 | WA | 0.4 | seatle | 0.8 | | a | 2 | 2 | MA | 0.3 | boston | 0.6 | | b | 9 | 2 | MA | 0.35 | bostan | 0.65 | +----------+----+----------+-------+------+---------+------+ [5 rows x 7 columns] """ ## Set up _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() model = NearestNeighborDeduplication() model._state['verbose'] = verbose model._state['k'] = k model._state['radius'] = radius ### ----------------------------- ### ### Validation and preprocessing ### ### ----------------------------- ### ### Validate input datasets ### ----------------------- ## If datasets is already a dict, check the keys are all strings if isinstance(datasets, dict): if not(all([isinstance(x, str) for x in datasets.keys()])): raise ValueError("Keys in the 'datasets' dict must be strings.") ## Convert singleton SFrame dataset into a list of datasets if isinstance(datasets, _gl.SFrame): _raise_error_if_sframe_empty(datasets, "dataset") datasets = {0: datasets} ## Convert a list of SFrames into a dict if isinstance(datasets, list): datasets = {k: sf for k, sf in enumerate(datasets)} ## At this point, 'datasets' must be dict. If it's not, something is wrong. if not isinstance(datasets, dict): raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " + "or a dictionary of (string, SFrame) pairs.") model._state['num_datasets'] = len(datasets) ## Ensure that all datasets are SFrames for d in datasets.values(): _raise_error_if_not_sframe(d, "dataset") ### Validate row label ### ------------------ ## Validate the label column if row_label: if not isinstance(row_label, str): raise TypeError("The 'row_label' parameter must be the name (string " + "type) of a column in each of the input datasets.") for d in datasets.values(): if row_label not in d.column_names(): raise _ToolkitError("The specified row_label column does not " + " exist in all input datasets.") else: row_label = 'row_number' for d in datasets.values(): if row_label in d.column_names(): raise _ToolkitError("Input 'row_label' defaulted to " + "'row_number', which is already a column" + " in at least one input dataset. Please " + "specify a row label column manually.") model._state['row_label'] = row_label ### Validate 'features' and 'grouping_features' parameters ### ------------------------------------------------------ if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") if grouping_features is not None: if not hasattr(grouping_features, '__iter__'): raise TypeError("Input 'grouping_features' must be a list.") if not all([isinstance(x, str) for x in grouping_features]): raise TypeError("Input 'grouping_features' must contain only strings.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. ## Find the intersection of all feature sets and feature types col_types = {k: v for k, v in zip(datasets.values()[0].column_names(), datasets.values()[0].column_types())} all_features = [sf.column_names() for sf in datasets.values()] ftr_intersection = list(set(all_features[0]).intersection(*all_features)) ftr_intersection = [x for x in ftr_intersection if x != row_label] ## Convert features and distance arguments into a composite distance. if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): if features is not None: distance = [[features, distance, 1]] else: distance = [[ftr_intersection, distance, 1]] elif distance == None: if features is not None: distance = _construct_auto_distance(features, col_types) else: distance = _construct_auto_distance(ftr_intersection, col_types) else: raise TypeError("Input 'distance' not understood. Note that for the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the form of the composite distance and add to the model allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label, allowed_dists.keys(), verbose) model._state['distance'] = _copy.deepcopy(distance) ## Figure out which features are 'fuzzy', i.e. used for approximate # matching, and set in the model state. fuzzy_features = _dmutl.extract_composite_features(distance) # already has row_label removed model._state['features'] = fuzzy_features model._state['num_features'] = len(fuzzy_features) ## Compile a master list of all features. This includes grouping features, # fuzzy features (the ones used for approximate matching), and "ancillary" # features, which are specified in the 'features' parameter but not in the # composite distance function for whatever reason. by the user in the # 'features' parameter, but not included in the 'distance' specification # for some reason. if features is None: features = [] else: features = [x for x in features if x != row_label] if grouping_features is None: grouping_features = [] else: grouping_features = [x for x in grouping_features if x != row_label] model._state['grouping_features'] = grouping_features model._state['num_grouping_features'] = len(grouping_features) master_features = list(set(features + grouping_features + fuzzy_features)) ### Consolidate data and engineer features ### -------------------------------------- ## Consolidate multiple input datasets into a single SFrame, with a useful # row label. sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label, features=master_features, sf_index_name='__sframe') overall_label = '__sframe.' + row_label sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." + sf_union[row_label].astype(str)) ## Validate the feature types in the consolidated dataset against the # specified distance functions. _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists) ## Clean string-type features in the fuzzy feature set. for ftr in fuzzy_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr sf_union[new_ftr] = sf_union[ftr].fillna("") sf_union[new_ftr] = sf_union[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Feature engineering, distance-component-wise. Also update list of # features and a map to their types. sf_union, distance = _engineer_distance_features(sf_union, distance) transformed_features = _dmutl.extract_composite_features(distance) ### -------------------------------------------- ### ### Main loop over blocks of neighbor candidates ### ### -------------------------------------------- ### ## Construct blocks on features that must match exactly if verbose: _logging.info("Constructing groups of records that match exactly on " + "the 'grouping_features'.") sf_union, block_errors, blocks = \ _dmutl.construct_exact_blocks(sf_union, grouping_features) if verbose and len(distance) > 0 and blocks['Count'].max() > 10000: _logging.warning("There are more than 10,000 records in the largest match " + "group. For many uses, approximate matches within each match group are " + "computed with brute force nearest neighbors, which may be slow. " + "Consider using smaller groups by requiring different features to " + "match exactly.") max_entity_number = 0 sf_entity = _gl.SFrame() output_features = (master_features + [row_label, '__sframe', '__entity']) ## Main loop over blocks for i, block in enumerate(blocks): if verbose: _logging.info("Processing {} records in match group: {}/{}".format(block['Count'], i+1, len(blocks))) ## Retrieve records in the block and impute the mean for missing numeric # values. records = sf_union[block['min_idx']:(block['max_idx'] + 1)] complete_records = _dmutl.impute_numeric_means(records, transformed_features) if len(distance) > 0: ## Run all-point nearest neighbors if verbose: _logging.info("Building the similarity graph....") m = _gl.nearest_neighbors.create(complete_records, label=overall_label, distance=distance, verbose=False) knn = m.query(complete_records, label=overall_label, k=k, radius=radius, verbose=verbose) ## Construct similarity graph to resolve transitive closure sg = _gl.SGraph() sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label) sg = sg.add_edges(knn, src_field='query_label', dst_field='reference_label') ## Cut the similarity graph to establish an entity for each vertex if verbose: _logging.info("Finding duplicate records in the similarity graph....") cc = _gl.connected_components.create(sg, verbose=verbose) ## Relabel the component IDs to be consecutive integers starting with # the max index of the previous block's entity labels. block_labels = cc['component_size'].add_row_number('__entity') block_labels['__entity'] += max_entity_number max_entity_number += block_labels.num_rows() block_entity_labels = cc['component_id'].join(block_labels, on='component_id', how='left') ## Join the entity labels for the block back to the block's records, # then append to the master output records = records.join(block_entity_labels[['__id', '__entity']], on={overall_label: '__id'}, how='left') records = records.sort('__entity') else: # no fuzzy features, so no nearest neighbors, just block ID records['__entity'] = _gl.SArray.from_const(i, len(records)) sf_entity = sf_entity.append(records[output_features]) ### ------------------------------------- ### ### Postprocessing and results formatting ### ### ------------------------------------- ### ## Add rows missing from the blocking back to the master results if len(block_errors) > 0: block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int) sf_entity = sf_entity.append(block_errors[output_features]) ## Rearrange columns sf_entity.swap_columns('__sframe', sf_entity.column_names()[0]) sf_entity.swap_columns(row_label, sf_entity.column_names()[1]) sf_entity.swap_columns('__entity', sf_entity.column_names()[2]) ## Finalize the model state model._state['training_time'] = _time.time() - start_time model._state['entities'] = sf_entity model._state['num_entities'] = max_entity_number return model
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a NearestNeighborAutotagger model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % x) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create( features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = {"training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance")} model = NearestNeighborAutoTagger(m, state) model.summary() return model
def create(data, features=None, bm25_k1=1.5, bm25_b=0.75, tfidf_threshold=0.01, verbose=True): """ Create a searchable index of text columns in an SFrame. Parameters ---------- data : SFrame An SFrame containing at least one str column containing text that should be indexed. features : list of str A list of column names that contain text that should be indexed. Default: all str columns in the provided dataset. bm25_k1 : float Tuning parameter for the relative importance of term frequencies when computing the BM25 score between a query token and a document. bm25_b : float Tuning parameter to downweight scores of long documents when computing the BM25 score between a query token and a document. tfidf_threshold : float Tuning parameter to skip indexing words that have a TF-IDF score below this value. verbose : bool Controls whether or not to print progress during model creation. Returns ------- out SearchModel See Also -------- SearchModel.query References ---------- Christopher D. Manning, Hinrich Schutze, and Prabhakar Raghavan. Introduction to information retrieval. http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf Examples -------- >>> import graphlab as gl >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']}) >>> m = gl.toolkits._internal.search.create(sf) >>> print m.query('burrito') """ # Input validation on data and features if features is None: features = _get_str_columns(data) _raise_error_if_not_of_type(data, [_gl.SFrame]) _raise_error_if_not_of_type(features, [list]) for f in features: if data[f].dtype() != str: raise _ToolkitError("Feature `%s` must be of type str" % f) # Store options options = {} options['bm25_b'] = bm25_b options['bm25_k1'] = bm25_k1 options['tfidf_threshold'] = tfidf_threshold options['verbose'] = verbose options['features'] = features # Construct model proxy = _gl.extensions._SearchIndex() proxy.init_options(options) proxy.index(data) return SearchModel(proxy)
def distances_to_similarity_scores(distance_fn, distances): """ Convert distances to similarity scores. Parameters ---------- distance_fn : str The name of the distance function. distances : SArray or SFrame An `SArray` or `SFrame` of distances to convert to similarity scores. If distances is an SFrame, it is expected to contain the following columns: "distance", "query_label", and "reference_label", of types float, str, and str respectively. If an SFrame is provided that does not contain these fields, a ToolkitError is raised. label : string Name of the label column. Returns ------- out : SArray The converted similarity scores. Notes ----- - To convert Levenshtein distances to similarities, the distances parameter must by an `SFrame`, since we require both of the strings being compared in order to normalize. """ if not (isinstance(distances, _gl.SFrame) or \ isinstance(distances, _gl.SArray)): raise TypeError("distances parameter is of type %s must be an SFrame " \ "or an SArray" % type(distances)) if isinstance(distances, _gl.SFrame): column_names = distances.column_names() required_names = ["distance", "query_label", "reference_label"] if not all([name in column_names for name in required_names]): raise _ToolkitError("distances SFrame is missing required " \ "columns; at a minimum, it should have the " \ "following columns: \"distance\", " \ "\"query_label\", and \"reference_label\"") if isinstance(distances, _gl.SArray): if distance_fn == "levenshtein": raise TypeError("Expected an SFrame but got a an SArray") distances = _gl.SFrame({"distance": distances}) def levenshtein_sim(dist, s1, s2): return 1 - dist / max(len(s1), len(s2)) scores = None if distance_fn == "levenshtein" and isinstance(distances, _gl.SFrame): scores = distances.apply( lambda x: levenshtein_sim( x["distance"], x["query_label"], x["reference_label"])) elif distance_fn in ("jaccard", "weighted_jaccard", "cosine"): scores = distances["distance"].apply(lambda dist: 1 - dist) elif distance_fn in ("manhattan", "euclidean", "squared_euclidean"): scores = distances["distance"].apply( lambda dist: 1 - dist / _MAX_SIMILARITY_RADIUS) else: raise _ToolkitError("Unsupported distance function: %s" % distance_fn) return scores
def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ Evaluate the model's predictive accuracy. This is done by predicting the target class for instances in a new dataset and comparing to known target values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto': Returns all available metrics. - 'accuracy': Classification accuracy. - 'confusion_matrix': An SFrame with counts of possible prediction/true label combinations. - 'roc_curve': An SFrame containing information needed for an roc curve (binary classification only). max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : dict Evaluation results. The dictionary keys are *accuracy* and *confusion_matrix* and *roc_curve* (if applicable). See also -------- create, predict, predict_topk, classify Notes ----- - Because the model randomly breaks ties between predicted classes, the results of repeated calls to `evaluate` method may differ. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ans = m.evaluate(sf_train, max_neighbors=2, ... metric='confusion_matrix') >>> print ans['confusion_matrix'] +--------------+-----------------+-------+ | target_label | predicted_label | count | +--------------+-----------------+-------+ | cat | dog | 1 | | dog | dog | 2 | | fossa | dog | 1 | +--------------+-----------------+-------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.evaluate') ## Validate the metric name _raise_error_evaluation_metric_is_valid( metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) ## Make sure the input dataset has a target column with an appropriate # type. target = self.get('target') _raise_error_if_column_exists(dataset, target, 'dataset', target) if not dataset[target].dtype() == str and not dataset[target].dtype( ) == int: raise TypeError("The target column of the evaluation dataset must " "contain integers or strings.") if self._state["num_classes"] != 2: if (metric == 'roc_curve') or (metric == ['roc_curve']): err_msg = "Currently, ROC curve is not supported for " err_msg += "multi-class classification in this model." raise _ToolkitError(err_msg) else: warn_msg = "WARNING: Ignoring `roc_curve`. " warn_msg += "Not supported for multi-class classification." print(warn_msg) ## Compute predictions with the input dataset. ystar = self.predict(dataset, output_type='class', max_neighbors=max_neighbors, radius=radius) ystar_prob = self.predict(dataset, output_type='probability', max_neighbors=max_neighbors, radius=radius) ## Compile accuracy metrics results = {} if metric in ['accuracy', 'auto']: results['accuracy'] = _gl.evaluation.accuracy( targets=dataset[target], predictions=ystar) if metric in ['confusion_matrix', 'auto']: results['confusion_matrix'] = \ _gl.evaluation.confusion_matrix(targets=dataset[target], predictions=ystar) if self._state["num_classes"] == 2: if metric in ['roc_curve', 'auto']: results['roc_curve'] = \ _gl.evaluation.roc_curve(targets=dataset[target], predictions=ystar_prob) return results
def create(dataset, feature=None, expected_runlength=250, lag=7): """ Create a `BayesianChangepointsModel`. The changepoint detection calculates where there is a shift in mean or variance in a univariate timeseries. This model calculates a probability that a given point is changepoint, given the data up to the point. The BayesianChangepointsModel works with either TimeSeries, SArray, or SFrame inputs. The model created by this function contains a table `scores` that contains the computed anomaly scores. The type of `scores` matches the type of the input `dataset`, and the table contains 4 columns: - *row id/time*: ID of the corresponding row in the input `dataset`. If `dataset` is an SFrame, this is the row numbers of the input data; if `dataset` is a TimeSeries, it is the index of the time series. - *changepoint score*: The probability that the given point is a changepoint. This value is in a range between 0 and 1. - *value*: input data. The name of this column matches the input `feature`. - *model update time*: time the model was updated. This is particularly useful if the `window_size` is larger than the number of rows in the input datasets, because the `scores` table has results from several updates. Note that any `None` values in dataset will have `changepoint_score` of `None`, and will be ignored in subsequent changepoint probability calculation. Parameters ---------- dataset : SFrame, SArray, or TimeSeries Input data. The column named by 'feature' will be extracted for modeling. feature : str, optional Name of the column to model. Any data provided to the model in this function or with `BayesianChangepointsModel.update` must have a column with this name, unless the datasets are in SArray form. expected_runlength: int or float, optional The a priori expected number of samples between changepoints. This helps condition the model. Note that this parameter must be set to a value larger than 0. lag: int, optional The model waits `lag` samples before evaluating the probability of a change happening `lag` samples prior. This is useful because it can be difficult to evaluate a change after a single sample of a new distribution. Note that this causes the last `lag` to not have enough data to evaluate changepoint scores, so they are filled with 'None' values. Also note that this value cannot be larger than 100, due to only keeping the previous 100 points in memory.The minimum lag is 0, which allows immediate detection of changepoints, but with less certainty. Returns ------- out : BayesianChangepointsModel See Also -------- MovingZScoreModel, graphlab.TimeSeries, local_outlier_factor References ---------- - The model implemented is desribed in `'Bayesian Online Changepoint Prediction' by Ryan Adams, <http://arxiv.org/pdf/0710.3742v1.pdf>`_. Examples -------- >>> sf = graphlab.SFrame({'series': [100]*25 + [200]*25}) >>> model = graphlab.anomaly_detection.bayesian_changepoints.create(sf, ... lag=5, ... feature='series') >>> model['scores'][24:28].print_rows(max_column_width=20) +--------+-------------------+--------+---------------------+ | row_id | changepoint_score | series | model_update_time | +--------+-------------------+--------+---------------------+ | 24 | 0.136735367681 | 100 | 2016-01-27 14:02... | | 25 | 0.831430606595 | 200 | 2016-01-27 14:02... | | 26 | 0.000347138442071 | 200 | 2016-01-27 14:02... | | 27 | 3.40869782692e-05 | 200 | 2016-01-27 14:02... | +--------+-------------------+--------+---------------------+ [4 rows x 4 columns] """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() logger = _logging.getLogger(__name__) ## Validate required inputs by themselves. if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise _ToolkitError("Input 'dataset' is empty.") if feature is not None and not isinstance(feature, str): raise TypeError("Input 'feature' must be a string if specified.") if not isinstance(lag, int): raise TypeError("Input 'lag' must be an integer if specified.") if lag > 100 or lag < 0: raise ValueError( "Input 'lag' cannot be greater than 100 or less than 0") if type(expected_runlength) not in (int, float): raise TypeError( "'expected_runlength' must be either an integer or float") if expected_runlength < 1: raise ValueError("Input 'expected_runlength' must be greater than 0.") ## Determine the feature name if left unspecified. column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \ else dataset.value_col_names if feature is None: if len(column_names) == 1: feature = column_names[0] else: raise _ToolkitError("If the 'input' dataset has multiple " + "columns, a 'feature' column name must be " + "specified.") ## Extract the specified feature as an SArray. try: series = dataset[feature] except: raise _ToolkitError("The specified feature could not be found " + "in the input 'dataset'.") ## Validate the type of the feature. if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Initialize options opts = {} opts['expected_runlength'] = expected_runlength opts['lag'] = lag opts['feature'] = feature ## Create SDK proxy proxy = _gl.extensions._BayesianOnlineChangepoint() proxy.init_changepoint_detector(opts, False, series.dropna()[0]) ## Construct python model from proxy model = BayesianChangepointsModel(proxy) ## Construct scores SFrame from calculated changepoints scores = _gl.SFrame() scores[feature] = series changepoints = model.__proxy__.calculate_changepoints(series) ## Append None's at the end, where there hasn't been enough data to determine ## whether there was a changepoint changepoints = changepoints.append( _gl.SArray([None] * (len(scores) - len(changepoints)))) scores['changepoint_score'] = changepoints scores['model_update_time'] = _dt.datetime.now() scores = scores[[ 'changepoint_score', # reorder the columns feature, 'model_update_time' ]] #Add row_id to SFrame if isinstance(dataset, _gl.SFrame): if feature != 'row_id': scores = scores.add_row_number('row_id') else: logger.warning("Feature name is 'row_id', so the " + "index in the model's 'scores' SFrame " + "is called '_row_id'.") scores = scores.add_row_number('_row_id') ## Add index to timeseries if isinstance(dataset, _gl.TimeSeries): scores[dataset.index_col_name] = dataset[dataset.index_col_name] dataset_type = 'TimeSeries' if isinstance(dataset, _gl.TimeSeries) else 'SFrame' # Set up the model. state = { 'dataset_type': dataset_type, 'num_examples': len(dataset), 'training_time': _time.time() - start_time } if isinstance(dataset, _gl.TimeSeries): model.__proxy__.set_index_col_name(dataset.index_col_name) model.__proxy__.set_state_sframe(scores, state) else: model.__proxy__.set_state_sframe(scores, state) return model
def create(data, row_label=None, features=None, feature_model='auto', method='lsh', verbose=True): """ Create a similarity search model, which can be used to quickly retrieve items similar to a query observation. In the case of images, this model automatically performs the appropriate feature engineering steps. NOTE: If you are using a CPU for the creation step with feature_model='auto', creation time may take a while. This is because extracting features for images on a CPU is expensive. With a GPU, one can expect large speedups. .. warning:: The similarity search toolkit is currently in beta, and feedback is welcome! Please send comments to [email protected]. Parameters ---------- dataset : SFrame The SFrame that represents the training data for the model, including at least one column of images. row_label : str, optional Name of the SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : str, optional The name of an image column in the input 'dataset' SFrame. feature_model : 'auto' | A model of type NeuralNetClassifier, optional A trained model for extracting features from raw data objects. By default ('auto'), we choose an appropriate model from our set of pre-trained models. See :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for more information. method : {'lsh', 'brute_force'}, optional The method used for nearest neighbor search. The 'lsh' option uses locality-sensitive hashing to find approximate results more quickly. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : SimilaritySearchModel See Also -------- SimilaritySearchModel graphlab.toolkits.nearest_neighbors graphlab.toolkits.feature_engineering Notes ----- The similarity search toolkit currently uses cosine distance to evaluate the similarity between each query and candidate results. Examples -------- First, split data into reference and query. >>> import graphlab as gl >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build neuralnet feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features= 'image', ... feature_model=nn_model) Find the most similar items in the reference set for each item in the query set: >>> model.search(query) """ _mt._get_metric_tracker().track(__name__ + '.create') _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_not_of_type(features, [str]) _raise_error_if_column_exists(data, features) if data[features].dtype() != _Image: raise _ToolkitError("Feature `%s` must be of type Image" \ % features) return SimilaritySearchModel(data, row_label=row_label, feature=features, feature_model=feature_model, method=method, verbose=verbose)
def update(self, dataset, window_size=None, min_observations=None, verbose=True): """ Create a new `MovingZScoreModel` with a new dataset. The `window_size` and `min_observations` parameters can also be updated with this method. The new model contains anomaly scores for each observation in the new `dataset`. In addition, the last `window_size` rows of the existing model's data and anomaly scores are prepended, for continuity and to show how the anomaly score is computed for the first few rows of the new `dataset`. Parameters ---------- dataset : SFrame or TimeSeries New data to use for updating the model. The type of the input 'dataset' must match the type of the data already in the model (if the model has data already). window_size : int, optional Length of the time window to use for defining the moving z-score value, in terms of number of observations. The window size will be the same as the current model's window size if a new window is not specified. min_observations : int, optional Minimum number of non-missing observations in the moving window required to compute the moving Z-score. If unspecified, the entire moving window preceding an observation must not contain any missing values in order for the observation to get an anomaly score. This parameter will be the same as the current model's value if not specified. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : MovingZScoreModel A *new* MovingZScoreModel, with an updated dataset and anomaly scores for the updated dataset. The `scores` field of the new model has the same schema as the `scores` field of the existing model, but data prepended from the existing results have a row ID of 'None'. See Also -------- create Examples -------- >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010], ... 'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]}) >>> model = graphlab.anomaly_detection.moving_zscore.create(sf, ... window_size=3, ... feature='value') ... >>> sf2 = graphlab.SFrame({'year': [2010, 2011, 2012, 2013], ... 'value': [18.4, 12.1, 12.0, 3.6]}) >>> model2 = model.update(sf2) >>> model2['scores'].print_rows(max_column_width=20) +--------+----------------+-------+----------------+---------------------+ | row_id | anomaly_score | value | moving_average | model_update_time | +--------+----------------+-------+----------------+---------------------+ | None | 28.0822407386 | 21.4 | 12.1333333333 | 2016-01-04 16:58... | | None | 1.00086199482 | 10.8 | 15.2 | 2016-01-04 16:58... | | None | 0.795990414837 | 11.2 | 14.9 | 2016-01-04 16:58... | | 0 | 0.801849542822 | 18.4 | 14.4666666667 | 2016-01-04 16:58... | | 1 | 0.391346818515 | 12.1 | 13.4666666667 | 2016-01-04 16:58... | | 2 | 0.593171014002 | 12.0 | 13.9 | 2016-01-04 16:58... | | 3 | 3.52963789428 | 3.6 | 14.1666666667 | 2016-01-04 16:58... | +--------+----------------+-------+----------------+---------------------+ [7 rows x 5 columns] """ start_time = _time.time() _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.moving_zscore.update') logger = _logging.getLogger(__name__) ## Validate the new dataset if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise TypeError("Input 'dataset' is empty.") if ((self.__proxy__['dataset_type'] == 'TimeSeries' and not isinstance(dataset, _gl.TimeSeries)) or (self.__proxy__['dataset_type'] == 'SFrame' and not isinstance(dataset, _gl.SFrame))): raise TypeError("New input 'dataset' must have the same type " + "as the data already in the model.") ## Validate the new window size (if there is one), and figure out what # the new window size will be. if window_size is None: window_size = self.__proxy__['window_size'] else: if not isinstance(window_size, int): raise TypeError("Input 'window_size' must be an integer.") if window_size < 1: raise ValueError("Input 'window_size' must greater than or " + "equal to 1.") ## Validate and determine the `min_observations` parameter. if min_observations is None: min_observations = self.__proxy__['min_observations'] else: if not isinstance(min_observations, int): raise TypeError("If specified, input 'min_observations' must " + "be a positive integer.") if min_observations < 1: raise ValueError("If specified, input 'min_observations' must " + "be a positive integer.") ## TimeSeries-specific dataset validation ## Make the sure new data occurs *after* the existing data. scores = self.__proxy__['scores'] if isinstance(dataset, _gl.TimeSeries): first_new_timestamp = dataset[0][dataset.index_col_name] last_old_timestamp = scores[-1][scores.index_col_name] if first_new_timestamp < last_old_timestamp: raise _ToolkitError("The new dataset has data with " + "earlier timestamps than the existing " + "dataset. Please ensure that new data " + "occurs after existing data.") ## Extract the feature from the new dataset and validate it. feature = self.__proxy__['feature'] try: series = dataset[feature] except: raise _ToolkitError("The feature specified by the original " + "model could not be found in the input " + "'dataset'.") if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Create a new model and cut the old score object to the window size. new_state = {k: self.__proxy__[k] for k in ['verbose', 'feature', 'dataset_type']} new_state['window_size'] = window_size new_state['min_observations'] = min_observations new_model = MovingZScoreModel(new_state) ## Save just the old data needed for the moving statistics on the new # data. if len(scores) < window_size: old_scores = scores[:] else: old_scores = scores[-window_size:] ## Compute Z-scores and anomaly scores. series = old_scores[feature].append(series) moving_average, moving_zscore, sufficient_data = \ _moving_z_score(series, window_size, min_observations) anomaly_score = abs(moving_zscore) if not sufficient_data: logger.warning("The number of observations is smaller than " + "the minimum number needed to compute a " + "moving Z-score, so all anomaly scores are 'None'. " + "Consider adding more data with the model's `update` " + "method, or reducing the `window_size` or " + "`min_observations` parameters.") ## General post-processing and formatting. scores = _gl.SFrame({feature: series, 'moving_average': moving_average, 'anomaly_score': anomaly_score}) scores['model_update_time'] = _dt.datetime.now() scores = scores[[feature, # reorder the columns 'moving_average', 'anomaly_score', 'model_update_time']] ## Replace the new Z-scores for the *old* data with the original # Z-score for that data. num_new_examples = len(dataset) new_scores = scores[-num_new_examples:] if isinstance(dataset, _gl.TimeSeries): new_scores[dataset.index_col_name] = dataset[dataset.index_col_name] new_scores = _gl.TimeSeries(new_scores, index=dataset.index_col_name) ## The index column should have the same name in the old and new # data. If it doesn't, change the name in the old scores. if dataset.index_col_name != old_scores.index_col_name: old_scores = old_scores.rename( {old_scores.index_col_name: dataset.index_col_name}) if verbose: logger.warning("The new dataset's index column name " + "does not match the existing index " + "column name. The new name is used in " + "the new model.") final_scores = old_scores.union(new_scores) else: new_scores = new_scores.add_row_number('row_id') old_scores['row_id'] = None old_scores['row_id'] = old_scores['row_id'].astype(int) final_scores = old_scores.append(new_scores) ## Finalize and return the model. new_model.__proxy__['num_examples'] = len(scores) new_model.__proxy__['scores'] = final_scores new_model.__proxy__['training_time'] = _time.time() - start_time return new_model
def evaluate(self, data, methods=['average_similarity', 'average_quality', 'log_det']): """ Objectively evaluate the quality and diversity of a data subset. There are several quantitaive measures of the quality and diversity of some set. This method provides three: - Average quality: The average over the quality features of each of the items in data. - Average similarity: The average of the pairwise similarities between every item in data. - Log-determinant: This simultaneously measures both the quality and diversity of a set. To measure the log-determinant of a given set, we first form the similarity matrix L, where a diagonal entry L_ii corresponds to the quality of item i, and an off diagonal entry L_ij corresponds to the similarity between items i and j. We then take the log of the determinant of this matrix. This type of matrix is also referred to as a Gramian matrix. The determinant of a Gramian matrix corresponds to the volume spanned by the vectors used to construct the matrix. If an item has a large quality, it corresponds to a longer vector, which will increase the volume (and determinant) of L. If two feature vectors are similar, then the volume decreases (because the vectors point in a similar direction), which correspondingly decreases the determinant. Thus, both quality and similarity are encapsulated by the log-determinant. Parameters ---------- data: SFrame or SGraph The subset of data to evaluate. methods: list[string], {'average_similarity', 'average_quality', 'log_det'} The set of methods to measure. If methods is None, then all possible evaluation methods will be used. Returns ------- out: dict Dictionary of values with keys corresponding to measurement types and values corresponding to the actual evaluation scores. Examples -------- >>> cars = graphlab.SFrame.read_csv('https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv') >>> sampler = graphlab.diverse_sampler.create(data=cars, item_id='name', quality_feature='accel', similarity_features=['mpg', 'displ', 'hp', 'weight', 'origin']) >>> >>> sf_simple_dd = gl.SFrame({'id': [0, 1, 2], 'q': [10, 10, 10], 's1': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]}) >>> sampler = gl.diverse_sampler.create(data=sg_simple_dd, item_id='id', quality_feature='q', similarity_features=['s1']) >>> sf = sampler.sample(5, greedy=True, diversity=0.2) >>> sampler.evaluate(sf) {'log_det': 15.819720050211457, 'average_quality': 23.76, 'average_similarity': 0.999730969627407} """ eval_frame = False if isinstance(data, _gl.SFrame): eval_frame = True elif not isinstance(data, _gl.SGraph): raise ValueError("Unknown data type " + str(type(data)) + ".") div_eval = _gl.extensions.diversity_eval() options = dict() options["eval_methods"] = methods if self._quality_feature is not None: options["quality_feature"] = self._quality_feature if self._similarity_features is not None: options["similarity_features"] = self._similarity_features if eval_frame: if not self._init_with_frame: raise _ToolkitError("Sampler initialized with SGraph, but eval "+ \ "was called with an SFrame.") return div_eval.evaluate_frame(data, options) else: if self._init_with_frame: raise _ToolkitError("Sampler initialized with SFrame, but eval "+ \ "was called with an SGraph.") return div_eval.evaluate_graph(data, options)
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, graphlab.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`graphlab.SFrame.fillna` and :func:`graphlab.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of GraphLab Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~graphlab.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product or distance == 'transformed_dot_product' or distance == _graphlab.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _graphlab.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _graphlab.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype() for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ _graphlab.distances.__dict__[k] for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [ f for f in feature_names if _dataset[f].dtype() == list ] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track( 'toolkit.nearest_neighbors_brute.create') elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create') else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model if not verbose: _mt.main.get_server().set_log_progress(False) result = _graphlab.extensions._nearest_neighbors.train(opts) _mt.main.get_server().set_log_progress(True) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def __init__(self, data=None, item_id=None, quality_feature=None, similarity_features=None, model_proxy = None, _class = None): """ Create a DiverseSampler object. This should never be called directly, because it is necessary to set up an SDK proxy prior to calling __init__. """ if _class: self.__class__ = _class self._init_with_frame = False self.__proxy__ = model_proxy self.__name__ = 'diverse_sampler' self._quality_feature = quality_feature self._similarity_features = similarity_features if data is None and model_proxy is None: raise ValueError("The diverse sampler must be initialized with a " + "reference SFrame or SGraph.") elif data is not None: if not (isinstance(data, _gl.SFrame) or isinstance(data, _gl.SGraph)): raise ValueError("Unknown data type " + str(type(data)) + ".") if item_id is None and model_proxy is None: # Note that for SGraphs, the __id vertex field is intrinsic to each # gl.Vertex, so we don't actually need to specify item_id if isinstance(data, _gl.SFrame): raise ValueError("An item_id must be specified.") if isinstance(data, _gl.SFrame): col_names = data.column_names() elif isinstance(data, _gl.SGraph): if similarity_features is not None and len(similarity_features) > 1: raise _ToolkitError("Only 1 similarity feature is supported for SGraph.") col_names = data.get_fields() if isinstance(data, _gl.SFrame) and item_id not in col_names: raise ValueError("Item ID "+item_id+" does not name " + "a column in the SFrame.") if quality_feature is not None and quality_feature not in col_names: raise ValueError("Quality feature "+quality_feature+" does not name " + "a column in the SFrame.") if similarity_features is not None: for sname in similarity_features: if sname not in col_names: raise ValueError("Similarity feature "+sname+" does not name " + "a column in the SFrame.") opts = dict() if item_id is None and isinstance(data, _gl.SGraph): item_id = "__id" opts["item_id"] = item_id if quality_feature is not None: opts["quality_feature"] = quality_feature if similarity_features is not None: opts["similarity_features"] = similarity_features if isinstance(data, _gl.SFrame): self._init_with_frame = True self.__proxy__.init_with_frame(data, opts) elif isinstance(data, _gl.SGraph): self._init_with_frame = False self.__proxy__.init_with_graph(data, opts)
def evaluate(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags to a set of queries labeled with their true tags, and then evaluate the model's performance on those queries. The true tags should be provided as an additional column in ``dataset``, and that column's name should be the same as the ``tag_name`` parameter specified when the model was created. The type of the tags column should be either string or list (of strings). Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point regardless of score. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : dict A dictionary containing the entire confusion matrix, as well as the following evaluation metrics: - Precision - Recall - F1 score See Also -------- tag, graphlab.evaluation.confusion_matrix Notes ----- - Autotagging is a variation on multiclass classification, where in contrast to a multiclass classifier, an autotagger model can output zero tags for a particular query (either because there were no tags with non-zero scores, or as a result of specifying a value for the similarity_threshold parameter). As is standard practice in multiclass classification, we report Precision, Recall, and F1 score as our evaluation metrics. Specifically, we microaverage Precision and Recall by counting type I errors (false positives) and type II errors (false negatives) over the entire confusion matrix. References ---------- - `Wikipedia - Precision and recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_ - Manning, C., Raghavan P., and Schutze H. (2008). Introduction to Information Retrieval. Examples -------- Continuing with the actor autotagger model referenced in previous example (for the ```tag``` method): >>> labeled_reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.10.tagged.sframe") >>> labeled_reviews_sf +-------------------------------+---------------------+ | review | actor | +-------------------------------+---------------------+ | When I saw this movie I wa... | [Leonardo DiCaprio] | | I rented this movie last w... | [Matt Damon] | | You've gotta hand it to St... | [Angelina Jolie] | | I caught this film at a te... | [Julia Roberts] | | I took a flyer in renting ... | [Jennifer Aniston] | | Frankly I'm rather incense... | [] | | This movie looked as if it... | [Jude Law] | | My wife and I watch a film... | [] | | A story of amazing disinte... | [] | | I don't remember a movie w... | [] | +-------------------------------+---------------------+ >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False, k=1) .. sourcecode:: :python {'confusion_matrix': Columns count int target_label str predicted_label str Rows: 10 Data: +-------+-------------------+-------------------+ | count | target_label | predicted_label | +-------+-------------------+-------------------+ | 1 | Leonardo DiCaprio | Leonardo DiCaprio | | 1 | Matt Damon | Matt Damon | | 1 | Angelina Jolie | Angelina Jolie | | 1 | Julia Roberts | Julia Roberts | | 1 | Jennifer Aniston | Jennifer Aniston | | 1 | Jude Law | Jude Law | | 1 | None | Will Smith | | 1 | None | Emma Stone | | 1 | None | Jennifer Aniston | | 1 | None | Charlize Theron | +-------+-------------------+-------------------+ [10 rows x 3 columns], 'f1_score': 0.7499999999999999, 'precision': 0.6, 'recall': 1.0} >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False, k=1, similarity_threshold=.6) .. sourcecode:: :python {'confusion_matrix': Columns: count int target_label str predicted_label str Rows: 7 Data: +-------+-------------------+-------------------+ | count | target_label | predicted_label | +-------+-------------------+-------------------+ | 1 | Leonardo DiCaprio | Leonardo DiCaprio | | 1 | Angelina Jolie | Angelina Jolie | | 1 | Julia Roberts | Julia Roberts | | 4 | None | None | | 1 | Jude Law | Jude Law | | 1 | Matt Damon | None | | 1 | Jennifer Aniston | None | +-------+-------------------+-------------------+ [7 rows x 3 columns], 'f1_score': 0.8, 'precision': 1.0, 'recall': 0.6666666666666666} """ _mt._get_metric_tracker().track(self.__module__ + '.tag') tag_name = self.get("tag_name") true_tags = dataset.select_column(tag_name) if true_tags.dtype() not in (list, str): raise TypeError("The %s column must either be of type str or list" % tag_name) if true_tags.dtype() == str: true_tags = true_tags.apply(lambda x: [x] if x else []) true_tags = true_tags.fillna([]) dataset = dataset.select_columns([x for x in dataset.column_names() if x != tag_name]) if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") results = self.tag(dataset, query_name=query_name, k=k, similarity_threshold=similarity_threshold, exclude_zeros=exclude_zeros, verbose=verbose) if len(results) == 0: raise ValueError("There is no data to evaluate. Try reducing the " \ "similarity_threshold or increasing k.") group_column = (query_name or dataset.column_names()[0]) + "_id" dataset = dataset.add_row_number(group_column) results = results.groupby(group_column, {"labels": _gl.aggregate.CONCAT(tag_name)}) results = dataset.join(results, on={group_column: group_column}, how="left") results = results.fillna("labels", []) results = results.sort(group_column) def precision(tps, fps): return tps / float(tps + fps) def recall(tps, fns): return tps / float(tps + fns) def f1_score(p, r): return 2 * p * r / (p + r) confusion_matrix = _gl.evaluation.confusion_matrix(true_tags, results["labels"]) confusion_matrix = confusion_matrix.stack("target_label", "target_label") # TO DO: this next line will be removed once .stack type-inference is fixed # or type_hint parameter is exposed confusion_matrix = _gl.SFrame({"predicted_label": [["stub"]], "count": [1], "target_label": ["stub"]})\ .append(confusion_matrix) confusion_matrix = confusion_matrix.stack("predicted_label", "predicted_label") # TO DO: remove this next line, per note above confusion_matrix = confusion_matrix[1:] tps = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] != None and \ row["target_label"] == row["predicted_label"])]["count"].sum() or 0 fps = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] != None and \ row["target_label"] != row["predicted_label"])]["count"].sum() or 0 fns = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] == None and \ row["target_label"] != None)]["count"].sum() or 0 p = precision(tps, fps) r = recall(tps, fns) f1 = f1_score(p, r) return {"precision": p, "recall": r, "f1_score": f1, 'confusion_matrix': confusion_matrix}
def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "s3://dato-datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({"id": range(len(query_sa)), query_column: query_sa}) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self._nn_model.query(features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": []}) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id", query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({"reference_label": self.get("tag_name"), "query_label": query_column}) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results
def tag(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags passed when a model is created to a new set of queries. This is a many-to-many match: each query may have any number of occurrences of a reference tag. Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point. verbose : bool, optional If True, print progress updates and model details. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. Returns ------- out : SFrame An SFrame with four columns: - row ID - column name specified as `tag_name` parameter to `create` method - column name specified as `query_name` parameter to `tag` method - a similarity score between 0 and 1, indicating the strength of the match between the query data and the suggested reference tag, where a score of zero indicates a poor match and a strength of 1 corresponds to a perfect match Notes ----- - By default, only rows for which there is a tag with a nonzero score are included in the output. To guarantee at least one output row for every input row in ``dataset``, set the ``exclude_zeros`` parameter to False. - If both ``k`` and ``similarity_threshold`` are set to ``None``, a ToolkitError is raised. Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.autotagger.create(actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's `similarity_threshold` parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ """ _mt._get_metric_tracker().track(self.__module__ + '.tag') # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that either k or similarity_threshold is set if not (k or similarity_threshold): raise _ToolkitError("Either k or similarity_threshold parameters " \ "must be set") # ensure that query_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not query_name: raise _ToolkitError("No query_name parameter specified on " \ "dataset with %d columns" % dataset.num_cols()) query_column = query_name or dataset.column_names()[0] # ensure that column with name tag_name exists if query_column not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' \ % query_column) query_sa = dataset.select_column(query_column) query_sf = _gl.SFrame({ "id": range(len(query_sa)), query_column: query_sa }) features = _preprocess(query_sa) features = features.add_row_number() if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") radius = (1 - similarity_threshold) if similarity_threshold else None results = self.__proxy__['nearest_neighbors_model'].query( features, label="id", k=k, radius=radius, verbose=verbose) # return empty SFrame immediately if no NN results if len(results) == 0: return _gl.SFrame({ query_column + "_id": [], query_column: [], self.get("tag_name"): [], "score": [] }) results = results.join(query_sf, on={"query_label": "id"}) results.rename({"query_label": query_column + "_id"}) results.rename({query_column: "query_label"}) # convert distances to similarity scores scores = _dists_to_sim_scores("weighted_jaccard", results) results.add_column(scores, "score") results.remove_column("distance") results.remove_column("rank") results.rename({ "reference_label": self.get("tag_name"), "query_label": query_column }) results.swap_columns(self.get("tag_name"), query_column) if exclude_zeros: try: results = results.filter_by(0.0, "score", exclude=True) except RuntimeError: # nothing to join _logging.getLogger(__name__).warn( "Empty results after filtering scores of 0.") results = results.head(0) return results
def create(dataset, window_size, feature=None, min_observations=None, verbose=True): """ Create a :class:`MovingZScoreModel` model. This model fits a moving average to a univariate time series and identifies points that are far from the fitted curve. The MovingZScoreModel works with either TimeSeries or SFrame inputs. A uniform sampling rate is assumed and the data window must be defined in terms of number of observations. This model differs from other GraphLab Create models in that it can be created from an existing `MovingZSCoreModel`. To create a new model in this fashion, use the existing model's `update` method. The model created by this function contains a table `scores` that contains the computed anomaly scores. The type of `scores` matches the type of the input `dataset`, and the table contains 5 columns: - *row id/time*: ID of the corresponding row in the input `dataset`. If `dataset` is an SFrame, this is the row numbers of the input data; if `dataset` is a TimeSeries, it is the index of the time series. - *anomaly score*: absolute value of the moving Z-score. A score of 0 indicates the value is identical to the moving average. The higher the score, the more likely a point is to be an anomaly. - *value*: input data. The name of this column matches the input `feature`. - *moving average*: moving average of each point's preceding `window_size` values. - *model update time*: time the model was updated. This is particularly useful if the `window_size` is larger than the number of rows in the input datasets, because the `scores` table has results from several updates. Parameters ---------- dataset : SFrame or TimeSeries Input data. The column named by the 'feature' parameter will be extracted for modeling. window_size : int Length of the time window to use for defining the moving z-score value, in terms of number of observations. feature : str, optional Name of the column to model. Any data provided to the model with either the `create` or `update` functions must have a column with this name. The feature name is not necessary if `dataset` is an SFrame with a single column or a TimeSeries with a single value column; it can be determined automatically in this case. min_observations : int, optional Minimum number of non-missing observations in the moving window required to compute the moving Z-score. If unspecified, the entire moving window preceding an observation must not contain any missing values in order for the observation to get an anomaly score. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : MovingZScoreModel A trained :class:`MovingZScoreModel`, which contains a table called `scores` that includes the anomaly score for each input data point. The type of the `scores` table matches the type of the input `dataset`. See Also -------- MovingZScoreModel, MovingZScoreModel.update Notes ----- - The moving Z-score for a data point :math:`x_t` is simply the value of :math:`x_t` standardized by subtracting the moving mean just prior to time :math:`t` and dividing by the moving standard deviation just prior to :math:`t`. Suppose :math:`w` stands for the `window_size` in terms of the number of observations. Then the moving Z-score is: .. math:: z(x_t) = \\frac{x_t - \\bar{x}_t}{s_t} where the moving average is: .. math:: \\bar{x}_t = (1/w) \sum_{i=t-w}^{t-1} x_i and the moving standard deviation is: .. math:: s_t = \sqrt{(1/w) \sum_{i=t-w}^{t-1} (x_i - \\bar{x}_t)^2} - The moving Z-score at points within `window_size` observations of the beginning of a series are not defined, because there are insufficient points to compute the moving average and moving standard deviation. This is represented by missing values. - Missing values in the input dataset are assigned missing values ('None') for their anomaly scores as well. - If there is no variation in the values preceding a given observation, the moving Z-score can be infinite or undefined. If the given observation is equal to the moving average, the anomaly score is coded as 'nan'; if the observation is *not* equal to the moving average, the anomaly score is 'inf'. Examples -------- >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010], ... 'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]}) >>> model = graphlab.anomaly_detection.moving_zscore.create(sf, ... window_size=3, ... feature='value') >>> model['scores'].print_rows(max_column_width=20) +--------+----------------+-------+----------------+---------------------+ | row_id | anomaly_score | value | moving_average | model_update_time | +--------+----------------+-------+----------------+---------------------+ | 0 | None | 12.2 | None | 2016-01-04 16:55... | | 1 | None | 11.7 | None | 2016-01-04 16:55... | | 2 | None | 12.5 | None | 2016-01-04 16:55... | | 3 | 28.0822407386 | 21.4 | 12.1333333333 | 2016-01-04 16:55... | | 4 | 1.00086199482 | 10.8 | 15.2 | 2016-01-04 16:55... | | 5 | 0.795990414837 | 11.2 | 14.9 | 2016-01-04 16:55... | +--------+----------------+-------+----------------+---------------------+ [6 rows x 5 columns] """ _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.moving_zscore.create') start_time = _time.time() logger = _logging.getLogger(__name__) ## Validate required inputs by themselves. if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise _ToolkitError("Input 'dataset' is empty.") if not isinstance(window_size, int): raise TypeError("Input 'window_size' must be an integer.") if window_size < 1: raise ValueError("Input 'window_size' must greater than or " + "equal to 1.") if feature is not None and not isinstance(feature, str): raise TypeError("Input 'feature' must be a string if specified.") if min_observations is not None: if not isinstance(min_observations, int): raise TypeError("If specified, input 'min_observations' must " + "be a positive integer.") if min_observations < 1: raise ValueError("If specified, input 'min_observations' must " + "be a positive integer.") ## Determine the feature name if left unspecified. column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \ else dataset.value_col_names if feature is None: if len(column_names) == 1: feature = column_names[0] else: raise _ToolkitError("If the 'input' dataset has multiple " + "columns, a 'feature' column name must be " + "specified.") ## Extract the specified feature as an SArray. try: series = dataset[feature] except: raise _ToolkitError("The specified feature could not be found " + "in the input 'dataset'.") ## Validate the type of the feature. if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Compute the moving average, Z-score, and a final anomaly score. For all # anomaly detectcion models, the final score should be in the range [0, # \infty], with higher values indicating more outlier-ness. moving_average, moving_zscore, sufficient_data = \ _moving_z_score(series, window_size, min_observations) anomaly_score = abs(moving_zscore) if not sufficient_data: logger.warning("The number of observations is smaller than " + "the minimum number needed to compute a " + "moving Z-score, so all anomaly scores are 'None'. " + "Consider adding more data with the model's `update` " + "method, or reducing the `window_size` or " + "`min_observations` parameters.") ## Format the results. scores = _gl.SFrame({feature: series, 'moving_average': moving_average, 'anomaly_score': anomaly_score}) scores['model_update_time'] = _dt.datetime.now() scores = scores[['anomaly_score', # reorder the columns feature, 'moving_average', 'model_update_time']] if isinstance(dataset, _gl.SFrame): if feature != 'row_id': scores = scores.add_row_number('row_id') else: logger.warning("Feature name is 'row_id', so the " + "index in the model's 'scores' SFrame " + "is called '_row_id'.") scores = scores.add_row_number('_row_id') if isinstance(dataset, _gl.TimeSeries): scores[dataset.index_col_name] = dataset[dataset.index_col_name] scores = _gl.TimeSeries(scores, index=dataset.index_col_name) dataset_type = 'TimeSeries' if isinstance(dataset, _gl.TimeSeries) else 'SFrame' ## Set up the model. state = { 'dataset_type': dataset_type, 'verbose': verbose, 'window_size': window_size, 'min_observations': min_observations, 'num_examples': len(dataset), 'feature': feature, 'training_time': _time.time() - start_time, 'scores': scores} model = MovingZScoreModel(state) return model
def evaluate(self, dataset, query_name=None, k=5, similarity_threshold=None, exclude_zeros=True, verbose=True): """ Match the reference tags to a set of queries labeled with their true tags, and then evaluate the model's performance on those queries. The true tags should be provided as an additional column in ``dataset``, and that column's name should be the same as the ``tag_name`` parameter specified when the model was created. The type of the tags column should be either string or list (of strings). Parameters ---------- dataset : SFrame Query data to be tagged. query_name : string, optional Name of the column in ``dataset`` to be auto-tagged. If ``dataset`` has more than one column, ``query_name`` must be specified. k : int, optional Number of results to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all results whose score is greater than or equal to ``similarity_threshold``. similarity_threshold : float, optional Only results whose score is greater than or equal to the specified ``similarity_threshold`` are returned. The default is ``None``, in which case the ``k`` best results are returned for each query point regardless of score. exclude_zeros : boolean, optional If True, only entries for which there is a tag with a nonzero score are preserved in the output. This is the default behavior. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : dict A dictionary containing the entire confusion matrix, as well as the following evaluation metrics: - Precision - Recall - F1 score See Also -------- tag, graphlab.evaluation.confusion_matrix Notes ----- - Autotagging is a variation on multiclass classification, where in contrast to a multiclass classifier, an autotagger model can output zero tags for a particular query (either because there were no tags with non-zero scores, or as a result of specifying a value for the similarity_threshold parameter). As is standard practice in multiclass classification, we report Precision, Recall, and F1 score as our evaluation metrics. Specifically, we microaverage Precision and Recall by counting type I errors (false positives) and type II errors (false negatives) over the entire confusion matrix. References ---------- - `Wikipedia - Precision and recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_ - Manning, C., Raghavan P., and Schutze H. (2008). Introduction to Information Retrieval. Examples -------- Continuing with the actor autotagger model referenced in previous example (for the ```tag``` method): >>> labeled_reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.10.tagged.sframe") >>> labeled_reviews_sf +-------------------------------+---------------------+ | review | actor | +-------------------------------+---------------------+ | When I saw this movie I wa... | [Leonardo DiCaprio] | | I rented this movie last w... | [Matt Damon] | | You've gotta hand it to St... | [Angelina Jolie] | | I caught this film at a te... | [Julia Roberts] | | I took a flyer in renting ... | [Jennifer Aniston] | | Frankly I'm rather incense... | [] | | This movie looked as if it... | [Jude Law] | | My wife and I watch a film... | [] | | A story of amazing disinte... | [] | | I don't remember a movie w... | [] | +-------------------------------+---------------------+ >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False, k=1) .. sourcecode:: python {'confusion_matrix': Columns count int target_label str predicted_label str Rows: 10 Data: +-------+-------------------+-------------------+ | count | target_label | predicted_label | +-------+-------------------+-------------------+ | 1 | Leonardo DiCaprio | Leonardo DiCaprio | | 1 | Matt Damon | Matt Damon | | 1 | Angelina Jolie | Angelina Jolie | | 1 | Julia Roberts | Julia Roberts | | 1 | Jennifer Aniston | Jennifer Aniston | | 1 | Jude Law | Jude Law | | 1 | None | Will Smith | | 1 | None | Emma Stone | | 1 | None | Jennifer Aniston | | 1 | None | Charlize Theron | +-------+-------------------+-------------------+ [10 rows x 3 columns], 'f1_score': 0.7499999999999999, 'precision': 0.6, 'recall': 1.0} >>> m.evaluate(labeled_reviews_sf, query_name="review", verbose=False, k=1, similarity_threshold=.6) .. sourcecode:: python {'confusion_matrix': Columns: count int target_label str predicted_label str Rows: 7 Data: +-------+-------------------+-------------------+ | count | target_label | predicted_label | +-------+-------------------+-------------------+ | 1 | Leonardo DiCaprio | Leonardo DiCaprio | | 1 | Angelina Jolie | Angelina Jolie | | 1 | Julia Roberts | Julia Roberts | | 4 | None | None | | 1 | Jude Law | Jude Law | | 1 | Matt Damon | None | | 1 | Jennifer Aniston | None | +-------+-------------------+-------------------+ [7 rows x 3 columns], 'f1_score': 0.8, 'precision': 1.0, 'recall': 0.6666666666666666} """ _mt._get_metric_tracker().track(self.__module__ + '.tag') tag_name = self.get("tag_name") true_tags = dataset.select_column(tag_name) if true_tags.dtype() not in (list, str): raise TypeError( "The %s column must either be of type str or list" % tag_name) if true_tags.dtype() == str: true_tags = true_tags.apply(lambda x: [x] if x else []) true_tags = true_tags.fillna([]) dataset = dataset.select_columns( [x for x in dataset.column_names() if x != tag_name]) if similarity_threshold: if not isinstance(similarity_threshold, (float, int)): raise _ToolkitError("similarity_threshold parameter must be a" \ "float or an int.") if similarity_threshold < 0 or similarity_threshold > 1: raise _ToolkitError("similarity_threshold parameter must be " \ "between 0 and 1.") results = self.tag(dataset, query_name=query_name, k=k, similarity_threshold=similarity_threshold, exclude_zeros=exclude_zeros, verbose=verbose) if len(results) == 0: raise ValueError("There is no data to evaluate. Try reducing the " \ "similarity_threshold or increasing k.") group_column = (query_name or dataset.column_names()[0]) + "_id" dataset = dataset.add_row_number(group_column) results = results.groupby(group_column, {"labels": _gl.aggregate.CONCAT(tag_name)}) results = dataset.join(results, on={group_column: group_column}, how="left") results = results.fillna("labels", []) results = results.sort(group_column) def precision(tps, fps): return tps / float(tps + fps) def recall(tps, fns): return tps / float(tps + fns) def f1_score(p, r): return 2 * p * r / (p + r) confusion_matrix = _gl.evaluation.confusion_matrix( true_tags, results["labels"]) confusion_matrix = confusion_matrix.stack("target_label", "target_label") # TO DO: this next line will be removed once .stack type-inference is fixed # or type_hint parameter is exposed confusion_matrix = _gl.SFrame({"predicted_label": [["stub"]], "count": [1], "target_label": ["stub"]})\ .append(confusion_matrix) confusion_matrix = confusion_matrix.stack("predicted_label", "predicted_label") # TO DO: remove this next line, per note above confusion_matrix = confusion_matrix[1:] tps = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] != None and \ row["target_label"] == row["predicted_label"])]["count"].sum() or 0 fps = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] != None and \ row["target_label"] != row["predicted_label"])]["count"].sum() or 0 fns = confusion_matrix[confusion_matrix.apply( lambda row: row["predicted_label"] == None and \ row["target_label"] != None)]["count"].sum() or 0 p = precision(tps, fps) r = recall(tps, fns) f1 = f1_score(p, r) return { "precision": p, "recall": r, "f1_score": f1, 'confusion_matrix': confusion_matrix }
def evaluate(self, data, methods=['average_similarity', 'average_quality', 'log_det']): """ Objectively evaluate the quality and diversity of a data subset. There are several quantitaive measures of the quality and diversity of some set. This method provides three: - Average quality: The average over the quality features of each of the items in data. - Average similarity: The average of the pairwise similarities between every item in data. - Log-determinant: This simultaneously measures both the quality and diversity of a set. To measure the log-determinant of a given set, we first form the similarity matrix L, where a diagonal entry L_ii corresponds to the quality of item i, and an off diagonal entry L_ij corresponds to the similarity between items i and j. We then take the log of the determinant of this matrix. This type of matrix is also referred to as a Gramian matrix. The determinant of a Gramian matrix corresponds to the volume spanned by the vectors used to construct the matrix. If an item has a large quality, it corresponds to a longer vector, which will increase the volume (and determinant) of L. If two feature vectors are similar, then the volume decreases (because the vectors point in a similar direction), which correspondingly decreases the determinant. Thus, both quality and similarity are encapsulated by the log-determinant. Parameters ---------- data: SFrame or SGraph The subset of data to evaluate. methods: list[string], {'average_similarity', 'average_quality', 'log_det'} The set of methods to measure. If methods is None, then all possible evaluation methods will be used. Returns ------- out: dict Dictionary of values with keys corresponding to measurement types and values corresponding to the actual evaluation scores. Examples -------- >>> cars = graphlab.SFrame.read_csv('https://static.turi.com/datasets/auto-mpg/auto-mpg.csv') >>> sampler = graphlab.diverse_sampler.create(data=cars, item_id='name', quality_feature='accel', similarity_features=['mpg', 'displ', 'hp', 'weight', 'origin']) >>> >>> sf_simple_dd = gl.SFrame({'id': [0, 1, 2], 'q': [10, 10, 10], 's1': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]}) >>> sampler = gl.diverse_sampler.create(data=sg_simple_dd, item_id='id', quality_feature='q', similarity_features=['s1']) >>> sf = sampler.sample(5, greedy=True, diversity=0.2) >>> sampler.evaluate(sf) {'log_det': 15.819720050211457, 'average_quality': 23.76, 'average_similarity': 0.999730969627407} """ eval_frame = False if isinstance(data, _gl.SFrame): eval_frame = True elif not isinstance(data, _gl.SGraph): raise ValueError("Unknown data type " + str(type(data)) + ".") div_eval = _gl.extensions.diversity_eval() options = dict() options["eval_methods"] = methods if self._quality_feature is not None: options["quality_feature"] = self._quality_feature if self._similarity_features is not None: options["similarity_features"] = self._similarity_features if eval_frame: if not self._init_with_frame: raise _ToolkitError("Sampler initialized with SGraph, but eval "+ \ "was called with an SFrame.") return div_eval.evaluate_frame(data, options) else: if self._init_with_frame: raise _ToolkitError("Sampler initialized with SFrame, but eval "+ \ "was called with an SGraph.") return div_eval.evaluate_graph(data, options)
def create(data, row_label=None, features=None, feature_model='auto', method='lsh', verbose=True): """ Create a similarity search model, which can be used to quickly retrieve items similar to a query observation. In the case of images, this model automatically performs the appropriate feature engineering steps. NOTE: If you are using a CPU for the creation step with feature_model='auto', creation time may take a while. This is because extracting features for images on a CPU is expensive. With a GPU, one can expect large speedups. Parameters ---------- dataset : SFrame The SFrame that represents the training data for the model, including at least one column of images. row_label : str, optional Name of the SFrame column with row id's. If 'row_label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : str, optional The name of an image column in the input 'dataset' SFrame. feature_model : 'auto' | A model of type NeuralNetClassifier, optional A trained model for extracting features from raw data objects. By default ('auto'), we choose an appropriate model from our set of pre-trained models. See :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for more information. method : {'lsh', 'brute_force'}, optional The method used for nearest neighbor search. The 'lsh' option uses locality-sensitive hashing to find approximate results more quickly. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : SimilaritySearchModel See Also -------- SimilaritySearchModel graphlab.toolkits.nearest_neighbors graphlab.toolkits.feature_engineering Notes ----- The similarity search toolkit currently uses cosine distance to evaluate the similarity between each query and candidate results. Examples -------- First, split data into reference and query. >>> import graphlab as gl >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') >>> reference, query = data.random_split(0.8) Build neuralnet feature extractor for images: >>> nn_model = gl.neuralnet_classifier.create(reference, target='label') Construct SimilaritySearchModel: >>> model = gl.similarity_search.create(reference, features= 'image', ... feature_model=nn_model) Find the most similar items in the reference set for each item in the query set: >>> model.search(query) """ _mt._get_metric_tracker().track(__name__ + '.create') _raise_error_if_not_of_type(data, [_SFrame]) _raise_error_if_not_of_type(features, [str]) _raise_error_if_column_exists(data, features) if data[features].dtype() != _Image: raise _ToolkitError("Feature `%s` must be of type Image" \ % features) return SimilaritySearchModel(data, row_label=row_label, feature=features, feature_model=feature_model, method=method, verbose=verbose)
def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ Evaluate the model's predictive accuracy. This is done by predicting the target class for instances in a new dataset and comparing to known target values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto': Returns all available metrics. - 'accuracy': Classification accuracy. - 'confusion_matrix': An SFrame with counts of possible prediction/true label combinations. - 'roc_curve': An SFrame containing information needed for an roc curve (binary classification only). max_neighbors : int, optional Maximum number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a neighbor in the reference dataset. Returns ------- out : dict Evaluation results. The dictionary keys are *accuracy* and *confusion_matrix* and *roc_curve* (if applicable). See also -------- create, predict, predict_topk, classify Notes ----- - Because the model randomly breaks ties between predicted classes, the results of repeated calls to `evaluate` method may differ. Examples -------- >>> sf_train = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) >>> m = graphlab.nearest_neighbor_classifier.create(sf, target='species') >>> ans = m.evaluate(sf_train, max_neighbors=2, ... metric='confusion_matrix') >>> print ans['confusion_matrix'] +--------------+-----------------+-------+ | target_label | predicted_label | count | +--------------+-----------------+-------+ | cat | dog | 1 | | dog | dog | 2 | | fossa | dog | 1 | +--------------+-----------------+-------+ """ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.evaluate') ## Validate the metric name _raise_error_evaluation_metric_is_valid(metric, ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) ## Make sure the input dataset has a target column with an appropriate # type. target = self.get('target') _raise_error_if_column_exists(dataset, target, 'dataset', target) if not dataset[target].dtype() == str and not dataset[target].dtype() == int: raise TypeError("The target column of the evaluation dataset must " "contain integers or strings.") if self._state["num_classes"] != 2: if (metric == 'roc_curve') or (metric == ['roc_curve']): err_msg = "Currently, ROC curve is not supported for " err_msg += "multi-class classification in this model." raise _ToolkitError(err_msg) else: warn_msg = "WARNING: Ignoring `roc_curve`. " warn_msg += "Not supported for multi-class classification." print warn_msg ## Compute predictions with the input dataset. ystar = self.predict(dataset, output_type='class', max_neighbors=max_neighbors, radius=radius) ystar_prob = self.predict(dataset, output_type='probability', max_neighbors=max_neighbors, radius=radius) ## Compile accuracy metrics results = {} if metric in ['accuracy', 'auto']: results['accuracy'] = _gl.evaluation.accuracy(targets=dataset[target], predictions=ystar) if metric in ['confusion_matrix', 'auto']: results['confusion_matrix'] = \ _gl.evaluation.confusion_matrix(targets=dataset[target], predictions=ystar) if self._state["num_classes"] == 2: if metric in ['roc_curve', 'auto']: results['roc_curve'] = \ _gl.evaluation.roc_curve(targets=dataset[target], predictions=ystar_prob) return results
def _validate_features(features, column_type_map, valid_types, label): """ Identify the subset of desired `features` that are valid for the Kmeans model. A warning is emitted for each feature that is excluded. Parameters ---------- features : list[str] Desired feature names. column_type_map : dict[str, type] Dictionary mapping each column name to the type of values in the column. valid_types : list[type] Exclude features whose type is not in this list. label : str Name of the row label column. Returns ------- valid_features : list[str] Names of features to include in the model. """ # logger = _logging.getLogger(__name__) if not isinstance(features, list): raise TypeError("Input 'features' must be a list, if specified.") if len(features) == 0: raise ValueError("If specified, input 'features' must contain " + "at least one column name.") ## Remove duplicates num_original_features = len(features) features = set(features) if len(features) < num_original_features: _logging.warning("Duplicates have been removed from the list of features") ## Remove the row label if label in features: features.remove(label) _logging.warning("The row label has been removed from the list of features.") ## Check the type of each feature against the list of valid types valid_features = [] for ftr in features: if not isinstance(ftr, str): _logging.warning("Feature '{}' excluded. ".format(ftr) + "Features must be specified as strings " + "corresponding to column names in the input dataset.") elif ftr not in column_type_map.keys(): _logging.warning("Feature '{}' excluded because ".format(ftr) + "it is not in the input dataset.") elif column_type_map[ftr] not in valid_types: _logging.warning("Feature '{}' excluded because of its type. ".format(ftr) + "Kmeans features must be int, float, dict, or array.array type.") else: valid_features.append(ftr) if len(valid_features) == 0: raise _ToolkitError("All specified features have been excluded. " + "Please specify valid features.") return valid_features
def update(self, dataset): """ Create a new BayesianChangepointsModel using the same parameters, but an updated dataset. Knowledge about the data is retained from the previous model, and it is assumed the data is a continuation of the previous models data. Parameters ---------- dataset : SFrame, SArray, or TimeSeries New data to use for an updated changepoint detection model. The type of the input 'dataset' must match the type of the data already in the model (if the model has data already). Returns ------- out : BayesianChangepointsModel A *new* BayesianChangepointsModel, with an updated dataset and changepoint scores for the updated dataset. The `scores` field of the new model has the same schema as the `scores` field of the existing model. The last `lag` fields are prepended to the data, though, because there is now enough data to evaluate their changepoint probability. See Also -------- create Examples -------- >>> sf = graphlab.SFrame({'series': [100]*25}) >>> model = graphlab.anomaly_detection.bayesian_changepoints.create(sf, ... lag=5, ... feature='series') >>> sf2 = graphlab.SFrame({'series': [200]*25}) >>> model2 = model.update(sf2) >>> model2['scores'].print_rows(max_column_width=20) +-------------------+--------+---------------------+ | changepoint_score | series | model_update_time | +-------------------+--------+---------------------+ | 0.831430606595 | 200 | 2016-01-27 14:06... | | 0.000347138442071 | 200 | 2016-01-27 14:06... | | 3.40869782692e-05 | 200 | 2016-01-27 14:06... | | 1.40792637711e-05 | 200 | 2016-01-27 14:06... | | 7.50780005726e-06 | 200 | 2016-01-27 14:06... | | 4.49582032092e-06 | 200 | 2016-01-27 14:06... | | 2.90328065455e-06 | 200 | 2016-01-27 14:06... | | 1.98060675567e-06 | 200 | 2016-01-27 14:06... | | 1.40930691121e-06 | 200 | 2016-01-27 14:06... | | 1.03700199168e-06 | 200 | 2016-01-27 14:06... | +-------------------+--------+---------------------+ [25 rows x 3 columns] """ start_time = _time.time() _mt._get_metric_tracker().track( 'toolkit.anomaly_detection.bayesian_changepoints.update') logger = _logging.getLogger(__name__) ## Validate the new dataset if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1: raise TypeError("Input 'dataset' is empty.") if ((self.get('dataset_type') == 'TimeSeries' and not isinstance(dataset, _gl.TimeSeries)) or (self.get('dataset_type') == 'SFrame' and not isinstance(dataset, _gl.SFrame))): raise TypeError("New input 'dataset' must have the same type " + "as the data already in the model.") ## TimeSeries-specific dataset validation ## Make the sure new data occurs *after* the existing data. scores = self.get('scores') if isinstance(dataset, _gl.TimeSeries): first_new_timestamp = dataset[0][dataset.index_col_name] last_old_timestamp = scores[-1][scores.index_col_name] if first_new_timestamp < last_old_timestamp: raise _ToolkitError("The new dataset has data with " + "earlier timestamps than the existing " + "dataset. Please ensure that new data " + "occurs after existing data.") ## Extract the feature from the new dataset and validate it. feature = self.get('feature') try: series = dataset[feature] except: raise _ToolkitError("The feature specified by the original " + "model could not be found in the input " + "'dataset'.") if not series.dtype() in [int, float]: raise ValueError("The values in the specified feature must be " + "integers or floats.") ## Create a new model initialize it. new_state = {k: self.get(k) for k in ['dataset_type']} opts = self.__proxy__.get_most_likely_hyperparams() proxy = _gl.extensions._BayesianOnlineChangepoint() ## Initialize new model w/state from old model. This allows for ## detection changepoints using knowledge learned previously, and ## also to detect changepoint probabilites for points which didn't yet ## have `lag` points following them. proxy.init_changepoint_detector( opts, True, self.get('scores')[feature].dropna()[0]) new_model = BayesianChangepointsModel(proxy) ## Once again, calculate changepoints with information known ## from model creation. Prepend `lag` points from previous dataset, ## we now have enough information to check if they were changepoints lag = self.get('lag') ## If `lag` is greater than 0, we want to prepend the points that we ## couldn't find a changepoint score before due to not enough data. ## These are the laat `lag` non-None points. if lag > 0: ## Grab previous scores if isinstance(dataset, _gl.SFrame): old_scores = self.get('scores')[[feature, 'model_update_time']] else: old_scores = self.get('scores').to_sframe()[[ feature, 'model_update_time' ]] # Copy SFrame and select only the feature column prepend_index_calc_temp_sf = old_scores[[feature]] #Rename, incase feature column is 'id' prepend_index_calc_temp_sf.rename({feature: 'series'}) # Identify the last `lag` points that are non-None prepend_index_calc_temp_sf = prepend_index_calc_temp_sf.add_row_number( ) prepend_index_calc_temp_sf = prepend_index_calc_temp_sf.dropna() # If `lag` is longer than scores, just take all previous points if lag >= len(prepend_index_calc_temp_sf): prepend_index = 0 else: prepend_index = prepend_index_calc_temp_sf['id'][-(lag + 1)] + 1 old_scores = old_scores[prepend_index:] ## Otherwise, we don't prepend anything, so the index can be the input ## data length else: prepend_index = len(series) ## Calculate changepoints scores = _gl.SFrame() scores[feature] = series changepoints = new_model.__proxy__.calculate_changepoints(series) scores['model_update_time'] = _dt.datetime.now() if lag > 0: scores = old_scores.append(scores) changepoints = changepoints.append( _gl.SArray([None] * (len(scores) - len(changepoints)))) scores['changepoint_score'] = changepoints scores = scores[[ 'changepoint_score', # reorder the columns feature, 'model_update_time' ]] ## Add row_id to SFrame if isinstance(dataset, _gl.SFrame): if feature != 'row_id': scores = scores.add_row_number('row_id') else: logger.warning("Feature name is 'row_id', so the " + "index in the model's 'scores' SFrame " + "is called '_row_id'.") scores = scores.add_row_number('_row_id') ## Finalize and return the model. new_state['num_examples'] = len(scores) new_state['training_time'] = _time.time() - start_time ## If time-series index name has changed, rename old_timeseries ## index name if isinstance(dataset, _gl.TimeSeries): old_index_col_name = self.__proxy__.get_index_col_name() old_timeseries = self.get('scores') if dataset.index_col_name != old_index_col_name: old_timeseries = old_timeseries.rename( {old_index_col_name: dataset.index_col_name}) logger.warning("The new dataset's index column name " + "does not match the existing index " + "column name. The new name is used in " + "the new model.") ## In model creation, the last `lag` points cannot be ## evaluated for changepoint probability. Now, there's more data, ## so that data is prepended. new_index = old_timeseries[ dataset.index_col_name][prepend_index:].append( dataset[dataset.index_col_name]) scores[dataset.index_col_name] = new_index new_model.__proxy__.set_index_col_name(dataset.index_col_name) new_model.__proxy__.set_state_sframe(scores, new_state) else: new_model.__proxy__.set_state_sframe(scores, new_state) return new_model
def __init__(self, data=None, item_id=None, quality_feature=None, similarity_features=None, model_proxy=None, _class=None): """ Create a DiverseSampler object. This should never be called directly, because it is necessary to set up an SDK proxy prior to calling __init__. """ if _class: self.__class__ = _class self._init_with_frame = False self.__proxy__ = model_proxy self.__name__ = 'diverse_sampler' self._quality_feature = quality_feature self._similarity_features = similarity_features if data is None and model_proxy is None: raise ValueError( "The diverse sampler must be initialized with a " + "reference SFrame or SGraph.") elif data is not None: if not (isinstance(data, _gl.SFrame) or isinstance(data, _gl.SGraph)): raise ValueError("Unknown data type " + str(type(data)) + ".") if item_id is None and model_proxy is None: # Note that for SGraphs, the __id vertex field is intrinsic to each # gl.Vertex, so we don't actually need to specify item_id if isinstance(data, _gl.SFrame): raise ValueError("An item_id must be specified.") if isinstance(data, _gl.SFrame): col_names = data.column_names() elif isinstance(data, _gl.SGraph): if similarity_features is not None and len( similarity_features) > 1: raise _ToolkitError( "Only 1 similarity feature is supported for SGraph.") col_names = data.get_fields() if isinstance(data, _gl.SFrame) and item_id not in col_names: raise ValueError("Item ID " + item_id + " does not name " + "a column in the SFrame.") if quality_feature is not None and quality_feature not in col_names: raise ValueError("Quality feature " + quality_feature + " does not name " + "a column in the SFrame.") if similarity_features is not None: for sname in similarity_features: if sname not in col_names: raise ValueError("Similarity feature " + sname + " does not name " + "a column in the SFrame.") opts = dict() if item_id is None and isinstance(data, _gl.SGraph): item_id = "__id" opts["item_id"] = item_id if quality_feature is not None: opts["quality_feature"] = quality_feature if similarity_features is not None: opts["similarity_features"] = similarity_features if isinstance(data, _gl.SFrame): self._init_with_frame = True self.__proxy__.init_with_frame(data, opts) elif isinstance(data, _gl.SGraph): self._init_with_frame = False self.__proxy__.init_with_graph(data, opts)
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier` model. This model predicts the class of a query instance by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : str, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier graphlab.toolkits.nearest_neighbors graphlab.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ _mt._get_metric_tracker().track( 'toolkit.classifier.nearest_neighbor_classifier.create') start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype() == str and not dataset[target].dtype( ) == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].num_missing() > 0: _logging.warning( "Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } distance = _construct_auto_distance(_features, col_types) else: raise TypeError( "Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _gl.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- model = NearestNeighborClassifier(knn_model) model._state['verbose'] = verbose model._state['distance'] = knn_model['distance'] model._state['num_distance_components'] = knn_model[ 'num_distance_components'] model._state['num_examples'] = dataset.num_rows() model._state['features'] = knn_model['features'] model._state['target'] = target model._state['num_classes'] = len(dataset[target].unique()) model._state['num_features'] = knn_model['num_features'] model._state['num_unpacked_features'] = knn_model['num_unpacked_features'] model._state['training_time'] = _time.time() - start_time model._target_type = dataset[target].dtype() return model
def create(observation_data, user_id='user_id', timestamp='timestamp', user_data=None, time_unit=0, features=[], time_aggregate=_datetime.timedelta(days=1), lookback_feature_periods=[7, 14, 21, 60, 90], time_boundaries=[], tree_depth = 100, verbose=True): """ Create a model of type :class:`~graphlab.churn_predictor.ChurnPredictor` that performs churn analysis on provided user activity logs. The Churn Prediction toolkit allows predicting which users will churn (stop using) a product or website given user activity logs. Training datasets should contain columns with user id, time stamp, and user events. Given the same, or a different data set, the toolkit will compute the probability of a user churning. For instance, given a dataset of the form: .. sourcecode:: python +-------------------------------+---------+---------------+------------+ | user_id | action | timestamp | product_id | +-------------------------------+---------+---------------+------------+ | ONE | open | 04/15/1981 | 205075200 | | ONE | bought | 04/15/1981 | 88441100 | | ONE | clicked | 04/17/1981 | 205075200 | | TWO | clicked | 09/01/2015 | 205075200 | | TWO | bought | 09/21/2015 | 88414900 | +-------------------------------+---------+---------------+------------+ If we are looking at this data set, the last time stamp is September 21st 2015. Given that last date, user TWO is unlikely to churn. Whereas user ONE, who was a customer in 1981 and has not come back for 34 years is likely to churn (not come back). The toolkit does not use current wall time, it uses the last provided time stamp for predictions (unless another time stamp is specified). This toolkit will look at users, time and activity types, form an internal feature set based on user behavior, and train a model to predict user churn. Predicted churn is provided as a probability, where 0% means the user will definitely churn, and 100% means the user will definitely stay. The prediction is set by default to execute on the latest time stamp provided in the prediction set, but can also be set manually to some other date. Since, internally, training requires generating training labels, the last 10% of the data is not used for training. Therefore, it is safe to reuse the training data set as a prediction data set to know who is likely to churn. A trained model can also be used to predict on new data set safely. Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids, a column of timestamps, along with one or more activity columns. Each row represents a user action at a given time. A user activity column can contain numeric data (length of visit in seconds, number of items in cart) or categorical data (item purchased, page visited). The user id must be of type 'int' or 'str'. The time column must be of type int or datetime. User activity columns of type 'int' and 'str' will be considered categorical. Columns of type 'float' will be considered numerical. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. Default: user_id timestamp : string, optional The name of the column in `observation_data` that corresponds to the timestamp. The column can be of datetime.datetime type, or int type. If the column contains ints the time_unit parameter can be used to define the unit of time represented by the column. Default: timestamp user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. The join performed is an inner join. features : List of string, optional If specified, only the features in the list will be used. Columns of type Integer and String will be treated as categorical, columns of type Float will be treated as numeric. time_unit : int, optional If the timestamp column is of integral type, this sets the unit interval of the column. For instance, if the value of the timestamp column is in milliseconds, the value would be 1000. If the timestamp column is in seconds, the value would be 1. Default: 0 (auto-detect) time_aggregate : datetime.timedelta, optional Internal time frame to roll up user actions. In order to make the inner computation efficient, the user actions are rolled-up by time. The default is to aggregate by day. This can be shortened if hourly rates of actions are of importance, or stretched if only weekly rates matter. The larger the roll-up, the faster the model will run. The timedelta must be positive. Default: 1 day lookback_feature_periods : list of int, optional Interval of time to look back for feature generation. Each number is in the unit of time_aggregate (the default being days as defined above). For instance, if the list contains [7, 14], it will generate features for weekly patterns and biweekly patterns. If time_aggregate was set to hours, [7, 14] would generate for 7 hours and 14 hours patterns. Default: [7, 14, 21, 60, 90] time_boundaries : list of datetime.datetime, optional List of time boundaries used to compute the training set. At each time boundary, users that are present before the boundary will be used to compute features, and their presence after the boundary will make up the label. By having multiple time boundaries, more training data can be generated. If an empty list is specified (default), 10 evenly separated boundaries are used based on the first and last timestamp of the observation_data. Default: 10 evenly separated boundaries are used based on the first and last timestamp of the observation_data. tree_depth : integer, optional The depth of the decision tree built internally to train. By default, the model assumes a large complex dataset and will assign up to 100 levels to the internal decision tree. This can be too many for smaller data sets with fewer columns. Default: 100 verbose: boolean, optional When set to true, more status output is generated Default: True Returns ------- out : ChurnPredictor A trained model of type :class:`~graphlab.churn_predictor.ChurnPredictor`. See Also -------- ChurnPredictor Examples -------- .. sourcecode:: python # Load a data set. The d ata set has 3 columns: user_id, timestamp, event # and must contain at least 100 rows of user activity. >>> sf = gl.SFrame('~/data/churn/actions_top_k.csv') # We set time_unit to 1000 because our data has timestamps in milliseconds. # Otherwise, we would ommit it (or set it to 1). >>> model = gl.churn_predictor.create(sf, user_id="user_id", timestamp="timestamp") # For simplicity, we will predict on the input data set >>> model.predict(sf) # Output is in the form: +-------------------------------+----------------------+ | user_id | stay_probability | +-------------------------------+----------------------+ | ONE | 0.001 | | TWO | 99.99 | +-------------------------------+----------------------+ """ _mt._get_metric_tracker().track('{}.create'.format("toolkit.churn_predictor.create")) _raise_error_if_not_sframe(observation_data, "observation_data") _raise_error_if_not_of_type(user_id, [str]) _raise_error_if_not_of_type(timestamp, [str]) if (user_data): _raise_error_if_not_sframe(user_data, "observation_data") time_aggregate_int = int(time_aggregate.total_seconds()) if (time_aggregate_int <= 0): raise _ToolkitError("time_aggregate must be a positive time delta") if (observation_data.num_rows() < 100): raise _ToolkitError("This toolkit requires at least 100 rows of activity") # Cheap way to determine time units if not user-specified if time_unit == 0: # Set to 1 in case of using datetimes or other non-int formats time_unit = 1 first_timestamp = observation_data[timestamp][0] if isinstance(first_timestamp, int): if (verbose): print("PROGRESS: Determining timestamp unit") max_timestamp = observation_data[timestamp].max() if len(str(max_timestamp)) >= 11: if (verbose): print("PROGRESS: Assuming timestamps are in milliseconds since 01/01/1970") time_unit = 1000 else: if (verbose): print("PROGRESS: Assuming timestamps are in seconds since 01/01/1970") time_unit = 1 if (verbose): print("PROGRESS: Initializing churn predictor") proxy = _gl.extensions._ChurnPredictor() proxy.define_columns(observation_data, timestamp, user_id, time_unit) proxy.define_columns3(observation_data, features) proxy.define_lookback_feature_periods(lookback_feature_periods) proxy.define_default_time_aggregate(time_aggregate_int) proxy.define_model_options(tree_depth) if (verbose): print("PROGRESS: Sorting input data by time order") sorted_data = proxy.sort_by_time(observation_data, "") if (verbose): print("PROGRESS: Aggregating input data by groups of " + str(time_aggregate)) aggregated_by_time = proxy.aggregate_by_time(sorted_data, True, 0, "", "") if (time_boundaries): unix_timestamps = [] for dt in time_boundaries: unix_timestamps.append(int(_time.mktime(dt.timetuple()))) time_boundaries = unix_timestamps stpcnt = 10 if (not time_boundaries and isinstance(sorted_data[timestamp][0], _datetime.datetime)): min_time = sorted_data[timestamp][0] max_time = sorted_data[timestamp][sorted_data[timestamp].size() - 1] step = (max_time - min_time) / stpcnt if (verbose): print("PROGRESS: No time boundaries specified, computing 10 boundaries from " + str(min_time) + " to " + str(max_time)) time_boundaries = [(i+1) * step + min_time for i in range(stpcnt - 1)] if (not time_boundaries): min_time = sorted_data[timestamp][0] / time_unit max_time = sorted_data[timestamp][sorted_data[timestamp].size() - 1] / time_unit step = (max_time - min_time) / stpcnt if step <= 0: raise _ToolkitError("Not enough time in the training data. There should be more than " + str(stpcnt) + " units of time.") if (verbose): print("PROGRESS: No time boundaries specified, computing 10 boundaries from " + str(_datetime.datetime.fromtimestamp(min_time)) + " to " + str(_datetime.datetime.fromtimestamp(max_time))) time_boundaries = range(min_time + step, max_time, step - 1) if (not time_boundaries): raise _ToolkitError("Not enough time boundaries defined, at least one must be defined") big_user_aggregate = None for time_boundary in time_boundaries: if isinstance(time_boundary, _datetime.datetime): time_boundary = int(_time.mktime(time_boundary.timetuple())) if (verbose): print("PROGRESS: Generating user data for aggregate " + str(_datetime.datetime.fromtimestamp(time_boundary))) user_aggregate = proxy.per_user_aggregate(aggregated_by_time, time_boundary, "", lookback_feature_periods) if (not big_user_aggregate): big_user_aggregate = user_aggregate else: big_user_aggregate = big_user_aggregate.append(user_aggregate) if (user_data): if (verbose): print("PROGRESS: Joining with user data") big_user_aggregate = big_user_aggregate.join(user_data, on=user_id, how="inner") if (verbose): print("PROGRESS: Training model") proxy.train_model(big_user_aggregate, "") if (verbose): print("PROGRESS: All done!") return ChurnPredictor(proxy, big_user_aggregate)
def create(dataset, target, features=None, distance=None, verbose=True): """ Create a :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. This model predicts the class of a query point by finding the most common class among the query's nearest neighbors. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns except the target variable should be used. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. Please see the :mod:`~graphlab.toolkits.distances` module for more details. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. Please see the documentation for that module for specific distance functions. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) Note that for sparse vectors, missing keys are assumed to have value 0.0. If distance is left unspecified or set to 'auto', then a composite distance is constructed automatically based on feature types. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborClassifier A trained model of type :class:`~graphlab.nearest_neighbor_classifier.NearestNeighborClassifier`. See Also -------- NearestNeighborClassifier graphlab.toolkits.nearest_neighbors graphlab.toolkits.distances References ---------- - `Wikipedia - nearest neighbors classifier <http://en.wikipedia.org/wiki/Nearest_neighbour_classifiers>`_ - Hastie, T., Tibshirani, R., Friedman, J. (2009). `The Elements of Statistical Learning <http://statweb.stanford.edu/~tibs/ElemStatLearn/>`_. Vol. 2. New York. Springer. pp. 463-481. Examples -------- >>> sf = graphlab.SFrame({'species': ['cat', 'dog', 'fossa', 'dog'], ... 'height': [9, 25, 20, 23], ... 'weight': [13, 28, 33, 22]}) ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species') As with the nearest neighbors toolkit, the nearest neighbor classifier accepts composite distance functions. >>> my_dist = [[('height', 'weight'), 'euclidean', 2.7], ... [('height', 'weight'), 'manhattan', 1.6]] ... >>> model = graphlab.nearest_neighbor_classifier.create(sf, target='species', ... distance=my_dist) """ ## Set up ## ------ _mt._get_metric_tracker().track('toolkit.classifier.nearest_neighbor_classifier.create') start_time = _time.time() ## Validation and preprocessing ## ---------------------------- ## 'dataset' must be a non-empty SFrame _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): raise _ToolkitError("The 'target' parameter must be the name of a " "column in the input dataset.") if not dataset[target].dtype() == str and not dataset[target].dtype() == int: raise TypeError("The target column must contain integers or strings.") ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].num_missing() > 0: _logging.warning("Missing values detected in the target column. This " + "may lead to ambiguous 'None' predictions, if the " + "'radius' parameter is set too small in the prediction, " + "classification, or evaluation methods.") ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit # because the automatic distance construction may be different for the two # toolkits. if features is None: _features = [x for x in dataset.column_names() if x != target] else: _features = [x for x in features if x != target] if isinstance(distance, list): distance = _copy.deepcopy(distance) elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] elif distance is None or distance == 'auto': col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} distance = _construct_auto_distance(_features, col_types) else: raise TypeError("Input 'distance' not understood. The 'distance' " + "parameter must be a string or a composite distance, " + " or left unspecified.") ## Construct and query the nearest neighbors model ## ----------------------------------------------- knn_model = _gl.nearest_neighbors.create(dataset, label=target, distance=distance, verbose=verbose) ## Postprocessing and formatting ## ----------------------------- model = NearestNeighborClassifier(knn_model) model._state['verbose'] = verbose model._state['distance'] = knn_model['distance'] model._state['num_distance_components'] = knn_model['num_distance_components'] model._state['num_examples'] = dataset.num_rows() model._state['features'] = knn_model['features'] model._state['target'] = target model._state['num_classes'] = len(dataset[target].unique()) model._state['num_features'] = knn_model['num_features'] model._state['num_unpacked_features'] = knn_model['num_unpacked_features'] model._state['training_time'] = _time.time() - start_time return model
def create(dataset, tag_name=None, features=None, verbose=True): """ Create a :class:`NearestNeighborAutoTagger` model, which can be used to quickly apply tags from a reference set of text labels to a new query set using the ``tag`` method. Parameters ---------- dataset : SFrame Reference data. This SFrame must contain at least one column. By default, only the ``tag_name`` column is used as the basis for tagging. You may optionally include additional columns with the ``features`` parameter. tag_name : string, optional Name of the column in ``dataset`` with the tags. This column must contain string values. If ``dataset`` contains more than one column, ``tag_name`` must be specified. features : list[string], optional Names of the columns with features to use as the basis for tagging. 'None' (the default) indicates that only the column specified by the ``tag_name`` parameter should be used. Only str or list fields are allowed. If a column of type list is specified, all values must be either of type string or convertible to type string. verbose : bool, optional If True, print verbose output during model creation. Returns ------- out : model A model for quickly tagging new query observations with entries from `dataset`. Currently, the only implementation is the following: - NearestNeighborAutoTagger See Also -------- graphlab.nearest_neighbors.NearestNeighborsModel Examples -------- First construct a toy `SFrame` of actor names, which will serve as the reference set for our autotagger model. >>> actors_sf = gl.SFrame( {"actor": ["Will Smith", "Tom Hanks", "Bradley Cooper", "Tom Cruise", "Jude Law", "Robert Pattinson", "Matt Damon", "Brad Pitt", "Johnny Depp", "Leonardo DiCaprio", "Jennifer Aniston", "Jessica Alba", "Emma Stone", "Cameron Diaz", "Scarlett Johansson", "Mila Kunis", "Julia Roberts", "Charlize Theron", "Marion Cotillard", "Angelina Jolie"]}) >>> m = gl.data_matching.nearest_neighbor_autotagger.create( actors_sf, tag_name="actor") Then we load some IMDB movie reviews into an `SFrame` and tag them using the model we created above. The score field in the output is a similarity score, indicating the strength of the match between the query data and the suggested reference tag. >>> reviews_sf = gl.SFrame( "https://static.turi.com/datasets/imdb_reviews/reviews.sframe") >>> m.tag(reviews_sf.head(10), query_name="review", verbose=False) +-----------+-------------------------------+------------------+-----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+-----------------+ | 0 | Story of a man who has unn... | Cameron Diaz | 0.0769230769231 | | 0 | Story of a man who has unn... | Angelina Jolie | 0.0666666666667 | | 0 | Story of a man who has unn... | Charlize Theron | 0.0625 | | 0 | Story of a man who has unn... | Robert Pattinson | 0.0588235294118 | | 1 | Bromwell High is a cartoon... | Jessica Alba | 0.125 | | 1 | Bromwell High is a cartoon... | Jennifer Aniston | 0.1 | | 1 | Bromwell High is a cartoon... | Charlize Theron | 0.05 | | 1 | Bromwell High is a cartoon... | Robert Pattinson | 0.047619047619 | | 1 | Bromwell High is a cartoon... | Marion Cotillard | 0.047619047619 | | 2 | Airport '77 starts as a br... | Julia Roberts | 0.0961538461538 | | ... | ... | ... | ... | +-----------+-------------------------------+------------------+-----------------+ The initial results look a little noisy. To filter out obvious spurious matches, we can set the `tag` method's similarity_threshold parameter. >>> m.tag(reviews_sf.head(1000), query_name="review", verbose=False, similarity_threshold=.8) +-----------+-------------------------------+------------------+----------------+ | review_id | review | actor | score | +-----------+-------------------------------+------------------+----------------+ | 341 | I caught this film at a te... | Julia Roberts | 0.857142857143 | | 657 | Fairly funny Jim Carrey ve... | Jennifer Aniston | 0.882352941176 | | 668 | A very funny movie. It was... | Jennifer Aniston | 0.833333333333 | | 673 | This film is the best film... | Jennifer Aniston | 0.9375 | +-----------+-------------------------------+------------------+----------------+ In this second example, you'll notice that the ``review_id`` column is much more sparse. This is because all results whose score was below the specified similarity threshold (.8) were excluded from the output. """ # validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") # ensure that tag_name is provided if dataset has > 1 column if dataset.num_cols() > 1 and not tag_name: raise _ToolkitError("No tag_name parameter specified on dataset " \ "with %d columns" % dataset.num_cols()) tag_name = tag_name or dataset.column_names()[0] # ensure that column with name tag_name exists if tag_name not in dataset.column_names(): raise _ToolkitError('No column named "%s" in dataset' % tag_name) # ensure that column is of type string if dataset[tag_name].dtype() != str: raise TypeError("The column used as the tag name must be of type " \ "string.") # use reasonable default for general case distance = _gl.distances.weighted_jaccard # if additional features are specified, ensure they are of appropriate types if features and not isinstance(features, list) and \ all([isinstance(x, str) for x in features]): raise TypeError("The feature parameter must be a list of strings " \ "and those strings must correspond to columns in " \ "`dataset`.") # at a minimum, this SFrame will contain the tags as features; features = features or [] features = [tag_name] + [x for x in features if x != tag_name] # ensure that each specified feature column is either of type list or str column_names = set(dataset.column_names()) for col_name in features: if col_name not in column_names: raise _ToolkitError("Specified feature column (%s) not found " \ "in dataset" % col_name) if dataset.select_column(col_name).dtype() not in (str, list): raise TypeError("Only string and list columns are allowed as " \ "features.") # concatenate the feature columns into a single column features_sf = dataset.select_columns(features) feature_col, features_sf = _concat_string_features(features_sf, features) # compute features if verbose: _logging.getLogger().info("Extracting features...") features = _preprocess(features_sf.select_column(feature_col)) # group by tag_name to ensure that tags are unique feature_cols = features.column_names() select_cols = {col_name: _gl.aggregate.SELECT_ONE(col_name) for col_name \ in feature_cols} features.add_column(dataset[tag_name], tag_name) features = features.groupby(tag_name, select_cols) # create nearest neighbors model m = _gl.nearest_neighbors.create(features, label=tag_name, distance=distance, features=feature_cols, verbose=verbose) # add standard toolkit state attributes state = { "nearest_neighbors_model": m, "training_time": m.get("training_time"), "tag_name": tag_name, "verbose": verbose, "num_examples": len(features), "features": feature_cols, "num_features": len(feature_cols), "distance": m.get("distance") } model = NearestNeighborAutoTagger(state) return model
def create(datasets, row_label=None, features=None, grouping_features=None, distance=None, k=2, radius=None, verbose=True): """ Create a deduplication model based on nearest neighbors and SGraph connected components. This method creates a :class:`NearestNeighborDeduplication` model by constructing a nearest neighbors similarity graph on all of the rows in the input 'datasets', then using the connected components tool in the :mod:`~graphlab.toolkits.graph_analytics` module to assign an entity label to each record. Records which share the same label are considered to be duplicates. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- datasets : SFrame or list[SFrame] or dict(string: SFrame) Input datasets. Each SFrame in the list must include all of the features specified in the `features` or 'distance' parameters, but may have additional columns as well. SFrames can be input as values in a dictionary, where the keys are strings used in the output to identify the SFrame from which each record originated. row_label : string, optional Name of the SFrame column with row labels. If not specified, row numbers are used to identify rows in the output. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates the intersection of columns over all SFrames in `datasets` should be used (except the label column, if specified). Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if `distance` is specified as a composite distance, then that parameter controls which features are used in the model. Any additional columns named in 'features' will be included in the model output but not used for distance computations. grouping_features : list[string], optional Names of features to use in grouping records before finding approximate matches. These columns must have string or integer type data. See the Notes section for more details on grouping. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. k : int, optional Number of neighbors to consider for each point. radius : float, optional Maximum distance from each point to a potential duplicate. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : NearestNeighborDeduplication model The NearestNeighborDeduplication object contains a field 'entities' which shows the entity label for each input record. It also shows the features for each record that are used to construct the model, as well as the original SFrame and row label for each record. If the original `datasets` are passed in a list, the SFrame identifier is the index of the SFrame in that list. See Also -------- NearestNeighborDeduplication, graphlab.toolkits.nearest_neighbors, graphlab.SFrame.groupby Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For datasets with more than about 10,000 records, *grouping* (also known as *blocking*) is a critical step to avoid computing distances between all pairs of records. The grouping step simply assigns each record to a group that has identical values for all `grouping_features`, and only looks for duplicates within each group. - Records with missing data in the `grouping_features` are removed from consideration as duplicates. These records are given the entity label "None". - For tasks that require *only* exact matches on certain features, it is generally more natural to use the SFrame `groupby` function. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> sf1 = graphlab.SFrame({'id': [0, 1, 2], ... 'x0': [0.5, 0.5, 0.3], ... 'x1': [1., 0.8, 0.6], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... ... # note: misspellings in the following dataset do not prevent correct ... # matches. >>> sf2 = graphlab.SFrame({'id': [9, 10], ... 'x0': [0.35, 0.4], ... 'x1': [0.65, 0.8], ... 'city': ['bostan', 'seatle'], ... 'state': ['MA', 'WA']}) ... >>> dist = [[('city',), 'levenshtein', 2], ... [('x0', 'x1'), 'euclidean', 1.5]] ... >>> m = graphlab.nearest_neighbor_deduplication.create({'a': sf1, 'b': sf2}, ... row_label='id', ... grouping_features=['state'], ... distance=dist, k=None, ... radius=3) ... >>> print m['entities'] +----------+----+----------+-------+------+---------+------+ | __sframe | id | __entity | state | x0 | city | x1 | +----------+----+----------+-------+------+---------+------+ | a | 1 | 0 | WA | 0.5 | olympia | 0.8 | | a | 0 | 1 | WA | 0.5 | seattle | 1.0 | | b | 10 | 1 | WA | 0.4 | seatle | 0.8 | | a | 2 | 2 | MA | 0.3 | boston | 0.6 | | b | 9 | 2 | MA | 0.35 | bostan | 0.65 | +----------+----+----------+-------+------+---------+------+ [5 rows x 7 columns] """ ## Set up _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() model = NearestNeighborDeduplication() model.__proxy__['verbose'] = verbose model.__proxy__['k'] = k model.__proxy__['radius'] = radius ### ----------------------------- ### ### Validation and preprocessing ### ### ----------------------------- ### ### Validate input datasets ### ----------------------- ## If datasets is already a dict, check the keys are all strings if isinstance(datasets, dict): if not(all([isinstance(x, str) for x in datasets.keys()])): raise ValueError("Keys in the 'datasets' dict must be strings.") ## Convert singleton SFrame dataset into a list of datasets if isinstance(datasets, _gl.SFrame): _raise_error_if_sframe_empty(datasets, "dataset") datasets = {0: datasets} ## Convert a list of SFrames into a dict if isinstance(datasets, list): datasets = {k: sf for k, sf in enumerate(datasets)} ## At this point, 'datasets' must be dict. If it's not, something is wrong. if not isinstance(datasets, dict): raise TypeError("Input 'datasets' must be an SFrame, a list of SFrames, " + "or a dictionary of (string, SFrame) pairs.") model.__proxy__['num_datasets'] = len(datasets) ## Ensure that all datasets are SFrames for d in datasets.values(): _raise_error_if_not_sframe(d, "dataset") ### Validate row label ### ------------------ ## Validate the label column if row_label: if not isinstance(row_label, str): raise TypeError("The 'row_label' parameter must be the name (string " + "type) of a column in each of the input datasets.") for d in datasets.values(): if row_label not in d.column_names(): raise _ToolkitError("The specified row_label column does not " + " exist in all input datasets.") else: row_label = 'row_number' for d in datasets.values(): if row_label in d.column_names(): raise _ToolkitError("Input 'row_label' defaulted to " + "'row_number', which is already a column" + " in at least one input dataset. Please " + "specify a row label column manually.") model.__proxy__['row_label'] = row_label ### Validate 'features' and 'grouping_features' parameters ### ------------------------------------------------------ if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") if grouping_features is not None: if not hasattr(grouping_features, '__iter__'): raise TypeError("Input 'grouping_features' must be a list.") if not all([isinstance(x, str) for x in grouping_features]): raise TypeError("Input 'grouping_features' must contain only strings.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. ## Find the intersection of all feature sets and feature types col_types = {k: v for k, v in zip(list(datasets.values())[0].column_names(), list(datasets.values())[0].column_types())} all_features = [sf.column_names() for sf in datasets.values()] ftr_intersection = list(set(all_features[0]).intersection(*all_features)) ftr_intersection = [x for x in ftr_intersection if x != row_label] ## Convert features and distance arguments into a composite distance. if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): if features is not None: distance = [[features, distance, 1]] else: distance = [[ftr_intersection, distance, 1]] elif distance == None: if features is not None: distance = _construct_auto_distance(features, col_types) else: distance = _construct_auto_distance(ftr_intersection, col_types) else: raise TypeError("Input 'distance' not understood. Note that for the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the form of the composite distance and add to the model allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label, list(allowed_dists.keys()), verbose) model.__proxy__['distance'] = _copy.deepcopy(distance) ## Figure out which features are 'fuzzy', i.e. used for approximate # matching, and set in the model state. fuzzy_features = _dmutl.extract_composite_features(distance) # already has row_label removed model.__proxy__['features'] = fuzzy_features model.__proxy__['num_features'] = len(fuzzy_features) ## Compile a master list of all features. This includes grouping features, # fuzzy features (the ones used for approximate matching), and "ancillary" # features, which are specified in the 'features' parameter but not in the # composite distance function for whatever reason. by the user in the # 'features' parameter, but not included in the 'distance' specification # for some reason. if features is None: features = [] else: features = [x for x in features if x != row_label] if grouping_features is None: grouping_features = [] else: grouping_features = [x for x in grouping_features if x != row_label] model.__proxy__['grouping_features'] = grouping_features model.__proxy__['num_grouping_features'] = len(grouping_features) master_features = list(set(features + grouping_features + fuzzy_features)) ### Consolidate data and engineer features ### -------------------------------------- ## Consolidate multiple input datasets into a single SFrame, with a useful # row label. sf_union = _dmutl.concatenate_sframes(datasets, row_label=row_label, features=master_features, sf_index_name='__sframe') overall_label = '__sframe.' + row_label sf_union[overall_label] = (sf_union['__sframe'].astype(str) + "." + sf_union[row_label].astype(str)) ## Validate the feature types in the consolidated dataset against the # specified distance functions. _dmutl.validate_distance_feature_types(sf_union, distance, allowed_dists) ## Clean string-type features in the fuzzy feature set. for ftr in fuzzy_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr sf_union[new_ftr] = sf_union[ftr].fillna("") sf_union[new_ftr] = sf_union[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Feature engineering, distance-component-wise. Also update list of # features and a map to their types. sf_union, distance = _engineer_distance_features(sf_union, distance) transformed_features = _dmutl.extract_composite_features(distance) ### -------------------------------------------- ### ### Main loop over blocks of neighbor candidates ### ### -------------------------------------------- ### ## Construct blocks on features that must match exactly if verbose: _logging.info("Constructing groups of records that match exactly on " + "the 'grouping_features'.") sf_union, block_errors, blocks = \ _dmutl.construct_exact_blocks(sf_union, grouping_features) if verbose and len(distance) > 0 and blocks['Count'].max() > 10000: _logging.warning("There are more than 10,000 records in the largest match " + "group. For many uses, approximate matches within each match group are " + "computed with brute force nearest neighbors, which may be slow. " + "Consider using smaller groups by requiring different features to " + "match exactly.") max_entity_number = 0 sf_entity = _gl.SFrame() output_features = (master_features + [row_label, '__sframe', '__entity']) ## Main loop over blocks for i, block in enumerate(blocks): if verbose: _logging.info("Processing {} records in match group: {}/{}".format(block['Count'], i+1, len(blocks))) ## Retrieve records in the block and impute the mean for missing numeric # values. records = sf_union[block['min_idx']:(block['max_idx'] + 1)] complete_records = _dmutl.impute_numeric_means(records, transformed_features) if len(distance) > 0: ## Run all-point nearest neighbors if verbose: _logging.info("Building the similarity graph....") m = _gl.nearest_neighbors.create(complete_records, label=overall_label, distance=distance, verbose=False) knn = m.query(complete_records, label=overall_label, k=k, radius=radius, verbose=verbose) ## Construct similarity graph to resolve transitive closure sg = _gl.SGraph() sg = sg.add_vertices(records[[overall_label]], vid_field=overall_label) sg = sg.add_edges(knn, src_field='query_label', dst_field='reference_label') ## Cut the similarity graph to establish an entity for each vertex if verbose: _logging.info("Finding duplicate records in the similarity graph....") cc = _gl.connected_components.create(sg, verbose=verbose) ## Relabel the component IDs to be consecutive integers starting with # the max index of the previous block's entity labels. block_labels = cc['component_size'].add_row_number('__entity') block_labels['__entity'] += max_entity_number max_entity_number += block_labels.num_rows() block_entity_labels = cc['component_id'].join(block_labels, on='component_id', how='left') ## Join the entity labels for the block back to the block's records, # then append to the master output records = records.join(block_entity_labels[['__id', '__entity']], on={overall_label: '__id'}, how='left') records = records.sort('__entity') else: # no fuzzy features, so no nearest neighbors, just block ID records['__entity'] = _gl.SArray.from_const(i, len(records)) sf_entity = sf_entity.append(records[output_features]) ### ------------------------------------- ### ### Postprocessing and results formatting ### ### ------------------------------------- ### ## Add rows missing from the blocking back to the master results if len(block_errors) > 0: block_errors['__entity'] = _gl.SArray.from_const(None, len(block_errors)).astype(int) sf_entity = sf_entity.append(block_errors[output_features]) ## Rearrange columns sf_entity.swap_columns('__sframe', sf_entity.column_names()[0]) sf_entity.swap_columns(row_label, sf_entity.column_names()[1]) sf_entity.swap_columns('__entity', sf_entity.column_names()[2]) ## Finalize the model state model.__proxy__['training_time'] = _time.time() - start_time model.__proxy__['entities'] = sf_entity model.__proxy__['num_entities'] = max_entity_number return model
def create(dataset, features=None, verbose=True): """ Create an anomaly detection model. Based on the type of the input data, this function automatically choose the anomaly detection model and the type of anomalies to search for. Generally speaking, if the input data appears to be a time series---if the dataset type is TimeSeries, one of the features is of type datetime.datetime, or there is only a single feature---the toolkit chooses the moving Z-score model. Parameters ---------- dataset : SFrame or TimeSeries Input dataset. Determines the type of anomaly detection model and types of anomalies to search for. features : list[str], optional Names of columns in the input 'dataset' to use as features. verbose : bool, optional If True, print progress updates and model details. Returns ------- model : GraphLab Create model See Also -------- local_outlier_factor.create, graphlab.toolkits.dbscan.create Examples -------- >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.], ... 'x1': [2., 1., 0., 1., 2., 1.5, 2.5]}) ... >>> m = graphlab.anomaly_detection.create(sf) >>> type(m) graphlab.toolkits.anomaly_detection.local_outlier_factor.LocalOutlierFactorModel ... >>> m['scores'] +--------+----------------------+ | row_id | local_outlier_factor | +--------+----------------------+ | 2 | 0.951567102896 | | 0 | 0.951567102896 | | 5 | 1.00783754045 | | 4 | 0.982224576307 | | 3 | 1.05829898642 | | 1 | 1.05829898642 | | 6 | 2.52792223974 | +--------+----------------------+ [7 rows x 2 columns] """ _mt._get_metric_tracker().track('toolkit.anomaly_detection.create') ## Basic validation of the input dataset. if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)): raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.") if len(dataset) < 1 or len(dataset.column_names()) < 1: raise TypeError("Input 'dataset' is empty.") ## Figure out the features and do basic validation. if features is None: features = dataset.column_names() if (not isinstance(features, list) or not all([type(c) == str for c in features])): raise TypeError("If specified, input 'features' must be a list " + "of strings.") if not all([c in dataset.column_names() for c in features]): raise _ToolkitError("The specified features could not all be found " + "in the input 'dataset'.") ## If any valid features are datetime types LOF is not valid. ## If there is more than one feature Z-score is not valid. # Figure out if there is a datetime column. col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } datetime_features = [c for c in features if col_types[c] == _dt.datetime] value_features = [c for c in features if col_types[c] != _dt.datetime] ## Decide which model to use. try_zscore = False if isinstance(dataset, _gl.TimeSeries): try_zscore = True else: # dataset is an SFrame if len(datetime_features) > 0: try_zscore = True if len(value_features) == 1 and (col_types[value_features[0]] in (int, float)): try_zscore = True ## Create the relevant model. bandwidth = max(1, int(0.05 * len(dataset))) if try_zscore: if len(value_features) != 1 or len(datetime_features) > 1: raise _ToolkitError( "Cannot select an appropriate anomaly " + "detection model. For a " + "local outlier factor model, please remove " + "any datetime-type features. For a moving" + "Z-score model, please identify one data" + "feature (integer- or float-type) and at most" + "one datetime column as an index (this indexing is done" + "automatically for TimeSeries objects)") if isinstance(dataset, _gl.SFrame) and len(datetime_features) == 1: _dataset = _gl.TimeSeries(dataset, index=datetime_features[0]) else: _dataset = dataset[:] if verbose: print("Creating a moving Z-score anomaly detection model.") model = _gl.moving_zscore.create(dataset=_dataset, feature=value_features[0], window_size=bandwidth, verbose=verbose) ## If not doing the moving z-score, do local outlier factor. else: if verbose: print("Creating a local outlier factor model.") model = _gl.local_outlier_factor.create(dataset=dataset, features=features, num_neighbors=bandwidth, verbose=verbose) return model
def create(dataset, num_clusters=None, features=None, initial_centers=None, max_iterations=10, batch_size=None, verbose=True): r""" Run the k-means++ clustering algorithm, returning a KmeansModel object that contains the cluster centers and the cluster assignment for each data point in the dataset. Given a number of clusters, k-means++ iteratively chooses the best cluster centers and assigns nearby points to the best cluster. If no points change cluster membership between iterations, the algorithm terminates. Parameters ---------- dataset : SFrame Each row in the SFrame is an observation. num_clusters : int Number of clusters. This is the 'k' in k-means. features : list[string], optional Names of feature columns to use in computing distances between observations and cluster centers. 'None' (the default) indicates that all columns should be used as features. Columns may be of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a distinct feature in the model. - *Dict*: dictionary of keys mapped to numeric values. Each unique key is treated as a distinct feature in the model. Note that columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. initial_centers : SFrame, optional If None (default), k-means++ intelligently chooses initial cluster centers. Otherwise, the algorithm starts with the centers provided in this SFrame. If this SFrame is provided, the ``num_clusters`` parameter does not need to be specified. ``initial_centers`` must have the columns specified in the ``features`` argument. max_iterations : int, optional The maximum number of iterations to run. Prints a warning if the algorithm does not converge after max_iterations iterations. If set to 0, the model returns clusters defined by the initial centers and assignments to those centers. batch_size : int, optional Number of randomly-chosen data points to use in each iteration. If `None` (the default) or greater than the number of rows in `dataset`, then this parameter is ignored: all rows of `dataset` are used in each iteration and model training terminates once point assignments stop changing or `max_iterations` is reached. verbose : bool, optional If True, print model training progress to the screen. Returns ------- out : KmeansModel A Model object containing a cluster id for each vertex, and the centers of the clusters. See Also -------- KmeansModel References ---------- - `Wikipedia - k-means clustering <http://en.wikipedia.org/wiki/K-means_clustering>`_ - Artuhur, D. and Vassilvitskii, S. (2007) `k-means++: The Advantages of Careful Seeding <http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf>`_. In Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms. pp. 1027-1035. - Elkan, C. (2003) `Using the triangle inequality to accelerate k-means <http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf>`_. In Proceedings of the Twentieth International Conference on Machine Learning, Volume 3, pp. 147-153. - Sculley, D. (2010) `Web Scale K-Means Clustering <http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf>`_. In Proceedings of the 19th International Conference on World Wide Web. pp. 1177-1178 Examples -------- >>> sf = graphlab.SFrame({ "d1": [ 0.46973508, 0.0063261, 0.14143399, 0.35025834, 0.83728709, 0.81438336, 0.74205833, 0.36273747, 0.00793858, 0.02298716], "d2": [ 0.51050977, 0.82167952, 0.61451765, 0.51179513, 0.35223035, 0.59366481, 0.48848649, 0.90026032, 0.78798728, 0.40125452], "d3": [ 0.71716265, 0.54163387, 0.55577274, 0.12619953, 0.80172228, 0.21519973, 0.21014113, 0.54207596, 0.65432528, 0.00754797], "d4": [ 0.69486673, 0.92585721, 0.95461882, 0.72658554, 0.86590678, 0.18017175, 0.60361348, 0.89223113, 0.37992791, 0.44700959] }) It's important to standardize our columns to get the best results possible from the k-means algorithm. >>> for col in ['d1', 'd2', 'd3', 'd4']: sf[col] = (sf[col] - sf[col].mean()) / sf[col].std() >>> model = graphlab.kmeans.create(sf, num_clusters=3) """ _mt._get_metric_tracker().track('toolkit.kmeans.create') opts = {'model_name': 'kmeans', 'max_iterations': max_iterations, 'verbose': verbose} ## Validate input dataset if not (isinstance(dataset, _SFrame)): raise TypeError("Input 'dataset' must be an SFrame.") if dataset.num_rows() == 0 or dataset.num_cols() == 0: raise ValueError("Input 'dataset' has no data.") ## Validate input initial centers if initial_centers is not None: if not (isinstance(initial_centers, _SFrame)): raise TypeError("Input 'initial_centers' must be an SFrame.") if initial_centers.num_rows() == 0 or initial_centers.num_cols() == 0: raise ValueError("An 'initial_centers' argument is provided " +\ "but has no data.") ## Validate number of clusters if initial_centers is None: if num_clusters is None: raise ValueError("Number of clusters cannot be determined from " +\ "'num_clusters' or 'initial_centers'. You must " +\ "specify one of these arguments.") else: _num_clusters = num_clusters else: num_centers = initial_centers.num_rows() if num_clusters is None: _num_clusters = num_centers else: if num_clusters != num_centers: raise ValueError("The value of 'num_clusters' does not match " +\ "the number of provided initial centers. " +\ "Please provide only one of these arguments " +\ "or ensure the values match.") else: _num_clusters = num_clusters if not isinstance(_num_clusters, int): raise _ToolkitError("Parameter 'num_clusters' must be an integer.") if _num_clusters > dataset.num_rows(): raise ValueError("The desired number of clusters exceeds the number " + "of data points. Please set 'num_clusters' to be " + "smaller than the number of data points.") opts['num_clusters'] = _num_clusters ## Validate the features in the dataset features = _select_valid_features(dataset, features, [_array, dict, int, float]) sf_features = dataset.select_columns(features) opts['features'] = sf_features ## Validate the features in the initial centers (if provided) if initial_centers is not None: try: initial_centers = initial_centers.select_columns(features) except: raise ValueError("Specified features cannot be extracted from the " +\ "provided initial centers.") if initial_centers.column_types() != sf_features.column_types(): raise TypeError("Feature types are different in the dataset and " +\ "initial centers.") else: initial_centers = _graphlab.SFrame() opts['initial_centers'] = initial_centers ## Validate the batch size and determine the training method. if batch_size is None: opts['method'] = 'elkan' opts['batch_size'] = dataset.num_rows(); else: opts['method'] = 'minibatch' opts['batch_size'] = batch_size ## Create and return the model params = _graphlab.toolkits._main.run('kmeans_train', opts, verbose) return KmeansModel(params['model'])
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of GraphLab Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Columns of type *list* are not supported. Convert them to array columns if all entries in the list are of numeric types. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~graphlab.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, graphlab.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`graphlab.SFrame.fillna` and :func:`graphlab.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of GraphLab Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~graphlab.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - ``euclidean`` and ``squared_euclidean``: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - ``manhattan``: The hash function of ``manhattan`` is similar with that of ``euclidean``. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - ``cosine``: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - ``jaccard``: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - ``dot_product``: The reference data points are first transformed to fixed-norm vectors, and then the minimum ``dot_product`` distance search problem can be solved via finding the reference data with smallest ``cosine`` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://jmlr.org/proceedings/papers/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = graphlab.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = graphlab.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = graphlab.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError("'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and (distance == 'cosine' or distance == _graphlab.distances.cosine or distance == 'dot_product' or distance == _graphlab.distances.dot_product or distance == 'transformed_dot_product' or distance == _graphlab.distances.transformed_dot_product): raise TypeError("The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and (not _method_options.has_key('num_projections_per_table')): if distance == 'jaccard' or distance == _graphlab.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _graphlab.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label _dataset, _label = _tkutl._validate_row_label(dataset, label=label) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types()) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _graphlab.distances.levenshtein): raise ValueError("Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print "Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components." else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([len(x) if hasattr(x, '__iter__') else 1 for x in sf_clean[0].itervalues()]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([x in [int, float, list, array.array] for x in sf_clean.column_types()]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in ['euclidean', 'manhattan', _graphlab.distances.euclidean, _graphlab.distances.manhattan]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_balltree.create') elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_brute.create') elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' _mt._get_metric_tracker().track('toolkit.nearest_neighbors_lsh.create') else: raise ValueError("Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update( {'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance}) ## Construct the nearest neighbors model if not verbose: _mt.main.get_client().set_log_progress(False) result = _graphlab.extensions._nearest_neighbors.train(opts) _mt.main.get_client().set_log_progress(True) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model