def create(dataset, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a RecordLinker model to match query records to a reference dataset of records, assuming both sets have the same general form. Parameters ---------- dataset : SFrame Reference data, against which to link new queries with the 'link' method. The 'dataset' SFrame must include at least the features specified in the 'features' or 'distance' parameter. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns should be used. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if 'distance' is specified as a composite distance, then that parameter controls which features are used in the model. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about GraphLab Create distance functions, please see the :py:mod:`~graphlab.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional Strategy for the nearest neighbors search. If not specified or 'auto', the search strategy is chosen automatically based on the data type and dimension. verbose : bool, optional If True, print progress updates and model details. **kwargs : optional Options passed through to the nearest_neighbors toolkit for particular nearest neighbors search strategies: - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. - *num_projections_per_table*: For the LSH method, the number of projections for each hash table. Returns ------- out : RecordLinker model. See Also -------- RecordLinker, graphlab.toolkits.nearest_neighbors Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745], ... 'street': ['phinney', 'fairview', 'cottage'], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... >>> model = graphlab.record_linker.create(homes, features=['city'], ... distance='levenshtein') """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Validate the features input. if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") else: features = dataset.column_names() ## Validate and preprocess the distance input. col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): # this will likely produce errors downstream if 'features' was not # specified by the user. distance = [[features, distance, 1]] elif distance == None: distance = _construct_auto_distance(features, col_types) else: raise TypeError( "Input 'distance' not understood. For the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list.") ## Validate the composite distance and set it in the model. allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array], 'transformed_dot_product': [int, float, str, dict, _array.array] } distance = _dmutl.validate_composite_distance(distance, row_label=None, allowed_dists=list( allowed_dists.keys()), verbose=verbose) ## Validate feauture types against distance functions. _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists) ## Clean and impute string data. # *** NOTE: after this, the composite distance and feature set will be # modified and useless to the user, so set the state here. *** state = {'distance': distance, 'num_distance_components': len(distance)} union_features = _dmutl.extract_composite_features(distance) _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(distance) for ftr in union_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [ new_ftr if x == ftr else x for x in dist_comp[0] ] ## Convert strings to dicts if the distance isn't levenshtein, and # concatenate string columns within a distance component into a single # feature. _dataset, _distance = _engineer_distance_features(_dataset, _distance) ## Create the nearest neighbors model and set in the model nn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance, method=method, verbose=verbose, **kwargs) ## Postprocessing and formatting state.update({ 'verbose': verbose, 'num_examples': dataset.num_rows(), 'features': union_features, 'nearest_neighbors_model': nn_model, 'num_features': len(union_features), 'method': nn_model['method'], 'training_time': _time.time() - start_time }) model = RecordLinker(state) return model
def create(dataset, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a RecordLinker model to match query records to a reference dataset of records, assuming both sets have the same general form. Parameters ---------- dataset : SFrame Reference data, against which to link new queries with the 'link' method. The 'dataset' SFrame must include at least the features specified in the 'features' or 'distance' parameter. features : list[string], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns should be used. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: array of numeric (integer or float) values. Each array element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *String*: string values. Please note: if 'distance' is specified as a composite distance, then that parameter controls which features are used in the model. distance : string or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', or 'dot_product'. Please see the :mod:`distances` module for more details. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) method : {'auto', brute_force', 'lsh', 'ball_tree'}, optional Strategy for the nearest neighbors search. If not specified or 'auto', the search strategy is chosen automatically based on the data type and dimension. verbose : bool, optional If True, print progress updates and model details. **kwargs : optional Options passed through to the nearest_neighbors toolkit for particular nearest neighbors search strategies: - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. - *num_projections_per_table*: For the LSH method, the number of projections for each hash table. Returns ------- out : RecordLinker model. See Also -------- RecordLinker, graphlab.toolkits.nearest_neighbors Notes ----- - Standardizing features is often a good idea with distance-based methods, but this model does *not* standardize features. - For features that all have the same type, the distance parameter may be a single standard distance function name (e.g. "euclidean"). In the model, however, all distances are first converted to composite distance functions; as a result, the 'distance' field in the model is always a composite distance. References ---------- - Christen, Peter. "Data matching: concepts and techniques for record linkage, entity resolution, and duplicate detection." Springer Science & Business Media, 2012. Examples -------- >>> homes = graphlab.SFrame({'sqft': [1230, 875, 1745], ... 'street': ['phinney', 'fairview', 'cottage'], ... 'city': ['seattle', 'olympia', 'boston'], ... 'state': ['WA', 'WA', 'MA']}) ... >>> model = graphlab.record_linker.create(homes, features=['city'], ... distance='levenshtein') """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) start_time = _time.time() ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate the features input. if features is not None: if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Input 'features' must contain only strings.") else: features = dataset.column_names() ## Validate and preprocess the distance input. col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} if isinstance(distance, list): distance = _copy.deepcopy(distance) elif isinstance(distance, str): # this will likely produce errors downstream if 'features' was not # specified by the user. distance = [[features, distance, 1]] elif distance == None: distance = _construct_auto_distance(features, col_types) else: raise TypeError("Input 'distance' not understood. For the " + "data matching toolkit, 'distance' must be a string or " + "a composite distance list." ) ## Validate the composite distance and set it in the model. allowed_dists = { 'euclidean': [int, float, _array.array], 'squared_euclidean': [int, float, _array.array], 'manhattan': [int, float, _array.array], 'levenshtein': [str], 'jaccard': [str, dict], 'weighted_jaccard': [str, dict], 'cosine': [int, float, str, dict, _array.array], 'dot_product': [int, float, str, dict, _array.array]} distance = _dmutl.validate_composite_distance(distance, row_label=None, allowed_dists=allowed_dists.keys(), verbose=verbose) ## Validate feauture types against distance functions. _dmutl.validate_distance_feature_types(dataset, distance, allowed_dists) ## Clean and impute string data. # *** NOTE: after this, the composite distance and feature set will be # modified and useless to the user, so set the state here. *** state = {'distance': distance, 'num_distance_components': len(distance)} union_features = _dmutl.extract_composite_features(distance) _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(distance) for ftr in union_features: if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Convert strings to dicts if the distance isn't levenshtein, and # concatenate string columns within a distance component into a single # feature. _dataset, _distance = _engineer_distance_features(_dataset, _distance) ## Create the nearest neighbors model and set in the model knn_model = _gl.nearest_neighbors.create(_dataset, distance=_distance, method=method, verbose=verbose, **kwargs) ## Postprocessing and formatting state.update({'verbose': verbose, 'num_examples': dataset.num_rows(), 'features': union_features, 'num_features': len(union_features), 'method': knn_model['method'], 'training_time': _time.time() - start_time}) model = RecordLinker(knn_model, state) return model
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = { k: v for k, v in zip(dataset.column_names(), dataset.column_types()) } _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self.__proxy__['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [ new_ftr if x == ftr else x for x in dist_comp[0] ] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self.__proxy__['nearest_neighbors_model'].query( _dataset, k=k, radius=radius, verbose=verbose) return result
def link(self, dataset, k=5, radius=None, verbose=True): """ Find matching records from the reference dataset (entered when the model was created) for each record in the 'dataset' passed to this function. The query dataset must include columns with the same names as the label and feature columns used to create the RecordLinker model. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~graphlab.nearest_neighbors.create` documentation for more detail on allowable data types. k : int, optional Maximum number of nearest neighbors to return from the reference set for each query observation. The default is 5, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the row label of the query observation, the second is the row label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. Examples -------- Assume we've created the model from the example in the RecordLinker 'create' function. >>> queries = graphlab.SFrame({'sqft': [986, 1320], ... 'street': ['fremont', 'phiney'], ... 'city': ['sea', 'seattle'], ... 'state': ['WA', 'WA']}) ... >>> model.link(queries, k=2, radius=5.) +-------------+-----------------+----------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------+------+ | 0 | 0 | 4.0 | 1 | | 0 | 2 | 5.0 | 2 | | 1 | 0 | 0.0 | 1 | +-------------+-----------------+----------+------+ """ _mt._get_metric_tracker().track(self.__module__ + '.link_records') ## Validate the 'dataset' input. _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Make sure all of the necessary features are present at 'link' time. sf_features = _tkutl._toolkits_select_columns(dataset, self.get('features')) ## Clean and impute string data. *** Think about consolidating this and # the next step into a feature transformer.*** col_types = {k: v for k, v in zip(dataset.column_names(), dataset.column_types())} _dataset = _copy.copy(dataset) _distance = _copy.deepcopy(self._state['distance']) for ftr in self.get('features'): if col_types[ftr] == str: new_ftr = '__clean.' + ftr _dataset[new_ftr] = _dataset[ftr].fillna("") _dataset[new_ftr] = _dataset[new_ftr].apply( lambda x: _dmutl.cleanse_string(x), dtype=str) for dist_comp in _distance: dist_comp[0] = [new_ftr if x == ftr else x for x in dist_comp[0]] ## Convert strings to dicts and concatenate string features. _dataset, _ = _engineer_distance_features(_dataset, _distance) ## Query the nearest neighbor model result = self._knn_model.query(_dataset, k=k, radius=radius, verbose=verbose) return result