def create(graph, verbose=True): """ Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : TriangleCountingModel References ---------- - T. Schank. (2007) `Algorithmic Aspects of Triangle-Based Network Analysis <http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/4541>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.triangle_counting.TriangleCountingModel` as follows: >>> g = >>> turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', >>> format='snap') tc = turicreate.triangle_counting.create(g) We can obtain the number of triangles that each vertex in the graph ``g`` is present in: >>> tc_out = tc['triangle_count'] # SFrame We can add the new "triangle_count" field to the original graph g using: >>> g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- TriangleCountingModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError("graph input must be a SGraph object.") with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.triangle_counting.create( {"graph": graph.__proxy__}) return TriangleCountingModel(params["model"])
def create(graph, verbose=True): """ Compute the graph coloring. Assign a color to each vertex such that no adjacent vertices have the same color. Return a model object with total number of colors used as well as the color ID for each vertex in the graph. This algorithm is greedy and is not guaranteed to find the **minimum** graph coloring. It is also not deterministic, so successive runs may return different answers. Parameters ---------- graph : SGraph The graph on which to compute the coloring. verbose : bool, optional If True, print progress updates. Returns ------- out : GraphColoringModel References ---------- - `Wikipedia - graph coloring <http://en.wikipedia.org/wiki/Graph_coloring>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.graph_coloring.GraphColoringModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> gc = turicreate.graph_coloring.create(g) We can obtain the ``color id`` corresponding to each vertex in the graph ``g`` as follows: >>> color_id = gc['color_id'] # SFrame We can obtain the total number of colors required to color the graph ``g`` as follows: >>> num_colors = gc['num_colors'] See Also -------- GraphColoringModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.graph_coloring.create( {'graph': graph.__proxy__}) return GraphColoringModel(params['model'])
def query(self, dataset, label=None, k=5, radius=None, verbose=True): """ For each row of the input 'dataset', retrieve the nearest neighbors from the model's stored data. In general, the query dataset does not need to be the same as the reference data stored in the model, but if it is, the 'include_self_edges' parameter can be set to False to exclude results that match query points to themselves. Parameters ---------- dataset : SFrame Query data. Must contain columns with the same names and types as the features used to train the model. Additional columns are allowed, but ignored. Please see the nearest neighbors :func:`~turicreate.nearest_neighbors.create` documentation for more detail on allowable data types. label : str, optional Name of the query SFrame column with row labels. If 'label' is not specified, row numbers are used to identify query dataset rows in the output SFrame. k : int, optional Number of nearest neighbors to return from the reference set for each query observation. The default is 5 neighbors, but setting it to ``None`` will return all neighbors within ``radius`` of the query point. radius : float, optional Only neighbors whose distance to a query point is smaller than this value are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. verbose: bool, optional If True, print progress updates and model details. Returns ------- out : SFrame An SFrame with the k-nearest neighbors of each query observation. The result contains four columns: the first is the label of the query observation, the second is the label of the nearby reference observation, the third is the distance between the query and reference observations, and the fourth is the rank of the reference observation among the query's k-nearest neighbors. See Also -------- similarity_graph Notes ----- - The `dataset` input to this method *can* have missing values (in contrast to the reference dataset used to create the nearest neighbors model). Missing numeric values are imputed to be the mean of the corresponding feature in the reference dataset, and missing strings are imputed to be empty strings. - If both ``k`` and ``radius`` are set to ``None``, each query point returns all of the reference set. If the reference dataset has :math:`n` rows and the query dataset has :math:`m` rows, the output is an SFrame with :math:`nm` rows. - For models created with the 'lsh' method, the query results may have fewer query labels than input query points. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query, the query point is omitted from the results. Examples -------- First construct a toy SFrame and create a nearest neighbors model: >>> sf = turicreate.SFrame({'label': range(3), ... 'feature1': [0.98, 0.62, 0.11], ... 'feature2': [0.69, 0.58, 0.36]}) >>> model = turicreate.nearest_neighbors.create(sf, 'label') A new SFrame contains query observations with same schema as the reference SFrame. This SFrame is passed to the ``query`` method. >>> queries = turicreate.SFrame({'label': range(3), ... 'feature1': [0.05, 0.61, 0.99], ... 'feature2': [0.06, 0.97, 0.86]}) >>> model.query(queries, 'label', k=2) +-------------+-----------------+----------------+------+ | query_label | reference_label | distance | rank | +-------------+-----------------+----------------+------+ | 0 | 2 | 0.305941170816 | 1 | | 0 | 1 | 0.771556867638 | 2 | | 1 | 1 | 0.390128184063 | 1 | | 1 | 0 | 0.464004310325 | 2 | | 2 | 0 | 0.170293863659 | 1 | | 2 | 1 | 0.464004310325 | 2 | +-------------+-----------------+----------------+------+ """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Get model features ref_features = self.features sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Validate and preprocess the 'label' input if label is None: query_labels = _turicreate.SArray.from_sequence(len(dataset)) else: if not label in dataset.column_names(): raise ValueError( "Input 'label' must be a string matching the name of a " +\ "column in the reference SFrame 'dataset'.") if not dataset[label].dtype == str and not dataset[ label].dtype == int: raise TypeError( "The label column must contain integers or strings.") if label in ref_features: raise ValueError( "The label column cannot be one of the features.") query_labels = dataset[label] ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'features': sf_features, 'query_labels': query_labels, 'k': k, 'radius': radius } with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.query(opts) return result['neighbors']
def create(dataset, target, model_name, features=None, validation_set='auto', distributed='auto', verbose=True, seed=None, **kwargs): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. kwargs : dict Additional parameter options that can be passed """ # Perform error-checking and trim inputs to specified columns dataset, validation_set = _validate_data(dataset, target, features, validation_set) # Sample a validation set from the training data if requested if isinstance(validation_set, str): assert validation_set == 'auto' if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95, seed=seed, exact=True) else: validation_set = _turicreate.SFrame() elif validation_set is None: validation_set = _turicreate.SFrame() # Sanitize model-specific options options = {k.lower(): kwargs[k] for k in kwargs} # Create a model instance and train it model = getattr(_turicreate.extensions, model_name)() with QuietProgress(verbose): model.train(dataset, target, validation_set, options) return SupervisedLearningModel(model, model_name)
def create(observation_data, user_id='user_id', item_id='item_id', target=None, user_data=None, item_data=None, num_factors=32, regularization=1e-9, linear_regularization=1e-9, side_data_factorization=True, ranking_regularization=0.25, unobserved_rating_value=None, num_sampled_negative_examples=4, max_iterations=25, sgd_step_size=0, random_seed=0, binary_target=False, solver='auto', verbose=True, **kwargs): """Create a RankingFactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids and a column of item ids. Each row represents an observed interaction between the user and the item. The (user, item) pairs are stored with the model so that they can later be excluded from recommendations if desired. It can optionally contain a target ratings column. All other columns are interpreted by the underlying model as side features for the observations. The user id and item id columns must be of type 'int' or 'str'. The target column must be of type 'int' or 'float'. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. item_id : string, optional The name of the column in `observation_data` that corresponds to the item id. target : string, optional The `observation_data` can optionally contain a column of scores representing ratings given by the users. If present, the name of this column may be specified variables `target`. user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. item_data : SFrame, optional Side information for the items. This SFrame must have a column with the same name as what is specified by the `item_id` input parameter. `item_data` can provide any amount of additional item-specific information. num_factors : int, optional Number of latent factors. regularization : float, optional L2 regularization for interaction terms. Default: 1e-10; a typical range for this parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues. linear_regularization : float, optional L2 regularization for linear term. Default: 1e-10; a typical range for this parameter is between 1e-12 and 1. Setting this to 0 may cause numerical issues. side_data_factorization : boolean, optional Use factorization for modeling any additional features beyond the user and item columns. If True, and side features or any additional columns are present, then a Factorization Machine model is trained. Otherwise, only the linear terms are fit to these features. See :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` for more information. Default: True. ranking_regularization : float, optional Penalize the predicted value of user-item pairs not in the training set. Larger values increase this penalization. Suggested values: 0, 0.1, 0.5, 1. NOTE: if no target column is present, this parameter is ignored. unobserved_rating_value : float, optional Penalize unobserved items with a larger predicted score than this value. By default, the estimated 5% quantile is used (mean - 1.96*std_dev). num_sampled_negative_examples : integer, optional For each (user, item) pair in the data, the ranking sgd solver evaluates this many randomly chosen unseen items for the negative example step. Increasing this can give better performance at the expense of speed, particularly when the number of items is large. Default is 4. binary_target : boolean, optional Assume the target column is composed of 0's and 1's. If True, use logistic loss to fit the model. max_iterations : int, optional The training algorithm will make at most this many iterations through the observed data. Default: 50. sgd_step_size : float, optional Step size for stochastic gradient descent. Smaller values generally lead to more accurate models that take more time to train. The default setting of 0 means that the step size is chosen by trying several options on a small subset of the data. random_seed : int, optional The random seed used to choose the initial starting point for model training. Note that some randomness in the training is unavoidable, so models trained with the same random seed may still differ. Default: 0. solver : string, optional Name of the solver to be used to solve the regression. See the references for more detail on each solver. The available solvers for this model are: - *auto (default)*: automatically chooses the best solver for the data and model parameters. - *ials*: Implicit Alternating Least Squares [1]. - *adagrad*: Adaptive Gradient Stochastic Gradient Descent. - *sgd*: Stochastic Gradient Descent verbose : bool, optional Enables verbose output. kwargs : optional Optional advanced keyword arguments passed in to the model optimization procedure. These parameters do not typically need to be changed. Examples -------- **Basic usage** When given just user and item pairs, one can create a RankingFactorizationRecommender as follows. >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"]) >>> from turicreate.recommender import ranking_factorization_recommender >>> m1 = ranking_factorization_recommender.create(sf) When a target column is present, one can include this to try and recommend items that are rated highly. >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], ... 'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) >>> m1 = ranking_factorization_recommender.create(sf, target='rating') **Including side features** >>> user_info = turicreate.SFrame({'user_id': ["0", "1", "2"], ... 'name': ["Alice", "Bob", "Charlie"], ... 'numeric_feature': [0.1, 12, 22]}) >>> item_info = turicreate.SFrame({'item_id': ["a", "b", "c", "d"], ... 'name': ["item1", "item2", "item3", "item4"], ... 'dict_feature': [{'a' : 23}, {'a' : 13}, ... {'b' : 1}, ... {'a' : 23, 'b' : 32}]}) >>> m2 = ranking_factorization_recommender.create(sf, target='rating', ... user_data=user_info, ... item_data=item_info) **Customizing ranking regularization** Create a model that pushes predicted ratings of unobserved user-item pairs toward 1 or below. >>> m3 = ranking_factorization_recommender.create(sf, target='rating', ... ranking_regularization = 0.1, ... unobserved_rating_value = 1) **Using the implicit alternating least squares model** Ranking factorization also implements implicit alternating least squares [1] as an alternative solver. This is enable using ``solver = 'ials'``. >>> m3 = ranking_factorization_recommender.create(sf, target='rating', solver = 'ials') See Also -------- :class:`turicreate.recommender.factorization_recommender.FactorizationRecommender`, :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` References ----------- [1] Collaborative Filtering for Implicit Feedback Datasets Hu, Y.; Koren, Y.; Volinsky, C. IEEE International Conference on Data Mining (ICDM 2008), IEEE (2008). """ from turicreate._cython.cy_server import QuietProgress if not (isinstance(observation_data, _SFrame)): raise TypeError('observation_data input must be a SFrame') opts = {} model_proxy = _turicreate.extensions.ranking_factorization_recommender() model_proxy.init_options(opts) if user_data is None: user_data = _turicreate.SFrame() if item_data is None: item_data = _turicreate.SFrame() nearest_items = _turicreate.SFrame() if target is None: binary_target = True opts = { 'user_id': user_id, 'item_id': item_id, 'target': target, 'random_seed': random_seed, 'num_factors': num_factors, 'regularization': regularization, 'linear_regularization': linear_regularization, 'ranking_regularization': ranking_regularization, 'binary_target': binary_target, 'max_iterations': max_iterations, 'side_data_factorization': side_data_factorization, 'num_sampled_negative_examples': num_sampled_negative_examples, 'solver': solver, # Has no effect here. 'sgd_step_size': sgd_step_size } if unobserved_rating_value is not None: opts["unobserved_rating_value"] = unobserved_rating_value if kwargs: try: possible_args = set(_get_default_options()["name"]) except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) opts.update(kwargs) extra_data = {"nearest_items": _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return RankingFactorizationRecommender(model_proxy)
def create( graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed="auto", verbose=True, ): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximum number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.pagerank.PageRankModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = turicreate.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError("graph input must be a SGraph object.") opts = { "threshold": threshold, "reset_probability": reset_probability, "max_iterations": max_iterations, "single_precision": _single_precision, "graph": graph.__proxy__, } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.pagerank.create(opts) model = params["model"] return PagerankModel(model)
def create(observation_data, user_id="user_id", item_id="item_id", target=None, user_data=None, item_data=None, num_factors=8, regularization=1e-8, linear_regularization=1e-10, side_data_factorization=True, nmf=False, binary_target=False, max_iterations=50, sgd_step_size=0, random_seed=0, solver="auto", verbose=True, **kwargs): """Create a FactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. This includes both standard matrix factorization as well as factorization machines models (in the situation where side data is available for users and/or items). Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids and a column of item ids. Each row represents an observed interaction between the user and the item. The (user, item) pairs are stored with the model so that they can later be excluded from recommendations if desired. It can optionally contain a target ratings column. All other columns are interpreted by the underlying model as side features for the observations. The user id and item id columns must be of type 'int' or 'str'. The target column must be of type 'int' or 'float'. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. item_id : string, optional The name of the column in `observation_data` that corresponds to the item id. target : string The `observation_data` must contain a column of scores representing ratings given by the users. If not present, consider using the ranking version of the factorization model, RankingFactorizationRecommender, :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. item_data : SFrame, optional Side information for the items. This SFrame must have a column with the same name as what is specified by the `item_id` input parameter. `item_data` can provide any amount of additional item-specific information. num_factors : int, optional Number of latent factors. regularization : float, optional Regularization for interaction terms. The type of regularization is L2. Default: 1e-8; a typical range for this parameter is between 1e-12 and 1. linear_regularization : float, optional Regularization for linear term. Default: 1e-10; a typical range for this parameter is between 1e-12 and 1. side_data_factorization : boolean, optional Use factorization for modeling any additional features beyond the user and item columns. If True, and side features or any additional columns are present, then a Factorization Machine model is trained. Otherwise, only the linear terms are fit to these features. See :class:`turicreate.recommender.factorization_recommender.FactorizationRecommender` for more information. Default: True. nmf : boolean, optional Use nonnegative matrix factorization, which forces the factors to be nonnegative. Disables linear and intercept terms. binary_target : boolean, optional Assume the target column is composed of 0's and 1's. If True, use logistic loss to fit the model. max_iterations : int, optional The training algorithm will make at most this many iterations through the observed data. Default: 50. sgd_step_size : float, optional Step size for stochastic gradient descent. Smaller values generally lead to more accurate models that take more time to train. The default setting of 0 means that the step size is chosen by trying several options on a small subset of the data. random_seed : int, optional The random seed used to choose the initial starting point for model training. Note that some randomness in the training is unavoidable, so models trained with the same random seed may still differ slightly. Default: 0. solver : string, optional Name of the solver to be used to solve the regression. See the references for more detail on each solver. The available solvers for this model are: - *auto (default)*: automatically chooses the best solver for the data and model parameters. - *sgd*: Stochastic Gradient Descent. - *adagrad*: Adaptive Gradient Stochastic Gradient Descent [1]. - *als*: Alternating Least Squares. verbose : bool, optional Enables verbose output. kwargs : optional Optional advanced keyword arguments passed in to the model optimization procedure. These parameters do not typically need to be changed. Examples -------- **Basic usage** >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], ... 'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) >>> m1 = turicreate.factorization_recommender.create(sf, target='rating') When a target column is present, :meth:`~turicreate.recommender.create` defaults to creating a :class:`~turicreate.recommender.factorization_recommender.FactorizationRecommender`. **Including side features** >>> user_info = turicreate.SFrame({'user_id': ["0", "1", "2"], ... 'name': ["Alice", "Bob", "Charlie"], ... 'numeric_feature': [0.1, 12, 22]}) >>> item_info = turicreate.SFrame({'item_id': ["a", "b", "c", "d"], ... 'name': ["item1", "item2", "item3", "item4"], ... 'dict_feature': [{'a' : 23}, {'a' : 13}, ... {'b' : 1}, ... {'a' : 23, 'b' : 32}]}) >>> m2 = turicreate.factorization_recommender.create(sf, target='rating', ... user_data=user_info, ... item_data=item_info) **Using the Alternating Least Squares (ALS) solver** The factorization model can also be solved using alternating least squares (ALS) as a solver option. This solver does not support side columns or other similar features. >>> m3 = turicreate.factorization_recommender.create(sf, target='rating', solver = 'als') See Also -------- RankingFactorizationRecommender, :class:`turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender` References ----------- [1] Duchi, John, Elad Hazan, and Yoram Singer. "Adaptive subgradient methods for online learning and stochastic optimization." The Journal of Machine Learning Research 12 (2011). """ from turicreate._cython.cy_server import QuietProgress if not (isinstance(observation_data, _SFrame)): raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.factorization_recommender() model_proxy.init_options(opts) if user_data is None: user_data = _turicreate.SFrame() if item_data is None: item_data = _turicreate.SFrame() opts = { "user_id": user_id, "item_id": item_id, "target": target, "random_seed": random_seed, "num_factors": num_factors, "regularization": regularization, "linear_regularization": linear_regularization, "binary_target": binary_target, "max_iterations": max_iterations, "sgd_step_size": sgd_step_size, "solver": solver, "side_data_factorization": side_data_factorization, # has no effect in the c++ end; ignore. "nmf": nmf, } if kwargs: try: possible_args = set(_get_default_options()["name"]) except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ", ".join(bad_arguments)) opts.update(kwargs) extra_data = {"nearest_items": _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return FactorizationRecommender(model_proxy)
def create(graph, kmin=0, kmax=10, verbose=True): """ Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the k-core decomposition. kmin : int, optional Minimum core id. Vertices having smaller core id than `kmin` will be assigned with core_id = `kmin`. kmax : int, optional Maximum core id. Vertices having larger core id than `kmax` will be assigned with core_id=`kmax`. verbose : bool, optional If True, print progress updates. Returns ------- out : KcoreModel References ---------- - Alvarez-Hamelin, J.I., et al. (2005) `K-Core Decomposition: A Tool for the Visualization of Large Networks <http://arxiv.org/abs/cs/0504107>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.kcore.KcoreModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> kc = turicreate.kcore.create(g) We can obtain the ``core id`` corresponding to each vertex in the graph ``g`` using: >>> kcore_id = kc['core_id'] # SFrame We can add the new core id field to the original graph g using: >>> g.vertices['core_id'] = kc['graph'].vertices['core_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- KcoreModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax} with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.kcore.create(opts) return KcoreModel(params['model'])
def create(item_data, item_id, observation_data = None, user_id = None, target = None, weights = 'auto', similarity_metrics = 'auto', item_data_transform = 'auto', max_item_neighborhood_size = 64, verbose=True): """Create a content-based recommender model in which the similarity between the items recommended is determined by the content of those items rather than learned from user interaction data. The similarity score between two items is calculated by first computing the similarity between the item data for each column, then taking a weighted average of the per-column similarities to get the final similarity. The recommendations are generated according to the average similarity of a candidate item to all the items in a user's set of rated items. Parameters ---------- item_data : SFrame An SFrame giving the content of the items to use to learn the structure of similar items. The SFrame must have one column that matches the name of the `item_id`; this gives a unique identifier that can then be used to make recommendations. The rest of the columns are then used in the distance calculations below. item_id : string The name of the column in item_data (and `observation_data`, if given) that represents the item ID. observation_data : None (optional) An SFrame giving user and item interaction data. This information is stored in the model, and the recommender will recommend the items with the most similar content to the items that were present and/or highly rated for that user. user_id : None (optional) If observation_data is given, then this specifies the column name corresponding to the user identifier. target : None (optional) If observation_data is given, then this specifies the column name corresponding to the target or rating. weights : dict or 'auto' (optional) If given, then weights must be a dictionary of column names present in item_data to weights between the column names. If 'auto' is given, the all columns are weighted equally. max_item_neighborhood_size : int, 64 For each item, we hold this many similar items to use when aggregating models for predictions. Decreasing this value decreases the memory required by the model and decreases the time required to generate recommendations, but it may also decrease recommendation accuracy. verbose : True or False (optional) If set to False, then less information is printed. Examples -------- >>> item_data = tc.SFrame({"my_item_id" : range(4), "data_1" : [ [1, 0], [1, 0], [0, 1], [0.5, 0.5] ], "data_2" : [ [0, 1], [1, 0], [0, 1], [0.5, 0.5] ] }) >>> m = tc.recommender.item_content_recommender.create(item_data, "my_item_id") >>> m.recommend_from_interactions([0]) Columns: my_item_id int score float rank int Rows: 3 Data: +------------+----------------+------+ | my_item_id | score | rank | +------------+----------------+------+ | 3 | 0.707106769085 | 1 | | 1 | 0.5 | 2 | | 2 | 0.5 | 3 | +------------+----------------+------+ [3 rows x 3 columns] >>> m.recommend_from_interactions([0, 1]) Columns: my_item_id int score float rank int Rows: 2 Data: +------------+----------------+------+ | my_item_id | score | rank | +------------+----------------+------+ | 3 | 0.707106769085 | 1 | | 2 | 0.25 | 2 | +------------+----------------+------+ [2 rows x 3 columns] """ from turicreate._cython.cy_server import QuietProgress # item_data is correct type if not isinstance(item_data, _SFrame) or item_data.num_rows() == 0: raise TypeError("`item_data` argument must be a non-empty SFrame giving item data to use for similarities.") # Error checking on column names item_columns = set(item_data.column_names()) if item_id not in item_columns: raise ValueError("Item column given as 'item_id = %s', but this is not found in `item_data` SFrame." % item_id) # Now, get the set ready to test for other argument issues. item_columns.remove(item_id) if weights != 'auto': if type(weights) is not dict: raise TypeError("`weights` parameter must be 'auto' or a dictionary of column " "names in `item_data` to weight values.") bad_columns = [col_name for col_name in item_columns if col_name not in item_columns] if bad_columns: raise ValueError("Columns %s given in weights, but these are not found in item_data." % ', '.join(bad_columns)) # Now, set any columns not given in the weights column to be # weight 0. for col_name in item_columns: weights.setdefault(col_name, 0) ################################################################################ # Now, check the feature transformer stuff. # Pass it through a feature transformer. if item_data_transform == 'auto': item_data_transform = _turicreate.toolkits._feature_engineering.AutoVectorizer(excluded_features = [item_id]) if not isinstance(item_data_transform, _turicreate.toolkits._feature_engineering.TransformerBase): raise TypeError("item_data_transform must be 'auto' or a valid feature_engineering transformer instance.") # Transform the input data. item_data = item_data_transform.fit_transform(item_data) # Translate any string columns to actually work in nearest # neighbors by making it a categorical list. Also translate lists # into dicts, and normalize numeric columns. gaussian_kernel_metrics = set() for c in item_columns: if item_data[c].dtype is str: item_data[c] = item_data[c].apply(lambda s: {s : 1}) elif item_data[c].dtype in [float, int]: item_data[c] = (item_data[c] - item_data[c].mean()) / max(item_data[c].std(), 1e-8) gaussian_kernel_metrics.add(c) if verbose: print("Applying transform:") print(item_data_transform) opts = {} model_proxy = _turicreate.extensions.item_content_recommender() model_proxy.init_options(opts) # The user_id is implicit if none is given. if user_id is None: user_id = "__implicit_user__" normalization_factor = 1 # Set the observation data. if observation_data is None: # In this case, it's important to make this a string type. If # the user column is not given, it may be given at recommend # time, in which case it is cast to a string type and cast # back if necessary. empty_user = _turicreate.SArray([], dtype=str) empty_item = _turicreate.SArray([], dtype=item_data[item_id].dtype) observation_data = _turicreate.SFrame( {user_id : empty_user, item_id : empty_item} ) # Now, work out stuff for the observation_data component normalization_factor = 1 # 1 for the item_id column. if item_data.num_columns() >= 3: if weights == "auto": # TODO: automatically tune this. weights = {col_name : 1 for col_name in item_data.column_names() if col_name != item_id} # Use the abs value here in case users pass in weights with negative values. normalization_factor = sum(abs(v) for v in weights.values()) if normalization_factor == 0: raise ValueError("Weights cannot all be set to 0.") distance = [([col_name], ("gaussian_kernel" if col_name in gaussian_kernel_metrics else "cosine"), weight) for col_name, weight in weights.items()] else: distance = "cosine" # Now, build the nearest neighbors model: nn = _turicreate.nearest_neighbors.create(item_data, label=item_id, distance = distance, verbose = verbose) graph = nn.query(item_data, label = item_id, k=max_item_neighborhood_size, verbose = verbose) graph = graph.rename({"query_label" : item_id, "reference_label" : "similar", "distance" : "score"}, inplace=True) def process_weights(x): return max(-1, min(1, 1 - x / normalization_factor)) graph["score"] = graph["score"].apply(process_weights) opts = {'user_id': user_id, 'item_id': item_id, 'target': target, 'similarity_type' : "cosine", 'max_item_neighborhood_size' : max_item_neighborhood_size} user_data = _turicreate.SFrame() extra_data = {"nearest_items" : graph} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return ItemContentRecommender(model_proxy)
def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): """ Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortest paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute shortest paths. source_vid : vertex ID ID of the source vertex. weight_field : string, optional The edge field representing the edge weights. If empty, uses unit weights. verbose : bool, optional If True, print progress updates. Returns ------- out : ShortestPathModel References ---------- - `Wikipedia - ShortestPath <http://en.wikipedia.org/wiki/Shortest_path_problem>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.shortest_path.ShortestPathModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> sp = turicreate.shortest_path.create(g, source_vid=1) We can obtain the shortest path distance from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_sframe = sp['distance'] # SFrame We can add the new distance field to the original graph g using: >>> g.vertices['distance_to_1'] = sp['graph'].vertices['distance'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. To get the actual path from the source vertex to any destination vertex: >>> path = sp.get_path(vid=10) We can obtain an auxiliary graph with additional information corresponding to the shortest path from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_graph = sp.get.graph # SGraph See Also -------- ShortestPathModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError("graph input must be a SGraph object.") opts = { "source_vid": source_vid, "weight_field": weight_field, "max_distance": max_distance, "graph": graph.__proxy__, } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.sssp.create(opts) return ShortestPathModel(params["model"])
def create(observation_data, user_id="user_id", item_id="item_id", target=None, user_data=None, item_data=None, nearest_items=None, similarity_type="jaccard", threshold=0.001, only_top_k=64, verbose=True, target_memory_usage=8 * 1024 * 1024 * 1024, **kwargs): """ Create a recommender that uses item-item similarities based on users in common. Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids and a column of item ids. Each row represents an observed interaction between the user and the item. The (user, item) pairs are stored with the model so that they can later be excluded from recommendations if desired. It can optionally contain a target ratings column. All other columns are interpreted by the underlying model as side features for the observations. The user id and item id columns must be of type 'int' or 'str'. The target column must be of type 'int' or 'float'. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. item_id : string, optional The name of the column in `observation_data` that corresponds to the item id. target : string, optional The `observation_data` can optionally contain a column of scores representing ratings given by the users. If present, the name of this column may be specified variables `target`. user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. (NB: This argument is currently ignored by this model.) item_data : SFrame, optional Side information for the items. This SFrame must have a column with the same name as what is specified by the `item_id` input parameter. `item_data` can provide any amount of additional item-specific information. (NB: This argument is currently ignored by this model.) similarity_type : {'jaccard', 'cosine', 'pearson'}, optional Similarity metric to use. See ItemSimilarityRecommender for details. Default: 'jaccard'. threshold : float, optional Predictions ignore items below this similarity value. Default: 0.001. only_top_k : int, optional Number of similar items to store for each item. Default value is 64. Decreasing this decreases the amount of memory required for the model, but may also decrease the accuracy. nearest_items : SFrame, optional A set of each item's nearest items. When provided, this overrides the similarity computed above. See Notes in the documentation for ItemSimilarityRecommender. Default: None. target_memory_usage : int, optional The target memory usage for the processing buffers and lookup tables. The actual memory usage may be higher or lower than this, but decreasing this decreases memory usage at the expense of training time, and increasing this can dramatically speed up the training time. Default is 8GB = 8589934592. seed_item_set_size : int, optional For users that have not yet rated any items, or have only rated uniquely occurring items with no similar item info, the model seeds the user's item set with the average ratings of the seed_item_set_size most popular items when making predictions and recommendations. If set to 0, then recommendations based on either popularity (no target present) or average item score (target present) are made in this case. nearest_neighbors_interaction_proportion_threshold : (advanced) float Any item that has was rated by more than this proportion of users is treated by doing a nearest neighbors search. For frequent items, this is almost always faster, but it is slower for infrequent items. Furthermore, decreasing this causes more items to be processed using the nearest neighbor path, which may decrease memory requirements. degree_approximation_threshold : (advanced) int, optional Users with more than this many item interactions may be approximated. The approximation is done by a combination of sampling and choosing the interactions likely to have the most impact on the model. Increasing this can increase the training time and may or may not increase the quality of the model. Default = 4096. max_data_passes : (advanced) int, optional The maximum number of passes through the data allowed in building the similarity lookup tables. If it is not possible to build the recommender in this many passes (calculated before that stage of training), then additional approximations are applied; namely decreasing degree_approximation_threshold. If this is not possible, an error is raised. To decrease the number of passes required, increase target_memory_usage or decrease nearest_neighbors_interaction_proportion_threshold. Default = 1024. Examples -------- Given basic user-item observation data, an :class:`~turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender` is created: >>> sf = turicreate.SFrame({'user_id': ['0', '0', '0', '1', '1', '2', '2', '2'], ... 'item_id': ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd']}) >>> m = turicreate.item_similarity_recommender.create(sf) >>> recs = m.recommend() When a target is available, one can specify the desired similarity. For example we may choose to use a cosine similarity, and use it to make predictions or recommendations. >>> sf2 = turicreate.SFrame({'user_id': ['0', '0', '0', '1', '1', '2', '2', '2'], ... 'item_id': ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd'], ... 'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) >>> m2 = turicreate.item_similarity_recommender.create(sf2, target="rating", ... similarity_type='cosine') >>> m2.predict(sf) >>> m2.recommend() Notes ----- Currently, :class:`~turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender` does not leverage the use of side features `user_data` and `item_data`. **Incorporating pre-defined similar items** For item similarity models, one may choose to provide user-specified nearest neighbors graph using the keyword argument `nearest_items`. This is an SFrame containing, for each item, the nearest items and the similarity score between them. If provided, these item similarity scores are used for recommendations. The SFrame must contain (at least) three columns: * 'item_id': a column with the same name as that provided to the `item_id` argument (which defaults to the string "item_id"). * 'similar': a column containing the nearest items for the given item id. This should have the same type as the `item_id` column. * 'score': a numeric score measuring how similar these two items are. For example, suppose you first create an ItemSimilarityRecommender and use :class:`~turicreate.recommender.ItemSimilarityRecommender.get_similar_items`: >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"]}) >>> m = turicreate.item_similarity_recommender.create(sf) >>> nn = m.get_similar_items() >>> m2 = turicreate.item_similarity_recommender.create(sf, nearest_items=nn) With the above code, the item similarities computed for model `m` can be used to create a new recommender object, `m2`. Note that we could have created `nn` from some other means, but now use `m2` to make recommendations via `m2.recommend()`. See Also -------- ItemSimilarityRecommender """ from turicreate._cython.cy_server import QuietProgress if not (isinstance(observation_data, _SFrame)): raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.item_similarity() model_proxy.init_options(opts) if user_data is None: user_data = _turicreate.SFrame() if item_data is None: item_data = _turicreate.SFrame() if nearest_items is None: nearest_items = _turicreate.SFrame() opts = { "user_id": user_id, "item_id": item_id, "target": target, "similarity_type": similarity_type, "threshold": threshold, "target_memory_usage": float(target_memory_usage), "max_item_neighborhood_size": only_top_k, } extra_data = {"nearest_items": nearest_items} if kwargs: try: possible_args = set(_get_default_options()["name"]) except (RuntimeError, KeyError): possible_args = set() bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: raise TypeError("Bad Keyword Arguments: " + ", ".join(bad_arguments)) opts.update(kwargs) extra_data = {"nearest_items": nearest_items} opts.update(kwargs) with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return ItemSimilarityRecommender(model_proxy)
def create(graph, verbose=True): """ Compute the number of weakly connected components in the graph. Return a model object with total number of weakly connected components as well as the component ID for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : ConnectedComponentsModel References ---------- - `Mathworld Wolfram - Weakly Connected Component <http://mathworld.wolfram.com/WeaklyConnectedComponent.html>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.connected_components.ConnectedComponentsModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> cc = turicreate.connected_components.create(g) >>> cc.summary() We can obtain the ``component id`` corresponding to each vertex in the graph ``g`` as follows: >>> cc_ids = cc['component_id'] # SFrame We can obtain a graph with additional information about the ``component id`` corresponding to each vertex as follows: >>> cc_graph = cc['graph'] # SGraph We can add the new component_id field to the original graph g using: >>> g.vertices['component_id'] = cc['graph'].vertices['component_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- ConnectedComponentsModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError('"graph" input must be a SGraph object.') with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.connected_components.create( {"graph": graph.__proxy__}) return ConnectedComponentsModel(params["model"])
def create(graph, verbose=True): """ Compute the in degree, out degree and total degree of each vertex. Parameters ---------- graph : SGraph The graph on which to compute degree counts. verbose : bool, optional If True, print progress updates. Returns ------- out : DegreeCountingModel Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.degree_counting.DegreeCountingModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/web-Google.txt.gz', ... format='snap') >>> m = turicreate.degree_counting.create(g) >>> g2 = m['graph'] >>> g2 SGraph({'num_edges': 5105039, 'num_vertices': 875713}) Vertex Fields:['__id', 'in_degree', 'out_degree', 'total_degree'] Edge Fields:['__src_id', '__dst_id'] >>> g2.vertices.head(5) Columns: __id int in_degree int out_degree int total_degree int <BLANKLINE> Rows: 5 <BLANKLINE> Data: +------+-----------+------------+--------------+ | __id | in_degree | out_degree | total_degree | +------+-----------+------------+--------------+ | 5 | 15 | 7 | 22 | | 7 | 3 | 16 | 19 | | 8 | 1 | 2 | 3 | | 10 | 13 | 11 | 24 | | 27 | 19 | 16 | 35 | +------+-----------+------------+--------------+ See Also -------- DegreeCountingModel """ from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): raise TypeError('"graph" input must be a SGraph object.') with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.degree_count.create( {'graph': graph.__proxy__}) return DegreeCountingModel(params['model'])
def create(observation_data, user_id='user_id', item_id='item_id', target=None, user_data=None, item_data=None, random_seed=0, verbose=True): """ Create a model that makes recommendations using item popularity. When no target column is provided, the popularity is determined by the number of observations involving each item. When a target is provided, popularity is computed using the item's mean target value. When the target column contains ratings, for example, the model computes the mean rating for each item and uses this to rank items for recommendations. Parameters ---------- observation_data : SFrame The dataset to use for training the model. It must contain a column of user ids and a column of item ids. Each row represents an observed interaction between the user and the item. The (user, item) pairs are stored with the model so that they can later be excluded from recommendations if desired. It can optionally contain a target ratings column. All other columns are interpreted by the underlying model as side features for the observations. The user id and item id columns must be of type 'int' or 'str'. The target column must be of type 'int' or 'float'. user_id : string, optional The name of the column in `observation_data` that corresponds to the user id. item_id : string, optional The name of the column in `observation_data` that corresponds to the item id. target : string, optional The `observation_data` can optionally contain a column of scores representing ratings given by the users. If present, the name of this column may be specified variables `target`. user_data : SFrame, optional Side information for the users. This SFrame must have a column with the same name as what is specified by the `user_id` input parameter. `user_data` can provide any amount of additional user-specific information. item_data : SFrame, optional Side information for the items. This SFrame must have a column with the same name as what is specified by the `item_id` input parameter. `item_data` can provide any amount of additional item-specific information. verbose : bool, optional Enables verbose output. Examples -------- >>> sf = turicreate.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], ... 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], ... 'rating': [1, 3, 2, 5, 4, 1, 4, 3]}) >>> m = turicreate.popularity_recommender.create(sf, target='rating') See Also -------- PopularityRecommender """ from turicreate._cython.cy_server import QuietProgress if not (isinstance(observation_data, _SFrame)): raise TypeError('observation_data input must be a SFrame') opts = {} model_proxy = _turicreate.extensions.popularity() model_proxy.init_options(opts) if user_data is None: user_data = _turicreate.SFrame() if item_data is None: item_data = _turicreate.SFrame() nearest_items = _turicreate.SFrame() opts = {'user_id': user_id, 'item_id': item_id, 'target': target, 'random_seed': 1} extra_data = {"nearest_items" : _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return PopularityRecommender(model_proxy)
def create(dataset, label=None, features=None, distance=None, method='auto', verbose=True, **kwargs): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` argument is specified as `auto`, the type of model is chosen automatically based on the type of data in `dataset`. .. warning:: The 'dot_product' distance is deprecated and will be removed in future versions of Turi Create. Please use 'transformed_dot_product' distance instead, although note that this is more than a name change; it is a *different* transformation of the dot product of two vectors. Please see the distances module documentation for more details. Parameters ---------- dataset : SFrame Reference data. If the features for each observation are numeric, they may be in separate columns of 'dataset' or a single column with lists of values. The features may also be in the form of a column of sparse vectors (i.e. dictionaries), with string keys and numeric values. label : string, optional Name of the SFrame column with row labels. If 'label' is not specified, row numbers are used to identify reference dataset rows when the model is queried. features : list[string], optional Name of the columns with features to use in computing distances between observations and the query points. 'None' (the default) indicates that all columns except the label should be used as features. Each column can be one of the following types: - *Numeric*: values of numeric type integer or float. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate variable in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values. Each key indicates a separate variable in the model. - *List*: list of integer or string values. Each element is treated as a separate variable in the model. - *String*: string values. Please note: if a composite distance is also specified, this parameter is ignored. distance : string, function, or list[list], optional Function to measure the distance between any two input data rows. This may be one of three types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Function*: a function handle from the :mod:`~turicreate.toolkits.distances` module. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (strings) 2. standard distance name (string) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. If 'distance' is left unspecified or set to 'auto', a composite distance is constructed automatically based on feature types. method : {'auto', 'ball_tree', 'brute_force', 'lsh'}, optional Method for computing nearest neighbors. The options are: - *auto* (default): the method is chosen automatically, based on the type of data and the distance. If the distance is 'manhattan' or 'euclidean' and the features are numeric or vectors of numeric values, then the 'ball_tree' method is used. Otherwise, the 'brute_force' method is used. - *ball_tree*: use a tree structure to find the k-closest neighbors to each query point. The ball tree model is slower to construct than the brute force model, but queries are faster than linear time. This method is not applicable for the cosine and dot product distances. See `Liu, et al (2004) <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_ for implementation details. - *brute_force*: compute the distance from a query point to all reference observations. There is no computation time for model creation with the brute force method (although the reference data is held in the model, but each query takes linear time. - *lsh*: use Locality Sensitive Hashing (LSH) to find approximate nearest neighbors efficiently. The LSH model supports 'euclidean', 'squared_euclidean', 'manhattan', 'cosine', 'jaccard', 'dot_product' (deprecated), and 'transformed_dot_product' distances. Two options are provided for LSH -- ``num_tables`` and ``num_projections_per_table``. See the notes below for details. verbose: bool, optional If True, print progress updates and model details. **kwargs : optional Options for the distance function and query method. - *leaf_size*: for the ball tree method, the number of points in each leaf of the tree. The default is to use the max of 1,000 and n/(2^11), which ensures a maximum tree depth of 12. - *num_tables*: For the LSH method, the number of hash tables constructed. The default value is 20. We recommend choosing values from 10 to 30. - *num_projections_per_table*: For the LSH method, the number of projections/hash functions for each hash table. The default value is 4 for 'jaccard' distance, 16 for 'cosine' distance and 8 for other distances. We recommend using number 2 ~ 6 for 'jaccard' distance, 8 ~ 20 for 'cosine' distance and 4 ~ 12 for other distances. Returns ------- out : NearestNeighborsModel A structure for efficiently computing the nearest neighbors in 'dataset' of new query points. See Also -------- NearestNeighborsModel.query, turicreate.toolkits.distances Notes ----- - Missing data is not allowed in the 'dataset' provided to this function. Please use the :func:`turicreate.SFrame.fillna` and :func:`turicreate.SFrame.dropna` utilities to handle missing data before creating a nearest neighbors model. - Missing keys in sparse vectors are assumed to have value 0. - The `composite_params` parameter was removed as of Turi Create version 1.5. The `distance` parameter now accepts either standard or composite distances. Please see the :mod:`~turicreate.toolkits.distances` module documentation for more information on composite distances. - If the features should be weighted equally in the distance calculations but are measured on different scales, it is important to standardize the features. One way to do this is to subtract the mean of each column and divide by the standard deviation. **Locality Sensitive Hashing (LSH)** There are several efficient nearest neighbors search algorithms that work well for data with low dimensions :math:`d` (approximately 50). However, most of the solutions suffer from either space or query time that is exponential in :math:`d`. For large :math:`d`, they often provide little, if any, improvement over the 'brute_force' method. This is a well-known consequence of the phenomenon called `The Curse of Dimensionality`. `Locality Sensitive Hashing (LSH) <https://en.wikipedia.org/wiki/Locality-sensitive_hashing>`_ is an approach that is designed to efficiently solve the *approximate* nearest neighbor search problem for high dimensional data. The key idea of LSH is to hash the data points using several hash functions, so that the probability of collision is much higher for data points which are close to each other than those which are far apart. An LSH family is a family of functions :math:`h` which map points from the metric space to a bucket, so that - if :math:`d(p, q) \\leq R`, then :math:`h(p) = h(q)` with at least probability :math:`p_1`. - if :math:`d(p, q) \\geq cR`, then :math:`h(p) = h(q)` with probability at most :math:`p_2`. LSH for efficient approximate nearest neighbor search: - We define a new family of hash functions :math:`g`, where each function :math:`g` is obtained by concatenating :math:`k` functions :math:`h_1, ..., h_k`, i.e., :math:`g(p)=[h_1(p),...,h_k(p)]`. The algorithm constructs :math:`L` hash tables, each of which corresponds to a different randomly chosen hash function :math:`g`. There are :math:`k \\cdot L` hash functions used in total. - In the preprocessing step, we hash all :math:`n` reference points into each of the :math:`L` hash tables. - Given a query point :math:`q`, the algorithm iterates over the :math:`L` hash functions :math:`g`. For each :math:`g` considered, it retrieves the data points that are hashed into the same bucket as q. These data points from all the :math:`L` hash tables are considered as candidates that are then re-ranked by their real distances with the query data. **Note** that the number of tables :math:`L` and the number of hash functions per table :math:`k` are two main parameters. They can be set using the options ``num_tables`` and ``num_projections_per_table`` respectively. Hash functions for different distances: - `euclidean` and `squared_euclidean`: :math:`h(q) = \\lfloor \\frac{a \\cdot q + b}{w} \\rfloor` where :math:`a` is a vector, of which the elements are independently sampled from normal distribution, and :math:`b` is a number uniformly sampled from :math:`[0, r]`. :math:`r` is a parameter for the bucket width. We set :math:`r` using the average all-pair `euclidean` distances from a small randomly sampled subset of the reference data. - `manhattan`: The hash function of `manhattan` is similar with that of `euclidean`. The only difference is that the elements of `a` are sampled from Cauchy distribution, instead of normal distribution. - `cosine`: Random Projection is designed to approximate the cosine distance between vectors. The hash function is :math:`h(q) = sgn(a \\cdot q)`, where :math:`a` is randomly sampled normal unit vector. - `jaccard`: We use a recently proposed method one permutation hashing by Shrivastava and Li. See the paper `[Shrivastava and Li, UAI 2014] <http://www.auai.org/uai2014/proceedings/individuals/225.pdf>`_ for details. - `dot_product`: The reference data points are first transformed to fixed-norm vectors, and then the minimum `dot_product` distance search problem can be solved via finding the reference data with smallest `cosine` distances. See the paper `[Neyshabur and Srebro, ICML 2015] <http://proceedings.mlr.press/v37/neyshabur15.html>`_ for details. References ---------- - `Wikipedia - nearest neighbor search <http://en.wikipedia.org/wiki/Nearest_neighbor_search>`_ - `Wikipedia - ball tree <http://en.wikipedia.org/wiki/Ball_tree>`_ - Ball tree implementation: Liu, T., et al. (2004) `An Investigation of Practical Approximate Nearest Neighbor Algorithms <http://papers.nips.cc/paper/2666-an-investigation-of-p ractical-approximat e-nearest-neighbor-algorithms>`_. Advances in Neural Information Processing Systems pp. 825-832. - `Wikipedia - Jaccard distance <http://en.wikipedia.org/wiki/Jaccard_index>`_ - Weighted Jaccard distance: Chierichetti, F., et al. (2010) `Finding the Jaccard Median <http://theory.stanford.edu/~sergei/papers/soda10-jaccard.pdf>`_. Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms. Society for Industrial and Applied Mathematics. - `Wikipedia - Cosine distance <http://en.wikipedia.org/wiki/Cosine_similarity>`_ - `Wikipedia - Levenshtein distance <http://en.wikipedia.org/wiki/Levenshtein_distance>`_ - Locality Sensitive Hashing : Chapter 3 of the book `Mining Massive Datasets <http://infolab.stanford.edu/~ullman/mmds/ch3.pdf>`_. Examples -------- Construct a nearest neighbors model with automatically determined method and distance: >>> sf = turicreate.SFrame({'X1': [0.98, 0.62, 0.11], ... 'X2': [0.69, 0.58, 0.36], ... 'str_feature': ['cat', 'dog', 'fossa']}) >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2']) For datasets with a large number of rows and up to about 100 variables, the ball tree method often leads to much faster queries. >>> model = turicreate.nearest_neighbors.create(sf, features=['X1', 'X2'], ... method='ball_tree') Often the final determination of a neighbor is based on several distance computations over different sets of features. Each part of this composite distance may have a different relative weight. >>> my_dist = [[['X1', 'X2'], 'euclidean', 2.], ... [['str_feature'], 'levenshtein', 3.]] ... >>> model = turicreate.nearest_neighbors.create(sf, distance=my_dist) """ ## Validate the 'dataset' input _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Basic validation of the features input if features is not None and not isinstance(features, list): raise TypeError("If specified, input 'features' must be a list of " + "strings.") ## Clean the method options and create the options dictionary allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: raise _ToolkitError( "'{}' is not a valid keyword argument".format(k) + " for the nearest neighbors model. Please " + "check for capitalization and other typos.") ## Exclude inappropriate combinations of method an distance if method == 'ball_tree' and ( distance == 'cosine' or distance == _turicreate.distances.cosine or distance == 'dot_product' or distance == _turicreate.distances.dot_product or distance == 'transformed_dot_product' or distance == _turicreate.distances.transformed_dot_product): raise TypeError( "The ball tree method does not work with 'cosine' " + "'dot_product', or 'transformed_dot_product' distance." + "Please use the 'brute_force' method for these distances.") if method == 'lsh' and ('num_projections_per_table' not in _method_options): if distance == 'jaccard' or distance == _turicreate.distances.jaccard: _method_options['num_projections_per_table'] = 4 elif distance == 'cosine' or distance == _turicreate.distances.cosine: _method_options['num_projections_per_table'] = 16 else: _method_options['num_projections_per_table'] = 8 ## Initial validation and processing of the label if label is None: _label = _robust_column_name('__id', dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] ## Determine the internal list of available feature names (may still include # the row label name). if features is None: _features = _dataset.column_names() else: _features = _copy.deepcopy(features) ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: raise _ToolkitError("The only available feature is the same as the " + "row label column. Please specify features " + "that are not also row labels.") ### Validate and preprocess the distance function ### --------------------------------------------- # - The form of the 'distance' controls how we interact with the 'features' # parameter as well. # - At this point, the row label 'label' may still be in the list(s) of # features. ## Convert any distance function input into a single composite distance. # distance is already a composite distance if isinstance(distance, list): distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. elif (hasattr(distance, '__call__') or (isinstance(distance, str) and not distance == 'auto')): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. elif distance is None or distance == 'auto': sample = _dataset.head() distance = _construct_auto_distance(_features, _dataset.column_names(), _dataset.column_types(), sample) else: raise TypeError("Input 'distance' not understood. The 'distance' " " argument must be a string, function handle, or " + "composite distance.") ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. distance = _scrub_composite_distance_features(distance, [_label]) distance = _convert_distance_names_to_functions(distance) _validate_composite_distance(distance) ## Raise an error if any distances are used with non-lists list_features_to_check = [] sparse_distances = [ 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product', 'transformed_dot_product' ] sparse_distances = [ getattr(_turicreate.distances, k) for k in sparse_distances ] for d in distance: feature_names, dist, _ = d list_features = [f for f in feature_names if _dataset[f].dtype == list] for f in list_features: if dist in sparse_distances: list_features_to_check.append(f) else: raise TypeError( "The chosen distance cannot currently be used " + "on list-typed columns.") for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: raise TypeError("Distances for sparse data, such as jaccard " + "and weighted_jaccard, can only be used on " + "lists containing only strings. Please modify " + "any list features accordingly before creating " + "the nearest neighbors model.") ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _turicreate.distances.levenshtein): raise ValueError( "Levenshtein distance cannot be used with multiple " + "columns. Please concatenate strings into a single " + "column before creating the nearest neighbors model.") ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: _method = 'brute_force' if method != 'brute_force' and verbose is True: print("Defaulting to brute force instead of ball tree because " +\ "there are multiple distance components.") else: if method == 'auto': # get the total number of variables. Assume the number of elements in # array type columns does not change num_variables = sum([ len(x) if hasattr(x, '__iter__') else 1 for x in _six.itervalues(sf_clean[0]) ]) # flag if all the features in the single composite are of numeric # type. numeric_type_flag = all([ x in [int, float, list, array.array] for x in sf_clean.column_types() ]) ## Conditions necessary for ball tree to work and be worth it if ((distance[0][1] in [ 'euclidean', 'manhattan', _turicreate.distances.euclidean, _turicreate.distances.manhattan ]) and numeric_type_flag is True and num_variables <= 200): _method = 'ball_tree' else: _method = 'brute_force' else: _method = method ## Pick the right model name for the method if _method == 'ball_tree': model_name = 'nearest_neighbors_ball_tree' elif _method == 'brute_force': model_name = 'nearest_neighbors_brute_force' elif _method == 'lsh': model_name = 'nearest_neighbors_lsh' else: raise ValueError( "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'.") ## Package the model options opts = {} opts.update(_method_options) opts.update({ 'model_name': model_name, 'ref_labels': ref_labels, 'label': label, 'sf_features': sf_clean, 'composite_params': distance }) ## Construct the nearest neighbors model with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.train(opts) model_proxy = result['model'] model = NearestNeighborsModel(model_proxy) return model
def similarity_graph(self, k=5, radius=None, include_self_edges=False, output_type='SGraph', verbose=True): """ Construct the similarity graph on the reference dataset, which is already stored in the model. This is conceptually very similar to running `query` with the reference set, but this method is optimized for the purpose, syntactically simpler, and automatically removes self-edges. Parameters ---------- k : int, optional Maximum number of neighbors to return for each point in the dataset. Setting this to ``None`` deactivates the constraint, so that all neighbors are returned within ``radius`` of a given point. radius : float, optional For a given point, only neighbors within this distance are returned. The default is ``None``, in which case the ``k`` nearest neighbors are returned for each query point, regardless of distance. include_self_edges : bool, optional For most distance functions, each point in the model's reference dataset is its own nearest neighbor. If this parameter is set to False, this result is ignored, and the nearest neighbors are returned *excluding* the point itself. output_type : {'SGraph', 'SFrame'}, optional By default, the results are returned in the form of an SGraph, where each point in the reference dataset is a vertex and an edge A -> B indicates that vertex B is a nearest neighbor of vertex A. If 'output_type' is set to 'SFrame', the output is in the same form as the results of the 'query' method: an SFrame with columns indicating the query label (in this case the query data is the same as the reference data), reference label, distance between the two points, and the rank of the neighbor. verbose : bool, optional If True, print progress updates and model details. Returns ------- out : SFrame or SGraph The type of the output object depends on the 'output_type' parameter. See the parameter description for more detail. Notes ----- - If both ``k`` and ``radius`` are set to ``None``, each data point is matched to the entire dataset. If the reference dataset has :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an SGraph with :math:`n^2` edges). - For models created with the 'lsh' method, the output similarity graph may have fewer vertices than there are data points in the original reference set. Because LSH is an approximate method, a query point may have fewer than 'k' neighbors. If LSH returns no neighbors at all for a query and self-edges are excluded, the query point is omitted from the results. Examples -------- First construct an SFrame and create a nearest neighbors model: >>> sf = turicreate.SFrame({'x1': [0.98, 0.62, 0.11], ... 'x2': [0.69, 0.58, 0.36]}) ... >>> model = turicreate.nearest_neighbors.create(sf, distance='euclidean') Unlike the ``query`` method, there is no need for a second dataset with ``similarity_graph``. >>> g = model.similarity_graph(k=1) # an SGraph >>> g.edges +----------+----------+----------------+------+ | __src_id | __dst_id | distance | rank | +----------+----------+----------------+------+ | 0 | 1 | 0.376430604494 | 1 | | 2 | 1 | 0.55542776308 | 1 | | 1 | 0 | 0.376430604494 | 1 | +----------+----------+----------------+------+ """ ## Validate inputs. if k is not None: if not isinstance(k, int): raise ValueError("Input 'k' must be an integer.") if k <= 0: raise ValueError("Input 'k' must be larger than 0.") if radius is not None: if not isinstance(radius, (int, float)): raise ValueError("Input 'radius' must be an integer or float.") if radius < 0: raise ValueError("Input 'radius' must be non-negative.") ## Set k and radius to special values to indicate 'None' if k is None: k = -1 if radius is None: radius = -1.0 opts = { 'model': self.__proxy__, 'model_name': self.__name__, 'k': k, 'radius': radius, 'include_self_edges': include_self_edges } with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.similarity_graph( opts) knn = result['neighbors'] if output_type == "SFrame": return knn else: sg = _SGraph(edges=knn, src_field='query_label', dst_field='reference_label') return sg
def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`turicreate.label_propagation.LabelPropagationModel` for the details of the algorithm. Notes: label propagation works well with small number of labels, i.e. binary labels, or less than 1000 classes. The toolkit will throw error if the number of classes exceeds the maximum value (1000). Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.label_propagation.LabelPropagationModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = turicreate.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ from turicreate._cython.cy_server import QuietProgress _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = { 'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__ } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.label_propagation.create(opts) model = params['model'] return LabelPropagationModel(model)