def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`graphlab.label_propagation.LabelPropagationModel` for the details of the algorithm. Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = graphlab.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.create') _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype() != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = {'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('label_propagation', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_labelprop', opts, env=_distributed, verbose=verbose) return LabelPropagationModel(model)
def create(dataset, target, model_name, features=None, validation_set='auto', verbose=True, distributed='auto', **kwargs): """ Create a :class:`~graphlab.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training kwargs : dict Additional parameter options that can be passed """ _raise_error_if_not_sframe(dataset, "training dataset") # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError, 'Unrecognized value for validation_set.' # Target target_sframe = _toolkits_select_columns(dataset, [target]) # Features if features is None: features = dataset.column_names() features.remove(target) if not hasattr(features, '__iter__'): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): raise TypeError("Invalid feature %s: Feature names must be of type str" % x) features_sframe = _toolkits_select_columns(dataset, features) options = {} _kwargs = {} for k in kwargs: _kwargs[k.lower()] = kwargs[k] options.update(_kwargs) options.update({'target': target_sframe, 'features': features_sframe, 'model_name': model_name}) if validation_set is not None: if not isinstance(validation_set, _graphlab.SFrame): raise TypeError, "validation_set must be either 'auto' or an SFrame matching the training data." # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) options.update({ 'features_validation' : _toolkits_select_columns(validation_set, features), 'target_validation' : _toolkits_select_columns(validation_set, [target])}) execution_env = get_distributed_execution_environment() if distributed == 'auto' and execution_env is None: ret = _graphlab.toolkits._main.run("supervised_learning_train", options, verbose) model = SupervisedLearningModel(ret['model'], model_name) else: ret = _distributed_run("distributed_supervised_train", options, env=distributed, verbose=verbose) model = SupervisedLearningModel(ret, model_name) return model
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed='auto', verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.pagerank.PageRankModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = graphlab.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('pagerank', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_pagerank', opts, env=_distributed, verbose=verbose) return PagerankModel(model)