def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`graphlab.label_propagation.LabelPropagationModel` for the details of the algorithm. Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = graphlab.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.create') _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype() != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = {'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('label_propagation', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_labelprop', opts, env=_distributed, verbose=verbose) return LabelPropagationModel(model)
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed='auto', verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.pagerank.PageRankModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = graphlab.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('pagerank', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_pagerank', opts, env=_distributed, verbose=verbose) return PagerankModel(model)