def create(graph, verbose=True): """ Compute the in degree, out degree and total degree of each vertex. Parameters ---------- graph : SGraph The graph on which to compute degree counts. verbose : bool, optional If True, print progress updates. Returns ------- out : DegreeCountingModel Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.degree_counting.DegreeCountingModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/web-Google.txt.gz', ... format='snap') >>> m = turicreate.degree_counting.create(g) >>> g2 = m['graph'] >>> g2 SGraph({'num_edges': 5105039, 'num_vertices': 875713}) Vertex Fields:['__id', 'in_degree', 'out_degree', 'total_degree'] Edge Fields:['__src_id', '__dst_id'] >>> g2.vertices.head(5) Columns: __id int in_degree int out_degree int total_degree int <BLANKLINE> Rows: 5 <BLANKLINE> Data: +------+-----------+------------+--------------+ | __id | in_degree | out_degree | total_degree | +------+-----------+------------+--------------+ | 5 | 15 | 7 | 22 | | 7 | 3 | 16 | 19 | | 8 | 1 | 2 | 3 | | 10 | 13 | 11 | 24 | | 27 | 19 | 16 | 35 | +------+-----------+------------+--------------+ See Also -------- DegreeCountingModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('degree_count', {'graph': graph.__proxy__}, verbose) return DegreeCountingModel(params['model'])
def create(graph, verbose=True): """ Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : TriangleCountingModel References ---------- - T. Schank. (2007) `Algorithmic Aspects of Triangle-Based Network Analysis <http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/4541>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.traingle_counting.TriangleCountingModel` as follows: >>> g = >>> turicreate.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', >>> format='snap') tc = turicreate.triangle_counting.create(g) We can obtain the number of triangles that each vertex in the graph ``g`` is present in: >>> tc_out = tc['triangle_count'] # SFrame We can add the new "triangle_count" field to the original graph g using: >>> g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- TriangleCountingModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('triangle_counting', {'graph': graph.__proxy__}, verbose) return TriangleCountingModel(params['model'])
def create(graph, verbose=True): """ Compute the graph coloring. Assign a color to each vertex such that no adjacent vertices have the same color. Return a model object with total number of colors used as well as the color ID for each vertex in the graph. This algorithm is greedy and is not guaranteed to find the **minimum** graph coloring. It is also not deterministic, so successive runs may return different answers. Parameters ---------- graph : SGraph The graph on which to compute the coloring. verbose : bool, optional If True, print progress updates. Returns ------- out : GraphColoringModel References ---------- - `Wikipedia - graph coloring <http://en.wikipedia.org/wiki/Graph_coloring>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.graph_coloring.GraphColoringModel` as follows: >>> g = turicreate.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> gc = turicreate.graph_coloring.create(g) We can obtain the ``color id`` corresponding to each vertex in the graph ``g`` as follows: >>> color_id = gc['color_id'] # SFrame We can obtain the total number of colors required to color the graph ``g`` as follows: >>> num_colors = gc['num_colors'] See Also -------- GraphColoringModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('graph_coloring', {'graph': graph.__proxy__}, verbose) return GraphColoringModel(params['model'])
def _describe_fields(cls): """ Return a dictionary for the class fields description. Fields should NOT be wrapped by _precomputed_field, if necessary """ dispatch_table = { 'ShortestPathModel': 'sssp_model_fields', 'GraphColoringModel': 'graph_coloring_model_fields', 'PagerankModel': 'pagerank_model_fields', 'ConnectedComponentsModel': 'connected_components_model_fields', 'TriangleCountingModel': 'triangle_counting_model_fields', 'KcoreModel': 'kcore_model_fields', 'DegreeCountingModel': 'degree_count_model_fields', 'LabelPropagationModel': 'label_propagation_model_fields' } try: fields_description = _main.run(dispatch_table[cls.__name__], {}) return fields_description except: raise RuntimeError('Model %s does not have fields description' % cls.__name__)
def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`turicreate.label_propagation.LabelPropagationModel` for the details of the algorithm. Notes: label propagation works well with small number of labels, i.e. binary labels, or less than 1000 classes. The toolkit will throw error if the number of classes exceeds the maximum value (1000). Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.label_propagation.LabelPropagationModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = turicreate.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = { 'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__ } params = _main.run('label_propagation', opts, verbose) model = params['model'] return LabelPropagationModel(model)
def create(graph, verbose=True): """ Compute the number of weakly connected components in the graph. Return a model object with total number of weakly connected components as well as the component ID for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : ConnectedComponentsModel References ---------- - `Mathworld Wolfram - Weakly Connected Component <http://mathworld.wolfram.com/WeaklyConnectedComponent.html>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.connected_components.ConnectedComponentsModel` as follows: >>> g = turicreate.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> cc = turicreate.connected_components.create(g) >>> cc.summary() We can obtain the ``component id`` corresponding to each vertex in the graph ``g`` as follows: >>> cc_ids = cc['component_id'] # SFrame We can obtain a graph with additional information about the ``component id`` corresponding to each vertex as follows: >>> cc_graph = cc['graph'] # SGraph We can add the new component_id field to the original graph g using: >>> g.vertices['component_id'] = cc['graph'].vertices['component_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- ConnectedComponentsModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('connected_components', {'graph': graph.__proxy__}, verbose) return ConnectedComponentsModel(params['model'])
def create(graph, kmin=0, kmax=10, verbose=True): """ Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the k-core decomposition. kmin : int, optional Minimun core id. Vertices having smaller core id than `kmin` will be assigned with core_id = `kmin`. kmax : int, optional Maximun core id. Vertices having larger core id than `kmax` will be assigned with core_id=`kmax`. verbose : bool, optional If True, print progress updates. Returns ------- out : KcoreModel References ---------- - Alvarez-Hamelin, J.I., et al. (2005) `K-Core Decomposition: A Tool for the Visualization of Large Networks <http://arxiv.org/abs/cs/0504107>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.kcore.KcoreModel` as follows: >>> g = turicreate.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> kc = turicreate.kcore.create(g) We can obtain the ``core id`` corresponding to each vertex in the graph ``g`` using: >>> kcore_id = kc['core_id'] # SFrame We can add the new core id field to the original graph g using: >>> g.vertices['core_id'] = kc['graph'].vertices['core_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- KcoreModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax} params = _main.run('kcore', opts, verbose) return KcoreModel(params['model'])
def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): """ Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortest paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute shortest paths. source_vid : vertex ID ID of the source vertex. weight_field : string, optional The edge field representing the edge weights. If empty, uses unit weights. verbose : bool, optional If True, print progress updates. Returns ------- out : ShortestPathModel References ---------- - `Wikipedia - ShortestPath <http://en.wikipedia.org/wiki/Shortest_path_problem>`_ Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.shortest_path.ShortestPathModel` as follows: >>> g = turicreate.load_sgraph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> sp = turicreate.shortest_path.create(g, source_vid=1) We can obtain the shortest path distance from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_sframe = sp['distance'] # SFrame We can add the new distance field to the original graph g using: >>> g.vertices['distance_to_1'] = sp['graph'].vertices['distance'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. To get the actual path from the source vertex to any destination vertex: >>> path = sp.get_path(vid=10) We can obtain an auxiliary graph with additional information corresponding to the shortest path from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_graph = sp.get.graph # SGraph See Also -------- ShortestPathModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'source_vid': source_vid, 'weight_field': weight_field, 'max_distance': max_distance, 'graph': graph.__proxy__} params = _main.run('sssp', opts, verbose) return ShortestPathModel(params['model'])
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed='auto', verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~turicreate.SGraph` ``g``, we can create a :class:`~turicreate.pagerank.PageRankModel` as follows: >>> g = turicreate.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = turicreate.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = { 'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__ } params = _main.run('pagerank', opts, verbose) model = params['model'] return PagerankModel(model)
def extract_features(self, dataset, missing_value_action='auto'): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~turicreate.logistic_classifier.LogisticClassifier`, an :py:class:`~turicreate.svm_classifier.SVMClassifier`, or a Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = turicreate.SFrame( 'https://static.turi.com/datasets/regression/houses.csv') >>> # Regression Tree Models >>> data['regression_tree_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['classification_tree_features'] = model.extract_features(data) """ metric_name = '.'.join([self.__module__, 'extract_features']) _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'extract_features') options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'dataset': dataset }) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])