def get_current_options(self): """ Return a dictionary with the options used to define and create this graph analytics model instance. Returns ------- out : dict Dictionary of options used to train this model. See Also -------- get_default_options, list_fields, get """ _mt._get_metric_tracker().track('toolkit.graph_analytics.get_current_options') dispatch_table = { 'ShortestPathModel': 'sssp_default_options', 'GraphColoringModel': 'graph_coloring_default_options', 'PagerankModel': 'pagerank_default_options', 'ConnectedComponentsModel': 'connected_components_default_options', 'TriangleCountingModel': 'triangle_counting_default_options', 'KcoreModel': 'kcore_default_options' } try: model_options = _main.run(dispatch_table[self.name()], {}) ## for each of the default options, update its current value by querying the model for key in model_options: current_value = self.get(key) model_options[key] = current_value return model_options except: raise RuntimeError('Model %s does not have options' % self.name())
def create(graph, verbose=True): """ Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : TriangleCountingModel References ---------- - T. Schank. (2007) `Algorithmic Aspects of Triangle-Based Network Analysis <http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/4541>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.traingle_counting.TriangleCountingModel` as follows: >>> g = >>> graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', >>> format='snap') tc = graphlab.triangle_counting.create(g) We can obtain the number of triangles that each vertex in the graph ``g`` is present in: >>> tc_out = tc['triangle_count'] # SFrame We can add the new "triangle_count" field to the original graph g using: >>> g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- TriangleCountingModel """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.triangle_counting.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('triangle_counting', {'graph': graph.__proxy__}, verbose) return TriangleCountingModel(params['model'])
def create(graph, kmin=0, kmax=10, verbose=True): """ Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the k-core decomposition. kmin : int, optional Minimun core id. Vertices having smaller core id than `kmin` will be assigned with core_id = `kmin`. kmax : int, optional Maximun core id. Vertices having larger core id than `kmax` will be assigned with core_id=`kmax`. verbose : bool, optional If True, print progress updates. Returns ------- out : KcoreModel References ---------- - Alvarez-Hamelin, J.I., et al. (2005) `K-Core Decomposition: A Tool for the Visualization of Large Networks <http://arxiv.org/abs/cs/0504107>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.kcore.KcoreModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> kc = graphlab.kcore.create(g) We can obtain the ``core id`` corresponding to each vertex in the graph ``g`` using: >>> kcore_id = kc['core_id'] # SFrame See Also -------- KcoreModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.kcore.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax} params = _main.run('kcore', opts, verbose) return KcoreModel(params['model'])
def create(graph, verbose=True): """ Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : TriangleCountingModel References ---------- - T. Schank. (2007) `Algorithmic Aspects of Triangle-Based Network Analysis <http://digbib.ubka.uni-karlsruhe.de/volltexte/documents/4541>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.traingle_counting.TriangleCountingModel` as follows: >>> g = >>> graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', >>> format='snap') tc = graphlab.triangle_counting.create(g) We can obtain the number of triangles that each vertex in the graph ``g`` is present in: >>> tc_out = tc['triangle_count'] # SFrame We can add the new "triangle_count" field to the original graph g using: >>> g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- TriangleCountingModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.triangle_counting.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('triangle_counting', {'graph': graph.__proxy__}, verbose) return TriangleCountingModel(params['model'])
def create(graph, verbose=True): """ Compute the graph coloring. Assign a color to each vertex such that no adjacent vertices have the same color. Return a model object with total number of colors used as well as the color ID for each vertex in the graph. This algorithm is greedy and is not guaranteed to find the **minimum** graph coloring. It is also not deterministic, so successive runs may return different answers. Parameters ---------- graph : SGraph The graph on which to compute the coloring. verbose : bool, optional If True, print progress updates. Returns ------- out : GraphColoringModel References ---------- - `Wikipedia - graph coloring <http://en.wikipedia.org/wiki/Graph_coloring>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.graph_coloring.GraphColoringModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> gc = graphlab.graph_coloring.create(g) We can obtain the ``color id`` corresponding to each vertex in the graph ``g`` as follows: >>> color_id = gc['color_id'] # SFrame We can obtain the total number of colors required to color the graph ``g`` as follows: >>> num_colors = gc['num_colors'] See Also -------- GraphColoringModel """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.graph_coloring.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('graph_coloring', {'graph': graph.__proxy__}, verbose) return GraphColoringModel(params['model'])
def create(graph, verbose=True): """ Compute the number of weakly connected components in the graph. Return a model object with total number of weakly connected components as well as the component ID for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : ConnectedComponentsModel References ---------- - `Mathworld Wolfram - Weakly Connected Component <http://mathworld.wolfram.com/WeaklyConnectedComponent.html>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.connected_components.ConnectedComponentsModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> cc = graphlab.connected_components.create(g) >>> cc.summary() We can obtain the ``component id`` corresponding to each vertex in the graph ``g`` as follows: >>> cc_ids = cc['component_id'] # SFrame We can obtain a graph with additional information about the ``component id`` corresponding to each vertex as follows: >>> cc_graph = cc['graph'] # SGraph See Also -------- ConnectedComponentsModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.connected_components.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('connected_components', {'graph': graph.__proxy__}, verbose) return ConnectedComponentsModel(params['model'])
def create(graph, verbose=True): """ Compute the graph coloring. Assign a color to each vertex such that no adjacent vertices have the same color. Return a model object with total number of colors used as well as the color ID for each vertex in the graph. This algorithm is greedy and is not guaranteed to find the **minimum** graph coloring. It is also not deterministic, so successive runs may return different answers. Parameters ---------- graph : SGraph The graph on which to compute the coloring. verbose : bool, optional If True, print progress updates. Returns ------- out : GraphColoringModel References ---------- - `Wikipedia - graph coloring <http://en.wikipedia.org/wiki/Graph_coloring>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.graph_coloring.GraphColoringModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> gc = graphlab.graph_coloring.create(g) We can obtain the ``color id`` corresponding to each vertex in the graph ``g`` as follows: >>> color_id = gc['color_id'] # SFrame We can obtain the total number of colors required to color the graph ``g`` as follows: >>> num_colors = gc['num_colors'] See Also -------- GraphColoringModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.graph_coloring.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('graph_coloring', {'graph': graph.__proxy__}, verbose) return GraphColoringModel(params['model'])
def get_default_options(): """ Get the default options for :func:`graphlab.graph_coloring.create`. Returns ------- out : dict Examples -------- >>> graphlab.graph_coloring.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.graph_coloring.get_default_options') return _main.run('graph_coloring_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.connected_components.create`. Returns ------- out : dict Examples -------- >>> graphlab.connected_components.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.connected_components.get_default_options') return _main.run('connected_components_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.degree_counting.create`. Returns ------- out : dict Examples -------- >>> graphlab.degree_counting.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.degree_counting.get_default_options') return _main.run('degree_count_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.label_propagation.create`. Returns ------- out : dict See Also -------- LabelPropagationModel.get_current_options Examples -------- >>> graphlab.label_propagation.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.get_default_options') return _main.run('label_propagation_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.shortest_path.create`. Returns ------- out : dict See Also -------- ShortestPathModel.get_current_options Examples -------- >>> graphlab.shortest_path.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.sssp.get_default_options') return _main.run('sssp_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.pagerank.create`. Returns ------- out : dict See Also -------- PagerankModel.get_current_options Examples -------- >>> graphlab.pagerank.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.get_default_options') return _main.run('pagerank_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.kcore.create`. Returns ------- out : dict See Also -------- KcoreModel.get_current_options Examples -------- >>> graphlab.kcore.get_default_options() """ _mt._get_metric_tracker().track('toolkit.graph_analytics.kcore.get_default_options') return _main.run('kcore_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.label_propagation.create`. Returns ------- out : dict See Also -------- LabelPropagationModel.get_current_options Examples -------- >>> graphlab.label_propagation.get_default_options() """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.label_propagation.get_default_options') return _main.run('label_propagation_default_options', {})
def get_default_options(): """ Get the default options for :func:`graphlab.shortest_path.create`. Returns ------- out : dict See Also -------- ShortestPathModel.get_current_options Examples -------- >>> graphlab.shortest_path.get_default_options() """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.sssp.get_default_options') return _main.run('sssp_default_options', {})
def _describe_fields(cls): """ Return a dictionary for the class fields description. Fields should NOT be wrapped by _precomputed_field, if necessary """ dispatch_table = { 'ShortestPathModel': 'sssp_model_fields', 'GraphColoringModel': 'graph_coloring_model_fields', 'PagerankModel': 'pagerank_model_fields', 'ConnectedComponentsModel': 'connected_components_model_fields', 'TriangleCountingModel': 'triangle_counting_model_fields', 'KcoreModel': 'kcore_model_fields', 'DegreeCountingModel': 'degree_count_model_fields', 'LabelPropagationModel': 'label_propagation_model_fields' } try: fields_description = _main.run(dispatch_table[cls.__name__], {}) return fields_description except: raise RuntimeError('Model %s does not have fields description' % cls.__name__)
def _describe_fields(cls): """ Return a pretty table for the class fields description. """ dispatch_table = { 'ShortestPathModel': 'sssp_model_fields', 'GraphColoringModel': 'graph_coloring_model_fields', 'PagerankModel': 'pagerank_model_fields', 'ConnectedComponentsModel': 'connected_components_model_fields', 'TriangleCountingModel': 'triangle_counting_model_fields', 'KcoreModel': 'kcore_model_fields' } try: fields_description = _main.run(dispatch_table[cls.__name__], {}) tbl = _PrettyTable(['Field', 'Description']) for k, v in fields_description.iteritems(): tbl.add_row([k, v]) tbl.align['Field'] = 'l' tbl.align['Description'] = 'l' return tbl except: raise RuntimeError('Model %s does not have fields description' % cls.__name__)
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed='auto', verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.pagerank.PageRankModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = graphlab.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('pagerank', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_pagerank', opts, env=_distributed, verbose=verbose) return PagerankModel(model)
def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): """ Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortes paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute shortest paths. source_vid : vertex ID ID of the source vertex. weight_field : string, optional The edge field representing the edge weights. If empty, uses unit weights. verbose : bool, optional If True, print progress updates. Returns ------- out : ShortestPathModel References ---------- - `Wikipedia - ShortestPath <http://en.wikipedia.org/wiki/Shortest_path_problem>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.shortest_path.ShortestPathModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> sp = graphlab.shortest_path.create(g, source_vid=1) We can obtain the shortest path distance from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_sframe = sp['distance'] # SFrame We can add the new distance field to the original graph g using: >>> g.vertices['distance_to_1'] = sp['graph'].vertices['distance'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. To get the actual path from the source vertex to any destination vertex: >>> path = sp.get_path(vid=10) We can obtain an auxiliary graph with additional information corresponding to the shortest path from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_graph = sp.get['graph'] # SGraph See Also -------- ShortestPathModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.sssp.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = { 'source_vid': source_vid, 'weight_field': weight_field, 'max_distance': max_distance, 'graph': graph.__proxy__ } params = _main.run('sssp', opts, verbose) return ShortestPathModel(params['model'])
def extract_features(self, dataset, layer_id=None): """ Takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id]. These feature vectors can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model, except for images which are automatically resized. We also are releasing a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45 . Using it requires 256 x 256 x 3 images. Please see Examples and References for more. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. layer_id : int , optional The index of the layer in neuralnet at which the activations are taken to be a dense feature vector. Must be a fully-connected layer. Default is None, in which case the layer before the connection layer to the output is used. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. See Also ------------ graphlab.deeplearning.layers References ---------- - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012. Examples -------- >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k') >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(data, ... target='label', ... network=net, ... max_iterations=3) >>> # Now, let's extract features from the last layer >>> data['features'] = m.extract_features(data) >>> # Now, let's build a new classifier on top of extracted features >>> m = graphlab.classifier.create(data, ... features = ['features'], ... target='label') Now, let's see how to load the ImageNet model, and use it for extracting features after resizing the data: >>> imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45') >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True) >>> data['imagenet_features'] = imagenet_model.extract_features(data) """ _mt._get_metric_tracker().track( 'toolkit.classifier.neuralnet_classifier.extract_features') _raise_error_if_not_sframe(dataset, "dataset") options = dict() net = self.get('network').layers network_size = len(net) - 1 if layer_id is None: if net[network_size]._type == "CONNECTION": layer_id = network_size - 1 else: layer_id = network_size - 2 _numeric_param_check_range("layer_id", layer_id, 0, network_size) conv2flat = False for i in range(0, layer_id + 1): if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION": conv2flat = True if conv2flat is not True: raise ValueError( "Features must be extracted from either a network " "with non-image input or a layer after a FlattenLayer. " "Try extracting features from layer following a FlattenLayer.") options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'missing_value_action': "error", 'layer_id': layer_id }) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def predict_topk(self, dataset, output_type="probability", k=3): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`,`rank`, or `score`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model, except for images which are automatically resized. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank', 'score'}, optional Choose the return type of the prediction: - `rank`: outputs rank along with class label. - `probability`: outputs learned probability along with class label. - `score`: Same as probability k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ... >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track( 'toolkit.classifier.neuralnet_classifier.predict_topk') _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': 'error' }) target = _toolkits_main.run('supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`graphlab.label_propagation.LabelPropagationModel` for the details of the algorithm. Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = graphlab.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.create') _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype() != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = {'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__} distributed_context = _get_distributed_execution_environment() if distributed_context is None: params = _main.run('label_propagation', opts, verbose) model = params['model'] else: model = _distributed_run('distributed_labelprop', opts, env=_distributed, verbose=verbose) return LabelPropagationModel(model)
def create(graph, verbose=True): """ Compute the in degree, out degree and total degree of each vertex. Parameters ---------- graph : SGraph The graph on which to compute degree counts. verbose : bool, optional If True, print progress updates. Returns ------- out : DegreeCountingModel Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.degree_counting.DegreeCountingModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/web-Google.txt.gz', ... format='snap') >>> m = graphlab.degree_counting.create(g) >>> g2 = m['graph'] >>> g2 SGraph({'num_edges': 5105039, 'num_vertices': 875713}) Vertex Fields:['__id', 'in_degree', 'out_degree', 'total_degree'] Edge Fields:['__src_id', '__dst_id'] >>> g2.vertices.head(5) Columns: __id int in_degree int out_degree int total_degree int <BLANKLINE> Rows: 5 <BLANKLINE> Data: +------+-----------+------------+--------------+ | __id | in_degree | out_degree | total_degree | +------+-----------+------------+--------------+ | 5 | 15 | 7 | 22 | | 7 | 3 | 16 | 19 | | 8 | 1 | 2 | 3 | | 10 | 13 | 11 | 24 | | 27 | 19 | 16 | 35 | +------+-----------+------------+--------------+ See Also -------- DegreeCountingModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.degree_counting.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('degree_count', {'graph': graph.__proxy__}, verbose) return DegreeCountingModel(params['model'])
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.pagerank.PageRankModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = graphlab.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame See Also -------- PagerankModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'graph': graph.__proxy__} params = _main.run('pagerank', opts, verbose) return PagerankModel(params['model'])
def create(graph, label_field, threshold=1e-3, weight_field='', self_weight=1.0, undirected=False, max_iterations=None, _single_precision=False, _distributed='auto', verbose=True): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the "label propagation" algorithm. The algorithm iteratively updates the label probability of current vertex as a weighted sum of label probability of self and the neighboring vertices until converge. See :class:`graphlab.label_propagation.LabelPropagationModel` for the details of the algorithm. Notes: label propagation works well with small number of labels, i.e. binary labels, or less than 1000 classes. The toolkit will throw error if the number of classes exceeds the maximum value (1000). Parameters ---------- graph : SGraph The graph on which to compute the label propagation. label_field: str Vertex field storing the initial vertex labels. The values in must be [0, num_classes). None values indicate unobserved vertex labels. threshold : float, optional Threshold for convergence, measured in the average L2 norm (the sum of squared values) of the delta of each vertex's label probability vector. max_iterations: int, optional The max number of iterations to run. Default is unlimited. If set, the algorithm terminates when either max_iterations or convergence threshold is reached. weight_field: str, optional Vertex field for edge weight. If empty, all edges are assumed to have unit weight. self_weight: float, optional The weight for self edge. undirected: bool, optional If true, treat each edge as undirected, and propagates label in both directions. _single_precision : bool, optional If true, running label propagation in single precision. The resulting probability values may less accurate, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : LabelPropagationModel References ---------- - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', ... format='snap') # Initialize random classes for a subset of vertices # Leave the unobserved vertices with None label. >>> import random >>> def init_label(vid): ... x = random.random() ... if x < 0.2: ... return 0 ... elif x > 0.9: ... return 1 ... else: ... return None >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int) >>> m = graphlab.label_propagation.create(g, label_field='label') We can obtain for each vertex the predicted label and the probability of each label in the graph ``g`` using: >>> labels = m['labels'] # SFrame >>> labels +------+-------+-----------------+-------------------+----------------+ | __id | label | predicted_label | P0 | P1 | +------+-------+-----------------+-------------------+----------------+ | 5 | 1 | 1 | 0.0 | 1.0 | | 7 | None | 0 | 0.8213214997 | 0.1786785003 | | 8 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 10 | None | 0 | 0.534984718273 | 0.465015281727 | | 27 | None | 0 | 0.752801638549 | 0.247198361451 | | 29 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 33 | None | 1 | 5.96046447754e-08 | 0.999999940395 | | 47 | 0 | 0 | 1.0 | 0.0 | | 50 | None | 0 | 0.788279032657 | 0.211720967343 | | 52 | None | 0 | 0.666666666667 | 0.333333333333 | +------+-------+-----------------+-------------------+----------------+ [36692 rows x 5 columns] See Also -------- LabelPropagationModel """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.label_propagation.create') _raise_error_if_not_of_type(label_field, str) _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') if graph.vertices[label_field].dtype() != int: raise TypeError('label_field %s must be integer typed.' % label_field) opts = { 'label_field': label_field, 'threshold': threshold, 'weight_field': weight_field, 'self_weight': self_weight, 'undirected': undirected, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__ } params = _main.run('label_propagation', opts, verbose) model = params['model'] return LabelPropagationModel(model)
def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): """ Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortes paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute shortest paths. source_vid : vertex ID ID of the source vertex. weight_field : string, optional The edge field representing the edge weights. If empty, uses unit weights. verbose : bool, optional If True, print progress updates. Returns ------- out : ShortestPathModel References ---------- - `Wikipedia - ShortestPath <http://en.wikipedia.org/wiki/Shortest_path_problem>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.shortest_path.ShortestPathModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> sp = graphlab.shortest_path.create(g, source_vid=1) We can obtain the shortest path distance from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_sframe = sp['distance'] # SFrame We can add the new distance field to the original graph g using: >>> g.vertices['distance_to_1'] = sp['graph'].vertices['distance'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. To get the actual path from the source vertex to any destination vertex: >>> path = sp.get_path(vid=10) We can obtain an auxiliary graph with additional information corresponding to the shortest path from the source vertex to each vertex in the graph ``g`` as follows: >>> sp_graph = sp.get['graph'] # SGraph See Also -------- ShortestPathModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.sssp.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'source_vid': source_vid, 'weight_field': weight_field, 'max_distance': max_distance, 'graph': graph.__proxy__} params = _main.run('sssp', opts, verbose) return ShortestPathModel(params['model'])
def create(graph, verbose=True): """ Compute the in degree, out degree and total degree of each vertex. Parameters ---------- graph : SGraph The graph on which to compute degree counts. verbose : bool, optional If True, print progress updates. Returns ------- out : DegreeCountingModel Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.degree_counting.DegreeCountingModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/web-Google.txt.gz', ... format='snap') >>> m = graphlab.degree_counting.create(g) >>> g2 = m['graph'] >>> g2 SGraph({'num_edges': 5105039, 'num_vertices': 875713}) Vertex Fields:['__id', 'in_degree', 'out_degree', 'total_degree'] Edge Fields:['__src_id', '__dst_id'] >>> g2.vertices.head(5) Columns: __id int in_degree int out_degree int total_degree int <BLANKLINE> Rows: 5 <BLANKLINE> Data: +------+-----------+------------+--------------+ | __id | in_degree | out_degree | total_degree | +------+-----------+------------+--------------+ | 5 | 15 | 7 | 22 | | 7 | 3 | 16 | 19 | | 8 | 1 | 2 | 3 | | 10 | 13 | 11 | 24 | | 27 | 19 | 16 | 35 | +------+-----------+------------+--------------+ See Also -------- DegreeCountingModel """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.degree_counting.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('degree_count', {'graph': graph.__proxy__}, verbose) return DegreeCountingModel(params['model'])
def create(graph, verbose=True): """ Compute the number of weakly connected components in the graph. Return a model object with total number of weakly connected components as well as the component ID for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the triangle counts. verbose : bool, optional If True, print progress updates. Returns ------- out : ConnectedComponentsModel References ---------- - `Mathworld Wolfram - Weakly Connected Component <http://mathworld.wolfram.com/WeaklyConnectedComponent.html>`_ Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.connected_components.ConnectedComponentsModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> cc = graphlab.connected_components.create(g) >>> cc.summary() We can obtain the ``component id`` corresponding to each vertex in the graph ``g`` as follows: >>> cc_ids = cc['component_id'] # SFrame We can obtain a graph with additional information about the ``component id`` corresponding to each vertex as follows: >>> cc_graph = cc['graph'] # SGraph We can add the new component_id field to the original graph g using: >>> g.vertices['component_id'] = cc['graph'].vertices['component_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- ConnectedComponentsModel """ _mt._get_metric_tracker().track( 'toolkit.graph_analytics.connected_components.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') params = _main.run('connected_components', {'graph': graph.__proxy__}, verbose) return ConnectedComponentsModel(params['model'])
def extract_features(self, dataset, layer_id=None): """ Takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id]. These feature vectors can be used as input to train another classifier such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, another :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`, or a :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`. Input dataset size must be the same as for the training of the model, except for images which are automatically resized. We also are releasing a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45 . Using it requires 256 x 256 x 3 images. Please see Examples and References for more. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. layer_id : int , optional The index of the layer in neuralnet at which the activations are taken to be a dense feature vector. Must be a fully-connected layer. Default is None, in which case the layer before the connection layer to the output is used. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. See Also ------------ graphlab.deeplearning.layers References ---------- - Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012. Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k') >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(data, ... target='label', ... network=net, ... max_iterations=3) >>> # Now, let's extract features from the last layer >>> data['features'] = m.extract_features(data) >>> # Now, let's build a new classifier on top of extracted features >>> m = graphlab.classifier.create(data, ... features = ['features'], ... target='label') Now, let's see how to load the ImageNet model, and use it for extracting features after resizing the data: >>> imagenet_model = graphlab.load_model('http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45') >>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3) >>> data['imagenet_features'] = imagenet_model.extract_features(data) """ _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.extract_features') _raise_error_if_not_sframe(dataset, "dataset") options = dict() net = self.get('network').layers network_size = len(net) - 1 if layer_id is None: if net[network_size]._type == "CONNECTION": layer_id = network_size - 1 else: layer_id = network_size - 2 _numeric_param_check_range("layer_id", layer_id, 0, network_size) conv2flat = False for i in range(0, layer_id + 1): if net[i]._type == "CONNECTION" or net[i]._type == "TRANSITION": conv2flat = True if conv2flat is not True: raise ValueError("Features must be extracted from either a network " "with non-image input or a layer after a FlattenLayer. " "Try extracting features from layer following a FlattenLayer.") options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'layer_id': layer_id}) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def predict_topk(self, dataset, output_type="probability", k=3): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `row_id`, `class`, and `probability`,`rank`, or `score`, depending on the ``output_type`` parameter. Input dataset size must be the same as for training of the model, except for images which are automatically resized. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. output_type : {'probability', 'rank', 'score'}, optional Choose the return type of the prediction: - `rank`: outputs rank along with class label. - `probability`: outputs learned probability along with class label. - `score`: Same as probability k : int, optional Number of classes to return for each input example. Returns ------- out : SFrame An SFrame with model predictions. See Also -------- predict, classify, evaluate Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train') >>> training_data, validation_data = data.random_split(0.8) >>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist') >>> m = graphlab.neuralnet_classifier.create(training_data, ... target='label', ... network=net, ... max_iterations=3) ... >>> pred = m.predict_topk(validation_data, k=3) >>> pred +--------+-------+-------------------+ | row_id | class | probability | +--------+-------+-------------------+ | 0 | 4 | 0.995623886585 | | 0 | 9 | 0.0038311756216 | | 0 | 7 | 0.000301006948575 | | 1 | 1 | 0.928708016872 | | 1 | 3 | 0.0440889261663 | | 1 | 2 | 0.0176190119237 | | 2 | 3 | 0.996967732906 | | 2 | 2 | 0.00151345680933 | | 2 | 7 | 0.000637513934635 | | 3 | 1 | 0.998070061207 | | ... | ... | ... | +--------+-------+-------------------+ [35688 rows x 3 columns] """ _mt._get_metric_tracker().track('toolkit.classifier.neuralnet_classifier.predict_topk') _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset, 'output_type': output_type, 'topk': k, 'missing_value_action': 'error'}) target = _toolkits_main.run('supervised_learning_predict_topk', options) return _map_unity_proxy_to_object(target['predicted'])
def create(graph, reset_probability=0.15, threshold=1e-2, max_iterations=20, _single_precision=False, _distributed='auto', verbose=True): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the pagerank value. reset_probability : float, optional Probability that a random surfer jumps to an arbitrary page. threshold : float, optional Threshold for convergence, measured in the L1 norm (the sum of absolute value) of the delta of each vertex's pagerank value. max_iterations : int, optional The maximun number of iterations to run. _single_precision : bool, optional If true, running pagerank in single precision. The resulting pagerank values may not be accurate for large graph, but should run faster and use less memory. _distributed : distributed environment, internal verbose : bool, optional If True, print progress updates. Returns ------- out : PagerankModel References ---------- - `Wikipedia - PageRank <http://en.wikipedia.org/wiki/PageRank>`_ - Page, L., et al. (1998) `The PageRank Citation Ranking: Bringing Order to the Web <http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.pagerank.PageRankModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> pr = graphlab.pagerank.create(g) We can obtain the page rank corresponding to each vertex in the graph ``g`` using: >>> pr_out = pr['pagerank'] # SFrame We can add the new pagerank field to the original graph g using: >>> g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- PagerankModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.pagerank.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = { 'threshold': threshold, 'reset_probability': reset_probability, 'max_iterations': max_iterations, 'single_precision': _single_precision, 'graph': graph.__proxy__ } params = _main.run('pagerank', opts, verbose) model = params['model'] return PagerankModel(model)
def create(graph, kmin=0, kmax=10, verbose=True): """ Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph. Parameters ---------- graph : SGraph The graph on which to compute the k-core decomposition. kmin : int, optional Minimun core id. Vertices having smaller core id than `kmin` will be assigned with core_id = `kmin`. kmax : int, optional Maximun core id. Vertices having larger core id than `kmax` will be assigned with core_id=`kmax`. verbose : bool, optional If True, print progress updates. Returns ------- out : KcoreModel References ---------- - Alvarez-Hamelin, J.I., et al. (2005) `K-Core Decomposition: A Tool for the Visualization of Large Networks <http://arxiv.org/abs/cs/0504107>`_. Examples -------- If given an :class:`~graphlab.SGraph` ``g``, we can create a :class:`~graphlab.kcore.KcoreModel` as follows: >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', format='snap') >>> kc = graphlab.kcore.create(g) We can obtain the ``core id`` corresponding to each vertex in the graph ``g`` using: >>> kcore_id = kc['core_id'] # SFrame We can add the new core id field to the original graph g using: >>> g.vertices['core_id'] = kc['graph'].vertices['core_id'] Note that the task above does not require a join because the vertex ordering is preserved through ``create()``. See Also -------- KcoreModel """ _mt._get_metric_tracker().track('toolkit.graph_analytics.kcore.create') if not isinstance(graph, _SGraph): raise TypeError('graph input must be a SGraph object.') opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax} params = _main.run('kcore', opts, verbose) return KcoreModel(params['model'])
def extract_features(self, dataset): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, or a :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/regression/houses.csv') >>> # Regression Tree Models >>> model = graphlab.boosted_trees_regression.create(data, ... target='price', ... features=['bath', 'bedroom', 'size']) >>> data['boosted_tree_features'] = model.extract_features(data) >>> model = graphlab.random_forest_regression.create(data, ... target='price', ... features=['bath', 'bedroom', 'size']) >>> data['random_forest_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['is_expensive'] = data['price'] > 30000 >>> model = graphlab.boosted_trees_classifier.create(data, ... target='is_expensive', ... features=['bath', 'bedroom', 'size']) >>> data['boosted_tree_features'] = model.extract_features(data) >>> model = graphlab.random_forest_classifier.create(data, ... target='is_expensive', ... features=['bath', 'bedroom', 'size']) >>> data['random_forest_features'] = model.extract_features(data) """ metric_name = '.'.join([self.__module__, 'extract_features']) _mt._get_metric_tracker().track(metric_name) _raise_error_if_not_sframe(dataset, "dataset") options = dict() options.update({'model': self.__proxy__, 'model_name': self.__name__, 'dataset': dataset}) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])
def extract_features(self, dataset, missing_value_action='auto'): """ For each example in the dataset, extract the leaf indices of each tree as features. For multiclass classification, each leaf index contains #num_class numbers. The returned feature vectors can be used as input to train another supervised learning model such as a :py:class:`~graphlab.logistic_classifier.LogisticClassifier`, an :py:class:`~graphlab.svm_classifier.SVMClassifier`, or a :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the features used for model training, but does not require a target column. Additional columns are ignored. missing_value_action: str, optional Action to perform when missing values are encountered. This can be one of: - 'auto': Choose a model dependent missing value policy. - 'impute': Proceed with evaluation by filling in the missing values with the mean of the training data. Missing values are also imputed if an entire column of data is missing during evaluation. - 'none': Treat missing value as is. Model must be able to handle missing value. - 'error' : Do not proceed with prediction and terminate with an error message. Returns ------- out : SArray An SArray of dtype array.array containing extracted features. Examples -------- >>> data = graphlab.SFrame( 'https://static.turi.com/datasets/regression/houses.csv') >>> # Regression Tree Models >>> data['regression_tree_features'] = model.extract_features(data) >>> # Classification Tree Models >>> data['classification_tree_features'] = model.extract_features(data) """ metric_name = '.'.join([self.__module__, 'extract_features']) _mt._get_metric_tracker().track(metric_name) _raise_error_if_not_sframe(dataset, "dataset") if missing_value_action == 'auto': missing_value_action = select_default_missing_value_policy( self, 'extract_features') options = dict() options.update({ 'model': self.__proxy__, 'model_name': self.__name__, 'missing_value_action': missing_value_action, 'dataset': dataset }) target = _toolkits_main.run('supervised_learning_feature_extraction', options) return _map_unity_proxy_to_object(target['extracted'])