def test_shortest_path(self): if "sssp" in get_unity().list_toolkit_functions(): m = tc.shortest_path.create(self.graph, source_vid=0) print(m) m.summary() self.__test_model_save_load_helper__(m) m2 = tc.shortest_path.create(self.graph, source_vid=0) print(m2) self.__test_model_save_load_helper__(m2) # Test get_path function on a simple chain graph and star graph chain_graph = tc.SGraph().add_edges([tc.Edge(i, i + 1) for i in range(10)]) m3 = tc.shortest_path.create(chain_graph, source_vid=0) for i in range(10): self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)]) star_graph = tc.SGraph().add_edges([tc.Edge(0, i + 1) for i in range(10)]) m4 = tc.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)]) # Test that get_path with the 'show' parameter set to True doesn't # break. # # Showing is problematic when there is actually a browser. # This will pause scripts. # m4.get_path(i, show=True) # Test sssp ignoring the existing distance field star_graph.vertices['distance'] = 0 m5 = tc.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)])
def test_pickling_sgraph_types(self): sg_test_1 = tc.SGraph().add_vertices([ tc.Vertex(0, {'fluffy': 1}), tc.Vertex(1, { 'fluffy': 1, 'woof': 1 }), tc.Vertex(2, {}) ]) sg_test_2 = tc.SGraph() sg_test_2 = sg_test_2.add_vertices([tc.Vertex(x) for x in [0, 1, 2]]) sg_test_2 = sg_test_2.add_edges([ tc.Edge(0, 1, attr={'relationship': 'dislikes'}), tc.Edge(1, 2, attr={'relationship': 'likes'}), tc.Edge(1, 0, attr={'relationship': 'likes'}) ]) sarray_list = [sg_test_1, sg_test_2] for obj in sarray_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj.get_vertices(), obj_ret.get_vertices()) assert_sframe_equal(obj.get_edges(), obj_ret.get_edges())
def setUp(self): self.pr_model = tc.pagerank.create(tc.SGraph()) self.cc_model = tc.connected_components.create(tc.SGraph()) self.__remove_file('~/tmp/tmp_model-%d' % temp_number) self.__remove_file('./tmp_model-%d' % temp_number) self.__remove_file('/tmp/tmp_model-%d' % temp_number) self.__remove_file('/tmp/tmp_model2-%d' % temp_number)
def do_load_graph_twitter(): # Load data print "[PB]: Loading Twitter graph" if os.path.exists(data_w_path_edges): print "[PB]: Loading graph from local path" edges = turicreate.SFrame.read_csv(data_w_path_edges, header=False) edges = edges.rename({'X1': 'src_node', 'X2': 'dst_node'}) print edges else: print "[PB-ERROR]: Can't find data! " + data_w_path_edges exit(1) if os.path.exists(data_w_path_nodes): print "[PB]: Loading graph from local path" nodes = turicreate.SFrame.read_csv(data_w_path_nodes, header=False) nodes = nodes.rename({'X1': 'node_id'}) print nodes else: print "[PB-ERROR]: Can't find nodes data! " + data_w_path_nodes exit(1) # Create graph sg = turicreate.SGraph() sg = sg.add_vertices(nodes, vid_field='node_id') sg = sg.add_edges(edges, src_field='src_node', dst_field='dst_node') return sg
def test_combination_gl_python_types(self): sg_test_1 = tc.SGraph().add_vertices([ tc.Vertex(1, {'fluffy': 1}), tc.Vertex(2, { 'fluffy': 1, 'woof': 1 }), tc.Vertex(3, {}) ]) sarray_test_1 = tc.SArray([1, 2, 3]) sframe_test_1 = tc.SFrame([1, 2, 3]) obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], { 0: sg_test_1, 1: sframe_test_1, 2: sarray_test_1 }] for obj in obj_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj[0].get_vertices(), obj_ret[0].get_vertices()) assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges()) assert_sframe_equal(obj[1], obj_ret[1]) assert list(obj[2]) == list(obj_ret[2])
def do_load_graph_twitter(): # Load data print("[K]: Loading Twitter graph") if os.path.exists(data_w_path_edges): print("[K]: Loading graph from local path") edges = turicreate.SFrame.read_csv(data_w_path_edges, header=False) edges = edges.rename({'X1': 'src_node', 'X2': 'dst_node'}) print(edges) else: print("[K][ERROR]: Can't find data! " + data_w_path_edges) exit(1) if os.path.exists(data_w_path_nodes): print("[K]: Loading graph from local path") nodes = turicreate.SFrame.read_csv(data_w_path_nodes, header=False) nodes = nodes.rename({'X1': 'node_id'}) print(nodes) else: print("[K][ERROR]: Can't find nodes data! " + data_w_path_nodes) exit(1) # Create graph sg = turicreate.SGraph() sg = sg.add_vertices(nodes, vid_field='node_id') sg = sg.add_edges(edges, src_field='src_node', dst_field='dst_node') print(sg.summary()) sys.stdout.flush() time.sleep(10) return sg
def test_pickle_unity_object_exception(self): sa = tc.SArray() sf = tc.SFrame() g = tc.SGraph() sk = sa.summary() m = tc.pagerank.create(g) for obj in [sa, sf, g, sk, m]: self.assertRaises(PicklingError, lambda: cloudpickle.dumps(obj))
def test_compute_shortest_path(self): edge_src_ids = ['src1', 'src2', 'a', 'b', 'c' ] edge_dst_ids = [ 'a', 'b', 'dst', 'c', 'dst'] edges = tc.SFrame({'__src_id': edge_src_ids, '__dst_id': edge_dst_ids}) g=tc.SGraph().add_edges(edges) res = list(tc.shortest_path._compute_shortest_path(g, ["src1","src2"], "dst")) self.assertEquals(res, [["src1", "a", "dst"]]) res = list(tc.shortest_path._compute_shortest_path(g, "src2", "dst")) self.assertEquals(res, [["src2", "b", "c", "dst"]]) edge_src_ids = [0,1,2,3,4] edge_dst_ids = [2,3,5,4,5] edge_weights = [1,0.1,1,0.1,0.1] g=tc.SFrame({'__src_id':edge_src_ids,'__dst_id':edge_dst_ids, 'weights':edge_weights}) g=tc.SGraph(edges=g) t=tc.shortest_path._compute_shortest_path(g,[0,1],[5],"weights") self.assertEquals(t.astype(list)[0], [1,3,4,5])
def test_pickle_unity_object_exception(self): sa = tc.SArray() sf = tc.SFrame() g = tc.SGraph() sk = sa.summary() m = tc.pagerank.create(g) expected_error = TypeError if (version_info[0] == 3) else PicklingError for obj in [sa, sf, g, sk, m]: self.assertRaises(expected_error, lambda: cloudpickle.dumps(obj))
def test_compute_shortest_path(self): edge_src_ids = ["src1", "src2", "a", "b", "c"] edge_dst_ids = ["a", "b", "dst", "c", "dst"] edges = tc.SFrame({"__src_id": edge_src_ids, "__dst_id": edge_dst_ids}) g = tc.SGraph().add_edges(edges) res = tc.shortest_path._compute_shortest_path(g, ["src1", "src2"], "dst") self.assertEqual(res, [["src1", "a", "dst"]]) res = tc.shortest_path._compute_shortest_path(g, "src2", "dst") self.assertEqual(res, [["src2", "b", "c", "dst"]]) edge_src_ids = [0, 1, 2, 3, 4] edge_dst_ids = [2, 3, 5, 4, 5] edge_weights = [1, 0.1, 1, 0.1, 0.1] g = tc.SFrame({ "__src_id": edge_src_ids, "__dst_id": edge_dst_ids, "weights": edge_weights, }) g = tc.SGraph(edges=g) t = tc.shortest_path._compute_shortest_path(g, [0, 1], [5], "weights") self.assertEqual(t, [[1, 3, 4, 5]])
def create(dataset, features=None, distance=None, radius=1., min_core_neighbors=10, verbose=True): """ Create a DBSCAN clustering model. The DBSCAN method partitions the input dataset into three types of points, based on the estimated probability density at each point. - **Core** points have a large number of points within a given neighborhood. Specifically, `min_core_neighbors` must be within distance `radius` of a point for it to be considered a core point. - **Boundary** points are within distance `radius` of a core point, but don't have sufficient neighbors of their own to be considered core. - **Noise** points comprise the remainder of the data. These points have too few neighbors to be considered core points, and are further than distance `radius` from all core points. Clusters are formed by connecting core points that are neighbors of each other, then assigning boundary points to their nearest core neighbor's cluster. Parameters ---------- dataset : SFrame Training data, with each row corresponding to an observation. Must include all features specified in the `features` parameter, but may have additional columns as well. features : list[str], optional Name of the columns with features to use in comparing records. 'None' (the default) indicates that all columns of the input `dataset` should be used to train the model. All features must be numeric, i.e. integer or float types. distance : str or list[list], optional Function to measure the distance between any two input data rows. This may be one of two types: - *String*: the name of a standard distance function. One of 'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein', 'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated), or 'transformed_dot_product'. - *Composite distance*: the weighted sum of several standard distance functions applied to various features. This is specified as a list of distance components, each of which is itself a list containing three items: 1. list or tuple of feature names (str) 2. standard distance name (str) 3. scaling factor (int or float) For more information about Turi Create distance functions, please see the :py:mod:`~turicreate.toolkits.distances` module. For sparse vectors, missing keys are assumed to have value 0.0. If 'distance' is left unspecified, a composite distance is constructed automatically based on feature types. radius : int or float, optional Size of each point's neighborhood, with respect to the specified distance function. min_core_neighbors : int, optional Number of neighbors that must be within distance `radius` of a point in order for that point to be considered a "core point" of a cluster. verbose : bool, optional If True, print progress updates and model details during model creation. Returns ------- out : DBSCANModel A model containing a cluster label for each row in the input `dataset`. Also contains the indices of the core points, cluster boundary points, and noise points. See Also -------- DBSCANModel, turicreate.toolkits.distances Notes ----- - Our implementation of DBSCAN first computes the similarity graph on the input dataset, which can be a computationally intensive process. In the current implementation, some distances are substantially faster than others; in particular "euclidean", "squared_euclidean", "cosine", and "transformed_dot_product" are quite fast, while composite distances can be slow. - Any distance function in the GL Create library may be used with DBSCAN but the results may be poor for distances that violate the standard metric properties, i.e. symmetry, non-negativity, triangle inequality, and identity of indiscernibles. In particular, the DBSCAN algorithm is based on the concept of connecting high-density points that are *close* to each other into a single cluster, but the notion of *close* may be very counterintuitive if the chosen distance function is not a valid metric. The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will likely yield the best results. References ---------- - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise <https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_. In Proceedings of the Second International Conference on Knowledge Discovery and Data Mining. pp. 226-231. - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_ - `Visualizing DBSCAN Clustering <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_ Examples -------- >>> sf = turicreate.SFrame({ ... 'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162, ... 8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020], ... 'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305, ... 5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]}) ... >>> model = turicreate.dbscan.create(sf, radius=4.25, min_core_neighbors=3) >>> model.cluster_id.print_rows(15) +--------+------------+----------+ | row_id | cluster_id | type | +--------+------------+----------+ | 8 | 0 | core | | 7 | 2 | core | | 0 | 1 | core | | 2 | 2 | core | | 3 | 1 | core | | 11 | 2 | core | | 4 | 2 | core | | 1 | 0 | boundary | | 6 | 0 | boundary | | 5 | 0 | boundary | | 9 | 0 | boundary | | 12 | 2 | boundary | | 10 | 1 | boundary | | 13 | 1 | boundary | +--------+------------+----------+ [14 rows x 3 columns] """ ## Start the training time clock and instantiate an empty model logger = _logging.getLogger(__name__) start_time = _time.time() ## Validate the input dataset _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") ## Validate neighborhood parameters if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0: raise ValueError("Input 'min_core_neighbors' must be a non-negative " + "integer.") if not isinstance(radius, (int, float)) or radius < 0: raise ValueError("Input 'radius' must be a non-negative integer " + "or float.") ## Compute all-point nearest neighbors within `radius` and count # neighborhood sizes knn_model = _tc.nearest_neighbors.create(dataset, features=features, distance=distance, method='brute_force', verbose=verbose) knn = knn_model.similarity_graph(k=None, radius=radius, include_self_edges=False, output_type='SFrame', verbose=verbose) neighbor_counts = knn.groupby('query_label', _agg.COUNT) ### NOTE: points with NO neighbors are already dropped here! ## Identify core points and boundary candidate points. Not all of the # boundary candidates will be boundary points - some are in small isolated # clusters. if verbose: logger.info("Identifying noise points and core points.") boundary_mask = neighbor_counts['Count'] < min_core_neighbors core_mask = 1 - boundary_mask # this includes too small clusters boundary_idx = neighbor_counts[boundary_mask]['query_label'] core_idx = neighbor_counts[core_mask]['query_label'] ## Build a similarity graph on the core points ## NOTE: careful with singleton core points - the second filter removes them # from the edge set so they have to be added separately as vertices. if verbose: logger.info("Constructing the core point similarity graph.") core_vertices = knn.filter_by(core_idx, 'query_label') core_edges = core_vertices.filter_by(core_idx, 'reference_label') core_graph = _tc.SGraph() core_graph = core_graph.add_vertices(core_vertices[['query_label']], vid_field='query_label') core_graph = core_graph.add_edges(core_edges, src_field='query_label', dst_field='reference_label') ## Compute core point connected components and relabel to be consecutive # integers cc = _tc.connected_components.create(core_graph, verbose=verbose) cc_labels = cc.component_size.add_row_number('__label') core_assignments = cc.component_id.join(cc_labels, on='component_id', how='left')[['__id', '__label']] core_assignments['type'] = 'core' ## Join potential boundary points to core cluster labels (points that aren't # really on a boundary are implicitly dropped) if verbose: logger.info("Processing boundary points.") boundary_edges = knn.filter_by(boundary_idx, 'query_label') # separate real boundary points from points in small isolated clusters boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label') # join a boundary point to its single closest core point. boundary_assignments = boundary_core_edges.groupby('query_label', {'reference_label': _agg.ARGMIN('rank', 'reference_label')}) boundary_assignments = boundary_assignments.join(core_assignments, on={'reference_label': '__id'}) boundary_assignments = boundary_assignments.rename({'query_label': '__id'}, inplace=True) boundary_assignments = boundary_assignments.remove_column('reference_label', inplace=True) boundary_assignments['type'] = 'boundary' ## Identify boundary candidates that turned out to be in small clusters but # not on real cluster boundaries small_cluster_idx = set(boundary_idx).difference( boundary_assignments['__id']) ## Identify individual noise points by the fact that they have no neighbors. noise_idx = set(range(dataset.num_rows())).difference( neighbor_counts['query_label']) noise_idx = noise_idx.union(small_cluster_idx) noise_assignments = _tc.SFrame({'row_id': _tc.SArray(list(noise_idx), int)}) noise_assignments['cluster_id'] = None noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype(int) noise_assignments['type'] = 'noise' ## Append core, boundary, and noise results to each other. master_assignments = _tc.SFrame() num_clusters = 0 if core_assignments.num_rows() > 0: core_assignments = core_assignments.rename({'__id': 'row_id', '__label': 'cluster_id'}, inplace=True) master_assignments = master_assignments.append(core_assignments) num_clusters = len(core_assignments['cluster_id'].unique()) if boundary_assignments.num_rows() > 0: boundary_assignments = boundary_assignments.rename({'__id': 'row_id', '__label': 'cluster_id'}, inplace=True) master_assignments = master_assignments.append(boundary_assignments) if noise_assignments.num_rows() > 0: master_assignments = master_assignments.append(noise_assignments) ## Post-processing and formatting state = {'verbose': verbose, 'radius': radius, 'min_core_neighbors': min_core_neighbors, 'distance': knn_model.distance, 'num_distance_components': knn_model.num_distance_components, 'num_examples': dataset.num_rows(), 'features': knn_model.features, 'num_features': knn_model.num_features, 'unpacked_features': knn_model.unpacked_features, 'num_unpacked_features': knn_model.num_unpacked_features, 'cluster_id': master_assignments, 'num_clusters': num_clusters, 'training_time': _time.time() - start_time} return DBSCANModel(state)