def test_shortest_path(self):
        if "sssp" in get_unity().list_toolkit_functions():
            m = tc.shortest_path.create(self.graph, source_vid=0)
            print(m)
            m.summary()
            self.__test_model_save_load_helper__(m)

            m2 = tc.shortest_path.create(self.graph, source_vid=0)
            print(m2)
            self.__test_model_save_load_helper__(m2)

            # Test get_path function on a simple chain graph and star graph
            chain_graph = tc.SGraph().add_edges([tc.Edge(i, i + 1) for i in range(10)])
            m3 = tc.shortest_path.create(chain_graph, source_vid=0)
            for i in range(10):
                self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)])

            star_graph = tc.SGraph().add_edges([tc.Edge(0, i + 1) for i in range(10)])
            m4 = tc.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)])

            # Test that get_path with the 'show' parameter set to True doesn't
            # break.
            #
            # Showing is problematic when there is actually a browser.
            # This will pause scripts.
            # m4.get_path(i, show=True)

            # Test sssp ignoring the existing distance field
            star_graph.vertices['distance'] = 0
            m5 = tc.shortest_path.create(star_graph, source_vid=0)
            for i in range(1, 11):
                self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)])
Exemple #2
0
    def test_pickling_sgraph_types(self):

        sg_test_1 = tc.SGraph().add_vertices([
            tc.Vertex(0, {'fluffy': 1}),
            tc.Vertex(1, {
                'fluffy': 1,
                'woof': 1
            }),
            tc.Vertex(2, {})
        ])

        sg_test_2 = tc.SGraph()
        sg_test_2 = sg_test_2.add_vertices([tc.Vertex(x) for x in [0, 1, 2]])
        sg_test_2 = sg_test_2.add_edges([
            tc.Edge(0, 1, attr={'relationship': 'dislikes'}),
            tc.Edge(1, 2, attr={'relationship': 'likes'}),
            tc.Edge(1, 0, attr={'relationship': 'likes'})
        ])

        sarray_list = [sg_test_1, sg_test_2]
        for obj in sarray_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj.get_vertices(), obj_ret.get_vertices())
            assert_sframe_equal(obj.get_edges(), obj_ret.get_edges())
Exemple #3
0
    def setUp(self):

        self.pr_model = tc.pagerank.create(tc.SGraph())
        self.cc_model = tc.connected_components.create(tc.SGraph())

        self.__remove_file('~/tmp/tmp_model-%d' % temp_number)
        self.__remove_file('./tmp_model-%d' % temp_number)
        self.__remove_file('/tmp/tmp_model-%d' % temp_number)
        self.__remove_file('/tmp/tmp_model2-%d' % temp_number)
Exemple #4
0
def do_load_graph_twitter():
    # Load data
    print "[PB]: Loading Twitter graph"
    if os.path.exists(data_w_path_edges):
        print "[PB]: Loading graph from local path"
        edges = turicreate.SFrame.read_csv(data_w_path_edges, header=False)
        edges = edges.rename({'X1': 'src_node', 'X2': 'dst_node'})
        print edges
    else:
        print "[PB-ERROR]: Can't find data! " + data_w_path_edges
        exit(1)

    if os.path.exists(data_w_path_nodes):
        print "[PB]: Loading graph from local path"
        nodes = turicreate.SFrame.read_csv(data_w_path_nodes, header=False)
        nodes = nodes.rename({'X1': 'node_id'})
        print nodes
    else:
        print "[PB-ERROR]: Can't find nodes data! " + data_w_path_nodes
        exit(1)

    # Create graph
    sg = turicreate.SGraph()
    sg = sg.add_vertices(nodes, vid_field='node_id')
    sg = sg.add_edges(edges, src_field='src_node', dst_field='dst_node')
    return sg
Exemple #5
0
    def test_combination_gl_python_types(self):

        sg_test_1 = tc.SGraph().add_vertices([
            tc.Vertex(1, {'fluffy': 1}),
            tc.Vertex(2, {
                'fluffy': 1,
                'woof': 1
            }),
            tc.Vertex(3, {})
        ])
        sarray_test_1 = tc.SArray([1, 2, 3])
        sframe_test_1 = tc.SFrame([1, 2, 3])

        obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], {
            0: sg_test_1,
            1: sframe_test_1,
            2: sarray_test_1
        }]

        for obj in obj_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj[0].get_vertices(),
                                obj_ret[0].get_vertices())
            assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges())
            assert_sframe_equal(obj[1], obj_ret[1])
            assert list(obj[2]) == list(obj_ret[2])
def do_load_graph_twitter():
    # Load data
    print("[K]: Loading Twitter graph")
    if os.path.exists(data_w_path_edges):
        print("[K]: Loading graph from local path")
        edges = turicreate.SFrame.read_csv(data_w_path_edges, header=False)
        edges = edges.rename({'X1': 'src_node', 'X2': 'dst_node'})
        print(edges)
    else:
        print("[K][ERROR]: Can't find data! " + data_w_path_edges)
        exit(1)

    if os.path.exists(data_w_path_nodes):
        print("[K]: Loading graph from local path")
        nodes = turicreate.SFrame.read_csv(data_w_path_nodes, header=False)
        nodes = nodes.rename({'X1': 'node_id'})
        print(nodes)
    else:
        print("[K][ERROR]: Can't find nodes data! " + data_w_path_nodes)
        exit(1)

    # Create graph
    sg = turicreate.SGraph()
    sg = sg.add_vertices(nodes, vid_field='node_id')
    sg = sg.add_edges(edges, src_field='src_node', dst_field='dst_node')
    print(sg.summary())
    sys.stdout.flush()

    time.sleep(10)
    return sg
 def test_pickle_unity_object_exception(self):
     sa = tc.SArray()
     sf = tc.SFrame()
     g = tc.SGraph()
     sk = sa.summary()
     m = tc.pagerank.create(g)
     for obj in [sa, sf, g, sk, m]:
         self.assertRaises(PicklingError, lambda: cloudpickle.dumps(obj))
    def test_compute_shortest_path(self):
            edge_src_ids = ['src1', 'src2',   'a', 'b', 'c'  ]
            edge_dst_ids = [   'a',    'b', 'dst', 'c', 'dst']
            edges = tc.SFrame({'__src_id': edge_src_ids, '__dst_id': edge_dst_ids})
            g=tc.SGraph().add_edges(edges)
            res = list(tc.shortest_path._compute_shortest_path(g, ["src1","src2"], "dst"))
            self.assertEquals(res, [["src1", "a", "dst"]])
            res = list(tc.shortest_path._compute_shortest_path(g, "src2", "dst"))
            self.assertEquals(res, [["src2", "b", "c", "dst"]])

            edge_src_ids = [0,1,2,3,4]
            edge_dst_ids = [2,3,5,4,5]
            edge_weights   = [1,0.1,1,0.1,0.1]
            g=tc.SFrame({'__src_id':edge_src_ids,'__dst_id':edge_dst_ids, 'weights':edge_weights})
            g=tc.SGraph(edges=g)
            t=tc.shortest_path._compute_shortest_path(g,[0,1],[5],"weights")
            self.assertEquals(t.astype(list)[0], [1,3,4,5])
    def test_pickle_unity_object_exception(self):
        sa = tc.SArray()
        sf = tc.SFrame()
        g = tc.SGraph()
        sk = sa.summary()
        m = tc.pagerank.create(g)

        expected_error = TypeError if (version_info[0] == 3) else PicklingError

        for obj in [sa, sf, g, sk, m]:
            self.assertRaises(expected_error, lambda: cloudpickle.dumps(obj))
Exemple #10
0
    def test_compute_shortest_path(self):
        edge_src_ids = ["src1", "src2", "a", "b", "c"]
        edge_dst_ids = ["a", "b", "dst", "c", "dst"]
        edges = tc.SFrame({"__src_id": edge_src_ids, "__dst_id": edge_dst_ids})
        g = tc.SGraph().add_edges(edges)
        res = tc.shortest_path._compute_shortest_path(g, ["src1", "src2"],
                                                      "dst")
        self.assertEqual(res, [["src1", "a", "dst"]])
        res = tc.shortest_path._compute_shortest_path(g, "src2", "dst")
        self.assertEqual(res, [["src2", "b", "c", "dst"]])

        edge_src_ids = [0, 1, 2, 3, 4]
        edge_dst_ids = [2, 3, 5, 4, 5]
        edge_weights = [1, 0.1, 1, 0.1, 0.1]
        g = tc.SFrame({
            "__src_id": edge_src_ids,
            "__dst_id": edge_dst_ids,
            "weights": edge_weights,
        })
        g = tc.SGraph(edges=g)
        t = tc.shortest_path._compute_shortest_path(g, [0, 1], [5], "weights")
        self.assertEqual(t, [[1, 3, 4, 5]])
Exemple #11
0
def create(dataset, features=None, distance=None, radius=1.,
           min_core_neighbors=10, verbose=True):
    """
    Create a DBSCAN clustering model. The DBSCAN method partitions the input
    dataset into three types of points, based on the estimated probability
    density at each point.

    - **Core** points have a large number of points within a given neighborhood.
      Specifically, `min_core_neighbors` must be within distance `radius` of a
      point for it to be considered a core point.

    - **Boundary** points are within distance `radius` of a core point, but
      don't have sufficient neighbors of their own to be considered core.

    - **Noise** points comprise the remainder of the data. These points have too
      few neighbors to be considered core points, and are further than distance
      `radius` from all core points.

    Clusters are formed by connecting core points that are neighbors of each
    other, then assigning boundary points to their nearest core neighbor's
    cluster.

    Parameters
    ----------
    dataset : SFrame
        Training data, with each row corresponding to an observation. Must
        include all features specified in the `features` parameter, but may have
        additional columns as well.

    features : list[str], optional
        Name of the columns with features to use in comparing records. 'None'
        (the default) indicates that all columns of the input `dataset` should
        be used to train the model. All features must be numeric, i.e. integer
        or float types.

    distance : str or list[list], optional
        Function to measure the distance between any two input data rows. This
        may be one of two types:

        - *String*: the name of a standard distance function. One of
          'euclidean', 'squared_euclidean', 'manhattan', 'levenshtein',
          'jaccard', 'weighted_jaccard', 'cosine', 'dot_product' (deprecated),
          or 'transformed_dot_product'.

        - *Composite distance*: the weighted sum of several standard distance
          functions applied to various features. This is specified as a list of
          distance components, each of which is itself a list containing three
          items:

          1. list or tuple of feature names (str)

          2. standard distance name (str)

          3. scaling factor (int or float)

        For more information about Turi Create distance functions, please
        see the :py:mod:`~turicreate.toolkits.distances` module.

        For sparse vectors, missing keys are assumed to have value 0.0.

        If 'distance' is left unspecified, a composite distance is constructed
        automatically based on feature types.

    radius : int or float, optional
        Size of each point's neighborhood, with respect to the specified
        distance function.

    min_core_neighbors : int, optional
        Number of neighbors that must be within distance `radius` of a point in
        order for that point to be considered a "core point" of a cluster.

    verbose : bool, optional
        If True, print progress updates and model details during model creation.

    Returns
    -------
    out : DBSCANModel
        A model containing a cluster label for each row in the input `dataset`.
        Also contains the indices of the core points, cluster boundary points,
        and noise points.

    See Also
    --------
    DBSCANModel, turicreate.toolkits.distances

    Notes
    -----
    - Our implementation of DBSCAN first computes the similarity graph on the
      input dataset, which can be a computationally intensive process. In the
      current implementation, some distances are substantially faster than
      others; in particular "euclidean", "squared_euclidean", "cosine", and
      "transformed_dot_product" are quite fast, while composite distances can be
      slow.

    - Any distance function in the GL Create library may be used with DBSCAN but
      the results may be poor for distances that violate the standard metric
      properties, i.e. symmetry, non-negativity, triangle inequality, and
      identity of indiscernibles. In particular, the DBSCAN algorithm is based
      on the concept of connecting high-density points that are *close* to each
      other into a single cluster, but the notion of *close* may be very
      counterintuitive if the chosen distance function is not a valid metric.
      The distances "euclidean", "manhattan", "jaccard", and "levenshtein" will
      likely yield the best results.

    References
    ----------
    - Ester, M., et al. (1996) `A Density-Based Algorithm for Discovering
      Clusters in Large Spatial Databases with Noise
      <https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf>`_. In Proceedings of the
      Second International Conference on Knowledge Discovery and Data Mining.
      pp. 226-231.

    - `Wikipedia - DBSCAN <https://en.wikipedia.org/wiki/DBSCAN>`_

    - `Visualizing DBSCAN Clustering
      <http://www.naftaliharris.com/blog/visualizing-dbscan-clustering/>`_

    Examples
    --------
    >>> sf = turicreate.SFrame({
    ...     'x1': [0.6777, -9.391, 7.0385, 2.2657, 7.7864, -10.16, -8.162,
    ...            8.8817, -9.525, -9.153, 2.0860, 7.6619, 6.5511, 2.7020],
    ...     'x2': [5.6110, 8.5139, 5.3913, 5.4743, 8.3606, 7.8843, 2.7305,
    ...            5.1679, 6.7231, 3.7051, 1.7682, 7.4608, 3.1270, 6.5624]})
    ...
    >>> model = turicreate.dbscan.create(sf, radius=4.25, min_core_neighbors=3)
    >>> model.cluster_id.print_rows(15)
    +--------+------------+----------+
    | row_id | cluster_id |   type   |
    +--------+------------+----------+
    |   8    |     0      |   core   |
    |   7    |     2      |   core   |
    |   0    |     1      |   core   |
    |   2    |     2      |   core   |
    |   3    |     1      |   core   |
    |   11   |     2      |   core   |
    |   4    |     2      |   core   |
    |   1    |     0      | boundary |
    |   6    |     0      | boundary |
    |   5    |     0      | boundary |
    |   9    |     0      | boundary |
    |   12   |     2      | boundary |
    |   10   |     1      | boundary |
    |   13   |     1      | boundary |
    +--------+------------+----------+
    [14 rows x 3 columns]
    """

    ## Start the training time clock and instantiate an empty model
    logger = _logging.getLogger(__name__)
    start_time = _time.time()


    ## Validate the input dataset
    _tkutl._raise_error_if_not_sframe(dataset, "dataset")
    _tkutl._raise_error_if_sframe_empty(dataset, "dataset")


    ## Validate neighborhood parameters
    if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0:
        raise ValueError("Input 'min_core_neighbors' must be a non-negative " +
                         "integer.")

    if not isinstance(radius, (int, float)) or radius < 0:
        raise ValueError("Input 'radius' must be a non-negative integer " +
                         "or float.")


    ## Compute all-point nearest neighbors within `radius` and count
    #  neighborhood sizes
    knn_model = _tc.nearest_neighbors.create(dataset, features=features,
                                             distance=distance,
                                             method='brute_force',
                                             verbose=verbose)

    knn = knn_model.similarity_graph(k=None, radius=radius,
                                     include_self_edges=False,
                                     output_type='SFrame',
                                     verbose=verbose)

    neighbor_counts = knn.groupby('query_label', _agg.COUNT)


    ### NOTE: points with NO neighbors are already dropped here!

    ## Identify core points and boundary candidate points. Not all of the
    #  boundary candidates will be boundary points - some are in small isolated
    #  clusters.
    if verbose:
        logger.info("Identifying noise points and core points.")

    boundary_mask = neighbor_counts['Count'] < min_core_neighbors
    core_mask = 1 - boundary_mask

    # this includes too small clusters
    boundary_idx = neighbor_counts[boundary_mask]['query_label']
    core_idx = neighbor_counts[core_mask]['query_label']


    ## Build a similarity graph on the core points
    ## NOTE: careful with singleton core points - the second filter removes them
    #  from the edge set so they have to be added separately as vertices.
    if verbose:
        logger.info("Constructing the core point similarity graph.")

    core_vertices = knn.filter_by(core_idx, 'query_label')
    core_edges = core_vertices.filter_by(core_idx, 'reference_label')

    core_graph = _tc.SGraph()
    core_graph = core_graph.add_vertices(core_vertices[['query_label']],
                                         vid_field='query_label')
    core_graph = core_graph.add_edges(core_edges, src_field='query_label',
                                      dst_field='reference_label')


    ## Compute core point connected components and relabel to be consecutive
    #  integers
    cc = _tc.connected_components.create(core_graph, verbose=verbose)
    cc_labels = cc.component_size.add_row_number('__label')
    core_assignments = cc.component_id.join(cc_labels, on='component_id',
                                               how='left')[['__id', '__label']]
    core_assignments['type'] = 'core'


    ## Join potential boundary points to core cluster labels (points that aren't
    #  really on a boundary are implicitly dropped)
    if verbose:
        logger.info("Processing boundary points.")

    boundary_edges = knn.filter_by(boundary_idx, 'query_label')

    # separate real boundary points from points in small isolated clusters
    boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label')

    # join a boundary point to its single closest core point.
    boundary_assignments = boundary_core_edges.groupby('query_label',
                    {'reference_label': _agg.ARGMIN('rank', 'reference_label')})

    boundary_assignments = boundary_assignments.join(core_assignments,
                                                 on={'reference_label': '__id'})

    boundary_assignments = boundary_assignments.rename({'query_label': '__id'}, inplace=True)
    boundary_assignments = boundary_assignments.remove_column('reference_label', inplace=True)
    boundary_assignments['type'] = 'boundary'


    ## Identify boundary candidates that turned out to be in small clusters but
    #  not on real cluster boundaries
    small_cluster_idx = set(boundary_idx).difference(
                                                   boundary_assignments['__id'])


    ## Identify individual noise points by the fact that they have no neighbors.
    noise_idx = set(range(dataset.num_rows())).difference(
                                                 neighbor_counts['query_label'])

    noise_idx = noise_idx.union(small_cluster_idx)

    noise_assignments = _tc.SFrame({'row_id': _tc.SArray(list(noise_idx), int)})
    noise_assignments['cluster_id'] = None
    noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype(int)
    noise_assignments['type'] = 'noise'


    ## Append core, boundary, and noise results to each other.
    master_assignments = _tc.SFrame()
    num_clusters = 0

    if core_assignments.num_rows() > 0:
        core_assignments = core_assignments.rename({'__id': 'row_id',
                                                    '__label': 'cluster_id'}, inplace=True)
        master_assignments = master_assignments.append(core_assignments)
        num_clusters = len(core_assignments['cluster_id'].unique())

    if boundary_assignments.num_rows() > 0:
        boundary_assignments = boundary_assignments.rename({'__id': 'row_id',
                                                       '__label': 'cluster_id'}, inplace=True)
        master_assignments = master_assignments.append(boundary_assignments)

    if noise_assignments.num_rows() > 0:
        master_assignments = master_assignments.append(noise_assignments)


    ## Post-processing and formatting
    state = {'verbose': verbose,
             'radius': radius,
             'min_core_neighbors': min_core_neighbors,
             'distance': knn_model.distance,
             'num_distance_components': knn_model.num_distance_components,
             'num_examples': dataset.num_rows(),
             'features': knn_model.features,
             'num_features': knn_model.num_features,
             'unpacked_features': knn_model.unpacked_features,
             'num_unpacked_features': knn_model.num_unpacked_features,
             'cluster_id': master_assignments,
             'num_clusters': num_clusters,
             'training_time': _time.time() - start_time}

    return DBSCANModel(state)