Beispiel #1
0
    def __init__(self, incoming_graph_data=None, default_label=None, **attr):

        self.graph_attr_dict_factory = self.graph_attr_dict_factory
        self.node_dict_factory = self.node_dict_factory
        self.adjlist_dict_factory = self.adjlist_dict_factory
        self.graph = self.graph_attr_dict_factory()
        self.cache = self.graph_cache_factory(self)

        # init node and adj (must be after cache)
        self._node = self.node_dict_factory(self)
        self._adj = self.adjlist_dict_factory(self)
        self._pred = self.adjlist_dict_factory(self, pred=True)
        self._succ = self._adj

        self._key = None
        self._op = None
        self._session_id = None
        self._graph_type = self._graph_type
        self._schema = GraphSchema()
        self._schema.init_nx_schema()

        # cache for add_node and add_edge
        self._add_node_cache = []
        self._add_edge_cache = []
        self._remove_node_cache = []
        self._remove_edge_cache = []

        create_empty_in_engine = attr.pop("create_empty_in_engine",
                                          True)  # a hidden parameter
        self._distributed = attr.pop("dist", False)
        if incoming_graph_data is not None and self._is_gs_graph(
                incoming_graph_data):
            # convert from gs graph always use distributed mode
            self._distributed = True
            if self._session is None:
                self._session = get_session_by_id(
                    incoming_graph_data.session_id)
        self._default_label = default_label

        if self._session is None:
            self._try_to_get_default_session()

        if not self._is_gs_graph(
                incoming_graph_data) and create_empty_in_engine:
            graph_def = empty_graph_in_engine(self, self.is_directed(),
                                              self._distributed)
            self._key = graph_def.key

        # attempt to load graph with data
        if incoming_graph_data is not None:
            if self._is_gs_graph(incoming_graph_data):
                self._init_with_arrow_property_graph(incoming_graph_data)
                self.cache.warmup()
            else:
                g = to_networkx_graph(incoming_graph_data, create_using=self)
                check_argument(isinstance(g, Graph))

        # load graph attributes (must be after to_networkx_graph)
        self.graph.update(attr)
        self._saved_signature = self.signature
Beispiel #2
0
    def __init__(self, session_id, incoming_data=None):
        """Construct a :class:`Graph` object.

        Args:
            session_id (str): Session id of the session the graph is created in.
            incoming_data: Graph can be initialized through various type of sources,
                which can be one of:
                    - :class:`GraphDef`
                    - :class:`nx.Graph`
                    - :class:`Graph`
                    - :class:`vineyard.Object`, :class:`vineyard.ObjectId` or :class:`vineyard.ObjectName`
        """

        # Don't import the :code:`NXGraph` in top-level statments to improve the
        # performance of :code:`import graphscope`.
        from graphscope.experimental.nx.classes.graph import Graph as NXGraph

        self._key = None
        self._op = None
        self._graph_type = None
        self.directed = False
        self._vineyard_id = 0
        self._schema = GraphSchema()

        self._session_id = session_id
        self._detached = False

        self._interactive_instance_list = []
        self._learning_instance_list = []

        if isinstance(incoming_data, GraphDef):
            graph_def = incoming_data
        elif isinstance(incoming_data, NXGraph):
            graph_def = self._from_nx_graph(incoming_data)
        elif isinstance(incoming_data, Graph):
            graph_def = self._copy_from(incoming_data)
        elif isinstance(
                incoming_data,
            (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)):
            graph_def = self._from_vineyard(incoming_data)
        else:
            raise ValueError(
                "Failed to create a graph on graphscope engine: %s",
                incoming_data)

        if graph_def:
            self._key = graph_def.key
            self._vineyard_id = graph_def.vineyard_id
            self._graph_type = graph_def.graph_type
            self._directed = graph_def.directed
            self._schema.get_schema_from_def(graph_def.schema_def)
            self._schema_path = graph_def.schema_path
            # init saved_signature (must be after init schema)
            self._saved_signature = self.signature
Beispiel #3
0
class Graph:
    def __init__(self, graph_def, conn=None) -> None:
        self._schema = GraphSchema()
        self._schema.from_graph_def(graph_def)
        self._conn: Connection = conn
        self._schema._conn = conn

    def schema(self):
        return self._schema

    def insert_vertex(self, vertex: VertexRecordKey, properties: dict):
        return self.insert_vertices([[vertex, properties]])

    def insert_vertices(self, vertices: list):
        request = to_write_requests_pb("VERTEX", vertices,
                                       write_service_pb2.INSERT)
        return self._conn.batch_write(request)

    def update_vertex_properties(self, vertex: VertexRecordKey,
                                 properties: dict):
        request = to_write_requests_pb("VERTEX", [[vertex, properties]],
                                       write_service_pb2.UPDATE)
        return self._conn.batch_write(request)

    def delete_vertex(self, vertex_pk: VertexRecordKey):
        return self.delete_vertices([vertex_pk])

    def delete_vertices(self, vertex_pks: list):
        request = to_write_requests_pb("VERTEX",
                                       [[pk, {}] for pk in vertex_pks],
                                       write_service_pb2.DELETE)
        return self._conn.batch_write(request)

    def insert_edge(self, edge: EdgeRecordKey, properties: dict):
        return self.insert_edges([[edge, properties]])

    def insert_edges(self, edges: list):
        request = to_write_requests_pb("EDGE", edges, write_service_pb2.INSERT)
        return self._conn.batch_write(request)

    def update_edge_properties(self, edge: EdgeRecordKey, properties: dict):
        request = to_write_requests_pb("EDGE", [[edge, properties]],
                                       write_service_pb2.UPDATE)
        return self._conn.batch_write(request)

    def delete_edge(self, edge: EdgeRecordKey):
        return self.delete_edges([edge])

    def delete_edges(self, edge_pks: list):
        request = to_write_requests_pb("EDGE", [[pk, {}] for pk in edge_pks],
                                       write_service_pb2.DELETE)
        return self._conn.batch_write(request)
Beispiel #4
0
    def __init__(
        self,
        graph_node,
    ):
        """Construct a :class:`Graph` object."""

        self._graph_node = graph_node
        self._session = self._graph_node.session
        # copy and set op evaluated
        self._graph_node.op = deepcopy(self._graph_node.op)
        self._graph_node.evaluated = True
        self._session.dag.add_op(self._graph_node.op)

        self._key = None
        self._vineyard_id = 0
        self._schema = GraphSchema()
        self._detached = False

        self._interactive_instance_launching_thread = None
        self._interactive_instance_list = []
        self._learning_instance_list = []
Beispiel #5
0
    def __init__(
        self,
        session,
        incoming_data=None,
        oid_type="int64",
        directed=True,
        generate_eid=True,
    ):
        """Construct a :class:`Graph` object.

        Args:
            session_id (str): Session id of the session the graph is created in.
            incoming_data: Graph can be initialized through various type of sources,
                which can be one of:

                    - :class:`Operation`
                    - :class:`nx.Graph`
                    - :class:`Graph`
                    - :class:`vineyard.Object`, :class:`vineyard.ObjectId` or :class:`vineyard.ObjectName`
        """

        self._key = None
        self._graph_type = types_pb2.ARROW_PROPERTY
        self._vineyard_id = 0
        self._schema = GraphSchema()
        self._session = session
        self._detached = False

        self._interactive_instance_launching_thread = None
        self._interactive_instance_list = []
        self._learning_instance_list = []

        # Hold uncompleted operation for lazy evaluation
        self._pending_op = None
        # Hold a reference to base graph of modify operation,
        # to avoid being garbage collected
        self._base_graph = None

        oid_type = utils.normalize_data_type_str(oid_type)
        if oid_type not in ("int64_t", "std::string"):
            raise ValueError("oid_type can only be int64_t or string.")
        self._oid_type = oid_type
        self._directed = directed
        self._generate_eid = generate_eid

        self._unsealed_vertices = {}
        self._unsealed_edges = {}
        # Used to isplay schema without load into vineyard,
        # and do sanity checking for newly added vertices and edges.
        self._v_labels = []
        self._e_labels = []
        self._e_relationships = []

        if incoming_data is not None:
            # Don't import the :code:`NXGraph` in top-level statements to improve the
            # performance of :code:`import graphscope`.
            from graphscope.experimental import nx

            if isinstance(incoming_data, Operation):
                self._pending_op = incoming_data
                if self._pending_op.type == types_pb2.PROJECT_GRAPH:
                    self._graph_type = types_pb2.ARROW_PROJECTED
            elif isinstance(incoming_data, nx.Graph):
                self._pending_op = self._from_nx_graph(incoming_data)
            elif isinstance(incoming_data, Graph):
                self._pending_op = self._copy_from(incoming_data)
            elif isinstance(
                    incoming_data,
                (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)):
                self._pending_op = self._from_vineyard(incoming_data)
            else:
                raise RuntimeError("Not supported incoming data.")
Beispiel #6
0
class Graph(object):
    """A class for representing metadata of a graph in the GraphScope.

    A :class:`Graph` object holds the metadata of a graph, such as key, schema, and the graph is directed or not.

    It is worth noting that the graph is stored by the backend such as Analytical Engine, Vineyard.
    In other words, the graph object holds nothing but metadata.

    The following example demonstrates its usage:

    .. code:: python

        >>> import graphscope as gs
        >>> from graphscope.framework.loader import Loader
        >>> sess = gs.session()
        >>> graph = sess.g()
        >>> graph = graph.add_vertices("person.csv","person")
        >>> graph = graph.add_vertices("software.csv", "software")
        >>> graph = graph.add_edges("knows.csv", "knows", src_label="person", dst_label="person")
        >>> graph = graph.add_edges("created.csv", "created", src_label="person", dst_label="software")
        >>> print(graph)
        >>> print(graph.schema)
    """
    def __init__(
        self,
        session,
        incoming_data=None,
        oid_type="int64",
        directed=True,
        generate_eid=True,
    ):
        """Construct a :class:`Graph` object.

        Args:
            session_id (str): Session id of the session the graph is created in.
            incoming_data: Graph can be initialized through various type of sources,
                which can be one of:

                    - :class:`Operation`
                    - :class:`nx.Graph`
                    - :class:`Graph`
                    - :class:`vineyard.Object`, :class:`vineyard.ObjectId` or :class:`vineyard.ObjectName`
        """

        self._key = None
        self._graph_type = types_pb2.ARROW_PROPERTY
        self._vineyard_id = 0
        self._schema = GraphSchema()
        self._session = session
        self._detached = False

        self._interactive_instance_launching_thread = None
        self._interactive_instance_list = []
        self._learning_instance_list = []

        # Hold uncompleted operation for lazy evaluation
        self._pending_op = None
        # Hold a reference to base graph of modify operation,
        # to avoid being garbage collected
        self._base_graph = None

        oid_type = utils.normalize_data_type_str(oid_type)
        if oid_type not in ("int64_t", "std::string"):
            raise ValueError("oid_type can only be int64_t or string.")
        self._oid_type = oid_type
        self._directed = directed
        self._generate_eid = generate_eid

        self._unsealed_vertices = {}
        self._unsealed_edges = {}
        # Used to isplay schema without load into vineyard,
        # and do sanity checking for newly added vertices and edges.
        self._v_labels = []
        self._e_labels = []
        self._e_relationships = []

        if incoming_data is not None:
            # Don't import the :code:`NXGraph` in top-level statements to improve the
            # performance of :code:`import graphscope`.
            from graphscope.experimental import nx

            if isinstance(incoming_data, Operation):
                self._pending_op = incoming_data
                if self._pending_op.type == types_pb2.PROJECT_GRAPH:
                    self._graph_type = types_pb2.ARROW_PROJECTED
            elif isinstance(incoming_data, nx.Graph):
                self._pending_op = self._from_nx_graph(incoming_data)
            elif isinstance(incoming_data, Graph):
                self._pending_op = self._copy_from(incoming_data)
            elif isinstance(
                    incoming_data,
                (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)):
                self._pending_op = self._from_vineyard(incoming_data)
            else:
                raise RuntimeError("Not supported incoming data.")

    def __del__(self):
        # cleanly ignore all exceptions, cause session may already closed / destroyed.
        try:
            self.unload()
        except Exception:  # pylint: disable=broad-except
            pass

    def _close_interactive_instances(self):
        # Close related interactive instances when graph unloaded.
        # Since the graph is gone, quering via interactive client is meaningless.
        for instance in self._interactive_instance_list:
            instance.close()
        self._interactive_instance_list.clear()

    def _close_learning_instances(self):
        for instance in self._learning_instance_list:
            instance.close()
        self._learning_instance_list.clear()

    def _launch_interactive_instance_impl(self):
        try:
            self._session.gremlin(self)
        except:  # noqa: E722
            # Record error msg in `InteractiveQuery` when launching failed.
            # Unexpect and suppress all exceptions here.
            pass

    def _from_graph_def(self, graph_def):
        check_argument(self._graph_type == graph_def.graph_type,
                       "Graph type doesn't match.")

        self._key = graph_def.key
        self._vineyard_id = graph_def.vineyard_id
        self._oid_type = graph_def.schema_def.oid_type
        self._directed = graph_def.directed
        self._generate_eid = graph_def.generate_eid

        self._schema_path = graph_def.schema_path
        self._schema.get_schema_from_def(graph_def.schema_def)
        self._v_labels = self._schema.vertex_labels
        self._e_labels = self._schema.edge_labels
        self._e_relationships = self._schema.edge_relationships

    def _ensure_loaded(self):
        if self._key is not None and self._pending_op is None:
            return
        # Unloaded
        if self._session is None:
            raise RuntimeError("The graph is not loaded")
        # Empty graph
        if self._key is None and self._pending_op is None:
            raise RuntimeError("Empty graph.")
        # Try to load
        if self._pending_op is not None:
            # Create a graph from scratch.
            graph_def = self._pending_op.eval()
            self._from_graph_def(graph_def)
            self._pending_op = None
            self._base_graph = None
            self._unsealed_vertices.clear()
            self._unsealed_edges.clear()
            # init saved_signature (must be after init schema)
            self._saved_signature = self.signature
            # create gremlin server pod asynchronously
            if gs_config.initializing_interactive_engine:
                self._interactive_instance_launching_thread = threading.Thread(
                    target=self._launch_interactive_instance_impl, args=())
                self._interactive_instance_launching_thread.start()

    @property
    def key(self):
        """The key of the corresponding graph in engine."""
        self._ensure_loaded()
        return self._key

    @property
    def graph_type(self):
        """The type of the graph object.

        Returns:
            type (`types_pb2.GraphType`): the type of the graph.
        """
        return self._graph_type

    @property
    def schema(self):
        """Schema of the graph.

        Returns:
            :class:`GraphSchema`: the schema of the graph
        """
        self._ensure_loaded()
        return self._schema

    @property
    def schema_path(self):
        """Path that Coordinator will write interactive schema path to.

        Returns:
            str: The path contains the schema. for interactive engine.
        """
        self._ensure_loaded()
        return self._schema_path

    @property
    def signature(self):
        self._ensure_loaded()
        return hashlib.sha256("{}.{}".format(
            self._schema.signature(), self._key).encode("utf-8")).hexdigest()

    @property
    def template_str(self):
        self._ensure_loaded()

        # transform str/string to std::string
        oid_type = utils.normalize_data_type_str(self._oid_type)
        vid_type = self._schema.vid_type
        vdata_type = utils.data_type_to_cpp(self._schema.vdata_type)
        edata_type = utils.data_type_to_cpp(self._schema.edata_type)
        if self._graph_type == types_pb2.ARROW_PROPERTY:
            template = f"vineyard::ArrowFragment<{oid_type},{vid_type}>"
        elif self._graph_type == types_pb2.ARROW_PROJECTED:
            template = f"gs::ArrowProjectedFragment<{oid_type},{vid_type},{vdata_type},{edata_type}>"
        elif self._graph_type == types_pb2.DYNAMIC_PROJECTED:
            template = f"gs::DynamicProjectedFragment<{vdata_type},{edata_type}>"
        else:
            raise ValueError(f"Unsupported graph type: {self._graph_type}")
        return template

    @property
    def vineyard_id(self):
        """Get the vineyard object_id of this graph.

        Returns:
            str: return vineyard id of this graph
        """
        self._ensure_loaded()
        return self._vineyard_id

    @property
    def session_id(self):
        """Get the currrent session_id.

        Returns:
            str: Return session id that the graph belongs to.
        """
        return self._session.session_id

    def detach(self):
        """Detaching a graph makes it being left in vineyard even when the varaible for
        this :class:`Graph` object leaves the lexical scope.

        The graph can be accessed using the graph's :code:`ObjectID` or its name later.
        """
        self._detached = True

    def loaded(self):
        try:
            self._ensure_loaded()
        except RuntimeError:
            return False
        return self._key is not None

    def __str__(self):
        v_str = "\n".join([f"VERTEX: {label}" for label in self._v_labels])
        relations = []
        for i in range(len(self._e_labels)):
            relations.extend([(self._e_labels[i], src, dst)
                              for src, dst in self._e_relationships[i]])
        e_str = "\n".join([
            f"EDGE: {label}\tsrc: {src}\tdst: {dst}"
            for label, src, dst in relations
        ])

        return f"graphscope.Graph\n{types_pb2.GraphType.Name(self._graph_type)}\n{v_str}\n{e_str}"

    def __repr__(self):
        return self.__str__()

    def unload(self):
        """Unload this graph from graphscope engine."""
        if self._session is None:
            raise RuntimeError("The graph is not loaded")

        if self._key is None:
            self._session = None
            self._pending_op = None
            return

        # close interactive instances first
        try:
            if (self._interactive_instance_launching_thread is not None and
                    self._interactive_instance_launching_thread.is_alive()):
                # join raises a RuntimeError if an attempt is made to join the current thread.
                # this exception occurs when a object collected by gc mechanism contains a running thread.
                if (threading.current_thread() !=
                        self._interactive_instance_launching_thread):
                    self._interactive_instance_launching_thread.join()
            self._close_interactive_instances()
        except Exception as e:
            logger.error("Failed to close interactive instances: %s" % e)
        try:
            self._close_learning_instances()
        except Exception as e:
            logger.error("Failed to close learning instances: %s" % e)
        if not self._detached:
            op = dag_utils.unload_graph(self)
            op.eval()
        self._key = None
        self._session = None
        self._pending_op = None

    def project_to_simple(self,
                          v_label="_",
                          e_label="_",
                          v_prop=None,
                          e_prop=None):
        """Project a property graph to a simple graph, useful for analytical engine.
        Will translate name represented label or property to index, which is broadedly used
        in internal engine.

        Args:
            v_label (str, optional): vertex label to project. Defaults to "_".
            e_label (str, optional): edge label to project. Defaults to "_".
            v_prop (str, optional): vertex property of the v_label. Defaults to None.
            e_prop (str, optional): edge property of the e_label. Defaults to None.

        Returns:
            :class:`Graph`: A `Graph` instance, which graph_type is `ARROW_PROJECTED`
        """
        self._ensure_loaded()
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)

        self._check_unmodified()

        def check_out_of_range(id, length):
            if id >= length or id < 0:
                raise IndexError("id {} is out of range.".format(id))

        try:
            if isinstance(v_label, str):
                v_label_id = self._schema.vertex_label_index(v_label)
            else:
                v_label_id = v_label
                check_out_of_range(v_label_id, self._schema.vertex_label_num)
                v_label = self._schema.vertex_labels[v_label_id]
            if isinstance(e_label, str):
                e_label_id = self._schema.edge_label_index(e_label)
            else:
                e_label_id = e_label
                check_out_of_range(e_label_id, self._schema.edge_label_num)
                e_label = self._schema.edge_labels[e_label]
        except ValueError as e:
            raise ValueError("Label does not exists.") from e

        # Check relation v_label -> e_label <- v_label exists.
        relation = (v_label, v_label)
        if relation not in self._schema.edge_relationships[e_label_id]:
            raise ValueError(
                f"Graph doesn't contain such relationship: {v_label} -> {e_label} <- {v_label}."
            )

        try:
            if v_prop is None:
                v_prop_id = -1
                vdata_type = None
            else:
                if isinstance(v_prop, str):
                    v_prop_id = self._schema.vertex_property_index(
                        v_label_id, v_prop)
                else:
                    v_prop_id = v_prop
                properties = self._schema.vertex_properties[v_label_id]
                check_out_of_range(v_prop_id, len(properties))
                vdata_type = list(properties.values())[v_prop_id]
            if e_prop is None:
                e_prop_id = -1
                edata_type = None
            else:
                if isinstance(e_prop, str):
                    e_prop_id = self._schema.edge_property_index(
                        e_label_id, e_prop)
                else:
                    e_prop_id = e_prop
                properties = self._schema.edge_properties[e_label_id]
                check_out_of_range(e_prop_id, len(properties))
                edata_type = list(properties.values())[e_prop_id]
        except ValueError as e:
            raise ValueError("Property does not exists.") from e

        oid_type = self._schema.oid_type
        vid_type = self._schema.vid_type

        op = dag_utils.project_arrow_property_graph(
            self,
            v_label_id,
            v_prop_id,
            e_label_id,
            e_prop_id,
            vdata_type,
            edata_type,
            oid_type,
            vid_type,
        )
        return Graph(self._session, op)

    def add_column(self, results, selector):
        """Add the results as a column to the graph. Modification rules are given by the selector.

        Args:
            results (:class:`Context`): A `Context` that created by doing a query.
            selector (dict): Select results to add as column. Format is similar to selectors in `Context`

        Returns:
            :class:`Graph`: A new `Graph` with new columns.
        """
        self._ensure_loaded()
        check_argument(isinstance(selector, Mapping),
                       "selector of add column must be a dict")
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        self._check_unmodified()
        selector = {
            key: results._transform_selector(value)
            for key, value in selector.items()
        }
        selector = json.dumps(selector)
        op = dag_utils.add_column(self, results, selector)
        return Graph(self._session, op)

    def to_numpy(self, selector, vertex_range=None):
        """Select some elements of the graph and output to numpy.

        Args:
            selector (str): Select a portion of graph as a numpy.ndarray.
            vertex_range(dict, optional): Slice vertices. Defaults to None.
        Returns:
            `numpy.ndarray`
        """
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        self._ensure_loaded()
        self._check_unmodified()
        selector = utils.transform_labeled_vertex_property_data_selector(
            self, selector)
        vertex_range = utils.transform_vertex_range(vertex_range)
        op = dag_utils.graph_to_numpy(self, selector, vertex_range)
        ret = op.eval()
        return utils.decode_numpy(ret)

    def to_dataframe(self, selector, vertex_range=None):
        """Select some elements of the graph and output as a pandas.DataFrame

        Args:
            selector (dict): Select some portions of graph.
            vertex_range (dict, optional): Slice vertices. Defaults to None.

        Returns:
            `pandas.DataFrame`
        """
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        self._ensure_loaded()
        self._check_unmodified()
        check_argument(
            isinstance(selector, Mapping),
            "selector of to_vineyard_dataframe must be a dict",
        )
        selector = {
            key:
            utils.transform_labeled_vertex_property_data_selector(self, value)
            for key, value in selector.items()
        }
        selector = json.dumps(selector)
        vertex_range = utils.transform_vertex_range(vertex_range)

        op = dag_utils.graph_to_dataframe(self, selector, vertex_range)
        ret = op.eval()
        return utils.decode_dataframe(ret)

    def is_directed(self):
        self._ensure_loaded()
        return self._directed

    def _check_unmodified(self):
        self._ensure_loaded()
        check_argument(self.signature == self._saved_signature,
                       "Graph has been modified!")

    def _from_nx_graph(self, incoming_graph):
        """Create a gs graph from a nx graph.
        Args:
            incoming_graph (:class:`nx.graph`): A nx graph that contains graph data.

        Returns:
            that will be used to construct a gs.Graph

        Raises:
            TypeError: Raise Error if graph type not match.

        Examples:
            >>> nx_g = nx.path_graph(10)
            >>> gs_g = gs.Graph(nx_g)
        """
        if hasattr(incoming_graph, "_graph"):
            msg = "graph view can not convert to gs graph"
            raise TypeError(msg)
        return dag_utils.dynamic_to_arrow(incoming_graph)

    def _copy_from(self, incoming_graph):
        """Copy a graph.

        Args:
            incoming_graph (:class:`Graph`): Source graph to be copied from

        Returns:
            :class:`Graph`: An identical graph, but with a new vineyard id.
        """
        check_argument(incoming_graph.graph_type == types_pb2.ARROW_PROPERTY)
        check_argument(incoming_graph.loaded())
        return dag_utils.copy_graph(incoming_graph)

    def _from_vineyard(self, vineyard_object):
        """Load a graph from a already existed vineyard graph.

        Args:
            vineyard_object (:class:`vineyard.Object`, :class:`vineyard.ObjectID`
                            or :class:`vineyard.ObjectName`): vineyard object,
                            which represents a graph.

        Returns:
            A graph_def.
        """
        if isinstance(vineyard_object, vineyard.Object):
            return self._from_vineyard_id(vineyard_object.id)
        if isinstance(vineyard_object, vineyard.ObjectID):
            return self._from_vineyard_id(vineyard_object)
        if isinstance(vineyard_object, vineyard.ObjectName):
            return self._from_vineyard_name(vineyard_object)

    def _from_vineyard_id(self, vineyard_id):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = utils.b_to_attr(True)
        config[types_pb2.VINEYARD_ID] = utils.i_to_attr(int(vineyard_id))
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = utils.s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = utils.s_to_attr("uint64_t")
        return dag_utils.create_graph(self.session_id,
                                      types_pb2.ARROW_PROPERTY,
                                      attrs=config)

    def _from_vineyard_name(self, vineyard_name):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = utils.b_to_attr(True)
        config[types_pb2.VINEYARD_NAME] = utils.s_to_attr(str(vineyard_name))
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = utils.s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = utils.s_to_attr("uint64_t")
        return dag_utils.create_graph(self.session_id,
                                      types_pb2.ARROW_PROPERTY,
                                      attrs=config)

    def _attach_interactive_instance(self, instance):
        """Store the instance when a new interactive instance is started.

        Args:
            instance: interactive instance
        """
        self._interactive_instance_list.append(instance)

    def _attach_learning_instance(self, instance):
        """Store the instance when a new learning instance is created.

        Args:
            instance: learning instance
        """
        self._learning_instance_list.append(instance)

    def save_to(self, path, **kwargs):
        """Serialize graph to a location.
        The meta and data of graph is dumped to specified location,
        and can be restored by `Graph.deserialize` in other sessions.

        Each worker will write a `path_{worker_id}.meta` file and
        a `path_{worker_id}` file to storage.
        Args:
            path (str): supported storages are local, hdfs, oss, s3
        """
        import vineyard
        import vineyard.io

        self._ensure_loaded()
        sess = self._session
        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        vineyard.io.serialize(
            path,
            vineyard.ObjectID(self._vineyard_id),
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )

    @classmethod
    def load_from(cls, path, sess, **kwargs):
        """Construct a `Graph` by deserialize from `path`.
        It will read all serialization files, which is dumped by
        `Graph.serialize`.
        If any serialize file doesn't exists or broken, will error out.

        Args:
            path (str): Path contains the serialization files.
            sess (`graphscope.Session`): The target session
                that the graph will be construct in

        Returns:
            `Graph`: A new graph object. Schema and data is supposed to be
                identical with the one that called serialized method.
        """
        import vineyard
        import vineyard.io

        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        graph_id = vineyard.io.deserialize(
            path,
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
        return cls(sess, vineyard.ObjectID(graph_id))

    def draw(self, vertices, hop=1):
        """Visualize the graph data in the result cell when the draw functions are invoked

        Args:
            vertices (list): selected vertices.
            hop (int): draw induced subgraph with hop extension. Defaults to 1.

        Returns:
            A GraphModel.
        """
        from ipygraphin import GraphModel

        self._ensure_loaded()
        interactive_query = self._session.gremlin(self)

        graph = GraphModel()
        graph.queryGraphData(vertices, hop, interactive_query)

        # listen on the 1~2 hops operation of node
        graph.on_msg(graph.queryNeighbor)
        return graph

    def _construct_graph(self,
                         vertices,
                         edges,
                         v_labels,
                         e_labels,
                         e_relations,
                         mutation_func=None):
        """Construct graph.
           1. Construct a graph from scratch.
              If the vertices and edges is empty, return a empty graph.
           2. Construct a graph from existed builded graph.
              If the vertices and edges is empty, return a copied graph.

        Args:
            vertices ([type]): [description]
            edges ([type]): [description]
            v_labels ([type]): [description]
            e_labels ([type]): [description]
            e_relations ([type]): [description]
            mutation_func ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """
        config = graph_utils.assemble_op_config(
            vertices.values(),
            edges.values(),
            self._oid_type,
            self._directed,
            self._generate_eid,
        )

        # edge case.
        if not vertices and not edges:
            if mutation_func:
                # Rely on `self._key`
                return Graph(self._session, self)
            else:
                return Graph(
                    self._session,
                    None,
                    self._oid_type,
                    self._directed,
                    self._generate_eid,
                )
        if mutation_func:
            op = mutation_func(self, attrs=config)
        else:
            op = dag_utils.create_graph(self.session_id,
                                        types_pb2.ARROW_PROPERTY,
                                        attrs=config)

        graph = Graph(self._session, op, self._oid_type, self._directed,
                      self._generate_eid)
        graph._unsealed_vertices = vertices
        graph._unsealed_edges = edges
        graph._v_labels = v_labels
        graph._e_labels = e_labels
        graph._e_relationships = e_relations
        # propage info about whether is a loaded graph.
        # graph._key = self._key
        if mutation_func:
            graph._base_graph = self._base_graph or self
        return graph

    def add_vertices(self, vertices, label="_", properties=[], vid_field=0):
        is_from_existed_graph = len(self._unsealed_vertices) != len(
            self._v_labels) or len(self._unsealed_edges) != len(self._e_labels)

        if label in self._v_labels:
            raise ValueError(f"Label {label} already existed in graph.")
        if not self._v_labels and self._e_labels:
            raise ValueError(
                "Cannot manually add vertices after inferred vertices.")
        unsealed_vertices = deepcopy(self._unsealed_vertices)
        unsealed_vertices[label] = VertexLabel(label=label,
                                               loader=vertices,
                                               properties=properties,
                                               vid_field=vid_field)
        v_labels = deepcopy(self._v_labels)
        v_labels.append(label)

        # Load after validity check and before create add_vertices op.
        # TODO(zsy): Add ability to add vertices and edges to existed graph simultaneously.
        if is_from_existed_graph and self._unsealed_edges:
            self._ensure_loaded()

        func = dag_utils.add_vertices if is_from_existed_graph else None
        return self._construct_graph(
            unsealed_vertices,
            self._unsealed_edges,
            v_labels,
            self._e_labels,
            self._e_relationships,
            func,
        )

    def add_edges(
        self,
        edges,
        label="_",
        properties=[],
        src_label=None,
        dst_label=None,
        src_field=0,
        dst_field=1,
    ):
        """Add edges to graph.
        1. Add edges to a uninitialized graph.

            i.   src_label and dst_label both unspecified. In this case, current graph must
                 has 0 (we deduce vertex label from edge table, and set vertex label name to '_'),
                 or 1 vertex label (we set src_label and dst label to this).
            ii.  src_label and dst_label both specified and existed in current graph's vertex labels.
            iii. src_label and dst_label both specified and there is no vertex labels in current graph.
                 we deduce all vertex labels from edge tables.
                 Note that you either provide all vertex labels, or let graphscope deduce all vertex labels.
                 We don't support mixed style.

        2. Add edges to a existed graph.
            Must add a new kind of edge label, not a new relation to builded graph.
            But you can add a new relation to uninitialized part of the graph.
            src_label and dst_label must be specified and existed in current graph.

        Args:
            edges ([type]): [description]
            label (str, optional): [description]. Defaults to "_".
            properties ([type], optional): [description]. Defaults to None.
            src_label ([type], optional): [description]. Defaults to None.
            dst_label ([type], optional): [description]. Defaults to None.
            src_field (int, optional): [description]. Defaults to 0.
            dst_field (int, optional): [description]. Defaults to 1.

        Raises:
            RuntimeError: [description]

        Returns:
            Graph: [description]
        """
        is_from_existed_graph = len(self._unsealed_vertices) != len(
            self._v_labels) or len(self._unsealed_edges) != len(self._e_labels)

        if is_from_existed_graph:
            if label in self._e_labels and label not in self._unsealed_edges:
                raise ValueError("Cannot add new relation to existed graph.")
            if src_label is None or dst_label is None:
                raise ValueError("src label and dst label cannot be None.")
            if src_label not in self._v_labels or dst_label not in self._v_labels:
                raise ValueError(
                    "src label or dst_label not existed in graph.")
        else:
            if src_label is None and dst_label is None:
                check_argument(
                    len(self._v_labels) <= 1, "ambiguous vertex label")
                if len(self._v_labels) == 1:
                    src_label = dst_label = self._v_labels[0]
                else:
                    src_label = dst_label = "_"
            elif src_label is not None and dst_label is not None:
                if self._v_labels:
                    if (src_label not in self._v_labels
                            or dst_label not in self._v_labels):
                        raise ValueError(
                            "src label or dst_label not existed in graph.")
                else:
                    # Infer all v_labels from edge tables.
                    pass
            else:
                raise ValueError(
                    "src and dst label must be both specified or either unspecified."
                )

        check_argument(src_field != dst_field,
                       "src and dst field cannot refer to the same field")

        unsealed_edges = deepcopy(self._unsealed_edges)
        e_labels = deepcopy(self._e_labels)
        relations = deepcopy(self._e_relationships)
        if label in unsealed_edges:
            assert label in self._e_labels
            label_idx = self._e_labels.index(label)
            # Will check conflict in `add_sub_label`
            relations[label_idx].append((src_label, dst_label))
            cur_label = unsealed_edges[label]
        else:
            e_labels.append(label)
            relations.append([(src_label, dst_label)])
            cur_label = EdgeLabel(label)
        cur_label.add_sub_label(
            EdgeSubLabel(edges, properties, src_label, dst_label, src_field,
                         dst_field))
        unsealed_edges[label] = cur_label

        # Load after validity check and before create add_vertices op.
        # TODO(zsy): Add ability to add vertices and edges to existed graph simultaneously.
        if is_from_existed_graph and self._unsealed_vertices:
            self._ensure_loaded()

        func = dag_utils.add_edges if is_from_existed_graph else None
        return self._construct_graph(
            self._unsealed_vertices,
            unsealed_edges,
            self._v_labels,
            e_labels,
            relations,
            func,
        )
Beispiel #7
0
class Graph(object):
    """A class for representing metadata of a graph in the GraphScope.

    A :class:`Graph` object holds the metadata of a graph, such as key, schema, and the graph is directed or not.

    It is worth noting that the graph is stored by the backend such as Analytical Engine, Vineyard.
    In other words, the graph object holds nothing but metadata.

    The graph object should not be created directly from :class:`Graph`.
    Instead, the graph should be created by `Session.load_from`

    The following example demonstrates its usage:

    .. code:: python

        >>> import graphscope as gs
        >>> from graphscope.framework.loader import Loader
        >>> sess = gs.session()
        >>> g = sess.load_from(
        ...     edges={
        ...         "knows": (
        ...             Loader("{}/p2p-31_property_e_0".format(property_dir), header_row=True),
        ...             ["src_label_id", "dst_label_id", "dist"],
        ...             ("src_id", "person"),
        ...             ("dst_id", "person"),
        ...         ),
        ...     },
        ...     vertices={
        ...         "person": Loader(
        ...             "{}/p2p-31_property_v_0".format(property_dir), header_row=True
        ...         ),
        ...     }
        ... )
    """

    def __init__(self, session_id, incoming_data=None):
        """Construct a :class:`Graph` object.

        Args:
            session_id (str): Session id of the session the graph is created in.
            incoming_data: Graph can be initialized through various type of sources,
                which can be one of:
                    - :class:`GraphDef`
                    - :class:`nx.Graph`
                    - :class:`Graph`
                    - :class:`VineyardObject`
        """

        # Don't import the :code:`NXGraph` in top-level statments to improve the
        # performance of :code:`import graphscope`.
        from graphscope.experimental.nx.classes.graph import Graph as NXGraph

        self._key = None
        self._op = None
        self._graph_type = None
        self.directed = False
        self._vineyard_id = 0
        self._schema = GraphSchema()

        self._session_id = session_id
        self._detached = False

        self._interactive_instance_list = []
        self._learning_instance_list = []

        if isinstance(incoming_data, GraphDef):
            graph_def = incoming_data
        elif isinstance(incoming_data, NXGraph):
            graph_def = self._from_nx_graph(incoming_data)
        elif isinstance(incoming_data, Graph):
            graph_def = self._copy_from(incoming_data)
        elif isinstance(incoming_data, VineyardObject):
            graph_def = self._from_vineyard(incoming_data)
        else:
            raise ValueError(
                "Failed to create a graph on graphscope engine: %s", incoming_data
            )

        if graph_def:
            self._key = graph_def.key
            self._vineyard_id = graph_def.vineyard_id
            self._graph_type = graph_def.graph_type
            self._directed = graph_def.directed
            self._schema.get_schema_from_def(graph_def.schema_def)
            self._schema_path = graph_def.schema_path
            # init saved_signature (must be after init schema)
            self._saved_signature = self.signature

    def __del__(self):
        # cleanly ignore all exceptions, cause session may already closed / destroyed.
        try:
            self.unload()
        except Exception:  # pylint: disable=broad-except
            pass

    def _close_interactive_instances(self):
        # Close related interactive instances when graph unloaded.
        # Since the graph is gone, quering via interactive client is meaningless.
        for instance in self._interactive_instance_list:
            instance.close()
        self._interactive_instance_list.clear()

    def _close_learning_instances(self):
        for instance in self._learning_instance_list:
            instance.close()
        self._learning_instance_list.clear()

    @property
    def op(self):
        """The DAG op of this graph."""
        return self._op

    @property
    def key(self):
        """The key of the corresponding graph in engine."""
        return self._key

    @property
    def graph_type(self):
        """The type of the graph object.

        Returns:
            type (`types_pb2.GraphType`): the type of the graph.
        """
        return self._graph_type

    @property
    def schema(self):
        """Schema of the graph.

        Returns:
            :class:`GraphSchema`: the schema of the graph
        """
        return self._schema

    @property
    def schema_path(self):
        """Path that Coordinator will write interactive schema path to.

        Returns:
            str: The path contains the schema. for interactive engine.
        """
        return self._schema_path

    @property
    def signature(self):
        if self._key is None:
            raise RuntimeError("graph should be registered in remote.")
        return hashlib.sha256(
            "{}.{}".format(self._schema.signature(), self._key).encode("utf-8")
        ).hexdigest()

    @property
    def template_sigature(self):
        if self._key is None:
            raise RuntimeError("graph should be registered in remote.")
        return hashlib.sha256(
            "{}.{}.{}.{}.{}".format(
                self._graph_type,
                self._schema.oid_type,
                self._schema.vid_type,
                self._schema.vdata_type,
                self._schema.edata_type,
            ).encode("utf-8")
        ).hexdigest()

    @property
    def vineyard_id(self):
        """Get the vineyard object_id of this graph.

        Returns:
            str: return vineyard id of this graph
        """
        return self._vineyard_id

    @property
    def session_id(self):
        """Get the currrent session_id.

        Returns:
            str: Return session id that the graph belongs to.
        """
        return self._session_id

    def detach(self):
        """Detaching a graph makes it being left in vineyard even when the varaible for
        this :class:`Graph` object leaves the lexical scope.

        The graph can be accessed using the graph's :code:`ObjectID` or its name later.
        """
        self._detached = True

    def loaded(self):
        return self._key is not None

    def __repr__(self):
        return "<grape.Graph '%s'>" % self._key

    def unload(self):
        """Unload this graph from graphscope engine."""
        if not self.loaded():
            raise RuntimeError("The graph is not registered in remote.")
        # close interactive instances first
        try:
            self._close_interactive_instances()
        except Exception as e:
            logger.error("Failed to close interactive instances: %s" % e)
        try:
            self._close_learning_instances()
        except Exception as e:
            logger.error("Failed to close learning instances: %s" % e)
        if not self._detached:
            op = unload_graph(self)
            op.eval()
        self._key = None

    def project_to_simple(self, v_label="_", e_label="_", v_prop=None, e_prop=None):
        """Project a property graph to a simple graph, useful for analytical engine.
        Will translate name represented label or property to index, which is broadedly used
        in internal engine.

        Args:
            v_label (str, optional): vertex label to project. Defaults to "_".
            e_label (str, optional): edge label to project. Defaults to "_".
            v_prop (str, optional): vertex property of the v_label. Defaults to None.
            e_prop (str, optional): edge property of the e_label. Defaults to None.

        Returns:
            :class:`Graph`: A `Graph` instance, which graph_type is `ARROW_PROJECTED`
        """
        if not self.loaded():
            raise RuntimeError(
                "The graph is not registered in remote, and can't project to simple"
            )
        self.check_unmodified()
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        check_argument(isinstance(v_label, (int, str)))
        check_argument(isinstance(e_label, (int, str)))

        def check_out_of_range(id, length):
            if id < length and id > -1:
                return id
            else:
                raise KeyError("id {} is out of range.".format(id))

        try:
            v_label_id = (
                check_out_of_range(v_label, self._schema.vertex_label_num)
                if isinstance(v_label, int)
                else self._schema.vertex_label_index(v_label)
            )
        except ValueError as e:
            raise ValueError(
                "graph not contains the vertex label {}.".format(v_label)
            ) from e

        try:
            e_label_id = (
                check_out_of_range(e_label, self._schema.edge_label_num)
                if isinstance(e_label, int)
                else self._schema.edge_label_index(e_label)
            )
        except ValueError as e:
            raise InvalidArgumentError(
                "graph not contains the edge label {}.".format(e_label)
            ) from e

        if v_prop is None:
            # NB: -1 means vertex property is None
            v_prop_id = -1
            v_properties = None
        else:
            check_argument(isinstance(v_prop, (int, str)))
            v_properties = self._schema.vertex_properties[v_label_id]
            try:
                v_prop_id = (
                    check_out_of_range(v_prop, len(v_properties))
                    if isinstance(v_prop, int)
                    else self._schema.vertex_property_index(v_label_id, v_prop)
                )
            except ValueError as e:
                raise ValueError(
                    "vertex label {} not contains the property {}".format(
                        v_label, v_prop
                    )
                ) from e

        if e_prop is None:
            # NB: -1 means edge property is None
            e_prop_id = -1
            e_properties = None
        else:
            check_argument(isinstance(e_prop, (int, str)))
            e_properties = self._schema.edge_properties[e_label_id]
            try:
                e_prop_id = (
                    check_out_of_range(e_prop, len(e_properties))
                    if isinstance(e_prop, int)
                    else self._schema.edge_property_index(e_label_id, e_prop)
                )
            except ValueError as e:
                raise ValueError(
                    "edge label {} not contains the property {}".format(e_label, e_prop)
                ) from e

        oid_type = self._schema.oid_type
        vid_type = self._schema.vid_type
        vdata_type = None
        if v_properties:
            vdata_type = list(v_properties.values())[v_prop_id]
        edata_type = None
        if e_properties:
            edata_type = list(e_properties.values())[e_prop_id]

        op = project_arrow_property_graph(
            self,
            v_label_id,
            v_prop_id,
            e_label_id,
            e_prop_id,
            vdata_type,
            edata_type,
            oid_type,
            vid_type,
        )
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)

    def add_column(self, results, selector):
        """Add the results as a column to the graph. Modification rules are given by the selector.

        Args:
            results (:class:`Context`): A `Context` that created by doing a query.
            selector (dict): Select results to add as column. Format is similar to selectors in `Context`

        Returns:
            :class:`Graph`: A new `Graph` with new columns.
        """
        check_argument(
            isinstance(selector, Mapping), "selector of add column must be a dict"
        )
        self.check_unmodified()
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        selector = {
            key: results._transform_selector(value) for key, value in selector.items()
        }
        selector = json.dumps(selector)
        op = add_column(self, results, selector)
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)

    def to_numpy(self, selector, vertex_range=None):
        """Select some elements of the graph and output to numpy.

        Args:
            selector (str): Select a portion of graph as a numpy.ndarray.
            vertex_range(dict, optional): Slice vertices. Defaults to None.
        Returns:
            `numpy.ndarray`
        """
        self.check_unmodified()
        selector = transform_labeled_vertex_property_data_selector(self, selector)
        vertex_range = transform_vertex_range(vertex_range)
        op = graph_to_numpy(self, selector, vertex_range)
        ret = op.eval()
        return decode_numpy(ret)

    def to_dataframe(self, selector, vertex_range=None):
        """Select some elements of the graph and output as a pandas.DataFrame

        Args:
            selector (dict): Select some portions of graph.
            vertex_range (dict, optional): Slice vertices. Defaults to None.

        Returns:
            `pandas.DataFrame`
        """
        self.check_unmodified()
        check_argument(
            isinstance(selector, Mapping),
            "selector of to_vineyard_dataframe must be a dict",
        )
        selector = {
            key: transform_labeled_vertex_property_data_selector(self, value)
            for key, value in selector.items()
        }
        selector = json.dumps(selector)
        vertex_range = transform_vertex_range(vertex_range)

        op = graph_to_dataframe(self, selector, vertex_range)
        ret = op.eval()
        return decode_dataframe(ret)

    def is_directed(self):
        return self._directed

    def check_unmodified(self):
        check_argument(
            self.signature == self._saved_signature, "Graph has been modified!"
        )

    def _from_nx_graph(self, incoming_graph):
        """Create a gs graph from a nx graph.
        Args:
            incoming_graph (:class:`nx.graph`): A nx graph that contains graph data.

        Returns:
            that will be used to construct a gs.Graph

        Raises:
            TypeError: Raise Error if graph type not match.

        Examples:
            >>> nx_g = nx.path_graph(10)
            >>> gs_g = gs.Graph(nx_g)
        """
        if hasattr(incoming_graph, "_graph"):
            msg = "graph view can not convert to gs graph"
            raise TypeError(msg)
        op = dynamic_to_arrow(incoming_graph)
        graph_def = op.eval()
        return graph_def

    def _copy_from(self, incoming_graph):
        """Copy a graph.

        Args:
            incoming_graph (:class:`Graph`): Source graph to be copied from

        Returns:
            :class:`Graph`: An identical graph, but with a new vineyard id.
        """
        check_argument(incoming_graph.graph_type == types_pb2.ARROW_PROPERTY)
        check_argument(incoming_graph.loaded())
        op = copy_graph(incoming_graph)
        graph_def = op.eval()
        return graph_def

    def _from_vineyard(self, vineyard_object):
        """Load a graph from a already existed vineyard graph.

        Args:
            vineyard_object (:class:`VineyardObject`): vineyard object, which contains a graph.

        Returns:
            A graph_def.
        """
        if vineyard_object.object_id is not None:
            return self._from_vineyard_id(vineyard_object.object_id)
        elif vineyard_object.object_name is not None:
            return self._from_vineyard_name(vineyard_object.object_name)

    def _from_vineyard_id(self, vineyard_id):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = b_to_attr(True)
        config[types_pb2.VINEYARD_ID] = i_to_attr(vineyard_id)
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = s_to_attr("uint64_t")
        op = create_graph(self._session_id, types_pb2.ARROW_PROPERTY, attrs=config)
        graph_def = op.eval()
        return graph_def

    def _from_vineyard_name(self, vineyard_name):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = b_to_attr(True)
        config[types_pb2.VINEYARD_NAME] = s_to_attr(vineyard_name)
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = s_to_attr("uint64_t")
        op = create_graph(self._session_id, types_pb2.ARROW_PROPERTY, attrs=config)
        graph_def = op.eval()
        return graph_def

    def attach_interactive_instance(self, instance):
        """Store the instance when a new interactive instance is started.

        Args:
            instance: interactive instance
        """
        self._interactive_instance_list.append(instance)

    def attach_learning_instance(self, instance):
        """Store the instance when a new learning instance is created.

        Args:
            instance: learning instance
        """
        self._learning_instance_list.append(instance)
Beispiel #8
0
class DiGraph(Graph):
    """
    Base class for directed graphs.

    A DiGraph that holds the metadata of a graph, and provides NetworkX-like DiGraph APIs.

    It is worth noticing that the graph is actually stored by the Analytical Engine backend.
    In other words, the Graph object holds nothing but metadata of a graph

    DiGraph support nodes and edges with optional data, or attributes.

    DiGraphs support directed edges.  Self loops are allowed but multiple
    (parallel) edges are not.

    Nodes can be arbitrary int/str/float/bool objects with optional
    key/value attributes.

    Edges are represented as links between nodes with optional
    key/value attributes.

    DiGraph support node label if it's created from a GraphScope graph object.
    nodes are identified by `(label, id)` tuple.

    Parameters
    ----------
    incoming_graph_data : input graph (optional, default: None)
        Data to initialize graph. If None (default) an empty
        graph is created.  The data can be any format that is supported
        by the to_networkx_graph() function, currently including edge list,
        dict of dicts, dict of lists, NetworkX graph, NumPy matrix
        or 2d ndarray, Pandas DataFrame, SciPy sparse matrix, or a GraphScope
        graph object.

    default_label : default node label (optional, default: None)
        if incoming_graph_data is a GraphScope graph object, default label means
        the nodes of the label can be identified by id directly, other label nodes
        need to use `(label, id)` to identify.

    attr : keyword arguments, optional (default= no attributes)
        Attributes to add to graph as key=value pairs.

    See Also
    --------
    Graph

    Examples
    --------
    Create an empty graph structure (a "null graph") with no nodes and
    no edges.

    >>> G = nx.DiGraph()

    G can be grown in several ways.

    **Nodes:**

    Add one node at a time:

    >>> G.add_node(1)

    Add the nodes from any container (a list, dict, set or
    even the lines from a file or the nodes from another graph).

    >>> G.add_nodes_from([2, 3])
    >>> G.add_nodes_from(range(100, 110))
    >>> H = nx.path_graph(10)
    >>> G.add_nodes_from(H)

    In addition integers, strings can represent a node.

    >>> G.add_node('a node')

    **Edges:**

    G can also be grown by adding edges.

    Add one edge,

    >>> G.add_edge(1, 2)

    a list of edges,

    >>> G.add_edges_from([(1, 2), (1, 3)])

    or a collection of edges,

    >>> G.add_edges_from(H.edges)

    If some edges connect nodes not yet in the graph, the nodes
    are added automatically.  There are no errors when adding
    nodes or edges that already exist.

    **Attributes:**

    Each graph, node, and edge can hold key/value attribute pairs
    in an associated attribute dictionary (the keys must be hashable).
    By default these are empty, but can be added or changed using
    add_edge, add_node or direct manipulation of the attribute
    dictionaries named graph, node and edge respectively.

    >>> G = nx.DiGraph(day="Friday")
    >>> G.graph
    {'day': 'Friday'}

    Add node attributes using add_node(), add_nodes_from() or G.nodes

    >>> G.add_node(1, time='5pm')
    >>> G.add_nodes_from([3], time='2pm')
    >>> G.nodes[1]
    {'time': '5pm'}
    >>> G.nodes[1]['room'] = 714
    >>> del G.nodes[1]['room'] # remove attribute
    >>> list(G.nodes(data=True))
    [(1, {'time': '5pm'}), (3, {'time': '2pm'})]

    Add edge attributes using add_edge(), add_edges_from(), subscript
    notation, or G.edges.

    >>> G.add_edge(1, 2, weight=4.7 )
    >>> G.add_edges_from([(3, 4), (4, 5)], color='red')
    >>> G.add_edges_from([(1, 2, {'color':'blue'}), (2, 3, {'weight':8})])
    >>> G[1][2]['weight'] = 4.7
    >>> G.edges[1, 2]['weight'] = 4

    Warning: we protect the graph data structure by making `G.edges[1, 2]` a
    read-only dict-like structure. However, you can assign to attributes
    in e.g. `G.edges[1, 2]`. Thus, use 2 sets of brackets to add/change
    data attributes: `G.edges[1, 2]['weight'] = 4`
    (For multigraphs: `MG.edges[u, v, key][name] = value`).

    **Shortcuts:**

    Many common graph features allow python syntax to speed reporting.

    >>> 1 in G     # check if node in graph
    True
    >>> [n for n in G if n < 3]  # iterate through nodes
    [1, 2]
    >>> len(G)  # number of nodes in graph
    5

    Often the best way to traverse all edges of a graph is via the neighbors.
    The neighbors are reported as an adjacency-dict `G.adj` or `G.adjacency()`

    >>> for n, nbrsdict in G.adjacency():
    ...     for nbr, eattr in nbrsdict.items():
    ...        if 'weight' in eattr:
    ...            # Do something useful with the edges
    ...            pass

    But the edges reporting object is often more convenient:

    >>> for u, v, weight in G.edges(data='weight'):
    ...     if weight is not None:
    ...         # Do something useful with the edges
    ...         pass

    **Transformation**

    Create a graph with GraphScope graph object. First we init a GraphScope graph
    with two node labels: person and comment`

    >>> g = graphscope.g(directed=True).add_vertice("persion.csv", label="person").add_vertice("comment.csv", label="comment")

    create a graph with g, set default_label to 'person'

    >>> G = nx.DiGraph(g, default_label="person")

    `person` label nodes can be identified by id directly, for `comment` label,
    we has to use tuple `("comment", id)` identify. Like, add a person label
    node and a comment label node

    >>> G.add_node(0, type="person")
    >>> G.add_node(("comment", 0), type="comment")

    print property of two nodes

    >>> G.nodes[0]
    {"type", "person"}
    >>> G.nodes[("comment", 0)]
    {"type", "comment"}

    **Reporting:**

    Simple graph information is obtained using object-attributes and methods.
    Reporting usually provides views instead of containers to reduce memory
    usage. The views update as the graph is updated similarly to dict-views.
    The objects `nodes, `edges` and `adj` provide access to data attributes
    via lookup (e.g. `nodes[n], `edges[u, v]`, `adj[u][v]`) and iteration
    (e.g. `nodes.items()`, `nodes.data('color')`,
    `nodes.data('color', default='blue')` and similarly for `edges`)
    Views exist for `nodes`, `edges`, `neighbors()`/`adj` and `degree`.

    For details on these and other miscellaneous methods, see below.
    """
    @patch_docstring(Graph.__init__)
    def __init__(self, incoming_graph_data=None, default_label=None, **attr):

        self.graph_attr_dict_factory = self.graph_attr_dict_factory
        self.node_dict_factory = self.node_dict_factory
        self.adjlist_dict_factory = self.adjlist_dict_factory
        self.graph = self.graph_attr_dict_factory()
        self.cache = self.graph_cache_factory(self)

        # init node and adj (must be after cache)
        self._node = self.node_dict_factory(self)
        self._adj = self.adjlist_dict_factory(self)
        self._pred = self.adjlist_dict_factory(self, pred=True)
        self._succ = self._adj

        self._key = None
        self._op = None
        self._session_id = None
        self._graph_type = self._graph_type
        self._schema = GraphSchema()
        self._schema.init_nx_schema()

        # cache for add_node and add_edge
        self._add_node_cache = []
        self._add_edge_cache = []
        self._remove_node_cache = []
        self._remove_edge_cache = []

        create_empty_in_engine = attr.pop("create_empty_in_engine",
                                          True)  # a hidden parameter
        self._distributed = attr.pop("dist", False)
        if incoming_graph_data is not None and self._is_gs_graph(
                incoming_graph_data):
            # convert from gs graph always use distributed mode
            self._distributed = True
            if self._session is None:
                self._session = get_session_by_id(
                    incoming_graph_data.session_id)
        self._default_label = default_label

        if self._session is None:
            self._try_to_get_default_session()

        if not self._is_gs_graph(
                incoming_graph_data) and create_empty_in_engine:
            graph_def = empty_graph_in_engine(self, self.is_directed(),
                                              self._distributed)
            self._key = graph_def.key

        # attempt to load graph with data
        if incoming_graph_data is not None:
            if self._is_gs_graph(incoming_graph_data):
                self._init_with_arrow_property_graph(incoming_graph_data)
                self.cache.warmup()
            else:
                g = to_networkx_graph(incoming_graph_data, create_using=self)
                check_argument(isinstance(g, Graph))

        # load graph attributes (must be after to_networkx_graph)
        self.graph.update(attr)
        self._saved_signature = self.signature

    @property
    @clear_mutation_cache
    @patch_docstring(RefDiGraph.adj)
    def adj(self):
        return AdjacencyView(self._succ)

    succ = adj

    @property
    @clear_mutation_cache
    @patch_docstring(RefDiGraph.pred)
    def pred(self):
        return AdjacencyView(self._pred)

    @clear_mutation_cache
    @patch_docstring(RefDiGraph.has_predecessor)
    def has_successor(self, u, v):
        return self.has_edge(u, v)

    @clear_mutation_cache
    @patch_docstring(RefDiGraph.has_predecessor)
    def has_predecessor(self, u, v):
        return self.has_edge(v, u)

    @clear_mutation_cache
    @patch_docstring(RefDiGraph.successors)
    def successors(self, n):
        try:
            return iter(self._succ[n])
        except KeyError:
            raise NetworkXError("The node %s is not in the digraph." % (n, ))

    # digraph definitions
    neighbors = successors

    @clear_mutation_cache
    @patch_docstring(RefDiGraph.predecessors)
    def predecessors(self, n):
        try:
            return iter(self._pred[n])
        except KeyError:
            raise NetworkXError("The node %s is not in the digraph." % (n, ))

    @property
    @clear_mutation_cache
    def edges(self):
        """An OutEdgeView of the DiGraph as G.edges or G.edges().

        edges(self, nbunch=None, data=False, default=None)

        The OutEdgeView provides set-like operations on the edge-tuples
        as well as edge attribute lookup. When called, it also provides
        an EdgeDataView object which allows control of access to edge
        attributes (but does not provide set-like operations).
        Hence, `G.edges[u, v]['color']` provides the value of the color
        attribute for edge `(u, v)` while
        `for (u, v, c) in G.edges.data('color', default='red'):`
        iterates through all the edges yielding the color attribute
        with default `'red'` if no color attribute exists.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.
        data : string or bool, optional (default=False)
            The edge attribute returned in 3-tuple (u, v, ddict[data]).
            If True, return edge attribute dict in 3-tuple (u, v, ddict).
            If False, return 2-tuple (u, v).
        default : value, optional (default=None)
            Value used for edges that don't have the requested attribute.
            Only relevant if data is not True or False.

        Returns
        -------
        edges : OutEdgeView
            A view of edge attributes, usually it iterates over (u, v)
            or (u, v, d) tuples of edges, but can also be used for
            attribute lookup as `edges[u, v]['foo']`.

        See Also
        --------
        in_edges, out_edges

        Notes
        -----
        Nodes in nbunch that are not in the graph will be (quietly) ignored.
        For directed graphs this returns the out-edges.

        Examples
        --------
        >>> G = nx.DiGraph()
        >>> nx.add_path(G, [0, 1, 2])
        >>> G.add_edge(2, 3, weight=5)
        >>> [e for e in G.edges]
        [(0, 1), (1, 2), (2, 3)]
        >>> G.edges.data()  # default data is {} (empty dict)
        OutEdgeDataView([(0, 1, {}), (1, 2, {}), (2, 3, {'weight': 5})])
        >>> G.edges.data("weight", default=1)
        OutEdgeDataView([(0, 1, 1), (1, 2, 1), (2, 3, 5)])
        >>> G.edges([0, 2])  # only edges incident to these nodes
        OutEdgeDataView([(0, 1), (2, 3)])
        >>> G.edges(0)  # only edges incident to a single node (use G.adj[0]?)
        OutEdgeDataView([(0, 1)])

        """
        return OutEdgeView(self)

    # alias out_edges to edges
    out_edges = edges

    @property
    @clear_mutation_cache
    @patch_docstring(RefDiGraph.in_edges)
    def in_edges(self):
        return InEdgeView(self)

    @property
    @clear_mutation_cache
    def degree(self):
        """A DegreeView for the Graph as G.degree or G.degree().

        The node degree is the number of edges adjacent to the node.
        The weighted node degree is the sum of the edge weights for
        edges incident to that node.

        This object provides an iterator for (node, degree) as well as
        lookup for the degree for a single node.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.

        weight : string or None, optional (default=None)
           The name of an edge attribute that holds the numerical value used
           as a weight.  If None, then each edge has weight 1.
           The degree is the sum of the edge weights adjacent to the node.

        Returns
        -------
        If a single node is requested
        deg : int
            Degree of the node

        OR if multiple nodes are requested
        nd_iter : iterator
            The iterator returns two-tuples of (node, degree).

        See Also
        --------
        in_degree, out_degree

        Examples
        --------
        >>> G = nx.DiGraph()
        >>> nx.add_path(G, [0, 1, 2, 3])
        >>> G.degree(0) # node 0 with degree 1
        1
        >>> list(G.degree([0, 1, 2]))
        [(0, 1), (1, 2), (2, 2)]

        """
        return DiDegreeView(self)

    @property
    @clear_mutation_cache
    @patch_docstring(RefDiGraph.in_degree)
    def in_degree(self):
        return InDegreeView(self)

    @property
    @clear_mutation_cache
    @patch_docstring(RefDiGraph.out_degree)
    def out_degree(self):
        return OutDegreeView(self)

    @patch_docstring(RefDiGraph.is_directed)
    def is_directed(self):
        return True

    @patch_docstring(RefDiGraph.is_multigraph)
    def is_multigraph(self):
        return False

    @clear_mutation_cache
    @patch_docstring(RefDiGraph.reverse)
    def reverse(self, copy=True):
        self._convert_arrow_to_dynamic()

        if not copy:
            g = reverse_view(self)
            g._op = self._op
            g._key = self._key
            g._schema = deepcopy(self._schema)
            g._is_client_view = True
        else:
            g = self.__class__(create_empty_in_engine=False)
            g.graph = self.graph
            g.name = self.name
            op = copy_graph(self, "reverse")
            g._op = op
            graph_def = op.eval()
            g._key = graph_def.key
            g._schema = deepcopy(self._schema)
            g.cache.warmup()
        g._session = self._session
        return g
Beispiel #9
0
class Graph(object):
    """A class for representing metadata of a graph in the GraphScope.

    A :class:`Graph` object holds the metadata of a graph, such as key, schema, and the graph is directed or not.

    It is worth noting that the graph is stored by the backend such as Analytical Engine, Vineyard.
    In other words, the graph object holds nothing but metadata.

    The graph object should not be created directly from :class:`Graph`.
    Instead, the graph should be created by `Session.load_from`

    The following example demonstrates its usage:

    .. code:: python

        >>> import graphscope as gs
        >>> from graphscope.framework.loader import Loader
        >>> sess = gs.session()
        >>> g = sess.load_from(
        ...     edges={
        ...         "knows": (
        ...             Loader("{}/p2p-31_property_e_0".format(property_dir), header_row=True),
        ...             ["src_label_id", "dst_label_id", "dist"],
        ...             ("src_id", "person"),
        ...             ("dst_id", "person"),
        ...         ),
        ...     },
        ...     vertices={
        ...         "person": Loader(
        ...             "{}/p2p-31_property_v_0".format(property_dir), header_row=True
        ...         ),
        ...     }
        ... )
    """
    def __init__(self, session_id, incoming_data=None):
        """Construct a :class:`Graph` object.

        Args:
            session_id (str): Session id of the session the graph is created in.
            incoming_data: Graph can be initialized through various type of sources,
                which can be one of:
                    - :class:`GraphDef`
                    - :class:`nx.Graph`
                    - :class:`Graph`
                    - :class:`vineyard.Object`, :class:`vineyard.ObjectId` or :class:`vineyard.ObjectName`
        """

        # Don't import the :code:`NXGraph` in top-level statments to improve the
        # performance of :code:`import graphscope`.
        from graphscope.experimental.nx.classes.graph import Graph as NXGraph

        self._key = None
        self._op = None
        self._graph_type = None
        self.directed = False
        self._vineyard_id = 0
        self._schema = GraphSchema()

        self._session_id = session_id
        self._detached = False

        self._interactive_instance_launching_thread = None
        self._interactive_instance_list = []
        self._learning_instance_list = []

        if isinstance(incoming_data, GraphDef):
            graph_def = incoming_data
        elif isinstance(incoming_data, NXGraph):
            graph_def = self._from_nx_graph(incoming_data)
        elif isinstance(incoming_data, Graph):
            graph_def = self._copy_from(incoming_data)
        elif isinstance(
                incoming_data,
            (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)):
            graph_def = self._from_vineyard(incoming_data)
        else:
            raise ValueError(
                "Failed to create a graph on graphscope engine: %s",
                incoming_data)

        if graph_def:
            self._key = graph_def.key
            self._vineyard_id = graph_def.vineyard_id
            self._graph_type = graph_def.graph_type
            self._directed = graph_def.directed
            self._generate_eid = graph_def.generate_eid
            self._schema.get_schema_from_def(graph_def.schema_def)
            self._schema_path = graph_def.schema_path
            # init saved_signature (must be after init schema)
            self._saved_signature = self.signature

            # create gremlin server pod asynchronously
            if gs_config.initializing_interactive_engine:
                self._interactive_instance_launching_thread = threading.Thread(
                    target=self._launch_interactive_instance_impl, args=())
                self._interactive_instance_launching_thread.start()

    def __del__(self):
        # cleanly ignore all exceptions, cause session may already closed / destroyed.
        try:
            self.unload()
        except Exception:  # pylint: disable=broad-except
            pass

    def _close_interactive_instances(self):
        # Close related interactive instances when graph unloaded.
        # Since the graph is gone, quering via interactive client is meaningless.
        for instance in self._interactive_instance_list:
            instance.close()
        self._interactive_instance_list.clear()

    def _close_learning_instances(self):
        for instance in self._learning_instance_list:
            instance.close()
        self._learning_instance_list.clear()

    def _launch_interactive_instance_impl(self):
        try:
            sess = get_session_by_id(self.session_id)
            sess.gremlin(self)
        except:  # noqa: E722
            # Record error msg in `InteractiveQuery` when launching failed.
            # Unexpect and suppress all exceptions here.
            pass

    @property
    def op(self):
        """The DAG op of this graph."""
        return self._op

    @property
    def key(self):
        """The key of the corresponding graph in engine."""
        return self._key

    @property
    def graph_type(self):
        """The type of the graph object.

        Returns:
            type (`types_pb2.GraphType`): the type of the graph.
        """
        return self._graph_type

    @property
    def schema(self):
        """Schema of the graph.

        Returns:
            :class:`GraphSchema`: the schema of the graph
        """
        return self._schema

    @property
    def schema_path(self):
        """Path that Coordinator will write interactive schema path to.

        Returns:
            str: The path contains the schema. for interactive engine.
        """
        return self._schema_path

    @property
    def signature(self):
        if self._key is None:
            raise RuntimeError("graph should be registered in remote.")
        return hashlib.sha256("{}.{}".format(
            self._schema.signature(), self._key).encode("utf-8")).hexdigest()

    @property
    def template_str(self):
        if self._key is None:
            raise RuntimeError("graph should be registered in remote.")
        graph_type = self._graph_type
        # transform str/string to std::string
        oid_type = utils.normalize_data_type_str(self._schema.oid_type)
        vid_type = self._schema.vid_type
        vdata_type = utils.data_type_to_cpp(self._schema.vdata_type)
        edata_type = utils.data_type_to_cpp(self._schema.edata_type)
        if graph_type == types_pb2.ARROW_PROPERTY:
            template = f"vineyard::ArrowFragment<{oid_type},{vid_type}>"
        elif graph_type == types_pb2.ARROW_PROJECTED:
            template = f"gs::ArrowProjectedFragment<{oid_type},{vid_type},{vdata_type},{edata_type}>"
        elif graph_type == types_pb2.DYNAMIC_PROJECTED:
            template = f"gs::DynamicProjectedFragment<{vdata_type},{edata_type}>"
        else:
            raise ValueError(f"Unsupported graph type: {graph_type}")
        return template

    @property
    def vineyard_id(self):
        """Get the vineyard object_id of this graph.

        Returns:
            str: return vineyard id of this graph
        """
        return self._vineyard_id

    @property
    def session_id(self):
        """Get the currrent session_id.

        Returns:
            str: Return session id that the graph belongs to.
        """
        return self._session_id

    def detach(self):
        """Detaching a graph makes it being left in vineyard even when the varaible for
        this :class:`Graph` object leaves the lexical scope.

        The graph can be accessed using the graph's :code:`ObjectID` or its name later.
        """
        self._detached = True

    def loaded(self):
        return self._key is not None

    def __str__(self):
        return f"graphscope.Graph <{self.template_str}> {self._vineyard_id}"

    def __repr__(self):
        return ("graphscope.Graph\n"
                f"type: {self.template_str.split('<')[0]}\n"
                f"vineyard_id: {self._vineyard_id}\n\n"
                f"{str(self._schema)}")

    def unload(self):
        """Unload this graph from graphscope engine."""
        if not self.loaded():
            raise RuntimeError("The graph is not registered in remote.")
        # close interactive instances first
        try:
            if (self._interactive_instance_launching_thread is not None and
                    self._interactive_instance_launching_thread.is_alive()):
                # join raises a RuntimeError if an attempt is made to join the current thread.
                # this exception occurs when a object collected by gc mechanism contains a running thread.
                if (threading.current_thread() !=
                        self._interactive_instance_launching_thread):
                    self._interactive_instance_launching_thread.join()
            self._close_interactive_instances()
        except Exception as e:
            logger.error("Failed to close interactive instances: %s" % e)
        try:
            self._close_learning_instances()
        except Exception as e:
            logger.error("Failed to close learning instances: %s" % e)
        if not self._detached:
            op = dag_utils.unload_graph(self)
            op.eval()
        self._key = None

    def project_to_simple(self,
                          v_label="_",
                          e_label="_",
                          v_prop=None,
                          e_prop=None):
        """Project a property graph to a simple graph, useful for analytical engine.
        Will translate name represented label or property to index, which is broadedly used
        in internal engine.

        Args:
            v_label (str, optional): vertex label to project. Defaults to "_".
            e_label (str, optional): edge label to project. Defaults to "_".
            v_prop (str, optional): vertex property of the v_label. Defaults to None.
            e_prop (str, optional): edge property of the e_label. Defaults to None.

        Returns:
            :class:`Graph`: A `Graph` instance, which graph_type is `ARROW_PROJECTED`
        """
        if not self.loaded():
            raise RuntimeError(
                "The graph is not registered in remote, and can't project to simple"
            )
        self.check_unmodified()
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        check_argument(isinstance(v_label, (int, str)))
        check_argument(isinstance(e_label, (int, str)))

        def check_out_of_range(id, length):
            if id < length and id > -1:
                return id
            else:
                raise KeyError("id {} is out of range.".format(id))

        try:
            v_label_id = (check_out_of_range(
                v_label, self._schema.vertex_label_num) if isinstance(
                    v_label, int) else
                          self._schema.vertex_label_index(v_label))
        except ValueError as e:
            raise ValueError("graph not contains the vertex label {}.".format(
                v_label)) from e

        try:
            e_label_id = (check_out_of_range(
                e_label, self._schema.edge_label_num) if isinstance(
                    e_label, int) else self._schema.edge_label_index(e_label))
        except ValueError as e:
            raise InvalidArgumentError(
                "graph not contains the edge label {}.".format(e_label)) from e

        if v_prop is None:
            # NB: -1 means vertex property is None
            v_prop_id = -1
            v_properties = None
        else:
            check_argument(isinstance(v_prop, (int, str)))
            v_properties = self._schema.vertex_properties[v_label_id]
            try:
                v_prop_id = (check_out_of_range(v_prop, len(v_properties))
                             if isinstance(v_prop, int) else
                             self._schema.vertex_property_index(
                                 v_label_id, v_prop))
            except ValueError as e:
                raise ValueError(
                    "vertex label {} not contains the property {}".format(
                        v_label, v_prop)) from e

        if e_prop is None:
            # NB: -1 means edge property is None
            e_prop_id = -1
            e_properties = None
        else:
            check_argument(isinstance(e_prop, (int, str)))
            e_properties = self._schema.edge_properties[e_label_id]
            try:
                e_prop_id = (check_out_of_range(e_prop, len(e_properties))
                             if isinstance(e_prop, int) else
                             self._schema.edge_property_index(
                                 e_label_id, e_prop))
            except ValueError as e:
                raise ValueError(
                    "edge label {} not contains the property {}".format(
                        e_label, e_prop)) from e

        oid_type = self._schema.oid_type
        vid_type = self._schema.vid_type
        vdata_type = None
        if v_properties:
            vdata_type = list(v_properties.values())[v_prop_id]
        edata_type = None
        if e_properties:
            edata_type = list(e_properties.values())[e_prop_id]

        op = dag_utils.project_arrow_property_graph(
            self,
            v_label_id,
            v_prop_id,
            e_label_id,
            e_prop_id,
            vdata_type,
            edata_type,
            oid_type,
            vid_type,
        )
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)

    def add_column(self, results, selector):
        """Add the results as a column to the graph. Modification rules are given by the selector.

        Args:
            results (:class:`Context`): A `Context` that created by doing a query.
            selector (dict): Select results to add as column. Format is similar to selectors in `Context`

        Returns:
            :class:`Graph`: A new `Graph` with new columns.
        """
        check_argument(isinstance(selector, Mapping),
                       "selector of add column must be a dict")
        self.check_unmodified()
        check_argument(self.graph_type == types_pb2.ARROW_PROPERTY)
        selector = {
            key: results._transform_selector(value)
            for key, value in selector.items()
        }
        selector = json.dumps(selector)
        op = dag_utils.add_column(self, results, selector)
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)

    def to_numpy(self, selector, vertex_range=None):
        """Select some elements of the graph and output to numpy.

        Args:
            selector (str): Select a portion of graph as a numpy.ndarray.
            vertex_range(dict, optional): Slice vertices. Defaults to None.
        Returns:
            `numpy.ndarray`
        """
        self.check_unmodified()
        selector = utils.transform_labeled_vertex_property_data_selector(
            self, selector)
        vertex_range = utils.transform_vertex_range(vertex_range)
        op = dag_utils.graph_to_numpy(self, selector, vertex_range)
        ret = op.eval()
        return utils.decode_numpy(ret)

    def to_dataframe(self, selector, vertex_range=None):
        """Select some elements of the graph and output as a pandas.DataFrame

        Args:
            selector (dict): Select some portions of graph.
            vertex_range (dict, optional): Slice vertices. Defaults to None.

        Returns:
            `pandas.DataFrame`
        """
        self.check_unmodified()
        check_argument(
            isinstance(selector, Mapping),
            "selector of to_vineyard_dataframe must be a dict",
        )
        selector = {
            key:
            utils.transform_labeled_vertex_property_data_selector(self, value)
            for key, value in selector.items()
        }
        selector = json.dumps(selector)
        vertex_range = utils.transform_vertex_range(vertex_range)

        op = dag_utils.graph_to_dataframe(self, selector, vertex_range)
        ret = op.eval()
        return utils.decode_dataframe(ret)

    def is_directed(self):
        return self._directed

    def check_unmodified(self):
        check_argument(self.signature == self._saved_signature,
                       "Graph has been modified!")

    def _from_nx_graph(self, incoming_graph):
        """Create a gs graph from a nx graph.
        Args:
            incoming_graph (:class:`nx.graph`): A nx graph that contains graph data.

        Returns:
            that will be used to construct a gs.Graph

        Raises:
            TypeError: Raise Error if graph type not match.

        Examples:
            >>> nx_g = nx.path_graph(10)
            >>> gs_g = gs.Graph(nx_g)
        """
        if hasattr(incoming_graph, "_graph"):
            msg = "graph view can not convert to gs graph"
            raise TypeError(msg)
        op = dag_utils.dynamic_to_arrow(incoming_graph)
        graph_def = op.eval()
        return graph_def

    def _copy_from(self, incoming_graph):
        """Copy a graph.

        Args:
            incoming_graph (:class:`Graph`): Source graph to be copied from

        Returns:
            :class:`Graph`: An identical graph, but with a new vineyard id.
        """
        check_argument(incoming_graph.graph_type == types_pb2.ARROW_PROPERTY)
        check_argument(incoming_graph.loaded())
        op = dag_utils.copy_graph(incoming_graph)
        graph_def = op.eval()
        return graph_def

    def _from_vineyard(self, vineyard_object):
        """Load a graph from a already existed vineyard graph.

        Args:
            vineyard_object (:class:`vineyard.Object`, :class:`vineyard.ObjectID`
                            or :class:`vineyard.ObjectName`): vineyard object,
                            which represents a graph.

        Returns:
            A graph_def.
        """
        if isinstance(vineyard_object, vineyard.Object):
            return self._from_vineyard_id(vineyard_object.id)
        if isinstance(vineyard_object, vineyard.ObjectID):
            return self._from_vineyard_id(vineyard_object)
        if isinstance(vineyard_object, vineyard.ObjectName):
            return self._from_vineyard_name(vineyard_object)

    def _from_vineyard_id(self, vineyard_id):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = utils.b_to_attr(True)
        config[types_pb2.VINEYARD_ID] = utils.i_to_attr(int(vineyard_id))
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = utils.s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = utils.s_to_attr("uint64_t")
        op = dag_utils.create_graph(self._session_id,
                                    types_pb2.ARROW_PROPERTY,
                                    attrs=config)
        graph_def = op.eval()
        return graph_def

    def _from_vineyard_name(self, vineyard_name):
        config = {}
        config[types_pb2.IS_FROM_VINEYARD_ID] = utils.b_to_attr(True)
        config[types_pb2.VINEYARD_NAME] = utils.s_to_attr(str(vineyard_name))
        # FIXME(hetao) hardcode oid/vid type for codegen, when loading from vineyard
        #
        # the metadata should be retrived from vineyard
        config[types_pb2.OID_TYPE] = utils.s_to_attr("int64_t")
        config[types_pb2.VID_TYPE] = utils.s_to_attr("uint64_t")
        op = dag_utils.create_graph(self._session_id,
                                    types_pb2.ARROW_PROPERTY,
                                    attrs=config)
        graph_def = op.eval()
        return graph_def

    def attach_interactive_instance(self, instance):
        """Store the instance when a new interactive instance is started.

        Args:
            instance: interactive instance
        """
        self._interactive_instance_list.append(instance)

    def attach_learning_instance(self, instance):
        """Store the instance when a new learning instance is created.

        Args:
            instance: learning instance
        """
        self._learning_instance_list.append(instance)

    def serialize(self, path, **kwargs):
        """Serialize graph to a location.
        The meta and data of graph is dumped to specified location,
        and can be restored by `Graph.deserialize` in other sessions.

        Each worker will write a `path_{worker_id}.meta` file and
        a `path_{worker_id}` file to storage.
        Args:
            path (str): supported storages are local, hdfs, oss, s3
        """
        import vineyard
        import vineyard.io

        sess = get_session_by_id(self.session_id)
        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        vineyard.io.serialize(
            path,
            vineyard.ObjectID(self._vineyard_id),
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )

    @classmethod
    def deserialize(cls, path, sess, **kwargs):
        """Construct a `Graph` by deserialize from `path`.
        It will read all serialization files, which is dumped by
        `Graph.serialize`.
        If any serialize file doesn't exists or broken, will error out.

        Args:
            path (str): Path contains the serialization files.
            sess (`graphscope.Session`): The target session
                that the graph will be construct in

        Returns:
            `Graph`: A new graph object. Schema and data is supposed to be
                identical with the one that called serialized method.
        """
        import vineyard
        import vineyard.io

        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        graph_id = vineyard.io.deserialize(
            path,
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
        return cls(sess.session_id, vineyard.ObjectID(graph_id))

    def draw(self, vertices, hop=1):
        """Visualize the graph data in the result cell when the draw functions are invoked

        Args:
            vertices (list): selected vertices.
            hop (int): draw induced subgraph with hop extension. Defaults to 1.

        Returns:
            A GraphModel.
        """
        from ipygraphin import GraphModel

        sess = get_session_by_id(self.session_id)
        interactive_query = sess.gremlin(self)

        graph = GraphModel()
        graph.queryGraphData(vertices, hop, interactive_query)

        # listen on the 1~2 hops operation of node
        graph.on_msg(graph.queryNeighbor)
        return graph

    def add_vertices(self, vertices):
        vertices = graph_utils.normalize_parameter_vertices(vertices)
        # Configurations inherited from input graph
        # oid_type
        # CHECK label name not in existed edge labels
        vertex_labels = self._schema.vertex_labels
        for vertex in vertices:
            check_argument(
                vertex.label not in vertex_labels,
                f"Duplicate label name with existing vertex labels: {vertex.label}",
            )

        config = graph_utils.assemble_op_config([], vertices, self._directed,
                                                self._schema.oid_type,
                                                self._generate_eid)
        op = dag_utils.add_vertices(self, attrs=config)
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)

    def add_edges(self, edges):
        edges = graph_utils.normalize_parameter_edges(edges)
        # directed, oid_type, generate_eid
        # CHECK:
        # 1. edge's src/dst labels must existed in vertex_labels
        # 2. label name not in existed edge labels
        vertex_labels = self._schema.vertex_labels
        edge_labels = self.schema.edge_labels
        graph_utils.check_edge_validity(edges, vertex_labels)
        for edge in edges:
            check_argument(
                edge.label not in edge_labels,
                f"Duplicate label name with existing edge labels: {edge.label}",
            )

        config = graph_utils.assemble_op_config(edges, [], self._directed,
                                                self._schema.oid_type,
                                                self._generate_eid)
        op = dag_utils.add_edges(self, attrs=config)
        graph_def = op.eval()
        return Graph(self.session_id, graph_def)
Beispiel #10
0
 def __init__(self, graph_def, conn=None) -> None:
     self._schema = GraphSchema()
     self._schema.from_graph_def(graph_def)
     self._conn: Connection = conn
     self._schema._conn = conn
Beispiel #11
0
class DiGraph(Graph):
    """
    Base class for directed graphs.

    A DiGraph stores nodes and edges with optional data, or attributes.

    DiGraphs hold directed edges.  Self loops are allowed but multiple
    (parallel) edges are not.

    Nodes can be strings or integers objects with optional key/value attributes.

    Edges are represented as links between nodes with optional
    key/value attributes.

    Parameters
    ----------
    incoming_graph_data : input graph (optional, default: None)
        Data to initialize graph. If None (default) an empty
        graph is created.  The data can be any format that is supported
        by the to_networkx_graph() function, currently including edge list,
        dict of dicts, dict of lists, NetworkX graph, NumPy matrix
        or 2d ndarray, SciPy sparse matrix, or a graphscope graph.

    attr : keyword arguments, optional (default= no attributes)
        Attributes to add to graph as key=value pairs.

    See Also
    --------
    Graph
    graphscope.Graph

    Examples
    --------
    Create an empty graph structure (a "null graph") with no nodes and
    no edges.

    >>> G = nx.DiGraph()

    G can be grown in several ways.

    **Nodes:**

    Add one node at a time:

    >>> G.add_node(1)

    Add the nodes from any container (a list, dict, set or
    even the lines from a file or the nodes from another graph).

    >>> G.add_nodes_from([2, 3])
    >>> G.add_nodes_from(range(100, 110))
    >>> H = nx.path_graph(10)
    >>> G.add_nodes_from(H)

    In addition integers, strings can represent a node.

    >>> G.add_node('a node')

    **Edges:**

    G can also be grown by adding edges.

    Add one edge,

    >>> G.add_edge(1, 2)

    a list of edges,

    >>> G.add_edges_from([(1, 2), (1, 3)])

    or a collection of edges,

    >>> G.add_edges_from(H.edges)

    If some edges connect nodes not yet in the graph, the nodes
    are added automatically.  There are no errors when adding
    nodes or edges that already exist.

    **Attributes:**

    Each graph, node, and edge can hold key/value attribute pairs
    in an associated attribute dictionary (the keys must be hashable).
    By default these are empty, but can be added or changed using
    add_edge, add_node or direct manipulation of the attribute
    dictionaries named graph, node and edge respectively.

    >>> G = nx.DiGraph(day="Friday")
    >>> G.graph
    {'day': 'Friday'}

    Add node attributes using add_node(), add_nodes_from() or G.nodes

    >>> G.add_node(1, time='5pm')
    >>> G.add_nodes_from([3], time='2pm')
    >>> G.nodes[1]
    {'time': '5pm'}
    >>> G.nodes[1]['room'] = 714
    >>> del G.nodes[1]['room'] # remove attribute
    >>> list(G.nodes(data=True))
    [(1, {'time': '5pm'}), (3, {'time': '2pm'})]

    Add edge attributes using add_edge(), add_edges_from(), subscript
    notation, or G.edges.

    >>> G.add_edge(1, 2, weight=4.7 )
    >>> G.add_edges_from([(3, 4), (4, 5)], color='red')
    >>> G.add_edges_from([(1, 2, {'color':'blue'}), (2, 3, {'weight':8})])
    >>> G[1][2]['weight'] = 4.7
    >>> G.edges[1, 2]['weight'] = 4

    Warning: we protect the graph data structure by making `G.edges[1, 2]` a
    read-only dict-like structure. However, you can assign to attributes
    in e.g. `G.edges[1, 2]`. Thus, use 2 sets of brackets to add/change
    data attributes: `G.edges[1, 2]['weight'] = 4`
    (For multigraphs: `MG.edges[u, v, key][name] = value`).

    **Shortcuts:**

    Many common graph features allow python syntax to speed reporting.

    >>> 1 in G     # check if node in graph
    True
    >>> [n for n in G if n < 3]  # iterate through nodes
    [1, 2]
    >>> len(G)  # number of nodes in graph
    5

    Often the best way to traverse all edges of a graph is via the neighbors.
    The neighbors are reported as an adjacency-dict `G.adj` or `G.adjacency()`

    >>> for n, nbrsdict in G.adjacency():
    ...     for nbr, eattr in nbrsdict.items():
    ...        if 'weight' in eattr:
    ...            # Do something useful with the edges
    ...            pass

    But the edges reporting object is often more convenient:

    >>> for u, v, weight in G.edges(data='weight'):
    ...     if weight is not None:
    ...         # Do something useful with the edges
    ...         pass

    **Reporting:**

    Simple graph information is obtained using object-attributes and methods.
    Reporting usually provides views instead of containers to reduce memory
    usage. The views update as the graph is updated similarly to dict-views.
    The objects `nodes, `edges` and `adj` provide access to data attributes
    via lookup (e.g. `nodes[n], `edges[u, v]`, `adj[u][v]`) and iteration
    (e.g. `nodes.items()`, `nodes.data('color')`,
    `nodes.data('color', default='blue')` and similarly for `edges`)
    Views exist for `nodes`, `edges`, `neighbors()`/`adj` and `degree`.

    For details on these and other miscellaneous methods, see below.
    """
    def __init__(self, incoming_graph_data=None, **attr):
        """Initialize a graph with edges, name, or graph attributes

        Parameters
        ----------
        incoming_graph_data : input graph (optional, default: None)
            Data to initialize graph. If None (default) an empty
            graph is created.  The data can be any format that is supported
            by the to_nx_graph() function, currently including edge list,
            dict of dicts, dict of lists, NetworkX graph, NumPy matrix
            or 2d ndarray, Pandas DataFrame, SciPy sparse matrix, or a graphscope
            graph.

        attr : keyword arguments, optional (default= no attributes)
            Attributes to add to graph as key=value pairs.

        See Also
        --------
        convert

        Examples
        --------
        >>> G = nx.Graph()  # or DiGraph
        >>> G = nx.Graph(name='my graph')
        >>> e = [(1, 2), (2, 3), (3, 4)]  # list of edges
        >>> G = nx.Graph(e)

        Arbitrary graph attribute pairs (key=value) may be assigned

        >>> G = nx.Graph(e, day="Friday")
        >>> G.graph
        {'day': 'Friday'}

        """
        sess = get_default_session()
        if sess is None:
            raise ValueError(
                "Cannot find a default session. "
                "Please register a session using graphscope.session(...).as_default()"
            )
        self._session_id = sess.session_id

        self._key = None
        self._op = None
        self._graph_type = self._graph_type
        self._schema = GraphSchema()
        self._schema.init_nx_schema()
        create_empty_in_engine = attr.pop("create_empty_in_engine",
                                          True)  # a hidden parameter
        if not self.is_gs_graph(
                incoming_graph_data) and create_empty_in_engine:
            graph_def = empty_graph_in_engine(self, self.is_directed())
            self._key = graph_def.key

        self.graph_attr_dict_factory = self.graph_attr_dict_factory
        self.node_dict_factory = self.node_dict_factory
        self.adjlist_dict_factory = self.adjlist_dict_factory

        self.graph = self.graph_attr_dict_factory()
        self._node = self.node_dict_factory(self)
        self._adj = self.adjlist_dict_factory(self)
        self._pred = self.adjlist_dict_factory(self, types_pb2.PREDS_BY_NODE)
        self._succ = self._adj
        # attempt to load graph with data
        if incoming_graph_data is not None:
            if self.is_gs_graph(incoming_graph_data):
                graph_def = from_gs_graph(incoming_graph_data, self)
                self._key = graph_def.key
                self._schema.init_nx_schema(incoming_graph_data.schema)
            else:
                to_nx_graph(incoming_graph_data, create_using=self)
        # load graph attributes (must be after to_nx_graph)
        self.graph.update(attr)
        self._saved_signature = self.signature

    def __repr__(self):
        s = "graphscope.nx.DiGraph\n"
        s += "type: " + self.template_str.split("<")[0]
        s += str(self._schema)
        return s

    @property
    def adj(self):
        """Graph adjacency object holding the successors of each node.

        This object is a read-only dict-like structure with node keys
        and neighbor-dict values.  The neighbor-dict is keyed by neighbor
        to the edge-data-dict.  So `G.succ[3][2]['color'] = 'blue'` sets
        the color of the edge `(3, 2)` to `"blue"`.

        Iterating over G.succ behaves like a dict. Useful idioms include
        `for nbr, datadict in G.succ[n].items():`.  A data-view not provided
        by dicts also exists: `for nbr, foovalue in G.succ[node].data('foo'):`
        and a default can be set via a `default` argument to the `data` method.

        The neighbor information is also provided by subscripting the graph.
        So `for nbr, foovalue in G[node].data('foo', default=1):` works.

        For directed graphs, `G.adj` is identical to `G.succ`.
        """
        return AdjacencyView(self._succ)

    succ = adj

    @property
    def pred(self):
        """Graph adjacency object holding the predecessors of each node.

        This object is a read-only dict-like structure with node keys
        and neighbor-dict values.  The neighbor-dict is keyed by neighbor
        to the edge-data-dict.  So `G.pred[2][3]['color'] = 'blue'` sets
        the color of the edge `(3, 2)` to `"blue"`.

        Iterating over G.pred behaves like a dict. Useful idioms include
        `for nbr, datadict in G.pred[n].items():`.  A data-view not provided
        by dicts also exists: `for nbr, foovalue in G.pred[node].data('foo'):`
        A default can be set via a `default` argument to the `data` method.
        """
        return AdjacencyView(self._pred)

    def is_gs_graph(self, incoming_graph_data):
        return (hasattr(incoming_graph_data, "graph_type")
                and incoming_graph_data.graph_type == types_pb2.ARROW_PROPERTY)

    def has_successor(self, u, v):
        """Returns True if node u has successor v.

        This is true if graph has the edge u->v.
        """
        return self.has_edge(u, v)

    def has_predecessor(self, u, v):
        """Returns True if node u has predecessor v.

        This is true if graph has the edge u<-v.
        """
        return self.has_edge(v, u)

    def successors(self, n):
        """Returns an iterator over successor nodes of n.

        A successor of n is a node m such that there exists a directed
        edge from n to m.

        Parameters
        ----------
        n : node
           A node in the graph

        Raises
        -------
        KeyError
           If n is not in the graph.

        See Also
        --------
        predecessors

        Notes
        -----
        neighbors() and successors() are the same.
        """
        try:
            return iter(self._succ[n])
        except KeyError:
            raise NetworkXError("The node %s is not in the digraph." % (n, ))

    # digraph definitions
    neighbors = successors

    def predecessors(self, n):
        """Returns an iterator over predecessor nodes of n.

        A predecessor of n is a node m such that there exists a directed
        edge from m to n.

        Parameters
        ----------
        n : node
           A node in the graph

        Raises
        -------
        Error
           If n is not in the graph.

        See Also
        --------
        successors
        """
        try:
            return iter(self._pred[n])
        except KeyError:
            raise NetworkXError("The node %s is not in the digraph." % (n, ))

    @property
    def edges(self):
        """An OutEdgeView of the DiGraph as G.edges or G.edges().

        edges(self, nbunch=None, data=False, default=None)

        The OutEdgeView provides set-like operations on the edge-tuples
        as well as edge attribute lookup. When called, it also provides
        an EdgeDataView object which allows control of access to edge
        attributes (but does not provide set-like operations).
        Hence, `G.edges[u, v]['color']` provides the value of the color
        attribute for edge `(u, v)` while
        `for (u, v, c) in G.edges.data('color', default='red'):`
        iterates through all the edges yielding the color attribute
        with default `'red'` if no color attribute exists.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.
        data : string or bool, optional (default=False)
            The edge attribute returned in 3-tuple (u, v, ddict[data]).
            If True, return edge attribute dict in 3-tuple (u, v, ddict).
            If False, return 2-tuple (u, v).
        default : value, optional (default=None)
            Value used for edges that don't have the requested attribute.
            Only relevant if data is not True or False.

        Returns
        -------
        edges : OutEdgeView
            A view of edge attributes, usually it iterates over (u, v)
            or (u, v, d) tuples of edges, but can also be used for
            attribute lookup as `edges[u, v]['foo']`.

        See Also
        --------
        in_edges, out_edges

        Notes
        -----
        Nodes in nbunch that are not in the graph will be (quietly) ignored.
        For directed graphs this returns the out-edges.

        Examples
        --------
        >>> G = nx.DiGraph()   # or MultiDiGraph, etc
        >>> nx.add_path(G, [0, 1, 2])
        >>> G.add_edge(2, 3, weight=5)
        >>> [e for e in G.edges]
        [(0, 1), (1, 2), (2, 3)]
        >>> G.edges.data()  # default data is {} (empty dict)
        OutEdgeDataView([(0, 1, {}), (1, 2, {}), (2, 3, {'weight': 5})])
        >>> G.edges.data('weight', default=1)
        OutEdgeDataView([(0, 1, 1), (1, 2, 1), (2, 3, 5)])
        >>> G.edges([0, 2])  # only edges incident to these nodes
        OutEdgeDataView([(0, 1), (2, 3)])
        >>> G.edges(0)  # only edges incident to a single node (use G.adj[0]?)
        OutEdgeDataView([(0, 1)])

        """
        return OutEdgeView(self)

    # alias out_edges to edges
    out_edges = edges

    @property
    def in_edges(self):
        """An InEdgeView of the Graph as G.in_edges or G.in_edges().

        in_edges(self, nbunch=None, data=False, default=None):

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.
        data : string or bool, optional (default=False)
            The edge attribute returned in 3-tuple (u, v, ddict[data]).
            If True, return edge attribute dict in 3-tuple (u, v, ddict).
            If False, return 2-tuple (u, v).
        default : value, optional (default=None)
            Value used for edges that don't have the requested attribute.
            Only relevant if data is not True or False.

        Returns
        -------
        in_edges : InEdgeView
            A view of edge attributes, usually it iterates over (u, v)
            or (u, v, d) tuples of edges, but can also be used for
            attribute lookup as `edges[u, v]['foo']`.

        See Also
        --------
        edges
        """
        return InEdgeView(self)

    @property
    def degree(self):
        """A DegreeView for the Graph as G.degree or G.degree().

        The node degree is the number of edges adjacent to the node.
        The weighted node degree is the sum of the edge weights for
        edges incident to that node.

        This object provides an iterator for (node, degree) as well as
        lookup for the degree for a single node.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.

        weight : string or None, optional (default=None)
           The name of an edge attribute that holds the numerical value used
           as a weight.  If None, then each edge has weight 1.
           The degree is the sum of the edge weights adjacent to the node.

        Returns
        -------
        If a single node is requested
        deg : int
            Degree of the node

        OR if multiple nodes are requested
        nd_iter : iterator
            The iterator returns two-tuples of (node, degree).

        See Also
        --------
        in_degree, out_degree

        Examples
        --------
        >>> G = nx.DiGraph()   # or MultiDiGraph
        >>> nx.add_path(G, [0, 1, 2, 3])
        >>> G.degree(0) # node 0 with degree 1
        1
        >>> list(G.degree([0, 1, 2]))
        [(0, 1), (1, 2), (2, 2)]

        """
        return DiDegreeView(self)

    @property
    def in_degree(self):
        """An InDegreeView for (node, in_degree) or in_degree for single node.

        The node in_degree is the number of edges pointing to the node.
        The weighted node degree is the sum of the edge weights for
        edges incident to that node.

        This object provides an iteration over (node, in_degree) as well as
        lookup for the degree for a single node.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.

        weight : string or None, optional (default=None)
           The name of an edge attribute that holds the numerical value used
           as a weight.  If None, then each edge has weight 1.
           The degree is the sum of the edge weights adjacent to the node.

        Returns
        -------
        If a single node is requested
        deg : int
            In-degree of the node

        OR if multiple nodes are requested
        nd_iter : iterator
            The iterator returns two-tuples of (node, in-degree).

        See Also
        --------
        degree, out_degree

        Examples
        --------
        >>> G = nx.DiGraph()
        >>> nx.add_path(G, [0, 1, 2, 3])
        >>> G.in_degree(0) # node 0 with degree 0
        0
        >>> list(G.in_degree([0, 1, 2]))
        [(0, 0), (1, 1), (2, 1)]
        """
        return InDegreeView(self)

    @property
    def out_degree(self):
        """An OutDegreeView for (node, out_degree)

        The node out_degree is the number of edges pointing out of the node.
        The weighted node degree is the sum of the edge weights for
        edges incident to that node.

        This object provides an iterator over (node, out_degree) as well as
        lookup for the degree for a single node.

        Parameters
        ----------
        nbunch : single node, container, or all nodes (default= all nodes)
            The view will only report edges incident to these nodes.

        weight : string or None, optional (default=None)
           The name of an edge attribute that holds the numerical value used
           as a weight.  If None, then each edge has weight 1.
           The degree is the sum of the edge weights adjacent to the node.

        Returns
        -------
        If a single node is requested
        deg : int
            Out-degree of the node

        OR if multiple nodes are requested
        nd_iter : iterator
            The iterator returns two-tuples of (node, out-degree).

        See Also
        --------
        degree, in_degree

        Examples
        --------
        >>> G = nx.DiGraph()
        >>> nx.add_path(G, [0, 1, 2, 3])
        >>> G.out_degree(0) # node 0 with degree 1
        1
        >>> list(G.out_degree([0, 1, 2]))
        [(0, 1), (1, 1), (2, 1)]
        """
        return OutDegreeView(self)

    def is_directed(self):
        """Returns True if graph is directed, False otherwise."""
        return True

    def is_multigraph(self):
        return False

    def reverse(self, copy=True):
        """Returns the reverse of the graph.

        The reverse is a graph with the same nodes and edges
        but with the directions of the edges reversed.

        Parameters
        ----------
        copy : bool optional (default=True)
            If True, return a new DiGraph holding the reversed edges.
            If False, the reverse graph is created using a view of
            the original graph.
        """
        if not copy:
            return reverse_view(self)
        g = self.__class__(create_empty_in_engine=False)
        g.graph = self.graph
        g.name = self.name
        g._op = self._op
        op = copy_graph(self, "reverse")
        graph_def = op.eval()
        g._key = graph_def.key
        g._schema = deepcopy(self._schema)
        return g
Beispiel #12
0
    def __init__(self, incoming_graph_data=None, **attr):
        """Initialize a graph with edges, name, or graph attributes

        Parameters
        ----------
        incoming_graph_data : input graph (optional, default: None)
            Data to initialize graph. If None (default) an empty
            graph is created.  The data can be any format that is supported
            by the to_nx_graph() function, currently including edge list,
            dict of dicts, dict of lists, NetworkX graph, NumPy matrix
            or 2d ndarray, Pandas DataFrame, SciPy sparse matrix, or a graphscope
            graph.

        attr : keyword arguments, optional (default= no attributes)
            Attributes to add to graph as key=value pairs.

        See Also
        --------
        convert

        Examples
        --------
        >>> G = nx.Graph()  # or DiGraph
        >>> G = nx.Graph(name='my graph')
        >>> e = [(1, 2), (2, 3), (3, 4)]  # list of edges
        >>> G = nx.Graph(e)

        Arbitrary graph attribute pairs (key=value) may be assigned

        >>> G = nx.Graph(e, day="Friday")
        >>> G.graph
        {'day': 'Friday'}

        """
        sess = get_default_session()
        if sess is None:
            raise ValueError(
                "Cannot find a default session. "
                "Please register a session using graphscope.session(...).as_default()"
            )
        self._session_id = sess.session_id

        self._key = None
        self._op = None
        self._graph_type = self._graph_type
        self._schema = GraphSchema()
        self._schema.init_nx_schema()
        create_empty_in_engine = attr.pop("create_empty_in_engine",
                                          True)  # a hidden parameter
        if not self.is_gs_graph(
                incoming_graph_data) and create_empty_in_engine:
            graph_def = empty_graph_in_engine(self, self.is_directed())
            self._key = graph_def.key

        self.graph_attr_dict_factory = self.graph_attr_dict_factory
        self.node_dict_factory = self.node_dict_factory
        self.adjlist_dict_factory = self.adjlist_dict_factory

        self.graph = self.graph_attr_dict_factory()
        self._node = self.node_dict_factory(self)
        self._adj = self.adjlist_dict_factory(self)
        self._pred = self.adjlist_dict_factory(self, types_pb2.PREDS_BY_NODE)
        self._succ = self._adj
        # attempt to load graph with data
        if incoming_graph_data is not None:
            if self.is_gs_graph(incoming_graph_data):
                graph_def = from_gs_graph(incoming_graph_data, self)
                self._key = graph_def.key
                self._schema.init_nx_schema(incoming_graph_data.schema)
            else:
                to_nx_graph(incoming_graph_data, create_using=self)
        # load graph attributes (must be after to_nx_graph)
        self.graph.update(attr)
        self._saved_signature = self.signature
Beispiel #13
0
class Graph(GraphInterface):
    """A class for representing metadata of a graph in the GraphScope.

    A :class:`Graph` object holds the metadata of a graph, such as key, schema, and the graph is directed or not.

    It is worth noticing that the graph is stored by the backend such as Analytical Engine, Vineyard.
    In other words, the graph object holds nothing but metadata.

    The following example demonstrates its usage:

    .. code:: python

        >>> import graphscope as gs
        >>> sess = gs.session()
        >>> graph = sess.g()
        >>> graph = graph.add_vertices("person.csv", "person")
        >>> graph = graph.add_vertices("software.csv", "software")
        >>> graph = graph.add_edges("knows.csv", "knows", src_label="person", dst_label="person")
        >>> graph = graph.add_edges("created.csv", "created", src_label="person", dst_label="software")
        >>> print(graph)
        >>> print(graph.schema)
    """

    def __init__(
        self,
        graph_node,
    ):
        """Construct a :class:`Graph` object."""

        self._graph_node = graph_node
        self._session = self._graph_node.session
        # copy and set op evaluated
        self._graph_node.op = deepcopy(self._graph_node.op)
        self._graph_node.evaluated = True
        self._session.dag.add_op(self._graph_node.op)

        self._key = None
        self._vineyard_id = 0
        self._schema = GraphSchema()
        self._detached = False

        self._interactive_instance_launching_thread = None
        self._interactive_instance_list = []
        self._learning_instance_list = []

    def __del__(self):
        # cleanly ignore all exceptions, cause session may already closed / destroyed.
        try:
            self.unload()
        except Exception:  # pylint: disable=broad-except
            pass

    def _close_interactive_instances(self):
        # Close related interactive instances when graph unloaded.
        # Since the graph is gone, quering via interactive client is meaningless.
        for instance in self._interactive_instance_list:
            instance.close()
        self._interactive_instance_list.clear()

    def _close_learning_instances(self):
        for instance in self._learning_instance_list:
            instance.close()
        self._learning_instance_list.clear()

    def _launch_interactive_instance_impl(self):
        try:
            self._session.gremlin(self)
        except:  # noqa: E722
            # Record error msg in `InteractiveQuery` when launching failed.
            # Unexpect and suppress all exceptions here.
            pass

    def update_from_graph_def(self, graph_def):
        if graph_def.graph_type == graph_def_pb2.ARROW_FLATTENED:
            self._graph_node._graph_type = graph_def_pb2.ARROW_FLATTENED
        check_argument(
            self._graph_node.graph_type == graph_def.graph_type,
            "Graph type doesn't match {} versus {}".format(
                self._graph_node.graph_type, graph_def.graph_type
            ),
        )
        self._key = graph_def.key
        self._directed = graph_def.directed
        self._is_multigraph = graph_def.is_multigraph
        vy_info = graph_def_pb2.VineyardInfoPb()
        graph_def.extension.Unpack(vy_info)
        self._vineyard_id = vy_info.vineyard_id
        self._oid_type = data_type_to_cpp(vy_info.oid_type)
        self._generate_eid = vy_info.generate_eid

        self._schema_path = vy_info.schema_path
        self._schema.from_graph_def(graph_def)
        self._v_labels = self._schema.vertex_labels
        self._e_labels = self._schema.edge_labels
        self._e_relationships = self._schema.edge_relationships
        # init saved_signature (must be after init schema)
        self._saved_signature = self.signature
        # create gremlin server pod asynchronously
        if self._session.eager() and gs_config.initializing_interactive_engine:
            self._interactive_instance_launching_thread = threading.Thread(
                target=self._launch_interactive_instance_impl, args=()
            )
            self._interactive_instance_launching_thread.start()

    def __getattr__(self, name):
        if hasattr(self._graph_node, name):
            return getattr(self._graph_node, name)
        raise AttributeError("{0} not found.".format(name))

    @property
    def key(self):
        """The key of the corresponding graph in engine."""
        return self._key

    @property
    def schema(self):
        """Schema of the graph.

        Returns:
            :class:`GraphSchema`: the schema of the graph
        """
        return self._schema

    @property
    def schema_path(self):
        """Path that Coordinator will write interactive schema path to.

        Returns:
            str: The path contains the schema. for interactive engine.
        """
        return self._schema_path

    @property
    def signature(self):
        return hashlib.sha256(
            "{}.{}".format(self._schema.signature(), self._key).encode("utf-8")
        ).hexdigest()

    @property
    def op(self):
        return self._graph_node.op

    @property
    def template_str(self):
        # transform str/string to std::string
        oid_type = utils.normalize_data_type_str(self._oid_type)
        vid_type = utils.data_type_to_cpp(self._schema._vid_type)
        vdata_type = utils.data_type_to_cpp(self._schema.vdata_type)
        edata_type = utils.data_type_to_cpp(self._schema.edata_type)
        if self._graph_type == graph_def_pb2.ARROW_PROPERTY:
            template = f"vineyard::ArrowFragment<{oid_type},{vid_type}>"
        elif self._graph_type == graph_def_pb2.ARROW_PROJECTED:
            template = f"gs::ArrowProjectedFragment<{oid_type},{vid_type},{vdata_type},{edata_type}>"
        elif self._graph_type == graph_def_pb2.ARROW_FLATTENED:
            template = f"ArrowFlattenedFragmen<{oid_type},{vid_type},{vdata_type},{edata_type}>"
        elif self._graph_type == graph_def_pb2.DYNAMIC_PROJECTED:
            template = f"gs::DynamicProjectedFragment<{vdata_type},{edata_type}>"
        else:
            raise ValueError(f"Unsupported graph type: {self._graph_type}")
        return template

    @property
    def vineyard_id(self):
        """Get the vineyard object_id of this graph.

        Returns:
            str: return vineyard id of this graph
        """
        return self._vineyard_id

    @property
    def session_id(self):
        """Get the currrent session_id.

        Returns:
            str: Return session id that the graph belongs to.
        """
        return self._session.session_id

    def detach(self):
        """Detaching a graph makes it being left in vineyard even when the varaible for
        this :class:`Graph` object leaves the lexical scope.

        The graph can be accessed using the graph's :code:`ObjectID` or its name later.
        """
        self._detached = True

    def loaded(self):
        """True if current graph has been loaded in the session."""
        return self._session.info["status"] == "active" and self._key is not None

    def __str__(self):
        v_str = "\n".join([f"VERTEX: {label}" for label in self._v_labels])
        relations = []
        for i in range(len(self._e_labels)):
            relations.extend(
                [(self._e_labels[i], src, dst) for src, dst in self._e_relationships[i]]
            )
        e_str = "\n".join(
            [f"EDGE: {label}\tsrc: {src}\tdst: {dst}" for label, src, dst in relations]
        )

        return f"graphscope.Graph\n{graph_def_pb2.GraphTypePb.Name(self._graph_type)}\n{v_str}\n{e_str}"

    def __repr__(self):
        return self.__str__()

    def unload(self):
        """Unload this graph from graphscope engine."""
        if self._session.info["status"] != "active" or self._key is None:
            return

        # close interactive instances first
        try:
            if (
                self._interactive_instance_launching_thread is not None
                and self._interactive_instance_launching_thread.is_alive()
            ):
                # join raises a RuntimeError if an attempt is made to join the current thread.
                # this exception occurs when a object collected by gc mechanism contains a running thread.
                if (
                    threading.current_thread()
                    != self._interactive_instance_launching_thread
                ):
                    self._interactive_instance_launching_thread.join()
            self._close_interactive_instances()
        except Exception as e:
            logger.error("Failed to close interactive instances: %s" % e)
        try:
            self._close_learning_instances()
        except Exception as e:
            logger.error("Failed to close learning instances: %s" % e)
        rlt = None
        if not self._detached:
            rlt = self._session._wrapper(self._graph_node.unload())
        self._key = None
        return rlt

    def _project_to_simple(self, v_prop=None, e_prop=None):
        return self._session._wrapper(
            self._graph_node._project_to_simple(v_prop, e_prop)
        )

    def add_column(self, results, selector):
        return self._session._wrapper(self._graph_node.add_column(results, selector))

    def to_numpy(self, selector, vertex_range=None):
        """Select some elements of the graph and output to numpy.

        Args:
            selector (str): Select a portion of graph as a numpy.ndarray.
            vertex_range(dict, optional): Slice vertices. Defaults to None.

        Returns:
            `numpy.ndarray`
        """
        self._check_unmodified()
        return self._session._wrapper(self._graph_node.to_numpy(selector, vertex_range))

    def to_dataframe(self, selector, vertex_range=None):
        """Select some elements of the graph and output as a pandas.DataFrame

        Args:
            selector (dict): Select some portions of graph.
            vertex_range (dict, optional): Slice vertices. Defaults to None.

        Returns:
            `pandas.DataFrame`
        """
        self._check_unmodified()
        return self._session._wrapper(
            self._graph_node.to_dataframe(selector, vertex_range)
        )

    def to_directed(self):
        """Returns a directed representation of the graph.

        Returns:
            :class:`Graph`: A directed graph with the same name, same nodes, and
                with each edge (u, v, data) replaced by two directed edges (u, v, data) and (v, u, data).

        """
        if self._directed:
            return self
        return self._session._wrapper(self._graph_node.to_directed())

    def to_undirected(self):
        """Returns an undirected representation of the digraph.

        Returns:
            :class:`Graph`: An undirected graph with the same name and nodes and
                with edge (u, v, data) if either (u, v, data) or (v, u, data) is in the digraph.
                If both edges exist in digraph, they will both be preserved.
                You must check and correct for this manually if desired.
        """
        if not self._directed:
            return self
        return self._session._wrapper(self._graph_node.to_undirected())

    def is_directed(self):
        return self._directed

    def is_multigraph(self):
        return self._is_multigraph

    def _check_unmodified(self):
        check_argument(
            self.signature == self._saved_signature, "Graph has been modified!"
        )

    def _attach_interactive_instance(self, instance):
        """Store the instance when a new interactive instance is started.

        Args:
            instance: interactive instance
        """
        self._interactive_instance_list.append(instance)

    def _attach_learning_instance(self, instance):
        """Store the instance when a new learning instance is created.

        Args:
            instance: learning instance
        """
        self._learning_instance_list.append(instance)

    def save_to(self, path, **kwargs):
        """Serialize graph to a location.
        The meta and data of graph is dumped to specified location,
        and can be restored by `Graph.deserialize` in other sessions.

        Each worker will write a `path_{worker_id}.meta` file and
        a `path_{worker_id}` file to storage.
        Args:
            path (str): supported storages are local, hdfs, oss, s3
        """
        try:
            import vineyard
            import vineyard.io
        except ImportError:
            raise RuntimeError(
                "Saving context to locations requires 'vineyard', "
                "please install those two dependencies via "
                "\n"
                "\n"
                "    pip3 install vineyard vineyard-io"
                "\n"
                "\n"
            )

        sess = self._session
        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        vineyard.io.serialize(
            path,
            vineyard.ObjectID(self._vineyard_id),
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )

    @classmethod
    def load_from(cls, path, sess, **kwargs):
        """Construct a `Graph` by deserialize from `path`.
        It will read all serialization files, which is dumped by
        `Graph.serialize`.
        If any serialize file doesn't exists or broken, will error out.

        Args:
            path (str): Path contains the serialization files.
            sess (`graphscope.Session`): The target session
                that the graph will be construct in

        Returns:
            `Graph`: A new graph object. Schema and data is supposed to be
                identical with the one that called serialized method.
        """
        try:
            import vineyard
            import vineyard.io
        except ImportError:
            raise RuntimeError(
                "Saving context to locations requires 'vineyard', "
                "please install those two dependencies via "
                "\n"
                "\n"
                "    pip3 install vineyard vineyard-io"
                "\n"
                "\n"
            )

        deployment = "kubernetes" if sess.info["type"] == "k8s" else "ssh"
        conf = sess.info["engine_config"]
        vineyard_endpoint = conf["vineyard_rpc_endpoint"]
        vineyard_ipc_socket = conf["vineyard_socket"]
        if sess.info["type"] == "k8s":
            hosts = [
                "{}:{}".format(sess.info["namespace"], s)
                for s in sess.info["engine_hosts"].split(",")
            ]
        else:  # type == "hosts"
            hosts = sess.info["engine_hosts"].split(",")
        graph_id = vineyard.io.deserialize(
            path,
            type="global",
            vineyard_ipc_socket=vineyard_ipc_socket,
            vineyard_endpoint=vineyard_endpoint,
            storage_options=kwargs,
            deployment=deployment,
            hosts=hosts,
        )
        return sess._wrapper(GraphDAGNode(sess, vineyard.ObjectID(graph_id)))

    def add_vertices(self, vertices, label="_", properties=None, vid_field=0):
        if not self.loaded():
            raise RuntimeError("The graph is not loaded")
        return self._session._wrapper(
            self._graph_node.add_vertices(vertices, label, properties, vid_field)
        )

    def add_edges(
        self,
        edges,
        label="_",
        properties=None,
        src_label=None,
        dst_label=None,
        src_field=0,
        dst_field=1,
    ):
        if not self.loaded():
            raise RuntimeError("The graph is not loaded")
        return self._session._wrapper(
            self._graph_node.add_edges(
                edges, label, properties, src_label, dst_label, src_field, dst_field
            )
        )

    def project(
        self,
        vertices: Mapping[str, Union[List[str], None]],
        edges: Mapping[str, Union[List[str], None]],
    ):
        if not self.loaded():
            raise RuntimeError("The graph is not loaded")
        return self._session._wrapper(self._graph_node.project(vertices, edges))