def load_ogbn_arxiv(sess=None, prefix=None): """Load ogbn_arxiv graph. The ogbn-arxiv dataset is a directed graph, representing the citation network between all Computer Science (CS) arXiv papers indexed by Microsoft Academic Graph (MAG). See more details here: https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_arsiv >>> sess = graphscope.session(mode="lazy") >>> g = load_ogbn_arxiv(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_arxiv >>> sess = graphscope.session(mode="eager") >>> g = load_ogbn_arxiv(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ogbn_arxiv.tar.gz" origin = f"{DATA_SITE}/ogbn_arxiv.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "d920922681e8369da5dc8e0f28fffae2eb0db056dc626097f4159351d4ea4389", ) # assumed dirname is ogbn_arxiv after extracting from ogbn_arxiv.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g() graph = graph.add_vertices(os.path.join(prefix, "nodes.csv"), "paper").add_edges( os.path.join(prefix, "edge.csv"), "citation") return graph
def __init__(self, incoming_graph_data=None, default_label=None, **attr): self.graph_attr_dict_factory = self.graph_attr_dict_factory self.node_dict_factory = self.node_dict_factory self.adjlist_outer_dict_factory = self.adjlist_outer_dict_factory self.cache = self.graph_cache_factory(self) # init node and adj (must be after cache) self.graph = self.graph_attr_dict_factory() self._node = self.node_dict_factory(self) self._adj = self.adjlist_outer_dict_factory(self) self._succ = self._adj self._pred = self.adjlist_outer_dict_factory(self, pred=True) self._key = None self._op = None self._graph_type = self._graph_type self._schema = GraphSchema() # cache for add_node and add_edge self._add_node_cache = [] self._add_edge_cache = [] self._remove_node_cache = [] self._remove_edge_cache = [] create_empty_in_engine = attr.pop( "create_empty_in_engine", True ) # a hidden parameter self._distributed = attr.pop("dist", False) if incoming_graph_data is not None and self._is_gs_graph(incoming_graph_data): # convert from gs graph always use distributed mode self._distributed = True if self._session is None: self._session = get_session_by_id(incoming_graph_data.session_id) self._default_label = default_label self._default_label_id = -1 if self._session is None: self._session = get_default_session() if not self._is_gs_graph(incoming_graph_data) and create_empty_in_engine: graph_def = init_empty_graph_in_engine( self, self.is_directed(), self._distributed ) self._key = graph_def.key # attempt to load graph with data if incoming_graph_data is not None: to_networkx_graph(incoming_graph_data, create_using=self) self.cache.warmup() # load graph attributes (must be after to_networkx_graph) self.graph.update(attr) self._saved_signature = self.signature self._is_client_view = False
def test_import(self): import graphscope.nx as nx_default nx1 = self.session1.nx() nx2 = self.session2.nx() G = nx_default.Graph() G1 = nx1.Graph() G2 = nx2.Graph() assert G.session_id == get_default_session().session_id assert G1.session_id == self.session1.session_id assert G2.session_id == self.session2.session_id self.session1.close() self.session2.close()
def finish(self): if self.finished: return if self.preprocessor is not None: self.protocol, self.source = self.preprocessor( self.source, self.storage_options, self.options.to_dict(), get_default_session(), ) logger.debug( f"processed protocol = {self.protocol}, source = {self.source}" ) self.finished = True
def process_vineyard(self, source): if vineyard is None: raise RuntimeError("Vineyard is not installed") if source.startswith("vineyard://"): source = source[len("vineyard://"):] if not urlparse(source).scheme: source = "file://%s" % source if "#" in source: source = "%s&%s" % (source, str(self.options)) else: source = "%s#%s" % (source, str(self.options)) if self.session is not None: sess = self.session else: sess = get_default_session() info = sess.info conf = info["engine_config"] vineyard_endpoint = conf["vineyard_rpc_endpoint"] vineyard_ipc_socket = conf["vineyard_socket"] hosts = info["engine_hosts"].split(",") if "namespace" in info: deployment = "kubernetes" hosts = ["%s:%s" % (info["namespace"], host) for host in hosts] else: deployment = "ssh" num_workers = info["num_workers"] self.protocol = "vineyard" self.source = repr( vineyard.io.open( source, mode="r", vineyard_endpoint=vineyard_endpoint, vineyard_ipc_socket=vineyard_ipc_socket, hosts=hosts, num_workers=num_workers, deployment=deployment, )) logger.debug("opened vineyard stream id = %s", self.source)
def load_ogbn_proteins(sess=None, prefix=None): """Load ogbn_proteins graph. The ogbn-proteins dataset is an undirected, weighted, and typed (according to species) graph. Nodes represent proteins, and edges indicate different types of biologically meaningful associations between proteins, e.g., physical interactions, co-expression or homology [1,2]. All edges come with 8-dimensional features, where each dimension represents the approximate confidence of a single association type and takes values between 0 and 1 (the larger the value is, the more confident we are about the association). The proteins come from 8 species. See more details here: https://ogb.stanford.edu/docs/nodeprop/#ogbn-proteins Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_arsiv >>> sess = graphscope.session(mode="lazy") >>> g = load_ogbn_proteins(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_proteins >>> sess = graphscope.session(mode="eager") >>> g = load_ogbn_proteins(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ogbn_proteins.tar.gz" origin = f"{DATA_SITE}/ogbn_proteins.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash="ea427e520bf068f3d6788d940b3bdc6773b965d792f2fa4a52311eab478acbde", ) # assumed dirname is ogbn_proteins after extracting from ogbn_proteins.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g() graph = graph.add_vertices(os.path.join(prefix, "nodes.csv"), "proteins").add_edges( os.path.join(prefix, "edge.csv"), "associations" ) return graph
def load_u2i(sess=None, prefix=None, directed=True): """Load user2item datasets. The user-2-item datasets consists of 5241 nodes, which represents both user and item node, 42876 edges represents with buying relationship. And this dataset is owned by graphlearn, you can downloads from here: https://github.com/alibaba/graph-learn/blob/graphscope/examples/data/u2i.py Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_u2i >>> sess = graphscope.session(mode="lazy") >>> g = load_u2i(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_u2i >>> sess = graphscope.session(mode="eager") >>> g = load_u2i(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "u2i.tar.gz" origin = f"{DATA_SITE}/u2i.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "b5475a0f6f13b0964ba0c38804d06003a44627653df3371d938e47fb9eedced6", ) # assumed dirname is u2i after extracting from u2i.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g(directed=directed) graph = (graph.add_vertices( Loader(os.path.join(prefix, "node.csv"), delimiter="\t"), label="u", properties=[("feature", "str")], vid_field="id", ).add_vertices( Loader(os.path.join(prefix, "node.csv"), delimiter="\t"), label="i", properties=[("feature", "str")], vid_field="id", ).add_edges( Loader(os.path.join(prefix, "edge.csv"), delimiter="\t"), label="u-i", properties=["weight"], src_label="u", dst_label="i", src_field="src_id", dst_field="dst_id", ).add_edges( Loader(os.path.join(prefix, "edge.csv"), delimiter="\t"), label="u-i_reverse", properties=["weight"], src_label="i", dst_label="u", src_field="dst_id", dst_field="src_id", )) return graph
def load_from( edges: Union[Mapping[str, Union[LoaderVariants, Sequence, Mapping]], LoaderVariants, Sequence], vertices: Union[Mapping[str, Union[LoaderVariants, Sequence, Mapping]], LoaderVariants, Sequence, None, ] = None, directed=True, oid_type="int64_t", generate_eid=True, vformat=None, eformat=None, ) -> Graph: """Load a Arrow property graph using a list of vertex/edge specifications. .. deprecated:: version 0.3 Use :class:`graphscope.Graph()` instead. - Use Dict of tuples to setup a graph. We can use a dict to set vertex and edge configurations, which can be used to build graphs. Examples: .. code:: ipython g = graphscope_session.load_from( edges={ "group": [ ( "file:///home/admin/group.e", ["group_id", "member_size"], ("leader_student_id", "student"), ("member_student_id", "student"), ), ( "file:///home/admin/group_for_teacher_student.e", ["group_id", "group_name", "establish_date"], ("teacher_in_charge_id", "teacher"), ("member_student_id", "student"), ), ] }, vertices={ "student": ( "file:///home/admin/student.v", ["name", "lesson_nums", "avg_score"], "student_id", ), "teacher": ( "file:///home/admin/teacher.v", ["name", "salary", "age"], "teacher_id", ), }, ) 'e' is the label of edges, and 'v' is the label for vertices, edges are stored in the 'both_in_out' format edges with label 'e' linking from 'v' to 'v'. - Use Dict of dict to setup a graph. We can also give each element inside the tuple a meaningful name, makes it more understandable. Examples: .. code:: ipython g = graphscope_session.load_from( edges={ "group": [ { "loader": "file:///home/admin/group.e", "properties": ["group_id", "member_size"], "source": ("leader_student_id", "student"), "destination": ("member_student_id", "student"), }, { "loader": "file:///home/admin/group_for_teacher_student.e", "properties": ["group_id", "group_name", "establish_date"], "source": ("teacher_in_charge_id", "teacher"), "destination": ("member_student_id", "student"), }, ] }, vertices={ "student": { "loader": "file:///home/admin/student.v", "properties": ["name", "lesson_nums", "avg_score"], "vid": "student_id", }, "teacher": { "loader": "file:///home/admin/teacher.v", "properties": ["name", "salary", "age"], "vid": "teacher_id", }, }, ) Args: edges: Edge configuration of the graph vertices (optional): Vertices configurations of the graph. Defaults to None. If None, we assume all edge's src_label and dst_label are deduced and unambiguous. directed (bool, optional): Indicate whether the graph should be treated as directed or undirected. oid_type (str, optional): ID type of graph. Can be "int64_t" or "string". Defaults to "int64_t". generate_eid (bool, optional): Whether to generate a unique edge id for each edge. Generated eid will be placed in third column. This feature is for cooperating with interactive engine. If you only need to work with analytical engine, set it to False. Defaults to False. """ # Don't import the :code:`nx` in top-level statments to improve the # performance of :code:`import graphscope`. from graphscope import nx sess = get_default_session() if isinstance(edges, (Graph, nx.Graph, *VineyardObjectTypes)): return sess.g(edges) oid_type = utils.normalize_data_type_str(oid_type) if oid_type not in ("int64_t", "std::string"): raise ValueError("oid_type can only be int64_t or string.") v_labels = normalize_parameter_vertices(vertices, oid_type, vformat) e_labels = normalize_parameter_edges(edges, oid_type, eformat) # generate and add a loader op to dag loader_op = dag_utils.create_loader(v_labels + e_labels) sess.dag.add_op(loader_op) # construct create graph op config = { types_pb2.DIRECTED: utils.b_to_attr(directed), types_pb2.OID_TYPE: utils.s_to_attr(oid_type), types_pb2.GENERATE_EID: utils.b_to_attr(generate_eid), types_pb2.VID_TYPE: utils.s_to_attr("uint64_t"), types_pb2.IS_FROM_VINEYARD_ID: utils.b_to_attr(False), } op = dag_utils.create_graph(sess.session_id, graph_def_pb2.ARROW_PROPERTY, inputs=[loader_op], attrs=config) graph = sess.g(op) return graph
def __init__(self, incoming_graph_data=None, **attr): """Initialize a graph with edges, name, or graph attributes Parameters ---------- incoming_graph_data : input graph (optional, default: None) Data to initialize graph. If None (default) an empty graph is created. The data can be any format that is supported by the to_nx_graph() function, currently including edge list, dict of dicts, dict of lists, NetworkX graph, NumPy matrix or 2d ndarray, Pandas DataFrame, SciPy sparse matrix, or a graphscope graph. attr : keyword arguments, optional (default= no attributes) Attributes to add to graph as key=value pairs. See Also -------- convert Examples -------- >>> G = nx.Graph() # or DiGraph >>> G = nx.Graph(name='my graph') >>> e = [(1, 2), (2, 3), (3, 4)] # list of edges >>> G = nx.Graph(e) Arbitrary graph attribute pairs (key=value) may be assigned >>> G = nx.Graph(e, day="Friday") >>> G.graph {'day': 'Friday'} """ sess = get_default_session() if sess is None: raise ValueError( "Cannot find a default session. " "Please register a session using graphscope.session(...).as_default()" ) self._session_id = sess.session_id self._key = None self._op = None self._graph_type = self._graph_type self._schema = GraphSchema() self._schema.init_nx_schema() create_empty_in_engine = attr.pop("create_empty_in_engine", True) # a hidden parameter if not self.is_gs_graph( incoming_graph_data) and create_empty_in_engine: graph_def = empty_graph_in_engine(self, self.is_directed()) self._key = graph_def.key self.graph_attr_dict_factory = self.graph_attr_dict_factory self.node_dict_factory = self.node_dict_factory self.adjlist_dict_factory = self.adjlist_dict_factory self.graph = self.graph_attr_dict_factory() self._node = self.node_dict_factory(self) self._adj = self.adjlist_dict_factory(self) self._pred = self.adjlist_dict_factory(self, types_pb2.PREDS_BY_NODE) self._succ = self._adj # attempt to load graph with data if incoming_graph_data is not None: if self.is_gs_graph(incoming_graph_data): graph_def = from_gs_graph(incoming_graph_data, self) self._key = graph_def.key self._schema.init_nx_schema(incoming_graph_data.schema) else: to_nx_graph(incoming_graph_data, create_using=self) # load graph attributes (must be after to_nx_graph) self.graph.update(attr) self._saved_signature = self.signature
def load_p2p_network(sess=None, prefix=None, directed=False): """Load p2p graph. A peer-to-peer dataset derived from Gnutella peer-to-peer network, August 31 2002, with generated data on vertices and edges. See more details here: http://snap.stanford.edu/data/p2p-Gnutella31.html Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_p2p_network >>> sess = graphscope.session(mode="lazy") >>> g = load_p2p_network(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_p2p_network >>> sess = graphscope.session(mode="eager") >>> g = load_p2p_network(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "p2p_network.tar.gz" origin = f"{DATA_SITE}/p2p_network.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "117131735186caff23ea127beec61b5396662c0815fc7918186451fe957e8c2f", ) # assumed dirname is p2p_network after extracting from p2p_network.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g(directed=directed) graph = graph.add_vertices(os.path.join(prefix, "p2p-31_property_v_0"), "host").add_edges( os.path.join(prefix, "p2p-31_property_e_0"), "connect", src_label="host", dst_label="host", ) return graph
def __init__( self, session=None, incoming_data=None, oid_type="int64", directed=True, generate_eid=True, ): """Construct a :class:`Graph` object. Args: session_id (str): Session id of the session the graph is created in. incoming_data: Graph can be initialized through various type of sources, which can be one of: - :class:`Operation` - :class:`nx.Graph` - :class:`Graph` - :class:`vineyard.Object`, :class:`vineyard.ObjectId` or :class:`vineyard.ObjectName` """ self._key = None self._graph_type = types_pb2.ARROW_PROPERTY self._vineyard_id = 0 self._schema = GraphSchema() if session is None: session = get_default_session() self._session = session self._detached = False self._interactive_instance_launching_thread = None self._interactive_instance_list = [] self._learning_instance_list = [] # Hold uncompleted operation for lazy evaluation self._pending_op = None # Hold a reference to base graph of modify operation, # to avoid being garbage collected self._base_graph = None oid_type = utils.normalize_data_type_str(oid_type) if oid_type not in ("int64_t", "std::string"): raise ValueError("oid_type can only be int64_t or string.") self._oid_type = oid_type self._directed = directed self._generate_eid = generate_eid self._unsealed_vertices = {} self._unsealed_edges = {} # Used to isplay schema without load into vineyard, # and do sanity checking for newly added vertices and edges. self._v_labels = [] self._e_labels = [] self._e_relationships = [] if incoming_data is not None: # Don't import the :code:`NXGraph` in top-level statements to improve the # performance of :code:`import graphscope`. from graphscope.experimental import nx if isinstance(incoming_data, Operation): self._pending_op = incoming_data if self._pending_op.type == types_pb2.PROJECT_GRAPH: self._graph_type = types_pb2.ARROW_PROJECTED elif isinstance(incoming_data, nx.Graph): self._pending_op = self._from_nx_graph(incoming_data) elif isinstance(incoming_data, Graph): self._pending_op = self._copy_from(incoming_data) elif isinstance( incoming_data, (vineyard.Object, vineyard.ObjectID, vineyard.ObjectName)): self._pending_op = self._from_vineyard(incoming_data) else: raise RuntimeError("Not supported incoming data.")
def load_ogbn_mag(sess=None, prefix=None): """Load ogbn_mag graph. The ogbn-mag dataset is a heterogeneous network composed of a subset of the Microsoft Academic Graph (MAG). See more details here: https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_mag >>> sess = graphscope.session(mode="lazy") >>> g = load_ogbn_mag(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_mag >>> sess = graphscope.session(mode="eager") >>> g = load_ogbn_mag(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ogbn_mag_small.tar.gz" origin = f"{DATA_SITE}/ogbn_mag_small.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash="ccd128ab673e5d7dd1cceeaa4ba5d65b67a18212c4a27b0cd090359bd7042b10", ) # assumed dirname is ogbn_mag_small after extracting from ogbn_mag_small.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g() graph = ( graph.add_vertices(os.path.join(prefix, "paper.csv"), "paper") .add_vertices(os.path.join(prefix, "author.csv"), "author") .add_vertices(os.path.join(prefix, "institution.csv"), "institution") .add_vertices(os.path.join(prefix, "field_of_study.csv"), "field_of_study") .add_edges( os.path.join(prefix, "author_affiliated_with_institution.csv"), "affiliated", src_label="author", dst_label="institution", ) .add_edges( os.path.join(prefix, "paper_has_topic_field_of_study.csv"), "hasTopic", src_label="paper", dst_label="field_of_study", ) .add_edges( os.path.join(prefix, "paper_cites_paper.csv"), "cites", src_label="paper", dst_label="paper", ) .add_edges( os.path.join(prefix, "author_writes_paper.csv"), "writes", src_label="author", dst_label="paper", ) ) return graph
def load_cora(sess=None, prefix=None, directed=False): """Load cora datasets. The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. See more details here: https://linqs.soe.ucsc.edu/data Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_cora >>> sess = graphscope.session(mode="lazy") >>> g = load_cora(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_cora >>> sess = graphscope.session(mode="eager") >>> g = load_cora(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "cora.tar.gz" origin = f"{DATA_SITE}/cora.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "2dae0c5ec6eca4321fc94614381d6c74a216726b930e4de228bc15fa1ab504e8", ) # assumed dirname is ppi after extracting from ppi.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g(directed=directed) graph = graph.add_vertices(os.path.join(prefix, "node.csv"), "paper").add_edges( os.path.join(prefix, "edge.csv"), "cites", src_label="paper", dst_label="paper", ) return graph
def load_ldbc(sess=None, prefix=None, directed=True): """Load ldbc dataset as a ArrowProperty Graph. Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ldbc >>> sess = graphscope.session(mode="lazy") >>> g = load_ldbc(sess, "/path/to/dataset", True) >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ldbc >>> sess = graphscope.session(mode="eager") >>> g = load_ldbc(sess, "/path/to/dataset", True) """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ldbc_sample.tar.gz" origin = f"{DATA_SITE}/ldbc_sample.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash="1a3d3c36fbf416c2a02ca4163734192eed602649220d7ceef2735fc11173fc6c", ) # assumed dirname is ldbc_sample after extracting from ldbc_sample.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() vertices = { "comment": ( Loader( os.path.join(prefix, "comment_0_0.csv"), header_row=True, delimiter="|" ), ["creationDate", "locationIP", "browserUsed", "content", "length"], "id", ), "organisation": ( Loader( os.path.join(prefix, "organisation_0_0.csv"), header_row=True, delimiter="|", ), ["type", "name", "url"], "id", ), "tagclass": ( Loader( os.path.join(prefix, "tagclass_0_0.csv"), header_row=True, delimiter="|" ), ["name", "url"], "id", ), "person": ( Loader( os.path.join(prefix, "person_0_0.csv"), header_row=True, delimiter="|" ), [ "firstName", "lastName", "gender", "birthday", "creationDate", "locationIP", "browserUsed", ], "id", ), "forum": ( Loader( os.path.join(prefix, "forum_0_0.csv"), header_row=True, delimiter="|" ), ["title", "creationDate"], "id", ), "place": ( Loader( os.path.join(prefix, "place_0_0.csv"), header_row=True, delimiter="|" ), ["name", "url", "type"], "id", ), "post": ( Loader( os.path.join(prefix, "post_0_0.csv"), header_row=True, delimiter="|" ), [ "imageFile", "creationDate", "locationIP", "browserUsed", "language", "content", "length", ], "id", ), "tag": ( Loader(os.path.join(prefix, "tag_0_0.csv"), header_row=True, delimiter="|"), ["name", "url"], "id", ), } edges = { "replyOf": [ ( Loader( os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Comment.id.1", "comment"), ), ( Loader( os.path.join(prefix, "comment_replyOf_post_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Post.id", "post"), ), ], "isPartOf": [ ( Loader( os.path.join(prefix, "place_isPartOf_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Place.id", "place"), ("Place.id.1", "place"), ) ], "isSubclassOf": [ ( Loader( os.path.join(prefix, "tagclass_isSubclassOf_tagclass_0_0.csv"), header_row=True, delimiter="|", ), [], ("TagClass.id", "tagclass"), ("TagClass.id.1", "tagclass"), ) ], "hasTag": [ ( Loader( os.path.join(prefix, "forum_hasTag_tag_0_0.csv"), header_row=True, delimiter="|", ), [], ("Forum.id", "forum"), ("Tag.id", "tag"), ), ( Loader( os.path.join(prefix, "comment_hasTag_tag_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Tag.id", "tag"), ), ( Loader( os.path.join(prefix, "post_hasTag_tag_0_0.csv"), header_row=True, delimiter="|", ), [], ("Post.id", "post"), ("Tag.id", "tag"), ), ], "knows": [ ( Loader( os.path.join(prefix, "person_knows_person_0_0.csv"), header_row=True, delimiter="|", ), ["creationDate"], ("Person.id", "person"), ("Person.id.1", "person"), ) ], "hasModerator": [ ( Loader( os.path.join(prefix, "forum_hasModerator_person_0_0.csv"), header_row=True, delimiter="|", ), [], ("Forum.id", "forum"), ("Person.id", "person"), ) ], "hasInterest": [ ( Loader( os.path.join(prefix, "person_hasInterest_tag_0_0.csv"), header_row=True, delimiter="|", ), [], ("Person.id", "person"), ("Tag.id", "tag"), ) ], "isLocatedIn": [ ( Loader( os.path.join(prefix, "post_isLocatedIn_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Post.id", "post"), ("Place.id", "place"), ), ( Loader( os.path.join(prefix, "comment_isLocatedIn_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Place.id", "place"), ), ( Loader( os.path.join(prefix, "organisation_isLocatedIn_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Organisation.id", "organisation"), ("Place.id", "place"), ), ( Loader( os.path.join(prefix, "person_isLocatedIn_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Person.id", "person"), ("Place.id", "place"), ), ], "hasType": [ ( Loader( os.path.join(prefix, "tag_hasType_tagclass_0_0.csv"), header_row=True, delimiter="|", ), [], ("Tag.id", "tag"), ("TagClass.id", "tagclass"), ) ], "hasCreator": [ ( Loader( os.path.join(prefix, "post_hasCreator_person_0_0.csv"), header_row=True, delimiter="|", ), [], ("Post.id", "post"), ("Person.id", "person"), ), ( Loader( os.path.join(prefix, "comment_hasCreator_person_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Person.id", "person"), ), ], "containerOf": [ ( Loader( os.path.join(prefix, "forum_containerOf_post_0_0.csv"), header_row=True, delimiter="|", ), [], ("Forum.id", "forum"), ("Post.id", "post"), ) ], "hasMember": [ ( Loader( os.path.join(prefix, "forum_hasMember_person_0_0.csv"), header_row=True, delimiter="|", ), ["joinDate"], ("Forum.id", "forum"), ("Person.id", "person"), ) ], "workAt": [ ( Loader( os.path.join(prefix, "person_workAt_organisation_0_0.csv"), header_row=True, delimiter="|", ), ["workFrom"], ("Person.id", "person"), ("Organisation.id", "organisation"), ) ], "likes": [ ( Loader( os.path.join(prefix, "person_likes_comment_0_0.csv"), header_row=True, delimiter="|", ), ["creationDate"], ("Person.id", "person"), ("Comment.id", "comment"), ), ( Loader( os.path.join(prefix, "person_likes_post_0_0.csv"), header_row=True, delimiter="|", ), ["creationDate"], ("Person.id", "person"), ("Post.id", "post"), ), ], "studyAt": [ ( Loader( os.path.join(prefix, "person_studyAt_organisation_0_0.csv"), header_row=True, delimiter="|", ), ["classYear"], ("Person.id", "person"), ("Organisation.id", "organisation"), ) ], } return sess.load_from(edges, vertices, directed, generate_eid=True)
def load_ogbl_collab(sess=None, prefix=None): """Load ogbl_collab graph. The ogbl-collab dataset is an undirected graph, representing a subset of the collaboration network between authors indexed by MAG. Each node represents an author and edges indicate the collaboration between authors. All nodes come with 128-dimensional features, obtained by averaging the word embeddings of papers that are published by the authors. All edges are associated with two meta-information: the year and the edge weight, representing the number of co-authored papers published in that year. The graph can be viewed as a dynamic multi-graph since there can be multiple edges between two nodes if they collaborate in more than one year. See more details here: https://ogb.stanford.edu/docs/linkprop/#ogbl-collab Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_arsiv >>> sess = graphscope.session(mode="lazy") >>> g = load_ogbl_collab(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ogbl_collab >>> sess = graphscope.session(mode="eager") >>> g = load_ogbl_collab(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ogbl_collab.tar.gz" origin = f"{DATA_SITE}/ogbl_collab.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "abb49a2f7c6c16ed355ea83ec7ce65ece1278eec40e6fef6ee9918b4383ae459", ) # assumed dirname is ogbl_collab after extracting from ogbl_collab.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g() graph = graph.add_vertices(os.path.join(prefix, "nodes.csv"), "author").add_edges( os.path.join(prefix, "edge.csv"), "collaboration") return graph
def load_from( edges: Union[Mapping[str, Union[LoaderVariants, Sequence, Mapping]], LoaderVariants, Sequence], vertices: Union[Mapping[str, Union[LoaderVariants, Sequence, Mapping]], LoaderVariants, Sequence, None, ] = None, directed=True, oid_type="int64_t", generate_eid=True, ) -> Graph: """Load a Arrow property graph using a list of vertex/edge specifications. - Use Dict of tuples to setup a graph. We can use a dict to set vertex and edge configurations, which can be used to build graphs. Examples: .. code:: ipython g = graphscope_session.load_from( edges={ "group": [ ( "file:///home/admin/group.e", ["group_id", "member_size"], ("leader_student_id", "student"), ("member_student_id", "student"), ), ( "file:///home/admin/group_for_teacher_student.e", ["group_id", "group_name", "establish_date"], ("teacher_in_charge_id", "teacher"), ("member_student_id", "student"), ), ] }, vertices={ "student": ( "file:///home/admin/student.v", ["name", "lesson_nums", "avg_score"], "student_id", ), "teacher": ( "file:///home/admin/teacher.v", ["name", "salary", "age"], "teacher_id", ), }, ) 'e' is the label of edges, and 'v' is the label for vertices, edges are stored in the 'both_in_out' format edges with label 'e' linking from 'v' to 'v'. - Use Dict of dict to setup a graph. We can also give each element inside the tuple a meaningful name, makes it more understandable. Examples: .. code:: ipython g = graphscope_session.load_from( edges={ "group": [ { "loader": "file:///home/admin/group.e", "properties": ["group_id", "member_size"], "source": ("leader_student_id", "student"), "destination": ("member_student_id", "student"), }, { "loader": "file:///home/admin/group_for_teacher_student.e", "properties": ["group_id", "group_name", "establish_date"], "source": ("teacher_in_charge_id", "teacher"), "destination": ("member_student_id", "student"), }, ] }, vertices={ "student": { "loader": "file:///home/admin/student.v", "properties": ["name", "lesson_nums", "avg_score"], "vid": "student_id", }, "teacher": { "loader": "file:///home/admin/teacher.v", "properties": ["name", "salary", "age"], "vid": "teacher_id", }, }, ) Args: edges: Edge configuration of the graph vertices (optional): Vertices configurations of the graph. Defaults to None. If None, we assume all edge's src_label and dst_label are deduced and unambiguous. directed (bool, optional): Indicate whether the graph should be treated as directed or undirected. oid_type (str, optional): ID type of graph. Can be "int64_t" or "string". Defaults to "int64_t". generate_eid (bool, optional): Whether to generate a unique edge id for each edge. Generated eid will be placed in third column. This feature is for cooperating with interactive engine. If you only need to work with analytical engine, set it to False. Defaults to False. """ # Don't import the :code:`nx` in top-level statments to improve the # performance of :code:`import graphscope`. from graphscope.experimental import nx sess = get_default_session() if sess is None: raise ValueError("No default session found.") if isinstance(edges, (Graph, nx.Graph, *VineyardObjectTypes)): return Graph(sess.session_id, edges) oid_type = utils.normalize_data_type_str(oid_type) e_labels = normalize_parameter_edges(edges) v_labels = normalize_parameter_vertices(vertices) e_labels, v_labels = _sanity_check(e_labels, v_labels) config = _get_config(e_labels, v_labels, directed, oid_type, generate_eid) op = dag_utils.create_graph(sess.session_id, types_pb2.ARROW_PROPERTY, attrs=config) graph_def = sess.run(op) graph = Graph(sess.session_id, graph_def) return graph
def load_ogbl_ddi(sess=None, prefix=None): """Load ogbl_ddi graph. The ogbl-ddi dataset is a homogeneous, unweighted, undirected graph, representing the drug-drug interaction network [1]. Each node represents an FDA-approved or experimental drug. Edges represent interactions between drugs and can be interpreted as a phenomenon where the joint effect of taking the two drugs together is considerably different from the expected effect in which drugs act independently of each other. See more details here: https://ogb.stanford.edu/docs/linkprop/#ogbl-ddi Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ogbn_arsiv >>> sess = graphscope.session(mode="lazy") >>> g = load_ogbl_ddi(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ogbl_ddi >>> sess = graphscope.session(mode="eager") >>> g = load_ogbl_ddi(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ogbl_ddi.tar.gz" origin = f"{DATA_SITE}/ogbl_ddi.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash= "2a66bf265a217fd6148ba1f0ed9c9a297e778bf539b2b7262edf4a0dc1f4c8b9", ) # assumed dirname is ogbl_ddi after extracting from ogbl_ddi.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g() graph = graph.add_vertices(os.path.join(prefix, "nodes.csv"), "drug").add_edges( os.path.join(prefix, "edge.csv"), "effect") return graph
def load_ppi(sess=None, prefix=None, directed=False): """Load protein-protein links datasets. In protein-protein links graph, every node represents a protein,and edges represent the links between them. See more details here: https://humgenomics.biomedcentral.com/articles/10.1186/1479-7364-3-3-291 Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix: `PathLike` object that represents a path. With standalone mode, set prefix None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. Examples: .. code:: python >>> # lazy mode >>> import graphscope >>> from graphscope.dataset import load_ppi >>> sess = graphscope.session(mode="lazy") >>> g = load_ppi(sess, "/path/to/dataset") >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset import load_ppi >>> sess = graphscope.session(mode="eager") >>> g = load_ppi(sess, "/path/to/dataset") """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "ppi.tar.gz" origin = f"{DATA_SITE}/ppi.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash="2ffe7207626f5b177cb05871b65ee7c95fc9ebc45cc9f628d36efef8b5c0b642", ) # assumed dirname is ppi after extracting from ppi.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g(directed=directed) graph = graph.add_vertices(os.path.join(prefix, "node.csv"), "protein").add_edges( os.path.join(prefix, "edge.csv"), "link", src_label="protein", dst_label="protein", ) return graph
def load_modern_graph(sess=None, prefix=None, directed=True): """Load modern graph. Modern graph consist 6 vertices and 6 edges, useful to test the basic functionalities. Args: sess (:class:`graphscope.Session`): Load graph within the session. Default session will be used when setting to None. Defaults to None. prefix (str): `PathLike` object that represents a path. With standalone mode, set prefix to None will try to download from source URL. Defaults to None. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.framework.graph.GraphDAGNode`: A Graph node which graph type is ArrowProperty, evaluated in eager mode. >>> # lazy mode >>> import graphscope >>> from graphscope.dataset. modern_graph import load_modern_graph >>> sess = graphscope.session(mode="lazy") >>> g = load_modern_graph(sess, "/path/to/dataset", True) >>> g1 = sess.run(g) >>> # eager mode >>> import graphscope >>> from graphscope.dataset. modern_graph import load_modern_graph >>> sess = graphscope.session(mode="eager") >>> g = load_modern_graph(sess, "/path/to/dataset", True) """ if prefix is not None: prefix = os.path.expandvars(prefix) else: fname = "modern_graph.tar.gz" origin = f"{DATA_SITE}/modern_graph.tar.gz" fpath = download_file( fname, origin=origin, extract=True, file_hash="a67c02191ea9dfa618a83d94087349a25937b92973f42206a28fdf6fa5299dec", ) # assumed dirname is modern_graph after extracting from modern_graph.tar.gz prefix = fpath[0:-7] if sess is None: sess = get_default_session() graph = sess.g(directed=directed) graph = ( graph.add_vertices( Loader(os.path.join(prefix, "person.csv"), delimiter="|"), "person", ["name", ("age", "int")], "id", ) .add_vertices( Loader(os.path.join(prefix, "software.csv"), delimiter="|"), "software", ["name", "lang"], "id", ) .add_edges( Loader(os.path.join(prefix, "knows.csv"), delimiter="|"), "knows", ["weight"], src_label="person", dst_label="person", src_field="src_id", dst_field="dst_id", ) .add_edges( Loader(os.path.join(prefix, "created.csv"), delimiter="|"), "created", ["weight"], src_label="person", dst_label="software", src_field="src_id", dst_field="dst_id", ) ) return graph