def ldbc_sample_multi_labels(prefix, directed): graph = graphscope.g(directed=directed) graph = (graph.add_vertices( Loader(os.path.join(prefix, "comment_0_0.csv"), delimiter="|"), "comment").add_vertices( Loader(os.path.join(prefix, "person_0_0.csv"), delimiter="|"), "person").add_vertices( Loader(os.path.join(prefix, "post_0_0.csv"), delimiter="|"), "post", )) graph = graph.add_edges( Loader(os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), delimiter="|"), "replyOf", src_label="comment", dst_label="comment", ).add_edges( Loader(os.path.join(prefix, "person_knows_person_0_0.csv"), delimiter="|"), "knows", ["creationDate"], src_label="person", dst_label="person", ) return graph
def test_group(graphscope_session, student_group_e, student_v): graph = graphscope.Graph(graphscope_session) graph = graph.add_vertices(Loader(student_v, session=graphscope_session), "student") graph = graph.add_edges( Loader(student_group_e, session=graphscope_session), "group") assert graph.schema is not None
def test_serialize_roundtrip(gs_session_distributed, p2p_property_dir): graph = gs_session_distributed.load_from( edges={ "knows": ( Loader("{}/p2p-31_property_e_0".format(p2p_property_dir), header_row=True), ["src_label_id", "dst_label_id", "dist"], ("src_id", "person"), ("dst_id", "person"), ), }, vertices={ "person": Loader("{}/p2p-31_property_v_0".format(p2p_property_dir), header_row=True), }, generate_eid=False, ) graph.serialize("/tmp/serialize") new_graph = Graph.deserialize("/tmp/serialize", gs_session_distributed) pg = new_graph.project_to_simple(0, 0, 0, 2) ctx = graphscope.sssp(pg, src=6) ret = (ctx.to_dataframe({ "node": "v.id", "r": "r" }, vertex_range={ "end": 6 }).sort_values(by=["node"]).to_numpy(dtype=float)) expect = np.array([[1.0, 260.0], [2.0, 229.0], [3.0, 310.0], [4.0, 256.0], [5.0, 303.0]]) assert np.all(ret == expect)
def ldbc_sample_single_label(prefix, directed): vertices = { "comment": ( Loader(os.path.join(prefix, "comment_0_0.csv"), header_row=True, delimiter="|"), ["creationDate", "locationIP", "browserUsed", "content", "length"], "id", ), } edges = { "replyOf": [ ( Loader( os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), header_row=True, delimiter="|", ), [], ("Comment.id", "comment"), ("Comment.id.1", "comment"), ), ], } return load_from(edges, vertices, directed=directed)
def test_add_vertices_edges(graphscope_session): prefix = os.path.expandvars("${GS_TEST_DIR}/modern_graph") graph = graphscope_session.g() graph = graph.add_vertices(Loader(f"{prefix}/person.csv", delimiter="|"), "person") graph = graph.add_edges(Loader(f"{prefix}/knows.csv", delimiter="|"), "knows") assert graph.schema.vertex_labels == ["person"] assert graph.schema.edge_labels == ["knows"] graph = graph.add_vertices(Loader(f"{prefix}/software.csv", delimiter="|"), "software") with pytest.raises(ValueError, match="already existed in graph"): graph = graph.add_edges( Loader(f"{prefix}/knows.csv", delimiter="|"), "knows", src_label="software", dst_label="software", ) graph = graph.add_edges( Loader(f"{prefix}/created.csv", delimiter="|"), "created", src_label="person", dst_label="software", ) assert graph.schema.vertex_labels == ["person", "software"] assert graph.schema.edge_labels == ["knows", "created"]
def test_unload(graphscope_session): prefix = os.path.expandvars("${GS_TEST_DIR}/property") g = graphscope_session.load_from( edges={ "knows": ( Loader("{}/p2p-31_property_e_0".format(prefix)), ["src_label_id", "dst_label_id", "dist"], ("src_id", "person"), ("dst_id", "person"), ), }, vertices={ "person": Loader("{}/p2p-31_property_v_0".format(prefix)), }, ) assert g.vineyard_id is not None g.unload() assert not g.loaded() with pytest.raises(RuntimeError, match="The graph is not registered in remote."): g.unload() with pytest.raises(RuntimeError, match="The graph is not registered in remote"): g.project_to_simple(v_label="person", e_label="knows") with pytest.raises(AssertionError): g2 = graphscope_session.load_from(g) with pytest.raises(RuntimeError, match="graph should be registered in remote."): property_sssp(g, src=6)
def test_error_on_remove_vertices_edges(graphscope_session): prefix = os.path.expandvars("${GS_TEST_DIR}/modern_graph") graph = graphscope_session.g() graph = graph.add_vertices(Loader(f"{prefix}/person.csv", delimiter="|"), "person") graph = graph.add_edges(Loader(f"{prefix}/knows.csv", delimiter="|"), "knows") graph = graph.add_vertices( Loader(f"{prefix}/software.csv", delimiter="|"), "software" ) graph = graph.add_edges( Loader(f"{prefix}/created.csv", delimiter="|"), "created", src_label="person", dst_label="software", ) with pytest.raises(ValueError, match="Vertex software has usage in relation"): graph = graph.remove_vertices("software") with pytest.raises(ValueError, match="label xxx not in vertices"): graph = graph.remove_vertices("xxx") with pytest.raises(ValueError, match="label xxx not in edges"): graph = graph.remove_edges("xxx") with pytest.raises(ValueError, match="Cannot find edges to remove"): graph = graph.remove_edges("knows", src_label="xxx", dst_label="xxx") assert graph.loaded() with pytest.raises( ValueError, match="Remove vertices from a loaded graph doesn't supported yet" ): graph = graph.remove_vertices("person") with pytest.raises( ValueError, match="Remove edges from a loaded graph doesn't supported yet" ): graph = graph.remove_edges("knows")
def test_add_vertices_edges(gs_session_distributed, modern_graph_data_dir): graph = load_modern_graph(gs_session_distributed, modern_graph_data_dir) graph = graph.add_vertices( Loader(os.path.join(modern_graph_data_dir, "person.csv"), delimiter="|"), "person2", ["name", ("age", "int")], "id", ) assert "person2" in graph.schema.vertex_labels graph = graph.add_edges( Loader( os.path.join(modern_graph_data_dir, "knows.csv"), delimiter="|", ), "knows2", ["weight"], src_label="person2", dst_label="person2", ) assert "knows2" in graph.schema.edge_labels interactive = gs_session_distributed.gremlin(graph) g = interactive.traversal_source() assert g.V().count().toList()[0] == 10 assert g.E().count().toList()[0] == 8
def test_p2p(graphscope_session, p2p_31_e, p2p_31_v): graph = graphscope.Graph(graphscope_session) graph = graph.add_vertices(Loader(p2p_31_v, session=graphscope_session), "person") graph = graph.add_edges(Loader(p2p_31_e, session=graphscope_session), "knows") assert graph.schema is not None
def simple_label_multigraph(prefix, directed): graph = graphscope.g(directed=directed, generate_eid=False) graph = graph.add_vertices(Loader(os.path.join(prefix, "simple_v_0.csv")), "v-0") graph = graph.add_vertices(Loader(os.path.join(prefix, "simple_v_1.csv")), "v-1") graph = graph.add_edges( Loader(os.path.join(prefix, "simple_e_0.csv")), "e-0", src_label="v-0", dst_label="v-0", ) graph = graph.add_edges( Loader(os.path.join(prefix, "simple_e_1_multiple.csv")), "e-1", src_label="v-0", dst_label="v-1", ) graph = graph.add_edges( Loader(os.path.join(prefix, "simple_e_2.csv")), "e-2", src_label="v-1", dst_label="v-1", ) return graph
def p3_graph(prefix, directed): graph = graphscope.g(directed=directed, generate_eid=False) graph = graph.add_vertices( Loader(os.path.join(prefix, "3v.csv"), delimiter="|"), "vertex") graph = graph.add_edges( Loader(os.path.join(prefix, "p3_directed.csv"), delimiter="|"), "edge", ) return graph
def ldbc_sample_single_label_with_sess(sess, prefix, directed): graph = sess.g(directed=directed, generate_eid=False) graph = graph.add_vertices( Loader(os.path.join(prefix, "comment_0_0.csv"), delimiter="|"), "comment" ) graph = graph.add_edges( Loader(os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), delimiter="|"), "replyOf", ) return graph
def ldbc_sample_string_oid(prefix, directed): graph = graphscope.g(directed=directed, oid_type="string", generate_eid=False) graph = graph.add_vertices( Loader(os.path.join(prefix, "comment_0_0.csv"), delimiter="|"), "comment" ) graph = graph.add_edges( Loader(os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), delimiter="|"), "replyOf", ) return graph
def ldbc_sample_single_label(prefix, directed): graph = graphscope.Graph(directed=directed) graph = graph.add_vertices( Loader(os.path.join(prefix, "comment_0_0.csv"), delimiter="|"), "comment" ) graph = graph.add_edges( Loader(os.path.join(prefix, "comment_replyOf_comment_0_0.csv"), delimiter="|"), "replyOf", ) return graph
def test_demo_on_hdfs(gs_session_distributed): graph = gs_session_distributed.g() graph = graph.add_vertices( Loader( os.environ["HDFS_TEST_DIR"] + "/person_0_0.csv", host=os.environ["HDFS_HOST"], port=9000, delimiter="|", ), "person", [ "firstName", "lastName", "gender", "birthday", # "creationDate", "locationIP", "browserUsed", ], "id", ) graph = graph.add_edges( Loader( os.environ["HDFS_TEST_DIR"] + "/person_knows_person_0_0.csv", host=os.environ["HDFS_HOST"], port=9000, delimiter="|", ), "knows", [], src_label="person", dst_label="person", ) # Interactive engine interactive = gs_session_distributed.gremlin(graph) sub_graph = interactive.subgraph( # noqa: F841 'g.V().hasLabel("person").outE("knows")') # Analytical engine # project the projected graph to simple graph. simple_g = sub_graph.project(vertices={"person": []}, edges={"knows": []}) pr_result = graphscope.pagerank(simple_g, delta=0.8) # output to hdfs pr_result.output( os.environ["HDFS_TEST_DIR"] + "/res.csv", selector={ "id": "v.id", "rank": "r" }, host=os.environ["HDFS_HOST"], port=9000, )
def ldbc_sample_with_duplicated_oid(prefix, directed): vertices = { "place": ( Loader( os.path.join(prefix, "place_0_0.csv"), header_row=True, delimiter="|" ), ["name", "url", "type"], "id", ), "person": ( Loader( os.path.join(prefix, "person_0_0.csv"), header_row=True, delimiter="|" ), [ "firstName", "lastName", "gender", "birthday", "creationDate", "locationIP", "browserUsed", ], "id", ), } edges = { "isPartOf": [ ( Loader( os.path.join(prefix, "place_isPartOf_place_0_0.csv"), header_row=True, delimiter="|", ), [], ("Place.id", "place"), ("Place.id.1", "place"), ) ], "knows": [ ( Loader( os.path.join(prefix, "person_knows_person_0_0.csv"), header_row=True, delimiter="|", ), ["creationDate"], ("Person.id", "person"), ("Person.id.1", "person"), ) ], } return load_from(edges, vertices, directed=directed)
def load_subgraph(name): import vineyard # invoke load_from g = self._graphscope_session.load_from( edges=[Loader(vineyard.ObjectName("__%s_edge_stream" % name))], vertices=[ Loader(vineyard.ObjectName("__%s_vertex_stream" % name)) ], generate_eid=False, ) logger.info("subgraph has been loaded") return g
def load_subgraph(name): import vineyard import graphscope graph = self._session.g(generate_eid=False) graph = graph.add_vertices( Loader(vineyard.ObjectName("__%s_vertex_stream" % name))) graph = graph.add_edges( Loader(vineyard.ObjectName("__%s_edge_stream" % name))) graph._ensure_loaded() logger.info("subgraph has been loaded") return graph
def test_p2p_form_loader(graphscope_session, p2p_31_e, p2p_31_v): g = graphscope_session.load_from( edges={ "group": { "loader": Loader(p2p_31_e, session=graphscope_session), } }, vertices={ "student": { "loader": Loader(p2p_31_v, session=graphscope_session), } }, )
def p2p_31_graph(prefix, directed): graph = graphscope.g(directed=directed, generate_eid=False) graph = graph.add_vertices( Loader(os.path.join(prefix, "p2p-31.v"), delimiter=" ", header_row=False), "vertex", ) graph = graph.add_edges( Loader(os.path.join(prefix, "p2p-31.e"), delimiter=" ", header_row=False), "edge", ) return graph
def test_multiple_add_vertices_edges(graphscope_session): prefix = os.path.expandvars("${GS_TEST_DIR}/modern_graph") graph = graphscope_session.g() graph = graph.add_vertices(Loader(f"{prefix}/person.csv", delimiter="|"), "person") graph = graph.add_edges(Loader(f"{prefix}/knows.csv", delimiter="|"), "knows") graph = graph.add_vertices(Loader(f"{prefix}/software.csv", delimiter="|"), "software") graph = graph.add_edges( Loader(f"{prefix}/created.csv", delimiter="|"), "created", src_label="person", dst_label="software", ) assert graph.schema.vertex_labels == ["person", "software"] assert graph.schema.edge_labels == ["knows", "created"] graph = graph.add_vertices(Loader(f"{prefix}/person.csv", delimiter="|"), "person2") graph = graph.add_edges( Loader(f"{prefix}/knows.csv", delimiter="|"), "knows2", src_label="person2", dst_label="person2", ) assert sorted(graph.schema.vertex_labels) == [ "person", "person2", "software", ] assert sorted(graph.schema.edge_labels) == [ "created", "knows", "knows2", ] graph = graph.add_vertices(Loader(f"{prefix}/software.csv", delimiter="|"), "software2") graph = graph.add_edges( Loader(f"{prefix}/created.csv", delimiter="|"), "created2", src_label="person2", dst_label="software2", ) assert sorted(graph.schema.vertex_labels) == [ "person", "person2", "software", "software2", ] assert sorted(graph.schema.edge_labels) == [ "created", "created2", "knows", "knows2", ]
def __init__( self, loader, properties=None, source=None, destination=None, load_strategy="both_out_in", ): if isinstance(loader, Loader): self.loader = loader else: self.loader = Loader(loader) self.raw_properties = properties self.properties = [] self.source_vid = 0 self.source_label = "" self.destination_vid = 1 self.destination_label = "" self.load_strategy = "" if source is not None: self.set_source(source) if destination is not None: self.set_destination(destination) if (isinstance(self.source_vid, int) and isinstance(self.destination_vid, str)) or ( isinstance(self.source_vid, str) and isinstance(self.destination_vid, int)): raise SyntaxError( "Source vid and destination vid must have same formats, both use name or both use index" ) self.set_load_strategy(load_strategy)
def test_serialize_roundtrip(p2p_property_dir): gs_image, gie_manager_image = get_gs_image_on_ci_env() sess = graphscope.session( num_workers=2, k8s_gs_image=gs_image, k8s_gie_graph_manager_image=gie_manager_image, k8s_coordinator_cpu=0.5, k8s_coordinator_mem="2500Mi", k8s_vineyard_cpu=0.1, k8s_vineyard_mem="512Mi", k8s_engine_cpu=0.1, k8s_engine_mem="1500Mi", k8s_vineyard_shared_mem="2Gi", k8s_volumes=get_k8s_volumes(), ) graph = sess.load_from( edges={ "knows": ( Loader("{}/p2p-31_property_e_0".format(p2p_property_dir), header_row=True), ["src_label_id", "dst_label_id", "dist"], ("src_id", "person"), ("dst_id", "person"), ), }, vertices={ "person": Loader("{}/p2p-31_property_v_0".format(p2p_property_dir), header_row=True), }, generate_eid=False, ) graph.serialize("/tmp/serialize") graph.unload() new_graph = Graph.deserialize("/tmp/serialize", sess) pg = new_graph.project_to_simple(0, 0, 0, 2) ctx = graphscope.sssp(pg, src=6) ret = (ctx.to_dataframe({ "node": "v.id", "r": "r" }, vertex_range={ "end": 6 }).sort_values(by=["node"]).to_numpy(dtype=float)) expect = np.array([[1.0, 260.0], [2.0, 229.0], [3.0, 310.0], [4.0, 256.0], [5.0, 303.0]]) assert np.all(ret == expect)
def p2p_property_graph(graphscope_session): g = graphscope_session.load_from( edges={ "knows": ( Loader("{}/p2p-31_property_e_0".format(property_dir), header_row=True), ["src_label_id", "dst_label_id", "dist"], ("src_id", "person"), ("dst_id", "person"), ), }, vertices={ "person": Loader("{}/p2p-31_property_v_0".format(property_dir), header_row=True), }, generate_eid=False, ) yield g
def load_modern_graph(sess, prefix, directed=True): """Load modern graph. Modern graph consist 6 vertices and 6 edges, useful to test the basic functionalities. Args: sess (:class:`graphscope.Session`): Load graph within the session. prefix (str): Data directory. directed (bool, optional): Determine to load a directed or undirected graph. Defaults to True. Returns: :class:`graphscope.Graph`: A Graph object which graph type is ArrowProperty """ graph = sess.g(directed=directed) graph = (graph.add_vertices( Loader(os.path.join(prefix, "person.csv"), delimiter="|"), "person", ["name", ("age", "int")], "id", ).add_vertices( Loader(os.path.join(prefix, "software.csv"), delimiter="|"), "software", ["name", "lang"], "id", ).add_edges( Loader(os.path.join(prefix, "knows.csv"), delimiter="|"), "knows", ["weight"], src_label="person", dst_label="person", src_field="src_id", dst_field="dst_id", ).add_edges( Loader(os.path.join(prefix, "created.csv"), delimiter="|"), "created", ["weight"], src_label="person", dst_label="software", src_field="src_id", dst_field="dst_id", )) return graph
def test_dict_in_dict_form_loader(graphscope_session, student_group_e, student_v): g = graphscope_session.load_from( edges={ "group": { "loader": Loader(student_group_e, session=graphscope_session), "properties": ["member_size"], "source": ("leader_student_id", "student"), "destination": ("member_student_id", "student"), "load_strategy": "both_out_in", } }, vertices={ "student": { "loader": Loader(student_v, session=graphscope_session), "properties": ["name", "lesson_nums", "avg_score"], "vid": "student_id", } }, )
def test_remove_vertices_edges(graphscope_session): prefix = os.path.expandvars("${GS_TEST_DIR}/modern_graph") graph = (Graph(graphscope_session).add_vertices( Loader(f"{prefix}/person.csv", delimiter="|"), "person").add_edges(Loader(f"{prefix}/knows.csv", delimiter="|"), "knows")) another_graph = graph.add_vertices( Loader(f"{prefix}/software.csv", delimiter="|"), "software").add_edges( Loader("{prefix}/created.csv", delimiter="|"), "created", src_label="person", dst_label="software", ) another_graph = another_graph.remove_edges("created") another_graph = another_graph.remove_vertices("software") assert graph.schema.vertex_labels == another_graph.schema.vertex_labels assert graph.schema.edge_labels == another_graph.schema.edge_labels
def ldbc_sample_with_duplicated_oid(prefix, directed): graph = graphscope.Graph(directed=directed) graph = graph.add_vertices( Loader(os.path.join(prefix, "place_0_0.csv"), delimiter="|"), "place" ).add_vertices( Loader(os.path.join(prefix, "person_0_0.csv"), delimiter="|"), "person" ) graph = graph.add_edges( Loader(os.path.join(prefix, "place_isPartOf_place_0_0.csv"), delimiter="|"), "isPartOf", src_label="place", dst_label="place", ).add_edges( Loader(os.path.join(prefix, "person_knows_person_0_0.csv"), delimiter="|"), "knows", ["creationDate"], src_label="person", dst_label="person", ) return graph
def load_subgraph(name): import vineyard host, port = self._graphscope_session.info["engine_config"][ "vineyard_rpc_endpoint"].split(":") client = vineyard.connect(host, int(port)) # get vertex/edge stream id vstream = client.get_name("__%s_vertex_stream" % name, True) estream = client.get_name("__%s_edge_stream" % name, True) # invoke load_from g = self._graphscope_session.load_from( edges=[Loader(estream)], vertices=[Loader(vstream)], generate_eid=False, ) client.put_name(vineyard.ObjectID(g.vineyard_id), graph_name) logger.info("subgraph has been loaded") return g
def __init__( self, loader, properties=None, src_label: str = "_", dst_label: str = "_", src_field: Union[str, int] = 0, dst_field: Union[str, int] = 1, load_strategy="both_out_in", id_type: str = "int64_t", eformat=None, ): if isinstance(loader, Loader): self.loader = loader else: self.loader = Loader(loader) # raw properties passed by user parameters self.raw_properties = properties # finally properties for constructing graph self.properties = [] # type of vertex original id # should be consistent with the original graph self.id_type = id_type self.src_label = src_label self.dst_label = dst_label self.src_field = src_field self.dst_field = dst_field # check avaiable check_argument( load_strategy in ("only_out", "only_in", "both_out_in"), "invalid load strategy: " + load_strategy, ) self.load_strategy = load_strategy if (isinstance(self.src_field, int) and isinstance( self.dst_field, str)) or (isinstance(self.src_field, str) and isinstance(self.dst_field, int)): print("src field", self.src_field, "dst_field", self.dst_field) raise SyntaxError( "Source vid and destination vid must have same formats, both use name or both use index" ) # normalize properties # add src/dst to property list self.add_property(str(self.src_field), self.id_type) self.add_property(str(self.dst_field), self.id_type) if self.raw_properties: self.add_properties(self.raw_properties) elif self.loader.deduced_properties: self.add_properties(self.loader.deduced_properties) # set selected columns to loader self.loader.select_columns( self.properties, include_all=bool(self.raw_properties is None)) self._eformat = eformat