def test_node_iterate_using_gsl(self): file_path = self.gen_test_data([utils.ATTRIBUTED]) decoder = gl.Decoder(attr_types=utils.ATTR_TYPES) g = gl.Graph() \ .node(source=file_path, node_type=self.node_type_, decoder=decoder) g.init(tracker=utils.TRACKER_PATH) batch_size = 4 query = g.V("user").batch(batch_size).alias('n').values() ds = gl.Dataset(query, window=1) res_ids = [] max_iter = 100 for i in range(max_iter): try: nodes = ds.next()['n'] utils.check_node_attrs(nodes) res_ids.extend(list(nodes.ids)) except gl.OutOfRangeError: break ids = range(self.value_range_[0], self.value_range_[1]) utils.check_sorted_equal(res_ids, ids) query = g.V('user').batch(batch_size).shuffle().alias('n').values() ds = gl.Dataset(query) max_iter = 10 for i in range(max_iter): nodes = ds.next()['n'] utils.check_node_attrs(nodes) utils.check_subset(nodes.ids, ids) g.close()
def test_conditional_negative_sample(self): def _check_ids(pos_id, neg_ids): utils.check_val_equal(neg_ids[0] % 5, pos_id % 5) utils.check_val_equal(neg_ids[1] % 4, pos_id % 4) utils.check_val_equal(neg_ids[2] % 3, pos_id % 3) utils.check_val_equal(neg_ids[3] % 3, pos_id % 3) q = self.g.E(self._cond_edge_type).batch(4).alias("e") \ .each(lambda e: ( e.inV().alias('dst'), e.outV().alias('src') \ .outNeg(self._cond_edge_type).sample(4).by('random').where( "dst", condition={ "int_cols": [0,1], "int_props": [0.25,0.25], "str_cols": [0], "str_props": [0.5]}).alias('neg'))) \ .values() dataset = gl.Dataset(q) res = dataset.next() src_ids = res["src"].ids dst_ids = res["dst"].ids neg_ids = res["neg"].ids for idx, id in enumerate(src_ids): print('src_id:', id, 'dst_id:', dst_ids[idx], 'neg_ids:', neg_ids[idx]) nbr_ids = [id+2,id+3,id+5] utils.check_disjoint(neg_ids[idx], nbr_ids) _check_ids(dst_ids[idx], neg_ids[idx])
def test_edge_shuffle(self): file_path = self.gen_test_data([utils.WEIGHTED], False) decoder = gl.Decoder(weighted=True) g = gl.Graph() \ .edge(source=file_path, edge_type=self.edge_tuple_, decoder=decoder) g.init(tracker=utils.TRACKER_PATH) batch_size = 4 sampler = g.E('first').batch(batch_size).shuffle(traverse=True).alias('seed').values() res_src = [] res_dst = [] max_iter = 100 ds = gl.Dataset(sampler) for i in range(max_iter): try: edges = ds.next()['seed'] utils.check_edge_weights(edges) res_src.extend(list(edges.src_ids)) res_dst.extend(list(edges.dst_ids)) except gl.OutOfRangeError: break src_ids = range(self.src_range_[0], self.src_range_[1]) dst_ids = range(self.dst_range_[0], self.dst_range_[1]) utils.check_sorted_equal(res_src, src_ids) utils.check_sorted_equal(res_dst, dst_ids) g.close()
def test_sampling_with_mask(self): gl.set_eager_mode(False) bs = 8 q = self.g.E(self._edge1_type, mask=gl.Mask.TEST).batch(bs).alias('test') \ .each( lambda e: (e.outV().alias('src'), e.inV().alias('dst') \ .outV(self._edge2_type).sample(3).by('topk').alias('nbr')) ).values(lambda x: (x['src'].ids, x['test'].labels, x['dst'].ids, x['dst'].weights, x['dst'].labels, x['nbr'].ids, x['nbr'].int_attrs)) dataset = gl.Dataset(q) iteration = 0 res = [] while True: try: sid, elb, did, dwei, dlb, nid, ni = dataset.next() utils.check_id_weights(did, dwei) utils.check_equal(dlb, did) iteration += 1 res += list(sid) except gl.OutOfRangeError: break whole = range(self._test_node_range[0], self._test_node_range[1]) expected = [] for elem in whole: expected += [elem] * len( utils.fixed_dst_ids(elem, self._node2_range)) utils.check_sorted_equal(res, expected)
def test_negative_sample(self): q = self.g.V(self._node1_type).batch(2).alias('a') \ .outNeg(self._edge1_type).sample(5).by("random").alias('b') \ .values(lambda x: (x['a'].ids, x['b'].weights)) dataset = gl.Dataset(q) res = dataset.next() utils.check_equal(list(res[0].shape), [2]) utils.check_equal(list(res[1].shape), [2, 5])
def test_sample_edge(self): q = self.g.V(self._node1_type).batch(8).alias('a') \ .outE(self._edge1_type).sample(3).by("random").alias('b') \ .inV().alias('c') \ .values() dataset = gl.Dataset(q) res = dataset.next() utils.check_equal(list(res['a'].shape), [8]) utils.check_equal(list(res['b'].shape), [8, 3]) utils.check_equal(list(res['c'].shape), [8, 3])
def test_iterate_node_with_2hop(self): q = self.g.V(self._node1_type).batch(2).alias('a') \ .outV(self._edge1_type).sample(3).by('random').alias('b') \ .outV(self._edge2_type).sample(4).by('random').alias('c') \ .values() dataset = gl.Dataset(q, 10) while True: try: res = dataset.next() utils.check_equal(list(res['a'].shape), [2]) utils.check_equal(list(res['b'].shape), [2, 3]) utils.check_equal(list(res['c'].shape), [2 * 3, 4]) except gl.OutOfRangeError: break
def test_iterate_edge_with_1hop(self): q = self.g.E(self._edge1_type).batch(4).alias("a") \ .outV().alias("b") \ .outV(self._edge1_type).sample(2).by("random").alias("c") \ .values() dataset = gl.Dataset(q) while True: try: res = dataset.next() utils.check_equal(list(res['a'].shape), [4]) utils.check_equal(list(res['b'].shape), [4]) utils.check_equal(list(res['b'].int_attrs.shape), [4, 2]) # [batch_size, int_attr_num] utils.check_equal(list(res['c'].shape), [4, 2]) except gl.OutOfRangeError: break
def test_iterate_edge_with_each(self): q = self.g.E(self._edge1_type).batch(4).alias('a') \ .each( lambda x: ( x.outV().alias('b').outV(self._edge1_type).sample(2).by('random').alias('d'), x.inV().alias('c').outV(self._edge2_type).sample(2).by('random').alias('e') )) \ .values( lambda x: (x['a'].int_attrs, x['d'].weights, x['e'].ids) ) dataset = gl.Dataset(q) while True: try: dataset.next() except gl.OutOfRangeError: break
def test_full_sample(self): q = self.g.V(self._node2_type).batch(4).alias('a') \ .outV(self._edge2_type).sample(3).by("full").alias('b') \ .values(lambda x: (x['a'].ids, x['b'].ids, x['b'].offsets)) dataset = gl.Dataset(q) while True: try: src, nbrs, offsets = dataset.next() start = 0 for idx, offset in enumerate(offsets): expected_nbrs = utils.fixed_dst_ids(src[idx], self._node1_range) assert offset == min(len(expected_nbrs), 3) utils.check_subset(nbrs[start: start + offset], expected_nbrs) start += offset except gl.OutOfRangeError: break
def test_basic(self): file_path = self.gen_test_data([], False) decoder = gl.Decoder() g = gl.Graph() \ .edge(source=file_path, edge_type=self.edge_tuple_, decoder=decoder) g.init(tracker=utils.TRACKER_PATH) query = g.E("first").batch(4).alias('e').values() ds = gl.Dataset(query) edges = ds.next()['e'] utils.check_ids(edges.src_ids, range(self.src_range_[0], self.src_range_[1])) utils.check_ids(edges.dst_ids, range(self.dst_range_[0], self.dst_range_[1])) g.close()
def test_labeled(self): file_path = self.gen_test_data([utils.LABELED], False) decoder = gl.Decoder(labeled=True) g = gl.Graph() \ .edge(source=file_path, edge_type=self.edge_tuple_, decoder=decoder) g.init(tracker=utils.TRACKER_PATH) query = g.E("first").batch(self.batch_size_).alias('e').values() ds = gl.Dataset(query, window=1) edges = ds.next()['e'] utils.check_ids(edges.src_ids, range(self.src_range_[0], self.src_range_[1])) utils.check_ids(edges.dst_ids, range(self.dst_range_[0], self.dst_range_[1])) utils.check_edge_labels(edges) g.close()
def test_sample_with_filter(self): q = self.g.E(self._edge1_type).batch(4).alias("a") \ .each(lambda e: (e.inV().alias('dst'), e.outV().alias('src') \ .outV(self._edge1_type).sample(2).by("random").filter('dst').alias("b") ) ) \ .values() dataset = gl.Dataset(q) while True: try: res = dataset.next() utils.check_equal(list(res['b'].shape), [4, 2]) filter_ids = res['dst'].ids remained_ids = res['b'].ids for fid, rid in zip(filter_ids, remained_ids): assert fid not in rid except gl.OutOfRangeError: break
def test_traverse_with_mask(self): bs = 8 q = self.g.V(self._node1_type, mask=gl.Mask.TEST).batch(bs).alias('test') \ .values(lambda x: (x['test'].ids, x['test'].int_attrs, x['test'].float_attrs, x['test'].string_attrs)) dataset = gl.Dataset(q) iteration = 0 for i in range(2): res = [] while True: try: ids, i, f, s = dataset.next() utils.check_i_attrs(i, ids) utils.check_f_attrs(f, ids) utils.check_s_attrs(s, ids) iteration += 1 res += list(ids) except gl.OutOfRangeError: break utils.check_sorted_equal(res, range(self._test_node_range[0], self._test_node_range[1]))
def test_weighted_labeled_attributed(self): file_path = self.gen_test_data( [utils.WEIGHTED, utils.LABELED, utils.ATTRIBUTED], False) decoder = gl.Decoder( weighted=True, labeled=True, attr_types=utils.ATTR_TYPES) g = gl.Graph() \ .edge(source=file_path, edge_type=self.edge_tuple_, decoder=decoder) g.init(tracker=utils.TRACKER_PATH) query = g.E("first").batch(self.batch_size_).alias('e').values() ds = gl.Dataset(query) edges = ds.next()['e'] utils.check_ids(edges.src_ids, range(self.src_range_[0], self.src_range_[1])) utils.check_ids(edges.dst_ids, range(self.dst_range_[0], self.dst_range_[1])) utils.check_edge_labels(edges) utils.check_edge_attrs(edges) utils.check_edge_weights(edges) g.close()
def test_truncated_full_edge_sample(graph): """Iterate buy edges, and sample full neighbors of the dst nodes. """ edges = graph.E("buy").batch(3).shuffle(traverse=True).alias("edges") dst = edges.inV().alias("dst") dst.inE("buy").sample(200).by("full").alias("dst_hop1_edges") \ .inV().alias("dst_hop1") ds = gl.Dataset(edges.values()) step = 0 while True: try: res = ds.next() step += 1 dst_hop1_edges = res["dst_hop1_edges"] if step == 1: print(dst_hop1_edges.offsets) src_ids = list(dst_hop1_edges.src_ids.flatten()) dst_ids = list(dst_hop1_edges.dst_ids.flatten()) weights = list(dst_hop1_edges.weights.flatten()) for src_id, dst_id, weight in zip(src_ids, dst_ids, weights): assert abs(0.1 * (src_id + dst_id) - weight) < 10**-6 except gl.OutOfRangeError: break
def test_conditional_negtaive_sample(graph): """Negative sampling with condition. """ condition = { "unique": False, "batch_share": True, "int_cols": [0, 1], "int_props": [0.25, 0.25], "str_cols": [0], "str_props": [0.5] } edges = graph.E("cond_edge").batch(4).shuffle(traverse=True).alias("edges") src = edges.outV().alias("src") dst = edges.inV().alias("dst") src.outNeg("cond_edge").sample(5).by("in_degree") \ .where(target="dst", condition=condition).alias("neg") \ .values() ds = gl.Dataset(edges.values()) try: res = ds.next() src_ids = res["src"].ids dst_ids = res["dst"].ids neg_nodes = res["neg"] for i in range(src_ids.size): print('src_id:%d\tdst_id:%d' % (src_ids[i], dst_ids[i])) print('neg_id_1:%d\tint_0_attr:%d' % (neg_nodes.ids[i][0], neg_nodes.int_attrs[i][0][0])) print('neg_id_2:%d\tint_1_attr:%d' % (neg_nodes.ids[i][1], neg_nodes.int_attrs[i][1][1])) print('neg_id_3:%d\tstr_0_attr:%s' % (neg_nodes.ids[i][2], neg_nodes.string_attrs[i][2][0])) print('neg_id_4:%d\tstr_0_attr:%s\n' % (neg_nodes.ids[i][3], neg_nodes.string_attrs[i][3][0])) except gl.OutOfRangeError: print("OutOfRange...")
def test_edge_iterate(graph, local=False): """Iterate buy edges, sample hops of src and dst nodes. user-(buy)-item (1) iterate edges | | (buy) (buy_reverse) | | item user (2) sample neighbors of src and dst nodes. ` """ edges = graph.E("buy").batch(32).shuffle(traverse=True).alias("edges") src = edges.outV().alias("src") dst = edges.inV().alias("dst") neg = src.outNeg("buy").sample(2).by("in_degree").alias("neg") neg.inV("buy").sample(4).by("random").alias("neg_hop1") src.outE("buy").sample(5).by("random").alias("src_hop1_edges") \ .inV().alias("src_hop1") dst.inE("buy").sample(3).by("edge_weight").alias("dst_hop1_edges") \ .inV().alias("dst_hop1") query = edges.values() ds = gl.Dataset(query) epoch = 2 for i in range(epoch): step = 0 while True: try: res = ds.next() step += 1 edges = res["edges"] src_nodes = res["src"] dst_nodes = res["dst"] neg_nodes = res["neg"] src_hop1_edges = res["src_hop1_edges"] src_hop1_nodes = res["src_hop1"] neg_hop1_nodes = res["neg_hop1"] dst_hop1_edges = res["dst_hop1_edges"] dst_hop1_nodes = res["dst_hop1"] assert edges.type == ("user", "item", "buy") assert src_nodes.type == "user" assert dst_nodes.type == "item" assert neg_nodes.type == "item" assert src_hop1_edges.type == ("user", "item", "buy") assert src_hop1_nodes.type == "item" assert neg_hop1_nodes.type == "user" assert dst_hop1_edges.type == ("item", "user", "buy_reverse") assert dst_hop1_nodes.type == "user" if local and step == 1000 // 32 + 1: # total buy edges count is 1000 assert tuple(neg_nodes.float_attrs.shape) == (1000 % 32, 2, 2) assert tuple(neg_hop1_nodes.weights.shape) == ( 1000 % 32 * 2, 4, ) assert tuple(src_hop1_edges.weights.shape) == (1000 % 32, 5) assert tuple( src_hop1_nodes.float_attrs.shape) == (1000 % 32, 5, 2) assert tuple(dst_hop1_edges.weights.shape) == (1000 % 32, 3) assert tuple(dst_hop1_nodes.weights.shape) == (1000 % 32, 3) elif local or step == 1: assert tuple(neg_nodes.float_attrs.shape) == (32, 2, 2) assert tuple(neg_hop1_nodes.weights.shape) == ( 32 * 2, 4, ) assert tuple(src_hop1_edges.weights.shape) == (32, 5) assert tuple(src_hop1_nodes.float_attrs.shape) == (32, 5, 2) assert tuple(dst_hop1_edges.weights.shape) == (32, 3) assert tuple(dst_hop1_nodes.weights.shape) == (32, 3) src_ids = list(dst_hop1_edges.src_ids.flatten()) dst_ids = list(dst_hop1_edges.dst_ids.flatten()) weights = list(dst_hop1_edges.weights.flatten()) for src_id, dst_id, weight in zip(src_ids, dst_ids, weights): assert abs(0.1 * (src_id + dst_id) - weight) < 10**-6 src_ids = list(src_hop1_edges.src_ids.flatten()) dst_ids = list(src_hop1_edges.dst_ids.flatten()) weights = list(src_hop1_edges.weights.flatten()) for src_id, dst_id, weight in zip(src_ids, dst_ids, weights): assert abs(0.1 * (src_id + dst_id) - weight) < 10**-6 except gl.OutOfRangeError: break
def test_node_iterate(graph, local=False): """Iterate users, sample 2 hops with path user-(buy)-item-(buy_reverse)-user. """ query = graph.V("user").batch(32).shuffle(traverse=True).alias("src") \ .outV("buy").sample(5).by("edge_weight").alias("hop1") \ .inE("buy").sample(2).by("random").alias("hop1-hop2") \ .inV().alias("hop2") \ .values() ds = gl.Dataset(query) epoch = 2 for i in range(epoch): step = 0 while True: try: res = ds.next() step += 1 src_nodes = res["src"] hop1_nodes = res["hop1"] hop1_hop2_edges = res["hop1-hop2"] hop2_nodes = res["hop2"] assert isinstance(src_nodes, gl.Nodes) assert isinstance(hop1_nodes, gl.Nodes) assert isinstance(hop1_hop2_edges, gl.Edges) assert isinstance(hop2_nodes, gl.Nodes) assert src_nodes.type == "user" assert hop1_nodes.type == "item" assert hop1_hop2_edges.type == ("item", "user", "buy_reverse") assert hop1_hop2_edges.edge_type == "buy_reverse" assert hop2_nodes.type == "user" if local and step == 100 // 32 + 1: # total user nodes count is 100 assert tuple(src_nodes.ids.shape) == (100 % 32, ) assert tuple(hop1_nodes.ids.shape) == (100 % 32, 5) assert tuple(hop1_hop2_edges.src_ids.shape) == (100 % 32 * 5, 2) assert tuple(hop1_hop2_edges.dst_ids.shape) == (100 % 32 * 5, 2) assert tuple(hop2_nodes.ids.shape) == (100 % 32 * 5, 2) assert tuple(hop1_nodes.float_attrs.shape) == ( 100 % 32, 5, 2) # 2 float attrs assert tuple(hop1_hop2_edges.weights.shape) == (100 % 32 * 5, 2) assert tuple(hop2_nodes.weights.shape) == (100 % 32 * 5, 2) elif local or step == 1: assert tuple(src_nodes.ids.shape) == (32, ) assert tuple(hop1_nodes.ids.shape) == (32, 5) assert tuple(hop1_hop2_edges.src_ids.shape) == (32 * 5, 2) assert tuple(hop1_hop2_edges.dst_ids.shape) == (32 * 5, 2) assert tuple(hop2_nodes.ids.shape) == (32 * 5, 2) assert tuple(hop1_nodes.float_attrs.shape) == ( 32, 5, 2) # 2 float attrs assert tuple(hop1_hop2_edges.weights.shape) == (32 * 5, 2) assert tuple(hop2_nodes.weights.shape) == (32 * 5, 2) except gl.OutOfRangeError: break