def build_node_from_tensors(feature_schema, tensors): """Constructs nodes `Data` in Tensor format. Args: feature_schema: A (name, Decoder) tuple used to parse the feature. Returns: A `Data` object in Tensor format. """ if feature_schema[1].int_attr_num > 0: int_attrs = next(tensors) else: int_attrs = None if feature_schema[1].float_attr_num > 0: float_attrs = next(tensors) else: float_attrs = None if feature_schema[1].string_attr_num > 0: string_attrs = next(tensors) else: string_attrs = None ids = next(tensors) feature_tensor = Data(ids, ints=int_attrs, floats=float_attrs, strings=string_attrs) return feature_tensor
def test_only_ints_with_fusion(self): spec = FeatureSpec(10) total_dim = 0 for i in range(8): dim = random.randint(8, 10) spec.append_sparse(100 + 10 * i, dim, False) total_dim += dim spec.append_sparse(100, 4, True) # two features need hash spec.append_sparse(100, 4, True) total_dim += 8 handler = FeatureHandler("ints_with_fusion", spec, fuse_embedding=True) self.assertEqual(len(handler._float_fg), 0) self.assertEqual(len(handler._int_fg) < 10, True) self.assertEqual(len(handler._fused_int_fg) > 0, True) self.assertEqual(len(handler._string_fg), 0) batch_ints = np.array([[i, 2 * i] for i in range(10)]).transpose() # [2, 10] input_data = Data( ints=tf.convert_to_tensor(batch_ints, dtype=tf.int64)) output = handler(input_data) # [2, total_dim] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(output) self.assertListEqual(list(ret.shape), [2, total_dim]) # 2d array, batch_size = 2
def _build_data(cls, graphs, type='node'): def list_append(list, item): if item is not None: list.append(item) return list def np_concat(list): if list: return np.concatenate(list, axis=0) return None #TODO(baole): support other labels and weights. ids_list = [] int_attrs_list = [] float_attrs_list = [] string_attrs_list = [] for graph in graphs: if type == 'node': item = graph.nodes else: return None # flatten format. ids_list = list_append(ids_list, item.ids) int_attrs_list = list_append(int_attrs_list, item.int_attrs) float_attrs_list = list_append(float_attrs_list, item.float_attrs) string_attrs_list = list_append(string_attrs_list, item.string_attrs) ids = np_concat(ids_list) ints = np_concat(int_attrs_list) floats = np_concat(float_attrs_list) strings = np_concat(string_attrs_list) return Data(ids, ints, floats, strings)
def test_floats_and_fused_ints(self): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = random.randint(8, 10) spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim handler = FeatureHandler("floats_and_fused_ints", spec) self.assertEqual(len(handler._float_fg), 3) self.assertEqual(len(handler._int_fg), 0) self.assertEqual(len(handler._fused_int_fg) > 0, True) self.assertEqual(len(handler._string_fg), 0) batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)], dtype=np.float32).transpose() # [2, 3] batch_ints = np.array([[i, 2 * i] for i in range(7)]).transpose() # [2, 7] input_data = Data(floats=tf.convert_to_tensor(batch_floats, dtype=tf.float32), ints=tf.convert_to_tensor(batch_ints, dtype=tf.int64)) output = handler(input_data) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(output) self.assertListEqual(list(ret.shape), [2, total_dim])
def test_only_strings(self): spec = FeatureSpec(3) total_dim = 0 for i in range(3): dim = random.randint(8, 10) spec.append_multival(10 + 10 * i, dim, ",") total_dim += dim handler = FeatureHandler("only_strings", spec) self.assertEqual(len(handler._float_fg), 0) self.assertEqual(len(handler._int_fg), 0) self.assertEqual(len(handler._fused_int_fg), 0) self.assertEqual(len(handler._string_fg), 3) batch_ss = np.array([["f1,batch1", "f1,batch2,others"], ["f2,batch1,others", "f2,batch2"], ["f3,batch1,haha", "f3,batch2,hh,kk"]]).transpose() # [2, 3] input_data = Data( strings=tf.convert_to_tensor(batch_ss, dtype=tf.string)) output = handler(input_data) # [2, total_dim] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(output) self.assertListEqual(list(ret.shape), [2, total_dim]) # 2d array, batch_size = 2
def induce_graph_with_edge(src_nodes, dst_nodes, src_nbrs, dst_nbrs): """induce SubGraphs using edge and it's neighbors. Args: src_nodes: A gl.Nodes instance with shape [batch_size]. dst_nodes: A gl.Nodes instance with shape [batch_size] or [batch_size, 1] for negative sample. src_nbrs: The src_nodes' full neighbors with 1D shape. dst_nbrs: The dst_nodes' full neighbors with 1D shape. Returns: SubGraphs. """ subgraphs = [] src_offset, dst_offset = 0,0 for i in range (src_nodes.ids.size): # induce k-hop enclosing SubGraph of target edge. ids = np.array([src_nodes.ids[i], dst_nodes.ids[i].reshape([-1])]) # neg dst is 2D. int_attrs = _get_target_attrs(src_nodes.int_attrs, dst_nodes.int_attrs, i) float_attrs = _get_target_attrs(src_nodes.float_attrs, dst_nodes.float_attrs, i) string_attrs = _get_target_attrs(src_nodes.string_attrs, dst_nodes.string_attrs, i) row, col = [], [] col_offset = ids.size src_begin, src_end = src_offset, src_offset + src_nbrs.offsets[i] dst_begin, dst_end = dst_offset, dst_offset + dst_nbrs.offsets[i] ids, int_attrs, float_attrs, string_attrs = \ _concat_node_with_nbr(ids, int_attrs, float_attrs, string_attrs, src_nbrs, src_begin, src_end) ids, int_attrs, float_attrs, string_attrs = \ _concat_node_with_nbr(ids, int_attrs, float_attrs, string_attrs, dst_nbrs, dst_begin, dst_end) row, col = gen_edge_index(i, src_nbrs, col_offset, row, col, src=True) col_offset += src_nbrs.offsets[i] row, col = gen_edge_index(i, dst_nbrs, col_offset, row, col, src=False) src_offset += src_nbrs.offsets[i] dst_offset += dst_nbrs.offsets[i] subgraph = SubGraph(np.stack([np.array(row), np.array(col)], axis=0), Data(ids, ints=int_attrs, floats=float_attrs, strings=string_attrs)) subgraphs.append(subgraph) return subgraphs
def build_data_dict(self, flatten_values): """Build the dict of Data from flatten value lists. Returns: dict: key is alias, value is `Data`. """ data_dict = {} cursor = [-1] def pop(mask): if mask: cursor[0] += 1 return flatten_values[cursor[0]] return None for alias, masks in self._masks.items(): ints, floats, strings, labels, weights, ids, dst_ids, offsets = \ [pop(msk) for msk in sum(masks, [])] data_dict[alias] = Data( ids, ints, floats, strings, labels, weights, offsets=offsets, dst_ids=dst_ids) return data_dict
def transform(self, transform_func=None): """transforms `BatchGraph`. Default transformation is encoding nodes feature to embedding. Args: transform_func: A function that takes in an `BatchGraph` object and returns a transformed version. """ if self.node_schema is None: return self # TODO(baole): supports heterogeneous grpah. vertex_handler = FeatureHandler(self.node_schema[0], self.node_schema[1].feature_spec) node = Data(self.nodes.ids, self.nodes.int_attrs, self.nodes.float_attrs, self.nodes.string_attrs) node_tensor = vertex_handler.forward(node) graph = BatchGraph(self.edge_index, node_tensor, self.node_schema, self.graph_node_offsets, additional_keys=self.additional_keys) for key in self.additional_keys: graph[key] = self[key] return graph
def test_only_floats(self): spec = FeatureSpec(10) for i in range(10): spec.append_dense() handler = FeatureHandler("only_floats", spec) self.assertEqual(len(handler._float_fg), 10) self.assertEqual(len(handler._int_fg), 0) self.assertEqual(len(handler._fused_int_fg), 0) self.assertEqual(len(handler._string_fg), 0) batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(10)]).transpose() # [2, 10] input_data = Data( floats=tf.convert_to_tensor(batch_floats, dtype=tf.float32)) output = handler(input_data) # [2, 10] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(output) self.assertListEqual(list(ret.shape), list(batch_floats.shape)) for i in range(ret.shape[0]): self.assertListEqual(list(ret[i]), list(batch_floats[i]))