def merged_embedding(name, sparse_inputs, initializer, emb_dim, feature_dim, combiner='sum', vtype=VarType.Index, length=50, reverse=False): """xdl embedding Args: name: name for embedding, will be used for declaring variable on ps-plus sparse_inputs: a list of sparse tensors represent input data initializer: intializer for the weights emb_dim: embedding dimension feature_dim: sparse input dimension, for pre-allocate memory combiner: reduce operator, support sum|mean Returns: a tensor represent embedding result Raises: None """ import xdl.python.framework.variable as variable var = variable.Variable(name=name, dtype=DataType.float, shape=[feature_dim, emb_dim], initializer=initializer, vtype=vtype, trainable=True) merged_sparse_inputs = merge_sparse(sparse_inputs) ids = merged_sparse_inputs.ids unique_ids, idx = xdl.unique(ids, itype=DataType.int32) embeddings = var.gather(unique_ids, save_ratio=feature_add_probability) global _EMBEDDING_TENSOR _EMBEDDING_TENSOR[embeddings] = var import xdl.python.sparse_engine.embedding_ops as embedding_ops if combiner == 'sum': embeddings = embedding_ops.merged_ksum(embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups) elif combiner == 'mean': embeddings = embedding_ops.merged_kmean(embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups) elif combiner == 'tile': embeddings = embedding_ops.merged_tile(embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups, length, reverse) else: raise Exception("Unrecognized combiner:" + str(combiner)) emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, embeddings) set_embedding_info([var], emb_info) return embeddings
def test_unique_cpu_2d(self): res_uniq = np.array([[3, 1], [2, 1], [2, 0], [1, 3], [1, 1], [3, 2]]) res_idx = np.array([0, 1, 0, 2, 3, 4, 2, 3, 5, 5, 1, 2]) uniq, idx = xdl.unique(data.reshape((data.size / 2, 2)), itype=DataType.int32) uniq, idx = xdl.execute([uniq, idx]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all())
def test_unique_cpu_1d(self): res_uniq = np.array([0, 2, 1, 3]) res_idx = np.array([ 3, 2, 1, 2, 3, 2, 1, 0, 2, 3, 2, 2, 1, 0, 2, 3, 3, 1, 3, 1, 1, 2, 1, 0 ]) res_sidx = np.array([ 2, 6, 10, 0, 2, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 9, 0, 1, 3, 6, 7, 8 ]) res_sseg = np.array([3, 10, 18, 24]) segment = np.array([3, 5, 8, 10, 11, 12, 16, 18, 20, 22, 24], np.int32) uniq, idx, sidx, sseg = xdl.unique(input=data, segment=segment, itype=DataType.int32) uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all()) self.assertTrue((sidx == res_sidx).all()) self.assertTrue((sseg == res_sseg).all()) segment = np.array([3, 5, 8, 10, 11, 12, 16, 18, 20, 22, 24], np.int64) uniq, idx, sidx, sseg = xdl.unique(input=data, segment=segment, itype=DataType.int64) uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all()) self.assertTrue((sidx == res_sidx).all()) self.assertTrue((sseg == res_sseg).all()) segment2 = np.array([3, 5, 5, 8, 10, 11, 12, 12, 16, 18, 20, 22, 24], np.int32) uniq, idx, sidx, sseg = xdl.unique(input=data, segment=segment2, itype=DataType.int32) uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg]) res_sidx = np.array([ 3, 8, 12, 0, 3, 8, 9, 10, 11, 12, 0, 1, 3, 4, 5, 6, 8, 11, 0, 1, 4, 8, 9, 10 ]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all()) self.assertTrue((sidx == res_sidx).all()) self.assertTrue((sseg == res_sseg).all())
def test_unique_cpu_1d(self): res_uniq = np.array([3, 1, 2, 0]) res_idx = np.array([ 0, 1, 2, 1, 0, 1, 2, 3, 1, 0, 1, 1, 2, 3, 1, 0, 0, 2, 0, 2, 2, 1, 2, 3 ]) uniq, idx = xdl.unique(data, itype=DataType.int32) uniq, idx = xdl.execute([uniq, idx]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all())
def enqueue_sparse_list(self, name, sparse_list): for i in range(len(sparse_list)): sparse_tensor = sparse_list[i] unique_ids, idx, sidx, sseg = xdl.unique(sparse_tensor.ids, sparse_tensor.segments, itype=DataType.int32) sparse_tensor._ids = unique_ids sparse_tensor._indices = idx self.enqueue_sparse("__" + name + "_" + str(i), sparse_tensor) self._regist_in_map(name, Offset("sparse_list", name, sparse_list))
def test_unique_gpu_2d(self): with xdl.device("GPU"): res_uniq = np.array([[1, 1], [1, 3], [2, 0], [2, 1], [3, 1], [3, 2]]) res_idx = np.array([4, 3, 4, 2, 1, 0, 2, 1, 5, 5, 3, 2]) segment = np.array([2, 2, 5, 6, 9, 12], np.int32) res_sidx = np.array([3, 2, 4, 2, 4, 5, 0, 5, 0, 2, 4, 5]) res_sseg = np.array([1, 3, 6, 8, 10, 12]) uniq, idx, sidx, sseg = xdl.unique(data.reshape( (data.size / 2, 2)), segment, itype=DataType.int32) uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all()) self.assertTrue((sidx == res_sidx).all()) self.assertTrue((sseg == res_sseg).all())
def test_unique_cpu_2d(self): data = np.array([ 3, 1, 2, 1, 3, 1, 2, 0, 1, 3, 1, 1, 2, 0, 1, 3, 3, 2, 3, 2, 2, 1, 2, 0 ]) res_uniq = np.array([[3, 2], [1, 1], [1, 3], [2, 0], [2, 1], [3, 1]]) res_idx = np.array([5, 4, 5, 3, 2, 1, 3, 2, 0, 0, 4, 3]) res_sidx = np.array([4, 5, 3, 2, 4, 2, 4, 5, 0, 5, 0, 2]) res_sseg = np.array([2, 3, 5, 8, 10, 12]) segment = np.array([2, 2, 5, 6, 9, 12], np.int32) uniq, idx, sidx, sseg = xdl.unique(data.reshape((data.size / 2, 2)), segment, itype=DataType.int32) uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg]) self.assertTrue((uniq == res_uniq).all()) self.assertTrue((idx == res_idx).all()) self.assertTrue((sidx == res_sidx).all()) self.assertTrue((sseg == res_sseg).all())
def merged_embedding(name, sparse_inputs, initializer, emb_dim, feature_dim, combiner='sum', vtype=VarType.Index, length=50, reverse=False, batch_read=3000, feature_add_probability=1.0, cbf=0, device='CPU', **device_attr): """xdl embedding Args: name: name for embedding, will be used for declaring variable on ps-plus sparse_inputs: a list of sparse tensors represent input data initializer: intializer for the weights emb_dim: embedding dimension feature_dim: sparse input dimension, for pre-allocate memory combiner: reduce operator, support sum|mean Returns: a tensor represent embedding result Raises: None """ import xdl.python.framework.variable as variable with variable.variable_info(batch_read=batch_read, save_ratio=feature_add_probability, bloom_filter=cbf): var = variable.Variable(name=name, dtype=DataType.float, shape=[feature_dim, emb_dim], initializer=initializer, vtype=vtype, trainable = True) if isinstance(sparse_inputs, (list, tuple)): merged_sparse_inputs = merge_sparse(sparse_inputs) emb_dim *= len(sparse_inputs) else: assert(isinstance(sparse_inputs, MergedSparseTensor)) merged_sparse_inputs = sparse_inputs if merged_sparse_inputs.has_unique_ids(): unique_ids = merged_sparse_inputs.ids idx = merged_sparse_inputs.indices sidx = merged_sparse_inputs.sidx sseg = merged_sparse_inputs.sseg else: with xdl.device(device, **device_attr): unique_ids, idx, sidx, sseg = xdl.unique(ids, merged_sparse_inputs.groups, itype=DataType.int32) embeddings = var.gather(unique_ids) global _EMBEDDING_TENSOR _EMBEDDING_TENSOR[embeddings] = var import xdl.python.sparse_engine.embedding_ops as embedding_ops if combiner == 'sum': embeddings = embedding_ops.merged_ksum( embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups, sidx, sseg, device, **device_attr) elif combiner == 'mean': embeddings = embedding_ops.merged_kmean( embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups, sidx, sseg, device, **device_attr) elif combiner == 'tile': embeddings = embedding_ops.merged_tile( embeddings, idx, merged_sparse_inputs.values, merged_sparse_inputs.segments, merged_sparse_inputs.groups, length, reverse, device, **device_attr) else: raise Exception("Unrecognized combiner:" + str(combiner)) emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, length, embeddings) set_embedding_info([var], emb_info) return embeddings
def embedding(name, sparse_input, initializer, emb_dim, feature_dim, combiner='sum', vtype=VarType.Index, length=50, reverse=False, batch_read=3000, feature_add_probability=1.0, cbf=0, device='CPU', statis_list=None, statis_decay=0.07, statis_decay_period=100, labels=None, save=True, **device_attr): """xdl embedding Args: name: name for embedding, will be used for declaring variable on ps-plus sparse_input: a sparse tensor represent input data initializer: intializer for the variable on ps-plus emb_dim: embedding dimension feature_dim: sparse input dimension, for pre-allocate memory combiner: reduce operator, support sum|mean Returns: a tensor represent embedding result Raises: None """ global EMBEDDING_LIST, EMBEDDING_SET if name not in EMBEDDING_SET: EMBEDDING_SET.add(name) EMBEDDING_LIST.append(name) import xdl.python.framework.variable as variable with variable.variable_info(batch_read=batch_read, save_ratio=feature_add_probability, bloom_filter=cbf, save="true" if save else "false"): var = variable.Variable(name=name, dtype=DataType.float, shape=[feature_dim, emb_dim], initializer=initializer, vtype=vtype, trainable=True) if statis_list is not None: statis_vars = [] for statis_type in statis_list: statis_var = variable.Variable(name=name, dtype=DataType.float, shape=[feature_dim, 1], initializer=xdl.Zeros(), vtype=vtype, trainable=False, statis_type=statis_type, statis_decay=statis_decay, statis_decay_period=statis_decay_period) statis_vars.append(statis_var) if sparse_input.has_unique_ids(): unique_ids = xdl.identity_op(sparse_input.ids) idx = sparse_input.indices embeddings = var.gather(unique_ids) sidx = sparse_input.sidx sseg = sparse_input.sseg else: with xdl.device(device, **device_attr): unique_ids, idx, sidx, sseg = xdl.unique(sparse_input.ids, sparse_input.segments, itype=DataType.int32) embeddings = var.gather(unique_ids) if statis_list is not None: assert labels is not None from xdl.python.training.training_utils import get_global_step global_step = get_global_step() statis_results = [] for statis_var in statis_vars: statis_result = statis_var.statis(sparse_input.ids, idx, sparse_input.segments, sidx, sseg, labels, global_step.value) statis_results.append(statis_result) global _EMBEDDING_TENSOR _EMBEDDING_TENSOR[embeddings] = var import xdl.python.sparse_engine.embedding_ops as embedding_ops import numpy as np if combiner == 'sum': embeddings = embedding_ops.ksum( embeddings, idx, sparse_input.values, sparse_input.segments, sidx, sseg, device, **device_attr) elif combiner == 'mean': embeddings = embedding_ops.kmean( embeddings, idx, sparse_input.values, sparse_input.segments, sidx, sseg, device, **device_attr) elif combiner == 'tile': embeddings = embedding_ops.tile( embeddings, idx, np.array([], dtype=np.float32), #sparse_input.values, sparse_input.segments, length, reverse, device, **device_attr) else: raise Exception("Unrecognized combiner:" + str(combiner)) if sparse_input.shape is not None and len(sparse_input.shape) > 0: embeddings.set_shape([sparse_input.shape[0], emb_dim]); emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, length, embeddings) set_embedding_info([var], emb_info) if statis_list is not None: return embeddings, statis_results return embeddings
def embedding(name, sparse_input, initializer, emb_dim, feature_dim, combiner='sum', vtype=VarType.Index, length=50, reverse=False, batch_read=3000, feature_add_probability=1.0): """xdl embedding Args: name: name for embedding, will be used for declaring variable on ps-plus sparse_input: a sparse tensor represent input data initializer: intializer for the variable on ps-plus emb_dim: embedding dimension feature_dim: sparse input dimension, for pre-allocate memory combiner: reduce operator, support sum|mean Returns: a tensor represent embedding result Raises: None """ import xdl.python.framework.variable as variable with variable.variable_info(batch_read=batch_read): var = variable.Variable(name=name, dtype=DataType.float, shape=[feature_dim, emb_dim], initializer=initializer, vtype=vtype, trainable=True) if sparse_input.has_unique_ids(): unique_ids = sparse_input.ids idx = sparse_input.indices embeddings = var.gather(unique_ids, save_ratio=feature_add_probability) else: unique_ids, idx = xdl.unique(sparse_input.ids, itype=DataType.int32) embeddings = var.gather(unique_ids, save_ratio=feature_add_probability) global _EMBEDDING_TENSOR _EMBEDDING_TENSOR[embeddings] = var import xdl.python.sparse_engine.embedding_ops as embedding_ops if combiner == 'sum': embeddings = embedding_ops.ksum(embeddings, idx, sparse_input.values, sparse_input.segments) elif combiner == 'mean': embeddings = embedding_ops.kmean(embeddings, idx, sparse_input.values, sparse_input.segments) elif combiner == 'tile': embeddings = embedding_ops.tile(embeddings, idx, sparse_input.values, sparse_input.segments, length, reverse) else: raise Exception("Unrecognized combiner:" + str(combiner)) if sparse_input.shape is not None and len(sparse_input.shape) > 0: embeddings.set_shape([sparse_input.shape[0], emb_dim]) emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, embeddings) set_embedding_info([var], emb_info) return embeddings