Exemple #1
0
def merged_embedding(name,
                     sparse_inputs,
                     initializer,
                     emb_dim,
                     feature_dim,
                     combiner='sum',
                     vtype=VarType.Index,
                     length=50,
                     reverse=False):
    """xdl embedding
       Args:
         name: name for embedding, will be used for declaring variable on ps-plus
         sparse_inputs: a list of sparse tensors represent input data
         initializer: intializer for the weights
         emb_dim: embedding dimension
         feature_dim: sparse input dimension, for pre-allocate memory
         combiner: reduce operator, support sum|mean
       Returns:
         a tensor represent embedding result
       Raises:
         None
    """
    import xdl.python.framework.variable as variable
    var = variable.Variable(name=name,
                            dtype=DataType.float,
                            shape=[feature_dim, emb_dim],
                            initializer=initializer,
                            vtype=vtype,
                            trainable=True)
    merged_sparse_inputs = merge_sparse(sparse_inputs)
    ids = merged_sparse_inputs.ids
    unique_ids, idx = xdl.unique(ids, itype=DataType.int32)
    embeddings = var.gather(unique_ids, save_ratio=feature_add_probability)
    global _EMBEDDING_TENSOR
    _EMBEDDING_TENSOR[embeddings] = var
    import xdl.python.sparse_engine.embedding_ops as embedding_ops
    if combiner == 'sum':
        embeddings = embedding_ops.merged_ksum(embeddings, idx,
                                               merged_sparse_inputs.values,
                                               merged_sparse_inputs.segments,
                                               merged_sparse_inputs.groups)
    elif combiner == 'mean':
        embeddings = embedding_ops.merged_kmean(embeddings, idx,
                                                merged_sparse_inputs.values,
                                                merged_sparse_inputs.segments,
                                                merged_sparse_inputs.groups)
    elif combiner == 'tile':
        embeddings = embedding_ops.merged_tile(embeddings, idx,
                                               merged_sparse_inputs.values,
                                               merged_sparse_inputs.segments,
                                               merged_sparse_inputs.groups,
                                               length, reverse)
    else:
        raise Exception("Unrecognized combiner:" + str(combiner))

    emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var,
                             embeddings)
    set_embedding_info([var], emb_info)
    return embeddings
Exemple #2
0
 def test_unique_cpu_2d(self):
     res_uniq = np.array([[3, 1], [2, 1], [2, 0], [1, 3], [1, 1], [3, 2]])
     res_idx = np.array([0, 1, 0, 2, 3, 4, 2, 3, 5, 5, 1, 2])
     uniq, idx = xdl.unique(data.reshape((data.size / 2, 2)),
                            itype=DataType.int32)
     uniq, idx = xdl.execute([uniq, idx])
     self.assertTrue((uniq == res_uniq).all())
     self.assertTrue((idx == res_idx).all())
    def test_unique_cpu_1d(self):
        res_uniq = np.array([0, 2, 1, 3])
        res_idx = np.array([
            3, 2, 1, 2, 3, 2, 1, 0, 2, 3, 2, 2, 1, 0, 2, 3, 3, 1, 3, 1, 1, 2,
            1, 0
        ])
        res_sidx = np.array([
            2, 6, 10, 0, 2, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 9, 0, 1, 3, 6,
            7, 8
        ])
        res_sseg = np.array([3, 10, 18, 24])

        segment = np.array([3, 5, 8, 10, 11, 12, 16, 18, 20, 22, 24], np.int32)
        uniq, idx, sidx, sseg = xdl.unique(input=data,
                                           segment=segment,
                                           itype=DataType.int32)
        uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg])
        self.assertTrue((uniq == res_uniq).all())
        self.assertTrue((idx == res_idx).all())
        self.assertTrue((sidx == res_sidx).all())
        self.assertTrue((sseg == res_sseg).all())

        segment = np.array([3, 5, 8, 10, 11, 12, 16, 18, 20, 22, 24], np.int64)
        uniq, idx, sidx, sseg = xdl.unique(input=data,
                                           segment=segment,
                                           itype=DataType.int64)
        uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg])
        self.assertTrue((uniq == res_uniq).all())
        self.assertTrue((idx == res_idx).all())
        self.assertTrue((sidx == res_sidx).all())
        self.assertTrue((sseg == res_sseg).all())

        segment2 = np.array([3, 5, 5, 8, 10, 11, 12, 12, 16, 18, 20, 22, 24],
                            np.int32)
        uniq, idx, sidx, sseg = xdl.unique(input=data,
                                           segment=segment2,
                                           itype=DataType.int32)
        uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg])
        res_sidx = np.array([
            3, 8, 12, 0, 3, 8, 9, 10, 11, 12, 0, 1, 3, 4, 5, 6, 8, 11, 0, 1, 4,
            8, 9, 10
        ])
        self.assertTrue((uniq == res_uniq).all())
        self.assertTrue((idx == res_idx).all())
        self.assertTrue((sidx == res_sidx).all())
        self.assertTrue((sseg == res_sseg).all())
Exemple #4
0
 def test_unique_cpu_1d(self):
     res_uniq = np.array([3, 1, 2, 0])
     res_idx = np.array([
         0, 1, 2, 1, 0, 1, 2, 3, 1, 0, 1, 1, 2, 3, 1, 0, 0, 2, 0, 2, 2, 1,
         2, 3
     ])
     uniq, idx = xdl.unique(data, itype=DataType.int32)
     uniq, idx = xdl.execute([uniq, idx])
     self.assertTrue((uniq == res_uniq).all())
     self.assertTrue((idx == res_idx).all())
Exemple #5
0
 def enqueue_sparse_list(self, name, sparse_list):
     for i in range(len(sparse_list)):
         sparse_tensor = sparse_list[i]
         unique_ids, idx, sidx, sseg = xdl.unique(sparse_tensor.ids,
                                                  sparse_tensor.segments,
                                                  itype=DataType.int32)
         sparse_tensor._ids = unique_ids
         sparse_tensor._indices = idx
         self.enqueue_sparse("__" + name + "_" + str(i), sparse_tensor)
     self._regist_in_map(name, Offset("sparse_list", name, sparse_list))
 def test_unique_gpu_2d(self):
     with xdl.device("GPU"):
         res_uniq = np.array([[1, 1], [1, 3], [2, 0], [2, 1], [3, 1],
                              [3, 2]])
         res_idx = np.array([4, 3, 4, 2, 1, 0, 2, 1, 5, 5, 3, 2])
         segment = np.array([2, 2, 5, 6, 9, 12], np.int32)
         res_sidx = np.array([3, 2, 4, 2, 4, 5, 0, 5, 0, 2, 4, 5])
         res_sseg = np.array([1, 3, 6, 8, 10, 12])
         uniq, idx, sidx, sseg = xdl.unique(data.reshape(
             (data.size / 2, 2)),
                                            segment,
                                            itype=DataType.int32)
         uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg])
         self.assertTrue((uniq == res_uniq).all())
         self.assertTrue((idx == res_idx).all())
         self.assertTrue((sidx == res_sidx).all())
         self.assertTrue((sseg == res_sseg).all())
    def test_unique_cpu_2d(self):
        data = np.array([
            3, 1, 2, 1, 3, 1, 2, 0, 1, 3, 1, 1, 2, 0, 1, 3, 3, 2, 3, 2, 2, 1,
            2, 0
        ])
        res_uniq = np.array([[3, 2], [1, 1], [1, 3], [2, 0], [2, 1], [3, 1]])
        res_idx = np.array([5, 4, 5, 3, 2, 1, 3, 2, 0, 0, 4, 3])
        res_sidx = np.array([4, 5, 3, 2, 4, 2, 4, 5, 0, 5, 0, 2])
        res_sseg = np.array([2, 3, 5, 8, 10, 12])

        segment = np.array([2, 2, 5, 6, 9, 12], np.int32)
        uniq, idx, sidx, sseg = xdl.unique(data.reshape((data.size / 2, 2)),
                                           segment,
                                           itype=DataType.int32)
        uniq, idx, sidx, sseg = xdl.execute([uniq, idx, sidx, sseg])

        self.assertTrue((uniq == res_uniq).all())
        self.assertTrue((idx == res_idx).all())
        self.assertTrue((sidx == res_sidx).all())
        self.assertTrue((sseg == res_sseg).all())
Exemple #8
0
def merged_embedding(name, sparse_inputs, initializer, emb_dim, feature_dim,
                     combiner='sum', vtype=VarType.Index, length=50, reverse=False,
                     batch_read=3000, feature_add_probability=1.0, cbf=0, device='CPU', **device_attr):
    """xdl embedding
       Args:
         name: name for embedding, will be used for declaring variable on ps-plus
         sparse_inputs: a list of sparse tensors represent input data
         initializer: intializer for the weights
         emb_dim: embedding dimension
         feature_dim: sparse input dimension, for pre-allocate memory
         combiner: reduce operator, support sum|mean
       Returns:
         a tensor represent embedding result
       Raises:
         None
    """
    import xdl.python.framework.variable as variable
    with variable.variable_info(batch_read=batch_read, save_ratio=feature_add_probability, bloom_filter=cbf):
        var = variable.Variable(name=name,
                                dtype=DataType.float,
                                shape=[feature_dim, emb_dim],
                                initializer=initializer,
                                vtype=vtype,
                                trainable = True)
    if isinstance(sparse_inputs, (list, tuple)):
        merged_sparse_inputs = merge_sparse(sparse_inputs)
        emb_dim *= len(sparse_inputs)
    else:
        assert(isinstance(sparse_inputs, MergedSparseTensor))
        merged_sparse_inputs = sparse_inputs
    if merged_sparse_inputs.has_unique_ids():
        unique_ids = merged_sparse_inputs.ids
        idx = merged_sparse_inputs.indices
        sidx = merged_sparse_inputs.sidx
        sseg = merged_sparse_inputs.sseg
    else:
        with xdl.device(device, **device_attr):
            unique_ids, idx, sidx, sseg = xdl.unique(ids, merged_sparse_inputs.groups, itype=DataType.int32)
    
    embeddings = var.gather(unique_ids)
    global _EMBEDDING_TENSOR
    _EMBEDDING_TENSOR[embeddings] = var
    import xdl.python.sparse_engine.embedding_ops as embedding_ops
    if combiner == 'sum':
        embeddings = embedding_ops.merged_ksum(
            embeddings,
            idx,
            merged_sparse_inputs.values,
            merged_sparse_inputs.segments,
            merged_sparse_inputs.groups,
            sidx,
            sseg,
            device, **device_attr)
    elif combiner == 'mean':
        embeddings = embedding_ops.merged_kmean(
            embeddings,
            idx,
            merged_sparse_inputs.values,
            merged_sparse_inputs.segments,
            merged_sparse_inputs.groups,
            sidx,
            sseg,
            device, **device_attr)
    elif combiner == 'tile':
        embeddings = embedding_ops.merged_tile(
            embeddings,
            idx,
            merged_sparse_inputs.values,
            merged_sparse_inputs.segments,
            merged_sparse_inputs.groups,
            length,
            reverse,
            device, **device_attr)
    else:
        raise Exception("Unrecognized combiner:" + str(combiner))

    emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, length, embeddings)
    set_embedding_info([var], emb_info)
    return embeddings
Exemple #9
0
def embedding(name, sparse_input, initializer, emb_dim, feature_dim,
              combiner='sum',
              vtype=VarType.Index,
              length=50,
              reverse=False,
              batch_read=3000,
              feature_add_probability=1.0,
              cbf=0,
              device='CPU',
              statis_list=None,
              statis_decay=0.07,
              statis_decay_period=100,
              labels=None,
              save=True,
              **device_attr):
    """xdl embedding
       Args:
         name: name for embedding, will be used for declaring variable on ps-plus
         sparse_input: a sparse tensor represent input data
         initializer: intializer for the variable on ps-plus
         emb_dim: embedding dimension
         feature_dim: sparse input dimension, for pre-allocate memory
         combiner: reduce operator, support sum|mean
       Returns:
         a tensor represent embedding result
       Raises:
         None
    """

    global EMBEDDING_LIST, EMBEDDING_SET
    if name not in EMBEDDING_SET:
        EMBEDDING_SET.add(name)
        EMBEDDING_LIST.append(name)

    import xdl.python.framework.variable as variable
    with variable.variable_info(batch_read=batch_read, save_ratio=feature_add_probability, bloom_filter=cbf, save="true" if save else "false"):
        var = variable.Variable(name=name,
                                dtype=DataType.float,
                                shape=[feature_dim, emb_dim],
                                initializer=initializer,
                                vtype=vtype,
                                trainable=True)
        if statis_list is not None:
            statis_vars = []
            for statis_type in statis_list:
                statis_var = variable.Variable(name=name,
                                               dtype=DataType.float,
                                               shape=[feature_dim, 1],
                                               initializer=xdl.Zeros(),
                                               vtype=vtype,
                                               trainable=False,
                                               statis_type=statis_type,
                                               statis_decay=statis_decay,
                                               statis_decay_period=statis_decay_period)
                statis_vars.append(statis_var)

    if sparse_input.has_unique_ids():
        unique_ids = xdl.identity_op(sparse_input.ids)
        idx = sparse_input.indices
        embeddings = var.gather(unique_ids)
        sidx = sparse_input.sidx
        sseg = sparse_input.sseg
    else:
        with xdl.device(device, **device_attr):
            unique_ids, idx, sidx, sseg = xdl.unique(sparse_input.ids, sparse_input.segments, itype=DataType.int32)
        embeddings = var.gather(unique_ids)
        
    if statis_list is not None:
        assert labels is not None
        from xdl.python.training.training_utils import get_global_step
        global_step = get_global_step()
        statis_results = []
        for statis_var in statis_vars:
            statis_result = statis_var.statis(sparse_input.ids, idx, sparse_input.segments, sidx, sseg, labels, global_step.value)
            statis_results.append(statis_result)

    global _EMBEDDING_TENSOR
    _EMBEDDING_TENSOR[embeddings] = var

    import xdl.python.sparse_engine.embedding_ops as embedding_ops
    import numpy as np
    if combiner == 'sum':
        embeddings = embedding_ops.ksum(
            embeddings,
            idx,
            sparse_input.values,
            sparse_input.segments,
            sidx,
            sseg,
            device, **device_attr)
    elif combiner == 'mean':
        embeddings = embedding_ops.kmean(
            embeddings,
            idx,
            sparse_input.values,
            sparse_input.segments,
            sidx,
            sseg,
            device, **device_attr)
    elif combiner == 'tile':
        embeddings = embedding_ops.tile(
            embeddings,
            idx,
            np.array([], dtype=np.float32),
            #sparse_input.values,
            sparse_input.segments,
            length,
            reverse,
            device, **device_attr)
    else:
        raise Exception("Unrecognized combiner:" + str(combiner))

    if sparse_input.shape is not None and len(sparse_input.shape) > 0:
        embeddings.set_shape([sparse_input.shape[0], emb_dim]);

    emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var, length, embeddings)
    set_embedding_info([var], emb_info)
    if statis_list is not None:
        return embeddings, statis_results
    return embeddings
Exemple #10
0
def embedding(name,
              sparse_input,
              initializer,
              emb_dim,
              feature_dim,
              combiner='sum',
              vtype=VarType.Index,
              length=50,
              reverse=False,
              batch_read=3000,
              feature_add_probability=1.0):
    """xdl embedding
       Args:
         name: name for embedding, will be used for declaring variable on ps-plus
         sparse_input: a sparse tensor represent input data
         initializer: intializer for the variable on ps-plus
         emb_dim: embedding dimension
         feature_dim: sparse input dimension, for pre-allocate memory
         combiner: reduce operator, support sum|mean
       Returns:
         a tensor represent embedding result
       Raises:
         None
    """
    import xdl.python.framework.variable as variable
    with variable.variable_info(batch_read=batch_read):
        var = variable.Variable(name=name,
                                dtype=DataType.float,
                                shape=[feature_dim, emb_dim],
                                initializer=initializer,
                                vtype=vtype,
                                trainable=True)
    if sparse_input.has_unique_ids():
        unique_ids = sparse_input.ids
        idx = sparse_input.indices
        embeddings = var.gather(unique_ids, save_ratio=feature_add_probability)
    else:
        unique_ids, idx = xdl.unique(sparse_input.ids, itype=DataType.int32)
        embeddings = var.gather(unique_ids, save_ratio=feature_add_probability)

    global _EMBEDDING_TENSOR
    _EMBEDDING_TENSOR[embeddings] = var

    import xdl.python.sparse_engine.embedding_ops as embedding_ops
    if combiner == 'sum':
        embeddings = embedding_ops.ksum(embeddings, idx, sparse_input.values,
                                        sparse_input.segments)
    elif combiner == 'mean':
        embeddings = embedding_ops.kmean(embeddings, idx, sparse_input.values,
                                         sparse_input.segments)
    elif combiner == 'tile':
        embeddings = embedding_ops.tile(embeddings, idx, sparse_input.values,
                                        sparse_input.segments, length, reverse)
    else:
        raise Exception("Unrecognized combiner:" + str(combiner))

    if sparse_input.shape is not None and len(sparse_input.shape) > 0:
        embeddings.set_shape([sparse_input.shape[0], emb_dim])

    emb_info = EmbeddingInfo(name, feature_dim, emb_dim, combiner, None, var,
                             embeddings)
    set_embedding_info([var], emb_info)
    return embeddings