Beispiel #1
0
def ff(inputs, config):
    outputs = ad.array_reshape_op(inputs, [-1, config.d_model])
    outputs = dense(outputs,
                    config.d_model,
                    config.d_ff,
                    activation=ad.relu_op)
    outputs = dense(outputs, config.d_ff, config.d_model)
    outputs = ad.array_reshape_op(outputs,
                                  [config.batch_size, -1, config.d_model])
    outputs = outputs + inputs
    outputs = layer_norm(outputs, config.d_model)
    return outputs
Beispiel #2
0
def wdl_adult(X_deep, X_wide, y_):
    lr = 5 / 128
    dim_wide = 809
    dim_deep = 68

    W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
    W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
    b1 = init.random_normal([50], stddev=0.1, name="b1")
    W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
    b2 = init.random_normal([20], stddev=0.1, name="b2")

    #deep
    Embedding = []
    X_deep_input = None

    for i in range(8):
        Embedding_name = "Embedding_deep_" + str(i)
        Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name))
        now = ad.embedding_lookup_op(Embedding[i], X_deep[i])
        now = ad.array_reshape_op(now, (-1, 8))
        if X_deep_input is None:
            X_deep_input = now
        else:
            X_deep_input = ad.concat_op(X_deep_input, now, 1)

    for i in range(4):
        now = ad.array_reshape_op(X_deep[i + 8], (-1, 1))
        X_deep_input = ad.concat_op(X_deep_input, now, 1)

    mat1 = ad.matmul_op(X_deep_input, W1)
    add1 = mat1 + ad.broadcastto_op(b1, mat1)
    relu1= ad.relu_op(add1)
    dropout1 = relu1 #ad.dropout_op(relu1, 0.5)
    mat2 = ad.matmul_op(dropout1, W2)
    add2 = mat2 + ad.broadcastto_op(b2, mat2)
    relu2= ad.relu_op(add2)
    dropout2 = relu2 #ad.dropout_op(relu2, 0.5)
    dmodel=dropout2

    # wide
    wmodel = ad.concat_op(X_wide, dmodel, 1)
    wmodel = ad.matmul_op(wmodel, W)

    prediction = wmodel
    loss = ad.softmaxcrossentropy_op(prediction, y_)
    loss = ad.reduce_mean_op(loss, [0])

    opt = optimizer.SGDOptimizer(learning_rate=lr)
    train_op = opt.minimize(loss)

    return loss, prediction, y_, train_op
Beispiel #3
0
def alexnet(x, y_):
    '''
    AlexNet model, for MNIST dataset.

    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''

    print('Building AlexNet model...')
    x = ad.array_reshape_op(x, [-1, 1, 28, 28])
    x = conv_bn_relu_pool(x,
                          1,
                          32,
                          'alexnet_conv1',
                          with_relu=True,
                          with_pool=True)
    x = conv_bn_relu_pool(x,
                          32,
                          64,
                          'alexnet_conv2',
                          with_relu=True,
                          with_pool=True)
    x = conv_bn_relu_pool(x,
                          64,
                          128,
                          'alexnet_conv3',
                          with_relu=True,
                          with_pool=False)
    x = conv_bn_relu_pool(x,
                          128,
                          256,
                          'alexnet_conv4',
                          with_relu=True,
                          with_pool=False)
    x = conv_bn_relu_pool(x,
                          256,
                          256,
                          'alexnet_conv5',
                          with_relu=False,
                          with_pool=True)
    x = ad.array_reshape_op(x, (-1, 256 * 3 * 3))
    x = fc(x, (256 * 3 * 3, 1024), name='alexnet_fc1', with_relu=True)
    x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True)
    y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    return loss, y
Beispiel #4
0
Datei: CNN.py Projekt: sj1104/Het
def fc(x, shape):
    weight = init.random_normal(shape=shape, stddev=0.1)
    bias = init.random_normal(shape=shape[-1:], stddev=0.1)
    x = ad.array_reshape_op(x, (-1, shape[0]))
    x = ad.matmul_op(x, weight)
    y = x + ad.broadcastto_op(bias, x)
    return y
Beispiel #5
0
    def version_1(cls,node,tensor_dict,**kwargs):
        x = tensor_dict[node.input_tensor_names[0]]
        output_shape = tensor_dict[node.input_tensor_names[1]]

        y = ad.array_reshape_op(x, output_shape)
        tensor_dict[node.output_tensor_names[0]] = y
        return y
Beispiel #6
0
def dc_criteo(dense_input, sparse_input, y_):

    feature_dimension = 33762577
    embedding_size = 8
    learning_rate = 0.001

    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding")
    sparse_input = ad.embedding_lookup_op(Embedding, sparse_input)
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))

    ## dc_model
    x = ad.concat_op(sparse_input, dense_input, axis=1)

    input_dim = 26 * 8 + 13
    hidden_dim = input_dim
    residual_out = build_residual_layers(x,
                                         input_dim,
                                         hidden_dim,
                                         num_layers=5)

    W4 = init.random_normal([26 * embedding_size + 13, 1],
                            stddev=0.1,
                            name="W4")
    y = ad.matmul_op(residual_out, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
Beispiel #7
0
    def transpose_for_scores(input_tensor):
        output_tensor = ad.array_reshape_op(input_tensor, [
            config.batch_size, -1, config.num_heads,
            config.d_model // config.num_heads
        ])

        output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor
Beispiel #8
0
def dfm_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01

    # FM
    Embedding1 = init.random_normal([feature_dimension, 1],
                                    stddev=0.01,
                                    name="fst_order_embedding",
                                    ctx=ndarray.cpu(0))
    FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
    sparse_1dim_input = ad.embedding_lookup_op(Embedding1,
                                               sparse_input,
                                               ctx=ndarray.cpu(0))
    fm_dense_part = ad.matmul_op(dense_input, FM_W)
    fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1)
    """ fst order output"""
    y1 = fm_dense_part + fm_sparse_part

    Embedding2 = init.random_normal([feature_dimension, embedding_size],
                                    stddev=0.01,
                                    name="snd_order_embedding",
                                    ctx=ndarray.cpu(0))
    sparse_2dim_input = ad.embedding_lookup_op(Embedding2,
                                               sparse_input,
                                               ctx=ndarray.cpu(0))
    sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1)
    sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum)

    sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input)
    sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1)
    sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
    sparse_2dim_half = sparse_2dim * 0.5
    """snd order output"""
    y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)

    #DNN
    flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size))
    W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 1], stddev=0.01, name="W3")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = y1 + y2
    y = y4 + y3
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
Beispiel #9
0
    def decode(self, ys, memory, src_masks):
        decoder_inputs = ys

        # embedding
        dec = ad.embedding_lookup_op(self.embeddings,
                                     decoder_inputs)  # (N, T2, d_model)
        dec = dec * self.hp.d_model**0.5  # scale

        dec += positional_encoding(
            dec, (self.hp.batch_size, self.hp.maxlen2 - 1, self.hp.d_model),
            self.hp.maxlen2)
        dec = dropout(dec, self.hp.dropout_rate)

        # Blocks
        for i in range(self.hp.num_blocks):
            # Masked self-attention (Note that causality is True at this time)
            dec = multihead_attention(
                queries=dec,
                keys=dec,
                values=dec,
                config=self.hp,
                attention_mask=decoder_inputs,
                causality=True,
            )
            # Vanilla attention
            dec = multihead_attention(
                queries=dec,
                keys=memory,
                values=memory,
                config=self.hp,
                attention_mask=src_masks,
                causality=False,
            )
            ### Feed Forward
            dec = ff(dec, config=self.hp)

        dec = ad.array_reshape_op(dec,
                                  [-1, self.hp.d_model])  # (N * T, d_model)
        logits = ad.array_reshape_op(
            ad.matmul_op(dec, self.embeddings, trans_B=True),
            [self.hp.batch_size, -1, self.hp.vocab_size])  # (N, T, vocab)

        return logits
Beispiel #10
0
def test_Reshape():
    X = ad.Variable(name="X")
    y = ad.array_reshape_op(X, [-1, 10 * 10 * 10])
    executor = ad.Executor([y], ctx=ctx)

    X_val = rand.normal(scale=0.1,
                        size=(batch_size, 10, 10, 10)).astype(np.float32)
    res = executor.run(feed_dict={X: X_val})
    Check(executor, res, [X], [y], [X_val])
    print(sys._getframe().f_code.co_name, 'pass!')
Beispiel #11
0
def wdl_criteo(dense, sparse, labels):
    batch_size = 128
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01
    if isinstance(dense, tuple):
        dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'],
                                        [dense[1], batch_size, 'validate']])
        sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'],
                                         [sparse[1], batch_size, 'validate']])
        y_ = dl.dataloader_op([[labels[0], batch_size, 'train'],
                               [labels[1], batch_size, 'validate']])
    else:
        dense_input = dl.dataloader_op([[dense, batch_size, 'train']])
        sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']])
        y_ = dl.dataloader_op([[labels, batch_size, 'train']])
    print("Data loaded.")
    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding",
                                   ctx=ndarray.cpu(0))
    sparse_input = ad.embedding_lookup_op(Embedding,
                                          sparse_input,
                                          ctx=ndarray.cpu(0))
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))

    #DNN
    flatten = dense_input
    W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

    W4 = init.random_normal([256 + 26 * embedding_size, 1],
                            stddev=0.01,
                            name="W4")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = ad.concat_op(sparse_input, y3, axis=1)
    y = ad.matmul_op(y4, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
Beispiel #12
0
def lenet(x, y_):
    '''
    LeNet model, for MNIST dataset.

    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''

    print('Building LeNet model...')
    x = ad.array_reshape_op(x, (-1, 1, 28, 28))
    x = conv_pool(x, 1, 6, name='lenet_conv1')
    x = conv_pool(x, 6, 16, name='lenet_conv2')
    x = ad.array_reshape_op(x, (-1, 7 * 7 * 16))
    x = fc(x, (7 * 7 * 16, 120), name='lenet_fc1', with_relu=True)
    x = fc(x, (120, 84), name='lenet_fc2', with_relu=True)
    y = fc(x, (84, 10), name='lenet_fc3', with_relu=False)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    return loss, y
Beispiel #13
0
def dc_criteo(dense, sparse, labels):

    batch_size = 128
    feature_dimension = 33762577
    embedding_size = 8
    learning_rate = 0.001
    if isinstance(dense, tuple):
        dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'],
                                        [dense[1], batch_size, 'validate']])
        sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'],
                                         [sparse[1], batch_size, 'validate']])
        y_ = dl.dataloader_op([[labels[0], batch_size, 'train'],
                               [labels[1], batch_size, 'validate']])
    else:
        dense_input = dl.dataloader_op([[dense, batch_size, 'train']])
        sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']])
        y_ = dl.dataloader_op([[labels, batch_size, 'train']])
    print("Data loaded.")

    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding")
    sparse_input = ad.embedding_lookup_op(Embedding, sparse_input)
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))

    ## dc_model
    x = ad.concat_op(sparse_input, dense_input, axis=1)

    input_dim = 26 * 8 + 13
    hidden_dim = input_dim
    residual_out = build_residual_layers(x,
                                         input_dim,
                                         hidden_dim,
                                         num_layers=5)

    W4 = init.random_normal([26 * embedding_size + 13, 1],
                            stddev=0.1,
                            name="W4")
    y = ad.matmul_op(residual_out, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
Beispiel #14
0
def dcn_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.003

    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding",
                                   ctx=ndarray.cpu(0))
    sparse_input = ad.embedding_lookup_op(Embedding,
                                          sparse_input,
                                          ctx=ndarray.cpu(0))
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))
    x = ad.concat_op(sparse_input, dense_input, axis=1)
    # Cross Network
    cross_output = build_cross_layer(x, num_layers=3)

    #DNN
    flatten = x
    W1 = init.random_normal([26 * embedding_size + 13, 256],
                            stddev=0.01,
                            name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

    W4 = init.random_normal([256 + 26 * embedding_size + 13, 1],
                            stddev=0.01,
                            name="W4")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = ad.concat_op(cross_output, y3, axis=1)
    y = ad.matmul_op(y4, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
Beispiel #15
0
Datei: CNN.py Projekt: sj1104/Het
def cnn_3_layers(x, y_):
    '''
    3-layer-CNN model, for MNIST dataset.

    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''

    print('Building 3-layer-CNN model...')
    x = ad.array_reshape_op(x, [-1, 1, 28, 28])
    x = conv_relu_avg(x, (32, 1, 5, 5))
    x = conv_relu_avg(x, (64, 32, 5, 5))
    y = fc(x, (7 * 7 * 64, 10))
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    return loss, y
Beispiel #16
0
def vgg(x, y_, num_layers):
    '''
    VGG model, for CIFAR10 dataset.

    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
        num_layers: 16 or 19
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''

    if num_layers == 16:
        print('Building VGG-16 model...')
        x = vgg_2block(x, 3, 64, 'vgg_block1')
        x = vgg_2block(x, 64, 128, 'vgg_block2')
        x = vgg_3block(x, 128, 256, 'vgg_block3')
        x = vgg_3block(x, 256, 512, 'vgg_block4')
        x = vgg_3block(x, 512, 512, 'vgg_block5')

    elif num_layers == 19:
        print('Building VGG-19 model...')
        x = vgg_2block(x, 3, 64, 'vgg_block1')
        x = vgg_2block(x, 64, 128, 'vgg_block2')
        x = vgg_4block(x, 128, 256, 'vgg_block3')
        x = vgg_4block(x, 256, 512, 'vgg_block4')
        x = vgg_4block(x, 512, 512, 'vgg_block5')

    else:
        assert False, 'VGG model should have 16 or 19 layers!'

    x = ad.array_reshape_op(x, (-1, 512))
    x = vgg_fc(x, 512, 4096, 'vgg_fc1')
    x = vgg_fc(x, 4096, 4096, 'vgg_fc2')
    y = vgg_fc(x, 4096, 10, 'vgg_fc3')

    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])

    return loss, y
Beispiel #17
0
def cnn(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):

    print("Build CNN model...")

    W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1')
    W2 = init.random_normal((64, 32, 5, 5), stddev=0.1, name='W2')
    W3 = init.random_normal((7 * 7 * 64, 10), stddev=0.1, name='W3')
    b3 = init.random_normal((10, ), stddev=0.1, name='b3')

    X = ad.Variable(name="X")

    z1 = ad.conv2d_op(X, W1, padding=2, stride=1)
    z2 = ad.relu_op(z1)
    z3 = ad.avg_pool2d_op(z2, kernel_H=2, kernel_W=2, padding=0, stride=2)

    z4 = ad.conv2d_op(z3, W2, padding=2, stride=1)
    z5 = ad.relu_op(z4)
    z6 = ad.avg_pool2d_op(z5, kernel_H=2, kernel_W=2, padding=0, stride=2)

    z6_flat = ad.array_reshape_op(z6, (-1, 7 * 7 * 64))
    y = ad.matmul_op(z6_flat, W3) + b3

    executor = ad.Executor([y], ctx=executor_ctx)

    rand = np.random.RandomState(seed=123)
    X_val = rand.normal(scale=0.1,
                        size=(batch_size, 1, 28, 28)).astype(np.float32)

    ath = executor.run(feed_dict={X: X_val})

    hx.hetu2onnx.export(executor, [X], [y], 'ath.onnx')
    #
    #
    sess = rt.InferenceSession("ath.onnx")
    input = sess.get_inputs()[0].name

    pre = sess.run(None, {input: X_val.astype(np.float32)})[0]

    np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
Beispiel #18
0
def wdl_adult(whatever):
    batch_size = 128
    lr=5
    dim_wide = 809

    lr_ = lr / batch_size
    dim_deep = 68

    from .load_data import load_adult_data
    x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data()

    W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W")
    W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1")
    b1 = init.random_normal([50], stddev=0.1, name="b1")
    W2 = init.random_normal([50, 20], stddev=0.1, name="W2")
    b2 = init.random_normal([20], stddev=0.1, name="b2")

    X_wide = dl.dataloader_op([
        [x_train_wide, batch_size, 'train'],
        [x_test_wide, batch_size, 'validate'],
    ])
    y_ = dl.dataloader_op([
        [y_train, batch_size, 'train'],
        [y_test, batch_size, 'validate'],
    ])

    #deep
    Embedding = []
    X_deep = []
    X_deep_input = None

    for i in range(8):
        X_deep_name = "x_deep_" + str(i)
        Embedding_name = "Embedding_deep_" + str(i)
        X_deep.append(dl.dataloader_op([
            [x_train_deep[:,i], batch_size, 'train'],
            [x_test_deep[:,i], batch_size, 'validate'],
        ]))
        Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name))
        now = ad.embedding_lookup_op(Embedding[i], X_deep[i])
        now = ad.array_reshape_op(now, (-1, 8))
        if X_deep_input is None:
            X_deep_input = now
        else:
            X_deep_input = ad.concat_op(X_deep_input, now, 1)

    for i in range(4):
        X_deep_name = "x_deep_" + str(8+i)
        X_deep.append(dl.dataloader_op([
            [x_train_deep[:,8+i], batch_size, 'train'],
            [x_test_deep[:,8+i], batch_size, 'validate'],
        ]))
        now = ad.array_reshape_op(X_deep[i + 8], (batch_size, 1))
        X_deep_input = ad.concat_op(X_deep_input, now, 1)

    mat1 = ad.matmul_op(X_deep_input, W1)
    add1 = mat1 + ad.broadcastto_op(b1, mat1)
    relu1= ad.relu_op(add1)
    dropout1 = relu1 #ad.dropout_op(relu1, 0.5)
    mat2 = ad.matmul_op(dropout1, W2)
    add2 = mat2 + ad.broadcastto_op(b2, mat2)
    relu2= ad.relu_op(add2)
    dropout2 = relu2 #ad.dropout_op(relu2, 0.5)
    dmodel=dropout2

    # wide
    wmodel = ad.concat_op(X_wide, dmodel, 1)
    wmodel = ad.matmul_op(wmodel, W)

    prediction = wmodel
    loss = ad.softmaxcrossentropy_op(prediction, y_)
    loss = ad.reduce_mean_op(loss, [0])

    opt = optimizer.SGDOptimizer(learning_rate=lr_)
    train_op = opt.minimize(loss)

    return loss, prediction, y_, train_op
Beispiel #19
0
def resnet(x, y_, num_layers=18):
    '''
    ResNet model, for CIFAR10 dataset.

    Parameters:
        x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W)
        y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
        num_layers: 18 or 34
    Return:
        loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
        y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
    '''

    base_size = 16

    x = conv2d(x,
               3,
               base_size,
               stride=1,
               padding=1,
               name='resnet_initial_conv')
    x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn')

    if num_layers == 18:
        print("Building ResNet-18 model...")
        x = resnet_block(x,
                         base_size,
                         num_blocks=2,
                         is_first=True,
                         name='resnet_block1')
        x = resnet_block(x,
                         base_size,
                         num_blocks=2,
                         is_first=False,
                         name='resnet_block2')
        x = resnet_block(x,
                         2 * base_size,
                         num_blocks=2,
                         is_first=False,
                         name='resnet_block3')
        x = resnet_block(x,
                         4 * base_size,
                         num_blocks=2,
                         is_first=False,
                         name='resnet_block4')
    elif num_layers == 34:
        print("Building ResNet-34 model...")
        x = resnet_block(x,
                         base_size,
                         num_blocks=3,
                         is_first=True,
                         name='resnet_block1')
        x = resnet_block(x,
                         base_size,
                         num_blocks=4,
                         is_first=False,
                         name='resnet_block2')
        x = resnet_block(x,
                         2 * base_size,
                         num_blocks=6,
                         is_first=False,
                         name='resnet_block3')
        x = resnet_block(x,
                         4 * base_size,
                         num_blocks=3,
                         is_first=False,
                         name='resnet_block4')
    else:
        assert False, "Number of layers should be 18 or 34 !"

    x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn')
    x = ad.array_reshape_op(x, (-1, 128 * base_size))
    y = fc(x, (128 * base_size, 10), name='resnet_final_fc')
    # here we don't use cudnn for softmax crossentropy to avoid overflows
    loss = ad.softmaxcrossentropy_op(y, y_, use_cudnn=False)
    loss = ad.reduce_mean_op(loss, [0])
    return loss, y
Beispiel #20
0
def train_main(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank % args.num_local_worker)
    embedding_width = args.hidden_size
    extract_width = embedding_width * (meta["feature"] - 1)

    y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array(
        convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu()))
    mask_ = ad.Variable(name="mask_")
    gcn1 = GCN(extract_width, hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    index = dl.GNNDataLoaderOp(
        lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()),
        ctx=ndarray.cpu())
    embedding = initializers.random_normal([meta["idx_max"], embedding_width],
                                           stddev=0.1)
    embed = ad.embedding_lookup_op(embedding, index)
    embed = ad.array_reshape_op(embed, (-1, extract_width))
    # embed = ad.reduce_mean_op(embed, axes=1)
    # x = ad.concat_op(x_, embed, axis=1)
    x = gcn1(embed)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    train_loss = loss * mask_
    train_loss = ad.reduce_mean_op(train_loss, [0])
    opt = optimizer.SGDOptimizer(args.learning_rate)
    train_op = opt.minimize(train_loss)
    ad.worker_init()
    distributed.ps_init(rank, nrank)

    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx = 0
    g_sample, mp_val, mask, mask_eval = graphs[idx]
    idx = (idx + 1) % ngraph
    dl.GNNDataLoaderOp.step(g_sample)
    dl.GNNDataLoaderOp.step(g_sample)
    epoch = 0
    nnodes = 0
    executor = ad.Executor([loss, y, train_op],
                           ctx=ctx,
                           comm_mode='PS',
                           use_sparse_pull=False,
                           cstable_policy=args.cache)
    while True:
        g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx]
        idx = (idx + 1) % ngraph
        dl.GNNDataLoaderOp.step(g_sample_nxt)
        feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask}
        loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)

        acc = np.sum((y_predicted == g_sample.y) * mask_eval)
        train_acc = np.sum((y_predicted == g_sample.y) * mask)
        stat.update(acc, mask_eval.sum(),
                    np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum())
        stat.update_train(train_acc, mask.sum(),
                          np.sum(loss_val.asnumpy() * mask) / mask.sum())

        # distributed.ps_get_worker_communicator().BarrierWorker()
        nnodes += mask.sum() + mask_eval.sum()
        if nnodes > meta["partition"]["nodes"][rank]:
            nnodes = 0
            epoch += 1
            if rank == 0:
                stat.print(epoch)
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
Beispiel #21
0
def multihead_attention(queries,
                        keys,
                        values,
                        config,
                        query_act=None,
                        key_act=None,
                        value_act=None,
                        attention_mask=None,
                        causality=False):
    def transpose_for_scores(input_tensor):
        output_tensor = ad.array_reshape_op(input_tensor, [
            config.batch_size, -1, config.num_heads,
            config.d_model // config.num_heads
        ])

        output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor

    batch_size = config.batch_size
    hidden_size = config.d_model
    num_attention_heads = config.num_heads
    caus_len = config.maxlen2 - 1
    attention_probs_dropout_prob = config.dropout_rate

    size_per_head = hidden_size // num_attention_heads

    # reshape to 2d
    queries2d = ad.array_reshape_op(queries,
                                    [-1, hidden_size])  # (N * T_q, d_model)
    keys2d = ad.array_reshape_op(keys, [-1, hidden_size])  # (N * T_k, d_model)
    values2d = ad.array_reshape_op(values,
                                   [-1, hidden_size])  # (N * T_k, d_model)

    # linear transformation
    query_layer = dense(queries2d, hidden_size, hidden_size,
                        query_act)  # (N * T_k, d_model)
    key_layer = dense(keys2d, hidden_size, hidden_size,
                      key_act)  # (N * T_k, d_model)
    value_layer = dense(values2d, hidden_size, hidden_size,
                        value_act)  # (N * T_k, d_model)

    # transpose
    query_layer = transpose_for_scores(query_layer)  # (N, h, T_q, d_model/h)
    key_layer = transpose_for_scores(key_layer)  # (N, h, T_k, d_model/h)
    value_layer = transpose_for_scores(value_layer)  # (N, h, T_k, d_model/h)

    # score
    attention_scores = ad.batch_matmul_op(query_layer, key_layer,
                                          trans_B=True)  # (N, h, T_q, T_k)
    attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head)))

    # mask
    if attention_mask is not None:
        zeros = ad.Variable('no_mask',
                            value=np.array((0, ), dtype=np.float32),
                            trainable=False)
        adder = ad.Variable('attention_mask',
                            value=np.array((-2**32 + 1, ), dtype=np.float32),
                            trainable=False)
        zeros = ad.broadcastto_op(zeros, attention_mask)
        adder = ad.broadcastto_op(adder, attention_mask)
        attention_mask = ad.where_op(attention_mask, zeros, adder)  # (N, T)
        attention_mask = ad.array_reshape_op(attention_mask,
                                             [batch_size, 1, 1, -1])
        attention_scores = attention_scores + ad.broadcastto_op(
            attention_mask, attention_scores)
    if causality:
        tril = ad.Variable(name='tril',
                           value=np.tril(np.ones((caus_len, caus_len))),
                           trainable=False)  # (T, T)
        future_masks = ad.broadcast_shape_op(
            tril, [batch_size, num_attention_heads, caus_len, caus_len])
        adder = ad.Variable('future_mask',
                            value=np.array((-2**32 + 1, ), dtype=np.float32),
                            trainable=False)
        adder = ad.broadcastto_op(adder, future_masks)
        attention_scores = ad.where_op(future_masks, attention_scores,
                                       adder)  # (N, h, T, T)

    # probs
    attention_probs = ad.softmax_op(attention_scores)
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
    context_layer = ad.batch_matmul_op(attention_probs, value_layer)
    context_layer = ad.transpose_op(context_layer, [0, 2, 1, 3])
    outputs = ad.array_reshape_op(
        context_layer, [batch_size, -1, num_attention_heads * size_per_head])

    # Residual connection
    outputs = outputs + queries  # (N, T_q, d_model)

    # Normalize
    outputs = layer_norm(outputs, hidden_size)  # (N, T_q, d_model)
    return outputs