def ff(inputs, config): outputs = ad.array_reshape_op(inputs, [-1, config.d_model]) outputs = dense(outputs, config.d_model, config.d_ff, activation=ad.relu_op) outputs = dense(outputs, config.d_ff, config.d_model) outputs = ad.array_reshape_op(outputs, [config.batch_size, -1, config.d_model]) outputs = outputs + inputs outputs = layer_norm(outputs, config.d_model) return outputs
def wdl_adult(X_deep, X_wide, y_): lr = 5 / 128 dim_wide = 809 dim_deep = 68 W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") #deep Embedding = [] X_deep_input = None for i in range(8): Embedding_name = "Embedding_deep_" + str(i) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): now = ad.array_reshape_op(X_deep[i + 8], (-1, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def alexnet(x, y_): ''' AlexNet model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building AlexNet model...') x = ad.array_reshape_op(x, [-1, 1, 28, 28]) x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1', with_relu=True, with_pool=True) x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2', with_relu=True, with_pool=True) x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3', with_relu=True, with_pool=False) x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4', with_relu=True, with_pool=False) x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5', with_relu=False, with_pool=True) x = ad.array_reshape_op(x, (-1, 256 * 3 * 3)) x = fc(x, (256 * 3 * 3, 1024), name='alexnet_fc1', with_relu=True) x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True) y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def fc(x, shape): weight = init.random_normal(shape=shape, stddev=0.1) bias = init.random_normal(shape=shape[-1:], stddev=0.1) x = ad.array_reshape_op(x, (-1, shape[0])) x = ad.matmul_op(x, weight) y = x + ad.broadcastto_op(bias, x) return y
def version_1(cls,node,tensor_dict,**kwargs): x = tensor_dict[node.input_tensor_names[0]] output_shape = tensor_dict[node.input_tensor_names[1]] y = ad.array_reshape_op(x, output_shape) tensor_dict[node.output_tensor_names[0]] = y return y
def dc_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 8 learning_rate = 0.001 Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") sparse_input = ad.embedding_lookup_op(Embedding, sparse_input) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) ## dc_model x = ad.concat_op(sparse_input, dense_input, axis=1) input_dim = 26 * 8 + 13 hidden_dim = input_dim residual_out = build_residual_layers(x, input_dim, hidden_dim, num_layers=5) W4 = init.random_normal([26 * embedding_size + 13, 1], stddev=0.1, name="W4") y = ad.matmul_op(residual_out, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def transpose_for_scores(input_tensor): output_tensor = ad.array_reshape_op(input_tensor, [ config.batch_size, -1, config.num_heads, config.d_model // config.num_heads ]) output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3]) return output_tensor
def dfm_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 # FM Embedding1 = init.random_normal([feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ndarray.cpu(0)) FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") sparse_1dim_input = ad.embedding_lookup_op(Embedding1, sparse_input, ctx=ndarray.cpu(0)) fm_dense_part = ad.matmul_op(dense_input, FM_W) fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1) """ fst order output""" y1 = fm_dense_part + fm_sparse_part Embedding2 = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_2dim_input = ad.embedding_lookup_op(Embedding2, sparse_input, ctx=ndarray.cpu(0)) sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1) sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum) sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input) sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1) sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum sparse_2dim_half = sparse_2dim * 0.5 """snd order output""" y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) #DNN flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size)) W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 1], stddev=0.01, name="W3") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = y1 + y2 y = y4 + y3 y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def decode(self, ys, memory, src_masks): decoder_inputs = ys # embedding dec = ad.embedding_lookup_op(self.embeddings, decoder_inputs) # (N, T2, d_model) dec = dec * self.hp.d_model**0.5 # scale dec += positional_encoding( dec, (self.hp.batch_size, self.hp.maxlen2 - 1, self.hp.d_model), self.hp.maxlen2) dec = dropout(dec, self.hp.dropout_rate) # Blocks for i in range(self.hp.num_blocks): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, config=self.hp, attention_mask=decoder_inputs, causality=True, ) # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, config=self.hp, attention_mask=src_masks, causality=False, ) ### Feed Forward dec = ff(dec, config=self.hp) dec = ad.array_reshape_op(dec, [-1, self.hp.d_model]) # (N * T, d_model) logits = ad.array_reshape_op( ad.matmul_op(dec, self.embeddings, trans_B=True), [self.hp.batch_size, -1, self.hp.vocab_size]) # (N, T, vocab) return logits
def test_Reshape(): X = ad.Variable(name="X") y = ad.array_reshape_op(X, [-1, 10 * 10 * 10]) executor = ad.Executor([y], ctx=ctx) X_val = rand.normal(scale=0.1, size=(batch_size, 10, 10, 10)).astype(np.float32) res = executor.run(feed_dict={X: X_val}) Check(executor, res, [X], [y], [X_val]) print(sys._getframe().f_code.co_name, 'pass!')
def wdl_criteo(dense, sparse, labels): batch_size = 128 feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 if isinstance(dense, tuple): dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'], [dense[1], batch_size, 'validate']]) sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'], [sparse[1], batch_size, 'validate']]) y_ = dl.dataloader_op([[labels[0], batch_size, 'train'], [labels[1], batch_size, 'validate']]) else: dense_input = dl.dataloader_op([[dense, batch_size, 'train']]) sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']]) y_ = dl.dataloader_op([[labels, batch_size, 'train']]) print("Data loaded.") Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) #DNN flatten = dense_input W1 = init.random_normal([13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(sparse_input, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def lenet(x, y_): ''' LeNet model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building LeNet model...') x = ad.array_reshape_op(x, (-1, 1, 28, 28)) x = conv_pool(x, 1, 6, name='lenet_conv1') x = conv_pool(x, 6, 16, name='lenet_conv2') x = ad.array_reshape_op(x, (-1, 7 * 7 * 16)) x = fc(x, (7 * 7 * 16, 120), name='lenet_fc1', with_relu=True) x = fc(x, (120, 84), name='lenet_fc2', with_relu=True) y = fc(x, (84, 10), name='lenet_fc3', with_relu=False) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def dc_criteo(dense, sparse, labels): batch_size = 128 feature_dimension = 33762577 embedding_size = 8 learning_rate = 0.001 if isinstance(dense, tuple): dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'], [dense[1], batch_size, 'validate']]) sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'], [sparse[1], batch_size, 'validate']]) y_ = dl.dataloader_op([[labels[0], batch_size, 'train'], [labels[1], batch_size, 'validate']]) else: dense_input = dl.dataloader_op([[dense, batch_size, 'train']]) sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']]) y_ = dl.dataloader_op([[labels, batch_size, 'train']]) print("Data loaded.") Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") sparse_input = ad.embedding_lookup_op(Embedding, sparse_input) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) ## dc_model x = ad.concat_op(sparse_input, dense_input, axis=1) input_dim = 26 * 8 + 13 hidden_dim = input_dim residual_out = build_residual_layers(x, input_dim, hidden_dim, num_layers=5) W4 = init.random_normal([26 * embedding_size + 13, 1], stddev=0.1, name="W4") y = ad.matmul_op(residual_out, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def dcn_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.003 Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) x = ad.concat_op(sparse_input, dense_input, axis=1) # Cross Network cross_output = build_cross_layer(x, num_layers=3) #DNN flatten = x W1 = init.random_normal([26 * embedding_size + 13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size + 13, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(cross_output, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def cnn_3_layers(x, y_): ''' 3-layer-CNN model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building 3-layer-CNN model...') x = ad.array_reshape_op(x, [-1, 1, 28, 28]) x = conv_relu_avg(x, (32, 1, 5, 5)) x = conv_relu_avg(x, (64, 32, 5, 5)) y = fc(x, (7 * 7 * 64, 10)) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def vgg(x, y_, num_layers): ''' VGG model, for CIFAR10 dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) num_layers: 16 or 19 Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' if num_layers == 16: print('Building VGG-16 model...') x = vgg_2block(x, 3, 64, 'vgg_block1') x = vgg_2block(x, 64, 128, 'vgg_block2') x = vgg_3block(x, 128, 256, 'vgg_block3') x = vgg_3block(x, 256, 512, 'vgg_block4') x = vgg_3block(x, 512, 512, 'vgg_block5') elif num_layers == 19: print('Building VGG-19 model...') x = vgg_2block(x, 3, 64, 'vgg_block1') x = vgg_2block(x, 64, 128, 'vgg_block2') x = vgg_4block(x, 128, 256, 'vgg_block3') x = vgg_4block(x, 256, 512, 'vgg_block4') x = vgg_4block(x, 512, 512, 'vgg_block5') else: assert False, 'VGG model should have 16 or 19 layers!' x = ad.array_reshape_op(x, (-1, 512)) x = vgg_fc(x, 512, 4096, 'vgg_fc1') x = vgg_fc(x, 4096, 4096, 'vgg_fc2') y = vgg_fc(x, 4096, 10, 'vgg_fc3') loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def cnn(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False): print("Build CNN model...") W1 = init.random_normal((32, 1, 5, 5), stddev=0.1, name='W1') W2 = init.random_normal((64, 32, 5, 5), stddev=0.1, name='W2') W3 = init.random_normal((7 * 7 * 64, 10), stddev=0.1, name='W3') b3 = init.random_normal((10, ), stddev=0.1, name='b3') X = ad.Variable(name="X") z1 = ad.conv2d_op(X, W1, padding=2, stride=1) z2 = ad.relu_op(z1) z3 = ad.avg_pool2d_op(z2, kernel_H=2, kernel_W=2, padding=0, stride=2) z4 = ad.conv2d_op(z3, W2, padding=2, stride=1) z5 = ad.relu_op(z4) z6 = ad.avg_pool2d_op(z5, kernel_H=2, kernel_W=2, padding=0, stride=2) z6_flat = ad.array_reshape_op(z6, (-1, 7 * 7 * 64)) y = ad.matmul_op(z6_flat, W3) + b3 executor = ad.Executor([y], ctx=executor_ctx) rand = np.random.RandomState(seed=123) X_val = rand.normal(scale=0.1, size=(batch_size, 1, 28, 28)).astype(np.float32) ath = executor.run(feed_dict={X: X_val}) hx.hetu2onnx.export(executor, [X], [y], 'ath.onnx') # # sess = rt.InferenceSession("ath.onnx") input = sess.get_inputs()[0].name pre = sess.run(None, {input: X_val.astype(np.float32)})[0] np.testing.assert_allclose(ath[0].asnumpy(), pre, rtol=1e-2)
def wdl_adult(whatever): batch_size = 128 lr=5 dim_wide = 809 lr_ = lr / batch_size dim_deep = 68 from .load_data import load_adult_data x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data() W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") X_wide = dl.dataloader_op([ [x_train_wide, batch_size, 'train'], [x_test_wide, batch_size, 'validate'], ]) y_ = dl.dataloader_op([ [y_train, batch_size, 'train'], [y_test, batch_size, 'validate'], ]) #deep Embedding = [] X_deep = [] X_deep_input = None for i in range(8): X_deep_name = "x_deep_" + str(i) Embedding_name = "Embedding_deep_" + str(i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,i], batch_size, 'train'], [x_test_deep[:,i], batch_size, 'validate'], ])) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): X_deep_name = "x_deep_" + str(8+i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,8+i], batch_size, 'train'], [x_test_deep[:,8+i], batch_size, 'validate'], ])) now = ad.array_reshape_op(X_deep[i + 8], (batch_size, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr_) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def resnet(x, y_, num_layers=18): ''' ResNet model, for CIFAR10 dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) num_layers: 18 or 34 Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' base_size = 16 x = conv2d(x, 3, base_size, stride=1, padding=1, name='resnet_initial_conv') x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn') if num_layers == 18: print("Building ResNet-18 model...") x = resnet_block(x, base_size, num_blocks=2, is_first=True, name='resnet_block1') x = resnet_block(x, base_size, num_blocks=2, is_first=False, name='resnet_block2') x = resnet_block(x, 2 * base_size, num_blocks=2, is_first=False, name='resnet_block3') x = resnet_block(x, 4 * base_size, num_blocks=2, is_first=False, name='resnet_block4') elif num_layers == 34: print("Building ResNet-34 model...") x = resnet_block(x, base_size, num_blocks=3, is_first=True, name='resnet_block1') x = resnet_block(x, base_size, num_blocks=4, is_first=False, name='resnet_block2') x = resnet_block(x, 2 * base_size, num_blocks=6, is_first=False, name='resnet_block3') x = resnet_block(x, 4 * base_size, num_blocks=3, is_first=False, name='resnet_block4') else: assert False, "Number of layers should be 18 or 34 !" x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn') x = ad.array_reshape_op(x, (-1, 128 * base_size)) y = fc(x, (128 * base_size, 10), name='resnet_final_fc') # here we don't use cudnn for softmax crossentropy to avoid overflows loss = ad.softmaxcrossentropy_op(y, y_, use_cudnn=False) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
def multihead_attention(queries, keys, values, config, query_act=None, key_act=None, value_act=None, attention_mask=None, causality=False): def transpose_for_scores(input_tensor): output_tensor = ad.array_reshape_op(input_tensor, [ config.batch_size, -1, config.num_heads, config.d_model // config.num_heads ]) output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3]) return output_tensor batch_size = config.batch_size hidden_size = config.d_model num_attention_heads = config.num_heads caus_len = config.maxlen2 - 1 attention_probs_dropout_prob = config.dropout_rate size_per_head = hidden_size // num_attention_heads # reshape to 2d queries2d = ad.array_reshape_op(queries, [-1, hidden_size]) # (N * T_q, d_model) keys2d = ad.array_reshape_op(keys, [-1, hidden_size]) # (N * T_k, d_model) values2d = ad.array_reshape_op(values, [-1, hidden_size]) # (N * T_k, d_model) # linear transformation query_layer = dense(queries2d, hidden_size, hidden_size, query_act) # (N * T_k, d_model) key_layer = dense(keys2d, hidden_size, hidden_size, key_act) # (N * T_k, d_model) value_layer = dense(values2d, hidden_size, hidden_size, value_act) # (N * T_k, d_model) # transpose query_layer = transpose_for_scores(query_layer) # (N, h, T_q, d_model/h) key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h) value_layer = transpose_for_scores(value_layer) # (N, h, T_k, d_model/h) # score attention_scores = ad.batch_matmul_op(query_layer, key_layer, trans_B=True) # (N, h, T_q, T_k) attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head))) # mask if attention_mask is not None: zeros = ad.Variable('no_mask', value=np.array((0, ), dtype=np.float32), trainable=False) adder = ad.Variable('attention_mask', value=np.array((-2**32 + 1, ), dtype=np.float32), trainable=False) zeros = ad.broadcastto_op(zeros, attention_mask) adder = ad.broadcastto_op(adder, attention_mask) attention_mask = ad.where_op(attention_mask, zeros, adder) # (N, T) attention_mask = ad.array_reshape_op(attention_mask, [batch_size, 1, 1, -1]) attention_scores = attention_scores + ad.broadcastto_op( attention_mask, attention_scores) if causality: tril = ad.Variable(name='tril', value=np.tril(np.ones((caus_len, caus_len))), trainable=False) # (T, T) future_masks = ad.broadcast_shape_op( tril, [batch_size, num_attention_heads, caus_len, caus_len]) adder = ad.Variable('future_mask', value=np.array((-2**32 + 1, ), dtype=np.float32), trainable=False) adder = ad.broadcastto_op(adder, future_masks) attention_scores = ad.where_op(future_masks, attention_scores, adder) # (N, h, T, T) # probs attention_probs = ad.softmax_op(attention_scores) attention_probs = dropout(attention_probs, attention_probs_dropout_prob) context_layer = ad.batch_matmul_op(attention_probs, value_layer) context_layer = ad.transpose_op(context_layer, [0, 2, 1, 3]) outputs = ad.array_reshape_op( context_layer, [batch_size, -1, num_attention_heads * size_per_head]) # Residual connection outputs = outputs + queries # (N, T_q, d_model) # Normalize outputs = layer_norm(outputs, hidden_size) # (N, T_q, d_model) return outputs