Python create_cuda_gpu_onの例、singa.device.create_cuda_gpu_on Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, device_id=0, **knobs):
        super().__init__(**knobs)
        # onnx model url
        self.model_url = 'https://onnxzoo.blob.core.windows.net/models/opset_8/tiny_yolov2/tiny_yolov2.tar.gz'

        # model path in the downloaded tar file
        self.model_path = 'tiny_yolov2/Model.onnx'
        self.dev = device.create_cuda_gpu_on(device_id)

コード例 #2

0

ファイルを表示

def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0):

    # Define the hypermeters good for the train_resnet
    niters = 100
    batch_size = 32
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    IMG_SIZE = 224

    # For distributed training, sequential has better throughput in the current version
    if DIST == True:
        sgd = opt.DistOpt(sgd)
        world_size = sgd.world_size
        local_rank = sgd.local_rank
        global_rank = sgd.global_rank
        sequential = True
    else:
        local_rank = 0
        world_size = 1
        global_rank = 0
        sequential = False

    dev = device.create_cuda_gpu_on(local_rank)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    dev.SetVerbosity(verbosity)
    dev.SetSkipIteration(5)

    # construct the model
    from model import resnet
    model = resnet.resnet50(num_channels=3, num_classes=1000)

    model.train()
    model.set_optimizer(sgd)
    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)

    # train model
    dev.Sync()
    start = time.time()
    with trange(niters) as t:
        for _ in t:
            model(tx, ty, dist_option='fp32', spars=None)

    dev.Sync()
    end = time.time()
    titer = (end - start) / float(niters)
    throughput = float(niters * batch_size * world_size) / (end - start)
    if global_rank == 0:
        print("Throughput = {} per second".format(throughput), flush=True)
        print("TotalTime={}".format(end - start), flush=True)
        print("Total={}".format(titer), flush=True)
        dev.PrintTimeProfiling()

コード例 #3

0

ファイルを表示

ファイル: vanilla.py プロジェクト: zlheui/singa

    def train(self):
        train_data, _, _, _, _, _ = load_data(self.dataset_filepath)
        dev = device.create_cuda_gpu_on(0)
        dev.SetRandSeed(0)
        np.random.seed(0)

        # sgd = opt.SGD(lr=self.learning_rate, momentum=0.9, weight_decay=1e-5)
        sgd = opt.Adam(lr=self.learning_rate)

        noise = tensor.Tensor((self.batch_size, self.noise_size), dev,
                              tensor.float32)
        real_images = tensor.Tensor((self.batch_size, self.feature_size), dev,
                                    tensor.float32)
        real_labels = tensor.Tensor((self.batch_size, 1), dev, tensor.float32)
        fake_labels = tensor.Tensor((self.batch_size, 1), dev, tensor.float32)

        # attached model to graph
        self.model.set_optimizer(sgd)
        self.model.compile([noise],
                           is_train=True,
                           use_graph=False,
                           sequential=True)

        real_labels.set_value(1.0)
        fake_labels.set_value(0.0)

        for iteration in range(self.iterations):
            idx = np.random.randint(0, train_data.shape[0], self.batch_size)
            real_images.copy_from_numpy(train_data[idx])

            self.model.train()

            # Training the Discriminative Net
            _, d_loss_real = self.model.train_one_batch_dis(
                real_images, real_labels)

            noise.uniform(-1, 1)
            fake_images = self.model.forward_gen(noise)
            _, d_loss_fake = self.model.train_one_batch_dis(
                fake_images, fake_labels)

            d_loss = tensor.to_numpy(d_loss_real)[0] + tensor.to_numpy(
                d_loss_fake)[0]

            # Training the Generative Net
            noise.uniform(-1, 1)
            _, g_loss_tensor = self.model.train_one_batch(
                noise, real_labels)

            g_loss = tensor.to_numpy(g_loss_tensor)[0]

            if iteration % self.interval == 0:
                self.model.eval()
                self.save_image(iteration)
                print_log(' The {} iteration, G_LOSS: {}, D_LOSS: {}'.format(
                    iteration, g_loss, d_loss))

コード例 #4

0

ファイルを表示

ファイル: OnnxModelBase.py プロジェクト: nusdbsystem/singa-auto

 def __init__(self, model_url, model_path, singa_model, device_id, **knobs):
     super().__init__(**knobs)
     self._knobs = knobs
     self.__dict__.update(knobs)
     self.model_url = model_url
     self.model_path = model_path
     self.singa_model = singa_model
     self.dev = device.create_cuda_gpu_on(device_id)
     self.dev.SetRandSeed(0)
     np.random.seed(0)

コード例 #5

0

ファイルを表示

ファイル: onnx_bert.py プロジェクト: nusdbsystem/singa-auto

    def __init__(self, device_id=0, length=20, **knobs):
        super().__init__(**knobs)
        # onnx model url
        self.model_url = 'https://media.githubusercontent.com/media/onnx/models/master/text/machine_comprehension/bert-squad/model/bertsquad-10.tar.gz'

        # model path in the downloaded tar file
        self.model_path = 'download_sample_10/bertsquad10.onnx'
        self.dev = device.create_cuda_gpu_on(device_id)
        self.max_answer_length = 30
        self.max_seq_length = 256
        self.doc_stride = 128
        self.max_query_length = 64
        self.n_best_size = 20
        self.batch_size = 3

コード例 #6

0

ファイルを表示

ファイル: xceptionnet.py プロジェクト: Wentong-DST/incubator-singa

        x = autograd.relu(features)
        x = self.globalpooling(x)
        x = autograd.flatten(x)
        x = self.fc(x)
        return x

    def __call__(self, input):
        x = self.features(input)
        x = self.logits(x)
        return x


if __name__ == '__main__':
    model = Xception(num_classes=1000)
    print('Start intialization............')
    dev = device.create_cuda_gpu_on(0)
    #dev = device.create_cuda_gpu()

    niters = 20
    batch_size = 16
    IMG_SIZE = 299
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    autograd.training = True
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

コード例 #7

0

ファイルを表示

def train_mnist_cnn(sgd,
                    max_epoch,
                    batch_size,
                    DIST=False,
                    data_partition=None,
                    gpu_num=None,
                    gpu_per_node=None,
                    nccl_id=None):

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          gpu_num=gpu_num,
                          gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            sychronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            plist = []
            for p, g in autograd.backward(loss):
                if DIST:
                    sgd.all_reduce(g)
                plist.append((p, g))
            if DIST:
                sgd.wait()
            for p, g in plist:
                sgd.update(p, g)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1, ), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)

コード例 #8

0

ファイルを表示

ファイル: mnist_cnn.py プロジェクト: willzhoupan/singa-ps

def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):
    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 128
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        '''
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        '''
        # create kvstore
        kv_type = 'dist_sync'  #set synchronization mode
        lr = 0.005
        kv = singa_kvstore.create_kvstore(kv_type, 'sgd', learning_rate=0.005)
        global_rank = kv.rank
        world_size = kv.num_workers
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, global_rank,
                                          world_size)
        test_x, test_y = data_partition(test_x, test_y, global_rank,
                                        world_size)

    # create model
    model = CNN()
    ''' 
    num_channels = train_x.shape[1]
    image_size = train_x.shape[2]
    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
    num_classes = (np.max(train_y) + 1).item()
    model = resnet.resnet18(num_channels=1, num_classes=num_classes)
    '''
    dev = device.create_cuda_gpu_on(kv.rank)
    #dev = device.create_cuda_gpu()
    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)
    '''
    if DIST:
        #Initial a batch to help obtain model parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        #Initial kv store for workers of ps-architecture
        key = 0
        for p, g in autograd.backward(loss):
            kv.init(key, mx.nd.array(tensor.to_numpy(p)))
            key += 1
     '''

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if (DIST == True):
            print('^_^Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)
        time_start = time.time()
        for b in range(num_train_batch):

            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            singa_kvstore.backward_and_update(kv, loss)
            '''
            if DIST:
                #push
                kv_pairs = []
                key = 0
                for p, g in autograd.backward(loss):
                    kv.push(key,mx.nd.array(tensor.to_numpy(g)))
                    kv_pairs.append((key,p,g))
                    key += 1
                #pull
                for key,p,g in kv_pairs:
                    out_buf = mx.nd.zeros(p.shape)
                    kv.pull(key,out=out_buf)
                    p.copy_from_numpy(out_buf.asnumpy())
             '''

            # Evaluation Phase
            if b % 400 != 0:
                continue
            autograd.training = False
            num_test_batch_inside = 20
            test_correct = 0
            for b in range(num_test_batch_inside):
                x = test_x[b * batch_size:(b + 1) * batch_size]
                y = test_y[b * batch_size:(b + 1) * batch_size]
                if x.shape[0] != tx.shape[0]:
                    break
                tx.copy_from_numpy(x)
                ty.copy_from_numpy(y)
                out_test = model.forward(tx)
                test_correct += accuracy(tensor.to_numpy(out_test), y)
            print('Evaluation accuracy = %f' %
                  (test_correct / (batch_size * num_test_batch_inside)),
                  flush=True)
            autograd.training = True
        print('epoch time is %f' % (time.time() - time_start))

コード例 #9

0

ファイルを表示

ファイル: predict.py プロジェクト: wangcan04/chatbot

    for i in range(datalen):
        a=x[i]
        sen.append(dic[a])
    return sen

if __name__ == "__main__":
        model_file = open('71.bin', 'rb')
        param = pickle.load(model_file)
        model_file.close()
        decoderw=param['decoder_w']
        densew,denseb=param['dense_w'],param['dense_b']
        hiddensize=param['hidden_size']
        numstacks=param['num_stacks']
        drop_out=param['dropout']
        vocab_size=7000
        cuda = device.create_cuda_gpu_on(1)
        encoder = layer.LSTM(name='lstm1', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,))
        decoder = layer.LSTM(name='lstm2', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,))
        encoder.to_device(cuda)
        decoder.to_device(cuda)
        encoder_w = encoder.param_values()[0]
        encoder_w.uniform(-0.08, 0.08)
        decoder.param_values()[0].copy_from_numpy(decoderw, offset=0)

        dense = layer.Dense('dense', vocab_size, input_sample_shape=(hiddensize,))
        dense.to_device(cuda)
        dense.param_values()[0].copy_from_numpy(densew,offset=0)
        dense.param_values()[1].copy_from_numpy(denseb,offset=0)
        
        
        metadata,idx_q,idx_a=load_data()

コード例 #10

0

ファイルを表示

def train(data,
          net,
          max_epoch,
          get_lr,
          weight_decay,
          batch_size=100,
          use_cpu=False):
    print('Start intialization............')
    if use_cpu:
        print('Using CPU')
        dev = device.get_default_device()
    else:
        print('Using GPU')
        dev = device.create_cuda_gpu_on(1)

    net.to_device(dev)
    opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
    for (p, specs) in zip(net.param_names(), net.param_specs()):
        opt.register(p, specs)

    tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
    ty = tensor.Tensor((batch_size, ), dev, core_pb2.kInt)
    train_x, train_y, test_x, test_y = data
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)
    fileTimeLog = open("epochTimeLog.text", "a")
    for epoch in range(3):
        time.sleep(1)
        np.random.shuffle(idx)
        loss, acc = 0.0, 0.0
        print('Epoch %d' % epoch)
        print(datetime.now().timetz())  # miliseconds
        print(int(round(time.time() * 1000)))
        fileTimeLog.write('Epoch %d: ' % epoch)
        fileTimeLog.write(str(int(round(time.time() * 1000))))
        fileTimeLog.write('\n')
        for b in range(20):
            time.sleep(1)
            print("train iteration %d" % b)
            fileTimeLog.write('iteration %d: ' % b)
            fileTimeLog.write(str(int(round(time.time() * 1000))))
            fileTimeLog.write('\n')
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            grads, (l, a) = net.train(tx, ty)
            loss += l
            acc += a
            for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
                opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b)
            # update progress bar
            utils.update_progress(b * 1.0 / num_train_batch,
                                  'training loss = %f, accuracy = %f' % (l, a))
        info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \
            % ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch))
        print(info)
        time.sleep(1)
        loss, acc = 0.0, 0.0
        for b in range(10):
            time.sleep(1)
            print("test iteration %d" % b)
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            l, a = net.evaluate(tx, ty)
            loss += l
            acc += a

        print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch),
                                                      (acc / num_test_batch)))
    fileTimeLog.close()
    net.save('model', 20)  # save model params into checkpoint file

コード例 #11

0

ファイルを表示

def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False):

    train_x, train_y = load_train_data()
    test_x, test_y = load_test_data()
    train_x, test_x = normalize_for_resnet(train_x, test_x)
    IMG_SIZE = 224
    num_classes=10

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node)
        dev = device.create_cuda_gpu_on(sgd.rank_in_local)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    from resnet import resnet50
    model = resnet50(num_classes=num_classes)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
        y = np.zeros( shape=(batch_size,), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        param = []
        for p, _ in autograd.backward(loss):
            sychronize(p, sgd)
            param.append(p)

    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Starting Epoch %d:' % (epoch))

        #Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1],dtype=np.float32)
        test_correct = np.zeros(shape=[1],dtype=np.float32)
        train_loss = np.zeros(shape=[1],dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            x = resize_dataset(x,IMG_SIZE)
            y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model(tx)
            loss = autograd.softmax_cross_entropy(out, ty)               
            train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32)
            train_loss += tensor.to_numpy(loss)[0]
            if not partial_update:
                sgd.backward_and_update(loss)
            else:
                sgd.backward_and_partial_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True)

        if partial_update:
            # sychronize parameters before evaluation phase
            for p in param:
                sychronize(p, sgd)

        #Evaulation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size: (b + 1) * batch_size]
            x = resize_dataset(x,IMG_SIZE)
            y = test_y[b * batch_size: (b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes))

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.rank_in_global == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)

コード例 #12

0

ファイルを表示

ファイル: imdb_train.py プロジェクト: zlheui/singa

parser.add_argument('-n',
                    '--num-layers',
                    default=2,
                    type=int,
                    help='num layers',
                    dest='num_layers')

args = parser.parse_args()

# parameters
seq_limit = 50
embed_size = 300
hid = 32

# gpu device
dev = device.create_cuda_gpu_on(args.device_id)

# create placeholder
tx = tensor.Tensor((args.bs, seq_limit, embed_size), dev, tensor.float32)
ty = tensor.Tensor((args.bs, 2), dev, tensor.float32)
tx.gaussian(0, 1)
ty.gaussian(0, 1)

# create model
m = IMDBModel(hid,
              mode=args.mode,
              return_sequences=args.return_sequences,
              bidirectional=args.bidirectional,
              num_layers=args.num_layers)
m.set_opt(opt.SGD(args.lr, 0.9))

コード例 #13

0

ファイルを表示

def train_resnet(DIST='singa', graph=True, sequential=False):

    # Define the hypermeters good for the train_resnet
    niters = 100
    batch_size = 32
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    IMG_SIZE = 224

    # For distributed training, sequential has better throughput in the current version
    if DIST=='singa':
        sgd = opt.DistOpt(sgd)
        world_size = sgd.world_size
        local_rank = sgd.local_rank
        global_rank = sgd.global_rank
        sequential = True
    else:
        kv_type = 'dist_sync' #set synchronization mode
        kv = singa_kvstore.create_kvstore(kv_type,'sgd',learning_rate=0.005)
        global_rank = kv.rank
        world_size = kv.num_workers
        sequential = True
        
    dev = device.create_cuda_gpu_on(kv.rank)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    # construct the model
    from model import resnet
    model = resnet.resnet50(num_channels=3, num_classes=1000)

    model.train()
    model.on_device(dev)
    model.set_optimizer(sgd)
    model.graph(graph, sequential)

    # train model
    if DIST=='singa':
       dev.Sync()
    compute_time = 0.0
    syn_time = 0.0
    start = time.time()
    with trange(niters) as t:
        for _ in t:
            out = model(tx)
            compute_start = time.time()
            loss = model.loss(out, ty)
            compute_time += time.time()-compute_start
            if DIST=='singa':
               syn_start = time.time()
               model.optim(loss, dist_option='fp32', spars=None)
               syn_time += time.time()-syn_start
            else:
               #autograd.training = True
               syn_start = time.time()
               singa_kvstore.backward_and_update(kv,loss)
               syn_time += time.time()-syn_start
    if DIST=='singa':
       dev.Sync()
    end = time.time()
    compute_time = compute_time /float(niters)
    syn_time = syn_time/ float(niters)
    titer = (end - start) / float(niters)
    throughput = float(niters * batch_size * world_size) / (end - start)
    if global_rank == 0:
        print("compute_time = {}".format(compute_time),flush=True)
        print("syn_time = {}".format(syn_time),flush=True)
        print("Throughput = {} per second".format(throughput), flush=True)
        print("TotalTime={}".format(end - start), flush=True)
        print("Total={}".format(titer), flush=True)

コード例 #14

0

ファイルを表示

ファイル: mnist_cnn.py プロジェクト: zxr8192/singa

def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):

    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 64
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        dev = device.create_cuda_gpu_on(sgd.local_rank)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            synchronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            if DIST:
                if (spars == 0):
                    sgd.backward_and_update(loss, threshold=50000)
                else:
                    sgd.backward_and_sparse_update(loss,
                                                   spars=spars,
                                                   topK=topK,
                                                   corr=corr)
            else:
                sgd.backward_and_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)

コード例 #15

0

ファイルを表示

ファイル: xceptionnet.py プロジェクト: dbxinj/incubator-singa

        x = autograd.relu(features)
        x = self.globalpooling(x)
        x = autograd.flatten(x)
        x = self.fc(x)
        return x

    def __call__(self, input):
        x = self.features(input)
        x = self.logits(x)
        return x


if __name__ == '__main__':
    model = Xception(num_classes=1000)
    print('Start intialization............')
    dev = device.create_cuda_gpu_on(0)
    #dev = device.create_cuda_gpu()

    niters = 20
    batch_size = 16
    IMG_SIZE = 299
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    autograd.training = True
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

コード例 #16

0

ファイルを表示

ファイル: train_cnn.py プロジェクト: shouxi/singa

def run(global_rank,
        world_size,
        local_rank,
        max_epoch,
        batch_size,
        model,
        data,
        sgd,
        graph,
        verbosity,
        dist_option='fp32',
        spars=None):
    dev = device.create_cuda_gpu_on(local_rank)
    dev.SetRandSeed(0)
    np.random.seed(0)

    if data == 'cifar10':
        from data import cifar10
        train_x, train_y, val_x, val_y = cifar10.load()
    elif data == 'cifar100':
        from data import cifar100
        train_x, train_y, val_x, val_y = cifar100.load()
    elif data == 'mnist':
        from data import mnist
        train_x, train_y, val_x, val_y = mnist.load()

    num_channels = train_x.shape[1]
    image_size = train_x.shape[2]
    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
    num_classes = (np.max(train_y) + 1).item()
    #print(num_classes)

    if model == 'resnet':
        from model import resnet
        model = resnet.resnet50(num_channels=num_channels,
                                num_classes=num_classes)
    elif model == 'xceptionnet':
        from model import xceptionnet
        model = xceptionnet.create_model(num_channels=num_channels,
                                         num_classes=num_classes)
    elif model == 'cnn':
        from model import cnn
        model = cnn.create_model(num_channels=num_channels,
                                 num_classes=num_classes)
    elif model == 'alexnet':
        from model import alexnet
        model = alexnet.create_model(num_channels=num_channels,
                                     num_classes=num_classes)
    elif model == 'mlp':
        import os, sys, inspect
        current = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        parent = os.path.dirname(current)
        sys.path.insert(0, parent)
        from mlp import module
        model = module.create_model(data_size=data_size,
                                    num_classes=num_classes)

    # For distributed training, sequential gives better performance
    if hasattr(sgd, "communicator"):
        DIST = True
        sequential = True
    else:
        DIST = False
        sequential = False

    if DIST:
        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
                                                   train_x, train_y, val_x,
                                                   val_y)
    '''
    # check dataset shape correctness
    if global_rank == 0:
        print("Check the shape of dataset:")
        print(train_x.shape)
        print(train_y.shape)
    '''

    if model.dimension == 4:
        tx = tensor.Tensor(
            (batch_size, num_channels, model.input_size, model.input_size),
            dev, tensor.float32)
    elif model.dimension == 2:
        tx = tensor.Tensor((batch_size, data_size), dev, tensor.float32)
        np.reshape(train_x, (train_x.shape[0], -1))
        np.reshape(val_x, (val_x.shape[0], -1))

    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_val_batch = val_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    # attached model to graph
    model.set_optimizer(sgd)
    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
    dev.SetVerbosity(verbosity)

    # Training and Evaluation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if global_rank == 0:
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        model.train()
        for b in range(num_train_batch):
            # Generate the patch data in this iteration
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            if model.dimension == 4:
                x = augmentation(x, batch_size)
                if (image_size != model.input_size):
                    x = resize_dataset(x, model.input_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]

            # Copy the patch data into input tensors
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)

            # Train the model
            out, loss = model(tx, ty, dist_option, spars)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1, ), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        if global_rank == 0:
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        model.eval()
        for b in range(num_val_batch):
            x = val_x[b * batch_size:(b + 1) * batch_size]
            if model.dimension == 4:
                if (image_size != model.input_size):
                    x = resize_dataset(x, model.input_size)
            y = val_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if global_rank == 0:
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_val_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)

    dev.PrintTimeProfiling()

コード例 #17

0

ファイルを表示

ファイル: resnet_dist.py プロジェクト: zxr8192/singa

from singa import autograd
from singa import tensor
from singa import device
from singa import opt

import numpy as np
from tqdm import trange

if __name__ == "__main__":
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd)

    if (sgd.global_rank == 0):
        print("Start intialization...........", flush=True)

    dev = device.create_cuda_gpu_on(sgd.local_rank)

    from resnet import resnet50
    model = resnet50()

    niters = 100
    batch_size = 32
    IMG_SIZE = 224

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    autograd.training = True
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

コード例 #18

0

ファイルを表示

from singa import opt

import numpy as np
from tqdm import trange

if __name__ == "__main__":
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd)

    from resnet import resnet50
    model = resnet50()

    if (sgd.rank_in_global == 0):
        print("Start intialization...........", flush=True)

    dev = device.create_cuda_gpu_on(sgd.rank_in_local)
    niters = 100
    batch_size = 32
    IMG_SIZE = 224

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
    autograd.training = True
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    import time

    dev.Sync()

コード例 #19

0

ファイルを表示

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================

import unittest
import numpy as np
from singa import tensor
from singa import opt
from singa import device
from singa import singa_wrap

if (singa_wrap.USE_DIST):
    sgd = opt.SGD(lr=0.1)
    sgd = opt.DistOpt(sgd)
    dev = device.create_cuda_gpu_on(sgd.local_rank, set_default=False)
    param = tensor.Tensor((10, 10), dev, tensor.float32)
    grad = tensor.Tensor((10, 10), dev, tensor.float32)
    expected = np.ones((10, 10), dtype=np.float32) * (10 - 0.1)


@unittest.skipIf(not singa_wrap.USE_DIST, 'DIST is not enabled')
class TestDistOptimizer(unittest.TestCase):
    def test_dist_opt_fp32(self):
        # Test the C++ all reduce operation in fp32

        param.set_value(10)
        grad.set_value(1)

        sgd.all_reduce(grad.data)
        sgd.wait()