Exemple #1
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function
import random, sys

import mxnet as mx
from mxnet import autograd, gluon, kv, nd
from mxnet.gluon.model_zoo import vision

import numpy as np

# Create a distributed key-value store
store = kv.create('dist')

# Clasify the images into one of the 10 digits
num_outputs = 10

# 64 images in a batch
batch_size_per_gpu = 64
# How many epochs to run the training
epochs = 5

# How many GPUs per machine
gpus_per_machine = 4
# Effective batch size across all GPUs
batch_size = batch_size_per_gpu * gpus_per_machine

# Create the context (a list of all GPUs to be used for training)
        self.start = self.part_len * part_index
        # Compute the end index for this partition
        self.end = self.start + self.part_len

    def __iter__(self):
        # Extract examples between `start` and `end`, shuffle and return them.
        indices = list(range(self.start, self.end))
        random.shuffle(indices)
        return iter(indices)

    def __len__(self):
        return self.part_len


# Use Horovod as the KVStore
store = kv.create('horovod')

# Get the number of workers
num_workers = store.num_workers

# Create the context based on the local rank of the current process
ctx = mx.cpu(store.local_rank) if args.no_cuda else mx.gpu(store.local_rank)

# Load the training data
train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True,
                                   transform=transform), args.batch_size,
                                   sampler=SplitSampler(50000,
                                                        num_workers,
                                                        store.rank))

# Load the test data
Exemple #3
0
def train(hyperparameters, input_data_config, channel_input_dirs,
          output_data_dir, model_dir, num_gpus, num_cpus, hosts, current_host,
          **kwargs):
    """
    [Required]

    Runs Apache MXNet training. Amazon SageMaker calls this function with information
    about the training environment. When called, if this function returns an
    object, that object is passed to a save function.  The save function
    can be used to serialize the model to the Amazon SageMaker training job model
    directory.

    The **kwargs parameter can be used to absorb any Amazon SageMaker parameters that
    your training job doesn't need to use. For example, if your training job
    doesn't need to know anything about the training environment, your function
    signature can be as simple as train(**kwargs).

    Amazon SageMaker invokes your train function with the following python kwargs:

    Args:
        - hyperparameters: The Amazon SageMaker Hyperparameters dictionary. A dict
            of string to string.
        - input_data_config: The Amazon SageMaker input channel configuration for
            this job.
        - channel_input_dirs: A dict of string-to-string maps from the
            Amazon SageMaker algorithm input channel name to the directory containing
            files for that input channel. Note, if the Amazon SageMaker training job
            is run in PIPE mode, this dictionary will be empty.
        - output_data_dir:
            The Amazon SageMaker output data directory. After the function returns, data written to this
            directory is made available in the Amazon SageMaker training job
            output location.
        - model_dir: The Amazon SageMaker model directory. After the function returns, data written to this
            directory is made available to the Amazon SageMaker training job
            model location.
        - num_gpus: The number of GPU devices available on the host this script
            is being executed on.
        - num_cpus: The number of CPU devices available on the host this script
            is being executed on.
        - hosts: A list of hostnames in the Amazon SageMaker training job cluster.
        - current_host: This host's name. It will exist in the hosts list.
        - kwargs: Other keyword args.

    Returns:
        - (object): Optional. An Apache MXNet model to be passed to the model
            save function. If you do not return anything (or return None),
            the save function is not called.
    """

    train_file_path = get_file_path(channel_input_dirs['train'], current_host,
                                    hosts)
    print('Train file path {}'.format(train_file_path))
    test_file_path = get_first_file_path_in_dir(channel_input_dirs['test'])
    print('Test file path {}'.format(test_file_path))
    ts_data_train = load_file(train_file_path, hyperparameters)
    ts_data_test = load_file(test_file_path, hyperparameters)

    ctx = [mx.cpu(i) for i in range(num_cpus)]
    if num_gpus > 0:
        ctx = ctx = [mx.gpu(i) for i in range(num_gpus)]
    print('Running on {}'.format(ctx))
    print('Hosts {}'.format(hosts))
    print('Current Host {}'.format(current_host))

    net = LSTNet(num_series=ts_data_train.num_series,
                 conv_hid=hyperparameters['conv_hid'],
                 gru_hid=hyperparameters['gru_hid'],
                 skip_gru_hid=hyperparameters['skip_gru_hid'],
                 skip=hyperparameters['skip'],
                 ar_window=hyperparameters['ar_window'])

    net.initialize(init=mx.init.Xavier(factor_type="in", magnitude=2.34),
                   ctx=ctx)

    kvstore = 'local'
    if len(hosts) == 1:
        kvstore = 'device' if num_gpus > 0 else 'local'
    else:
        kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync'
    print('kvstore {}'.format(kvstore))
    store = kv.create(kvstore)
    trainer = gluon.Trainer(net.collect_params(),
                            kvstore=store,
                            optimizer='adam',
                            optimizer_params={
                                'learning_rate':
                                hyperparameters['learning_rate'],
                                'clip_gradient':
                                hyperparameters['clip_gradient']
                            })

    batch_size = hyperparameters['batch_size']
    train_data_loader = gluon.data.DataLoader(ts_data_train.train,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=16,
                                              last_batch='discard')
    test_data_loader = gluon.data.DataLoader(ts_data_test.train,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=16,
                                             last_batch='discard')

    epochs = hyperparameters['epochs']
    print("Training Start")
    metric = mx.metric.RMSE()
    tic = time.time()
    for e in range(epochs):
        metric.reset()
        epoch_start_time = time.time()
        for data, label in train_data_loader:
            batch_forward_backward(data, label, ctx, net, trainer, batch_size,
                                   metric)
        name, value = metric.get()
        print("Epoch {}: {} {} time {:.4f} s".format(
            e, name, value,
            time.time() - epoch_start_time))

    # Calculate the test RMSE when training has finished
    validate(train_data_loader, metric, ctx, net)

    print("Total training time: {}".format(time.time() - tic))

    if not os.path.exists(output_data_dir):
        os.makedirs(output_data_dir)
    net.save_params(os.path.join(output_data_dir, 'lstnet_params.params'))
    print("Training End")
    return
Exemple #4
0
import mxnet as mx
from mxnet import autograd, gluon, kv, nd
from mxnet.gluon.model_zoo import vision

import numpy as np
import socket
import os
import sys

if len(sys.argv) < 4:
    sys.exit("Wrong parameters")

# Create a distributed key-value store
storetype = str(sys.argv[1])  # 'dist_sync, 'dist_async', 'local'
store = kv.create(storetype)

# Clasify the images into one of the 10 digits
num_outputs = 10

# How many epochs to run the training
epochs = 100

# Effective batch size across all GPUs
batch_size = int(sys.argv[2])
learning_rate = float(sys.argv[3])

# Create the context (a list of all GPUs to be used for training)
#ctx = [mx.gpu(i) for i in range(gpus_per_machine)]
ctx = [mx.cpu()]
Exemple #5
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function
import random, sys

import mxnet as mx
from mxnet import autograd, gluon, kv, nd
from mxnet.gluon.model_zoo import vision

import numpy as np

# Create a distributed key-value store
store = kv.create('dist_device_sync')

# Clasify the images into one of the 10 digits
num_outputs = 10

# 64 images in a batch
batch_size_per_gpu = 64
# How many epochs to run the training
epochs = 5

# How many GPUs per machine
gpus_per_machine = 4
# Effective batch size across all GPUs
batch_size = batch_size_per_gpu * gpus_per_machine

# Create the context (a list of all GPUs to be used for training)
Exemple #6
0
import mxnet as mx
from mxnet import kv, nd, gpu

if __name__ == "__main__":
    # MXNet provides a key-value store to synchronize data among devices.
    # The following code initializes an ndarray associated with the key “weight” on a key-value store.
    print('running kvstore local')
    kv_local = kv.create("local")
    SHAPE = (2, 3)
    x = nd.random.uniform(shape=SHAPE)
    kv_local.init("params", x)
    print('=== init "params" ==={}'.format(x))
    # After initialization, we can pull the value to multiple devices.
    NUM_GPUS = 2
    ctx = [gpu(i) for i in range(NUM_GPUS)]
    y = [nd.zeros(shape=SHAPE, ctx=c) for c in ctx]
    kv_local.pull("params", out=y)
    print('=== pull "params" to {} ===\n{}'.format(ctx, y))
    # We can also push new data value into the store.
    # It will first sum the data on the same key and then overwrite the current value.
    z = [nd.ones(shape=SHAPE, ctx=c) for c in ctx]
    kv_local.push("params", z)
    print('=== push to "params" ===\n{}'.format(z))
    kv_local.pull("params", out=y)
    print('=== pull "params" ===\n{}'.format(y))

    # With push and pull we can define the allreduce function by
    # def allreduce(data, data_name, store):
    #     store.push(data_name, data)
    #     store.pull(data_name, out=data)
Exemple #7
0
    parser.add_argument("-g", "--gpu", type=int, default=0)
    parser.add_argument("-c", "--cpu", type=int, default=0)
    parser.add_argument("-d", "--distributed", type=int, default=1)
    parser.add_argument("-m", "--mode", type=str, default="dist_async")
    args, unknown = parser.parse_known_args()

    crop_size = (320, 480)
    num_classes = 21
    ctx = mx.cpu() if args.cpu else [mx.gpu(args.gpu)]

    colormap2label = nd.zeros(256**3)
    for i, colormap in enumerate(VOC_COLORMAP):
        colormap2label[(colormap[0] * 256 + colormap[1]) * 256 +
                       colormap[2]] = i

    kvstore = kv.create(args.mode) if args.distributed else "device"
    num_workers = 0 if sys.platform.startswith("win32") else 4
    voc_train = VOCSegDataset(True, crop_size,
                              "/home/lizh/learn-gluon/data/VOC2012",
                              colormap2label)
    voc_test = VOCSegDataset(False, crop_size,
                             "/home/lizh/learn-gluon/data/VOC2012",
                             colormap2label)
    train_iter = gdata.DataLoader(voc_train,
                                  args.batch_size,
                                  shuffle=True,
                                  last_batch="discard",
                                  num_workers=num_workers)
    test_iter = gdata.DataLoader(voc_test,
                                 args.batch_size,
                                 last_batch="discard",
Exemple #8
0
                    type=str,
                    default='resnet',
                    help='model_name:cnn, mlp, resnet (default: resnet)')
parser.add_argument('--gpu_i', type=int, default=1, help='gpu_i:(default: 1)')
parser.add_argument('--cpu', type=int, default=0, help='cpu:(default: 0)')
opt = parser.parse_args()

_print(
    'pid {}, dist {}, epochs {}, gpus_per_machine {}, model_name {}, gpu_i {}, cpu {}'
    .format(str(pid), str(opt.dist), str(opt.epochs),
            str(opt.gpus_per_machine), str(opt.model_name), str(opt.gpu_i),
            str(opt.cpu)))

# Create a distributed key-value store
store = kv.create(
    opt.dist
)  # Note: you can control the sync and async here (https://mxnet.incubator.apache.org/api/python/kvstore/kvstore.html)

if store.rank == 0:
    os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"

# Clasify the images into one of the 10 digits
num_outputs = 10

# 64 images in a batch
batch_size_per_gpu = 256
# How many epochs to run the training
epochs = opt.epochs
Exemple #9
0
# specific language governing permissions and limitations
# under the License.

"""cifar10_dist.py contains code that trains a ResNet18 network using distributed training"""

from __future__ import print_function

import sys
import random
import numpy as np
import mxnet as mx
from mxnet import autograd, gluon, kv, nd
from mxnet.gluon.model_zoo import vision

# Create a distributed key-value store
store = kv.create('dist')

# Clasify the images into one of the 10 digits
num_outputs = 10

# 64 images in a batch
batch_size_per_gpu = 64
# How many epochs to run the training
epochs = 5

# How many GPUs per machine
gpus_per_machine = 4
# Effective batch size across all GPUs
batch_size = batch_size_per_gpu * gpus_per_machine

# Create the context (a list of all GPUs to be used for training)
Exemple #10
0
    def run(self):
        # hyper parameters
        epochs = 1
        batch_size = 1000
        sparse_feature_number = 1000001
        sparse_feature_dim = 10
        dense_feature_dim = 13
        num_field = 26
        layer_sizes = [400, 400, 400]
        train_data_path = "./train_data"
        print_step = 5
        distributed_train = False
        cpu_num = int(os.getenv("CPU_NUM", 1))

        # create network
        ctx = mx.cpu()
        net = CtrDnn(sparse_feature_number, sparse_feature_dim,
                     dense_feature_dim, num_field, layer_sizes)
        net.initialize(ctx=ctx)
        # net.hybridize()

        self.loss = gluon.loss.SoftmaxCrossEntropyLoss()

        if distributed_train:
            self.store = kv.create('dist_async')
        else:
            self.store = kv.create('local')

        # Load the training data
        reader_start_time = time.time()

        file_list = self.get_file_list(train_data_path, distributed_train)
        reader = Reader()
        dataset = reader.load_criteo_dataset(file_list)
        train_data = gluon.data.DataLoader(dataset,
                                           batch_size,
                                           num_workers=cpu_num,
                                           last_batch="discard")
        reader_end_time = time.time()
        logger.info("Load Data in memory finish, using time: {}".format(
            reader_end_time - reader_start_time))

        if distributed_train:
            trainer = gluon.Trainer(net.collect_params(),
                                    'adam', {
                                        'learning_rate': 0.0001,
                                        'lazy_update': True
                                    },
                                    kvstore=self.store,
                                    update_on_kvstore=True)
        else:
            trainer = gluon.Trainer(net.collect_params(),
                                    'adam', {'learning_rate': 0.0001},
                                    kvstore=self.store)

        for epoch in range(epochs):
            logger.info("Epoch {} training begin".format(epoch))
            epoch_start_time = time.time()

            batch_id = 1
            train_run_cost = 0.0
            total_examples = 0
            self.global_score = None
            self.global_label = None

            for batch in train_data:
                train_start = time.time()
                loss_value = self.train_batch(batch, ctx, net, trainer)

                train_run_cost += (time.time() - train_start)
                total_examples += batch_size

                batch_id += 1
                if batch_id % print_step == 0:
                    metric_start = time.time()
                    fpr, tpr, _ = metrics.roc_curve(
                        list(self.global_lable.asnumpy()),
                        list(self.global_score.asnumpy()))
                    auc_value = metrics.auc(fpr, tpr)
                    train_run_cost += (time.time() - metric_start)

                    metrics_string = "auc: {}, loss: {}".format(
                        auc_value, loss_value)
                    profiler_string = ""
                    profiler_string += "using_time: {} sec ".format(
                        train_run_cost)
                    profiler_string += "avg_batch_cost: {} sec, ".format(
                        format((train_run_cost) / print_step, '.5f'))
                    profiler_string += " ips: {} example/sec ".format(
                        format(total_examples / (train_run_cost), '.5f'))
                    logger.info("Epoch: {}, Batch: {}, {} {}".format(
                        epoch, batch_id, metrics_string, profiler_string))
                    train_run_cost = 0.0
                    total_examples = 0

            epoch_end_time = time.time()
            logger.info("Epoch: {}, using time {} second,".format(
                epoch, epoch_end_time - epoch_start_time))
Exemple #11
0
parser.add_argument('--synthesize',
                    type=int,
                    default=0,
                    help="use synthesize data or not")
parser.add_argument(
    '--log-interval',
    type=int,
    default=1,
    help='number of batches to wait before logging training status')

if __name__ == '__main__':
    import logging
    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.INFO, format=head)

    store = kv.create('local_allreduce_device')
    # arg parser
    args = parser.parse_args()
    logging.info(args)
    num_epoch = args.num_epoch
    batch_size = args.batch_size
    optimizer = args.optimizer
    log_interval = args.log_interval
    lr = args.lr
    ctx = [mx.gpu(i) for i in range(args.gpu_num)]

    # synthesized dataset
    if args.synthesize:
        data_dir = os.path.join(os.getcwd(), 'data')
        train_data = os.path.join(data_dir, ADULT['train'])
        val_data = os.path.join(data_dir, ADULT['test'])