# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from __future__ import print_function import random, sys import mxnet as mx from mxnet import autograd, gluon, kv, nd from mxnet.gluon.model_zoo import vision import numpy as np # Create a distributed key-value store store = kv.create('dist') # Clasify the images into one of the 10 digits num_outputs = 10 # 64 images in a batch batch_size_per_gpu = 64 # How many epochs to run the training epochs = 5 # How many GPUs per machine gpus_per_machine = 4 # Effective batch size across all GPUs batch_size = batch_size_per_gpu * gpus_per_machine # Create the context (a list of all GPUs to be used for training)
self.start = self.part_len * part_index # Compute the end index for this partition self.end = self.start + self.part_len def __iter__(self): # Extract examples between `start` and `end`, shuffle and return them. indices = list(range(self.start, self.end)) random.shuffle(indices) return iter(indices) def __len__(self): return self.part_len # Use Horovod as the KVStore store = kv.create('horovod') # Get the number of workers num_workers = store.num_workers # Create the context based on the local rank of the current process ctx = mx.cpu(store.local_rank) if args.no_cuda else mx.gpu(store.local_rank) # Load the training data train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True, transform=transform), args.batch_size, sampler=SplitSampler(50000, num_workers, store.rank)) # Load the test data
def train(hyperparameters, input_data_config, channel_input_dirs, output_data_dir, model_dir, num_gpus, num_cpus, hosts, current_host, **kwargs): """ [Required] Runs Apache MXNet training. Amazon SageMaker calls this function with information about the training environment. When called, if this function returns an object, that object is passed to a save function. The save function can be used to serialize the model to the Amazon SageMaker training job model directory. The **kwargs parameter can be used to absorb any Amazon SageMaker parameters that your training job doesn't need to use. For example, if your training job doesn't need to know anything about the training environment, your function signature can be as simple as train(**kwargs). Amazon SageMaker invokes your train function with the following python kwargs: Args: - hyperparameters: The Amazon SageMaker Hyperparameters dictionary. A dict of string to string. - input_data_config: The Amazon SageMaker input channel configuration for this job. - channel_input_dirs: A dict of string-to-string maps from the Amazon SageMaker algorithm input channel name to the directory containing files for that input channel. Note, if the Amazon SageMaker training job is run in PIPE mode, this dictionary will be empty. - output_data_dir: The Amazon SageMaker output data directory. After the function returns, data written to this directory is made available in the Amazon SageMaker training job output location. - model_dir: The Amazon SageMaker model directory. After the function returns, data written to this directory is made available to the Amazon SageMaker training job model location. - num_gpus: The number of GPU devices available on the host this script is being executed on. - num_cpus: The number of CPU devices available on the host this script is being executed on. - hosts: A list of hostnames in the Amazon SageMaker training job cluster. - current_host: This host's name. It will exist in the hosts list. - kwargs: Other keyword args. Returns: - (object): Optional. An Apache MXNet model to be passed to the model save function. If you do not return anything (or return None), the save function is not called. """ train_file_path = get_file_path(channel_input_dirs['train'], current_host, hosts) print('Train file path {}'.format(train_file_path)) test_file_path = get_first_file_path_in_dir(channel_input_dirs['test']) print('Test file path {}'.format(test_file_path)) ts_data_train = load_file(train_file_path, hyperparameters) ts_data_test = load_file(test_file_path, hyperparameters) ctx = [mx.cpu(i) for i in range(num_cpus)] if num_gpus > 0: ctx = ctx = [mx.gpu(i) for i in range(num_gpus)] print('Running on {}'.format(ctx)) print('Hosts {}'.format(hosts)) print('Current Host {}'.format(current_host)) net = LSTNet(num_series=ts_data_train.num_series, conv_hid=hyperparameters['conv_hid'], gru_hid=hyperparameters['gru_hid'], skip_gru_hid=hyperparameters['skip_gru_hid'], skip=hyperparameters['skip'], ar_window=hyperparameters['ar_window']) net.initialize(init=mx.init.Xavier(factor_type="in", magnitude=2.34), ctx=ctx) kvstore = 'local' if len(hosts) == 1: kvstore = 'device' if num_gpus > 0 else 'local' else: kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync' print('kvstore {}'.format(kvstore)) store = kv.create(kvstore) trainer = gluon.Trainer(net.collect_params(), kvstore=store, optimizer='adam', optimizer_params={ 'learning_rate': hyperparameters['learning_rate'], 'clip_gradient': hyperparameters['clip_gradient'] }) batch_size = hyperparameters['batch_size'] train_data_loader = gluon.data.DataLoader(ts_data_train.train, batch_size=batch_size, shuffle=True, num_workers=16, last_batch='discard') test_data_loader = gluon.data.DataLoader(ts_data_test.train, batch_size=batch_size, shuffle=True, num_workers=16, last_batch='discard') epochs = hyperparameters['epochs'] print("Training Start") metric = mx.metric.RMSE() tic = time.time() for e in range(epochs): metric.reset() epoch_start_time = time.time() for data, label in train_data_loader: batch_forward_backward(data, label, ctx, net, trainer, batch_size, metric) name, value = metric.get() print("Epoch {}: {} {} time {:.4f} s".format( e, name, value, time.time() - epoch_start_time)) # Calculate the test RMSE when training has finished validate(train_data_loader, metric, ctx, net) print("Total training time: {}".format(time.time() - tic)) if not os.path.exists(output_data_dir): os.makedirs(output_data_dir) net.save_params(os.path.join(output_data_dir, 'lstnet_params.params')) print("Training End") return
import mxnet as mx from mxnet import autograd, gluon, kv, nd from mxnet.gluon.model_zoo import vision import numpy as np import socket import os import sys if len(sys.argv) < 4: sys.exit("Wrong parameters") # Create a distributed key-value store storetype = str(sys.argv[1]) # 'dist_sync, 'dist_async', 'local' store = kv.create(storetype) # Clasify the images into one of the 10 digits num_outputs = 10 # How many epochs to run the training epochs = 100 # Effective batch size across all GPUs batch_size = int(sys.argv[2]) learning_rate = float(sys.argv[3]) # Create the context (a list of all GPUs to be used for training) #ctx = [mx.gpu(i) for i in range(gpus_per_machine)] ctx = [mx.cpu()]
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from __future__ import print_function import random, sys import mxnet as mx from mxnet import autograd, gluon, kv, nd from mxnet.gluon.model_zoo import vision import numpy as np # Create a distributed key-value store store = kv.create('dist_device_sync') # Clasify the images into one of the 10 digits num_outputs = 10 # 64 images in a batch batch_size_per_gpu = 64 # How many epochs to run the training epochs = 5 # How many GPUs per machine gpus_per_machine = 4 # Effective batch size across all GPUs batch_size = batch_size_per_gpu * gpus_per_machine # Create the context (a list of all GPUs to be used for training)
import mxnet as mx from mxnet import kv, nd, gpu if __name__ == "__main__": # MXNet provides a key-value store to synchronize data among devices. # The following code initializes an ndarray associated with the key “weight” on a key-value store. print('running kvstore local') kv_local = kv.create("local") SHAPE = (2, 3) x = nd.random.uniform(shape=SHAPE) kv_local.init("params", x) print('=== init "params" ==={}'.format(x)) # After initialization, we can pull the value to multiple devices. NUM_GPUS = 2 ctx = [gpu(i) for i in range(NUM_GPUS)] y = [nd.zeros(shape=SHAPE, ctx=c) for c in ctx] kv_local.pull("params", out=y) print('=== pull "params" to {} ===\n{}'.format(ctx, y)) # We can also push new data value into the store. # It will first sum the data on the same key and then overwrite the current value. z = [nd.ones(shape=SHAPE, ctx=c) for c in ctx] kv_local.push("params", z) print('=== push to "params" ===\n{}'.format(z)) kv_local.pull("params", out=y) print('=== pull "params" ===\n{}'.format(y)) # With push and pull we can define the allreduce function by # def allreduce(data, data_name, store): # store.push(data_name, data) # store.pull(data_name, out=data)
parser.add_argument("-g", "--gpu", type=int, default=0) parser.add_argument("-c", "--cpu", type=int, default=0) parser.add_argument("-d", "--distributed", type=int, default=1) parser.add_argument("-m", "--mode", type=str, default="dist_async") args, unknown = parser.parse_known_args() crop_size = (320, 480) num_classes = 21 ctx = mx.cpu() if args.cpu else [mx.gpu(args.gpu)] colormap2label = nd.zeros(256**3) for i, colormap in enumerate(VOC_COLORMAP): colormap2label[(colormap[0] * 256 + colormap[1]) * 256 + colormap[2]] = i kvstore = kv.create(args.mode) if args.distributed else "device" num_workers = 0 if sys.platform.startswith("win32") else 4 voc_train = VOCSegDataset(True, crop_size, "/home/lizh/learn-gluon/data/VOC2012", colormap2label) voc_test = VOCSegDataset(False, crop_size, "/home/lizh/learn-gluon/data/VOC2012", colormap2label) train_iter = gdata.DataLoader(voc_train, args.batch_size, shuffle=True, last_batch="discard", num_workers=num_workers) test_iter = gdata.DataLoader(voc_test, args.batch_size, last_batch="discard",
type=str, default='resnet', help='model_name:cnn, mlp, resnet (default: resnet)') parser.add_argument('--gpu_i', type=int, default=1, help='gpu_i:(default: 1)') parser.add_argument('--cpu', type=int, default=0, help='cpu:(default: 0)') opt = parser.parse_args() _print( 'pid {}, dist {}, epochs {}, gpus_per_machine {}, model_name {}, gpu_i {}, cpu {}' .format(str(pid), str(opt.dist), str(opt.epochs), str(opt.gpus_per_machine), str(opt.model_name), str(opt.gpu_i), str(opt.cpu))) # Create a distributed key-value store store = kv.create( opt.dist ) # Note: you can control the sync and async here (https://mxnet.incubator.apache.org/api/python/kvstore/kvstore.html) if store.rank == 0: os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" else: os.environ["CUDA_VISIBLE_DEVICES"] = "3,4" # Clasify the images into one of the 10 digits num_outputs = 10 # 64 images in a batch batch_size_per_gpu = 256 # How many epochs to run the training epochs = opt.epochs
# specific language governing permissions and limitations # under the License. """cifar10_dist.py contains code that trains a ResNet18 network using distributed training""" from __future__ import print_function import sys import random import numpy as np import mxnet as mx from mxnet import autograd, gluon, kv, nd from mxnet.gluon.model_zoo import vision # Create a distributed key-value store store = kv.create('dist') # Clasify the images into one of the 10 digits num_outputs = 10 # 64 images in a batch batch_size_per_gpu = 64 # How many epochs to run the training epochs = 5 # How many GPUs per machine gpus_per_machine = 4 # Effective batch size across all GPUs batch_size = batch_size_per_gpu * gpus_per_machine # Create the context (a list of all GPUs to be used for training)
def run(self): # hyper parameters epochs = 1 batch_size = 1000 sparse_feature_number = 1000001 sparse_feature_dim = 10 dense_feature_dim = 13 num_field = 26 layer_sizes = [400, 400, 400] train_data_path = "./train_data" print_step = 5 distributed_train = False cpu_num = int(os.getenv("CPU_NUM", 1)) # create network ctx = mx.cpu() net = CtrDnn(sparse_feature_number, sparse_feature_dim, dense_feature_dim, num_field, layer_sizes) net.initialize(ctx=ctx) # net.hybridize() self.loss = gluon.loss.SoftmaxCrossEntropyLoss() if distributed_train: self.store = kv.create('dist_async') else: self.store = kv.create('local') # Load the training data reader_start_time = time.time() file_list = self.get_file_list(train_data_path, distributed_train) reader = Reader() dataset = reader.load_criteo_dataset(file_list) train_data = gluon.data.DataLoader(dataset, batch_size, num_workers=cpu_num, last_batch="discard") reader_end_time = time.time() logger.info("Load Data in memory finish, using time: {}".format( reader_end_time - reader_start_time)) if distributed_train: trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.0001, 'lazy_update': True }, kvstore=self.store, update_on_kvstore=True) else: trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.0001}, kvstore=self.store) for epoch in range(epochs): logger.info("Epoch {} training begin".format(epoch)) epoch_start_time = time.time() batch_id = 1 train_run_cost = 0.0 total_examples = 0 self.global_score = None self.global_label = None for batch in train_data: train_start = time.time() loss_value = self.train_batch(batch, ctx, net, trainer) train_run_cost += (time.time() - train_start) total_examples += batch_size batch_id += 1 if batch_id % print_step == 0: metric_start = time.time() fpr, tpr, _ = metrics.roc_curve( list(self.global_lable.asnumpy()), list(self.global_score.asnumpy())) auc_value = metrics.auc(fpr, tpr) train_run_cost += (time.time() - metric_start) metrics_string = "auc: {}, loss: {}".format( auc_value, loss_value) profiler_string = "" profiler_string += "using_time: {} sec ".format( train_run_cost) profiler_string += "avg_batch_cost: {} sec, ".format( format((train_run_cost) / print_step, '.5f')) profiler_string += " ips: {} example/sec ".format( format(total_examples / (train_run_cost), '.5f')) logger.info("Epoch: {}, Batch: {}, {} {}".format( epoch, batch_id, metrics_string, profiler_string)) train_run_cost = 0.0 total_examples = 0 epoch_end_time = time.time() logger.info("Epoch: {}, using time {} second,".format( epoch, epoch_end_time - epoch_start_time))
parser.add_argument('--synthesize', type=int, default=0, help="use synthesize data or not") parser.add_argument( '--log-interval', type=int, default=1, help='number of batches to wait before logging training status') if __name__ == '__main__': import logging head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.INFO, format=head) store = kv.create('local_allreduce_device') # arg parser args = parser.parse_args() logging.info(args) num_epoch = args.num_epoch batch_size = args.batch_size optimizer = args.optimizer log_interval = args.log_interval lr = args.lr ctx = [mx.gpu(i) for i in range(args.gpu_num)] # synthesized dataset if args.synthesize: data_dir = os.path.join(os.getcwd(), 'data') train_data = os.path.join(data_dir, ADULT['train']) val_data = os.path.join(data_dir, ADULT['test'])