def worker(rank): import kungfu.torch as kf from kungfu.python import current_cluster_size, current_rank print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size())) x = torch.ones([]) * int(current_rank()) print(x) y = kf.ops.collective.all_reduce_fn(x) print(y)
def get_neighbour_mask(edges): """Compute a bool vector of neighbours for the current peer. For the peer of rank i, v[j] = true if (i, j) is an edge of the MST, otherwise v[j] = false. """ return _op_lib.kungfu_get_neighbour_mask( edges, self_rank=current_rank(), cluster_size=current_cluster_size())
def test_broadcast(): from kungfu.tensorflow.ops import broadcast v = tf.Variable(True if current_rank() == 0 else False) u = broadcast(v) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) x = sess.run(v) y = sess.run(u) # print(x, y) assert (y == True)
def test_all_gather(device='cpu'): rank = current_rank() x = (torch.ones([2, 3]) * rank) x.to(device) y = kf.ops.collective.all_gather(x) z = [] np = current_cluster_size() for i in range(np): z.append(torch.ones([2, 3]) * i) z = torch.stack(z) assert (z.equal(y))
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KungFu: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KungFu: broadcast the global variable from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp sess.run(BroadcastGlobalVariablesOp()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def test_group_all_gather(): from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_gather rank = current_rank() np = current_cluster_size() sizes = [i + 1 for i in range(5)] xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes] ys = [all_gather(x) for x in xs] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i, y in enumerate(ys): v = sess.run(y) assert (v.sum() == (np + 1) * np / 2 * (i + 1))
def worker(rank): from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_reduce print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size())) x = tf.Variable(tf.ones(shape=(), dtype=tf.int32)) y = all_reduce(x * rank) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(y) print('v=%s' % (v))
def test_consensus(): from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.ops import consensus np = current_cluster_size() rank = current_rank() x = tf.Variable(rank, dtype=tf.int32) consensus_check = consensus(x) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(consensus_check) assert v == (np == 1)
def train_model(model, dataset, n_epochs=1, batch_size=5000): n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(dataset['x_train']) # calculate the offset for the data of the KungFu node shard_size = train_data_size // n_shards offset = batch_size * shard_id # extract the data for learning of the KungFu node x = dataset['x_train'][offset:offset + shard_size] y = dataset['y_train'][offset:offset + shard_size] # train the model model.fit(x, y, batch_size=batch_size, epochs=n_epochs, callbacks=[BroadcastGlobalVariablesCallback()], validation_data=(dataset['x_val'], dataset['y_val']), verbose=2)
def main(_): from kungfu.tensorflow.initializer import BroadcastGlobalVariablesHook hooks = [ BroadcastGlobalVariablesHook(), tf.train.LoggingTensorHook(['train_accuracy', 'train_loss'], every_n_iter=10) ] from kungfu.python import current_rank save_checkpoints_secs = None if current_rank() != 0 else 30 config = tf.estimator.RunConfig( save_checkpoints_secs=save_checkpoints_secs) mnist_classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=FLAGS.model_dir, config=config) for _ in range(FLAGS.num_epochs): mnist_classifier.train( input_fn=train_data, hooks=hooks, ) mnist_classifier.evaluate(input_fn=eval_data)
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \ preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer): '''Single train pipeline of Openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' # train hyper params # dataset params total_step = config.train.n_step batch_size = config.train.batch_size # learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [ 200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000, 900000 ] weight_decay_factor = config.train.weight_decay_factor # log and checkpoint params log_interval = config.log.log_interval vis_interval = config.train.vis_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir # model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors data_format = train_model.data_format model_dir = config.model.model_dir pretrain_model_dir = config.pretrain.pretrain_model_dir pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" # metrics metric_manager = MetricManager() # initializing train dataset train_dataset = dataset.get_train_dataset() epoch_size = dataset.get_train_datasize() // batch_size paramed_map_fn = get_paramed_map_fn(augmentor=augmentor, preprocessor=preprocessor, data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096).repeat() train_dataset = train_dataset.map( paramed_map_fn, num_parallel_calls=get_num_parallel_calls()) train_dataset = train_dataset.batch(config.train.batch_size) train_dataset = train_dataset.prefetch(3) train_dataset_iter = iter(train_dataset) #train configure save_step = tf.Variable(1, trainable=False) save_lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.Adam(learning_rate=save_lr) domainadapt_flag = config.data.domainadapt_flag total_epoch = total_step // epoch_size #domain adaptation params if (not domainadapt_flag): ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt) else: log("Domain adaptaion in training enabled!") # weight param lambda_adapt = 1e-4 # construct discrminator model feature_hin = train_model.hin // train_model.backbone.scale_size feature_win = train_model.win // train_model.backbone.scale_size in_channels = train_model.backbone.out_channels adapt_dis = Discriminator(feature_hin, feature_win, in_channels, data_format=data_format) opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr) ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt, opt_d=opt_d) # construct domain adaptation dataset dmadapt_train_dataset = dataset.get_dmadapt_train_dataset() paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor) dmadapt_train_dataset = dmadapt_train_dataset.map( paramed_dmadapt_map_fn, num_parallel_calls=get_num_parallel_calls()) dmadapt_train_dataset = dmadapt_train_dataset.shuffle( buffer_size=4096).repeat() dmadapt_train_dataset = dmadapt_train_dataset.batch( config.train.batch_size) dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3) dmadapt_train_dataset_iter = iter(dmadapt_train_dataset) #load from ckpt ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path, network=train_model.backbone, skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized" ) #load model weights try: log("loading saved training model weights...") train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") if (domainadapt_flag): try: log("loading saved domain adaptation discriminator weight...") adapt_dis.load_weights( os.path.join(model_dir, "newest_discriminator.npz")) except: log("discriminator path doesn't exist, discriminator parameters are initialized" ) log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) step = save_step.numpy() lr = save_lr.numpy() #import kungfu from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer total_step = total_step // current_cluster_size() + 1 # KungFu total_epoch = total_epoch // current_cluster_size() + 1 # KungFu for step_idx, decay_step in enumerate(lr_decay_steps): lr_decay_steps[ step_idx] = decay_step // current_cluster_size() + 1 # KungFu # optimize one step def optimize_step(image, mask, target_x, train_model, metric_manager: MetricManager): # tape with tf.GradientTape() as tape: predict_x = train_model.forward(x=image, is_train=True, ret_backbone=domainadapt_flag) total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \ mask=mask, metric_manager=metric_manager) # optimize model gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) return predict_x def optimize_step_dmadapt(image_src, image_dst, train_model, adapt_dis: Discriminator, metric_manager: MetricManager): # tape with tf.GradientTape(persistent=True) as tape: # feature extraction # src feature predict_src = train_model.forward(x=image_src, is_train=True, ret_backbone=True) backbone_feature_src = predict_src["backbone_features"] adapt_pd_src = adapt_dis.forward(backbone_feature_src) # dst feature predict_dst = train_model.forward(x=image_dst, is_train=True, ret_backbone=True) backbone_feature_dst = predict_dst["backbone_features"] adapt_pd_dst = adapt_dis.forward(backbone_feature_dst) # loss calculation # loss of g g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst, label=True) * lambda_adapt # loss of d d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True) d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False) d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2 # optimize model g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights) opt.apply_gradients(zip(g_gradient, train_model.trainable_weights)) metric_manager.update("model/g_adapt_loss", g_adapt_loss) # optimize dis d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights) opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights)) metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src) metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst) # delete persistent tape del tape return predict_dst # formal training procedure # KungFu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) train_model.train() cur_epoch = step // epoch_size + 1 log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\ +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\ +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" ) for epoch_idx in range(cur_epoch, total_epoch): log(f"Epoch {epoch_idx}/{total_epoch}:") for _ in tqdm(range(0, epoch_size)): step += 1 metric_manager.start_timing() image, mask, target_list = next(train_dataset_iter) # extract gt_label target_list = [ cPickle.loads(target) for target in target_list.numpy() ] target_x = {key: [] for key, value in target_list[0].items()} target_x = reduce( lambda x, y: {key: x[key] + [y[key]] for key, value in x.items()}, [target_x] + target_list) target_x = { key: np.stack(value) for key, value in target_x.items() } target_x = to_tensor_dict(target_x) # learning rate decay if (step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) + 1) lr = lr_init * new_lr_decay # optimize one step predict_x = optimize_step(image, mask, target_x, train_model, metric_manager) # optimize domain adaptation if (domainadapt_flag): src_image = image dst_image = next(dmadapt_train_dataset_iter) predict_dst = optimize_step_dmadapt(src_image, dst_image, train_model, adapt_dis, metric_manager) if (step == 1): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) # log info periodly if ((step != 0) and (step % log_interval) == 0): log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\ +f"{metric_manager.report_train()} ") # visualize periodly if ((step != 0) and (step % vis_interval) == 0 and current_rank() == 0): log(f"Visualizing prediction maps and target maps") visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\ name=f"train_{step}") # save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0 and current_rank() == 0): # save ckpt log("saving model ckpt and result...") save_step.assign(step) save_lr.assign(lr) ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") # save train model model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") # save discriminator model if (domainadapt_flag): dis_save_path = os.path.join(model_dir, "newest_discriminator.npz") adapt_dis.save_weights(dis_save_path) log(f"discriminator save_path:{dis_save_path} saved!\n")
if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt, with_keras=True) else: raise RuntimeError('unknown optimizer: %s' % name) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [BroadcastGlobalVariablesCallback(with_keras=True)] # KungFu: save checkpoints only on worker 0 to prevent other workers from corrupting them. if current_rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1 if current_rank() == 0 else 0, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
import tensorflow as tf from kungfu.python import current_cluster_size, current_rank, run_barrier from kungfu.tensorflow.optimizers import (PairAveragingOptimizer, SynchronousAveragingOptimizer, SynchronousSGDOptimizer) from kungfu.tensorflow.initializer import BroadcastGlobalVariablesCallback parser = argparse.ArgumentParser(description='KungFu mnist example.') parser.add_argument('--kf-optimizer', type=str, default='sync-sgd', help='available options: sync-sgd, async-sgd, sma') args = parser.parse_args() (x_train, y_train), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank()) train_dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y_train, tf.int64))) train_dataset = train_dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax')
def __init__(self, schedule): from kungfu.python import current_rank self._rank = current_rank() self._step = 0 self._schedule = schedule
import tensorflow as tf from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.optimizers import (PairAveragingOptimizer, SynchronousAveragingOptimizer, SynchronousSGDOptimizer) parser = argparse.ArgumentParser(description='KungFu mnist example.') parser.add_argument('--kf-optimizer', type=str, default='sync-sgd', help='available options: sync-sgd, async-sgd, sma') args = parser.parse_args() (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax')
def test_peer_info(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def local_next(self, bs): cur = self.global_next(bs) rank = kf.current_rank() size = kf.current_cluster_size() local = cur.partition(rank, size) return local
def worker(rank): from kungfu.python import current_cluster_size, current_rank print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size()))