avg_loss = fluid.layers.mean(loss) return avg_loss class TestMnist(TestParallelDyGraphRunnerBase): def get_model(self): model = MNIST() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=2, drop_last=True) opt = paddle.optimizer.Adam( learning_rate=1e-3, parameters=model.parameters()) return model, train_reader, opt def run_one_loop(self, model, opt, data): batch_size = len(data) dy_x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array( [x[1] for x in data]).astype('int64').reshape(batch_size, 1) img = to_variable(dy_x_data) label = to_variable(y_data) label.stop_gradient = True avg_loss = model(img, label) return avg_loss if __name__ == "__main__": runtime_main(TestMnist)
# global configs # using small `vocab_size` to test rows number over height batch_size = 4 batch_num = 200 hidden_size = 10 vocab_size = 10 num_steps = 3 init_scale = 0.1 class TestSparseEmbeddingOverHeight(TestSparseEmbedding): def get_model(self): model = SimpleNet(hidden_size=hidden_size, vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=True) train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters()) return model, train_reader, optimizer if __name__ == "__main__": runtime_main(TestSparseEmbeddingOverHeight)
def fake_sample_reader(): def __reader__(): for i in range(batch_num): x_data = np.random.random_sample((10, )).astype('float32') yield x_data return __reader__ class TestSimpleNet(TestParallelDyGraphRunnerBase): def get_model(self): model = SimpleNet() train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): x_data = np.array([x for x in batch]) x_data = x_data.reshape((-1, 10)) x = paddle.to_tensor(x_data) out = model(x) loss = out.sum() / len(batch) return loss if __name__ == "__main__": runtime_main(TestSimpleNet)
vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=True) train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64') y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) dy_loss = model(x, y) return dy_loss if __name__ == "__main__": runtime_main(TestSparseEmbeddingFP64)
"train") train_reader = get_batch_reader([train_file], batch_size) train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"] return train_reader, train_feed class TestDistSimnetBow2x2(TestDistRunnerBase): def get_model(self, batch_size=2): # Train program avg_cost, acc, predict = \ train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"])), bool(int(os.environ["IS_SELF_CONTAINED_LR"]))) inference_program = fluid.default_main_program().clone() # Optimization opt = os.getenv('OPTIMIZER', 'sgd') opt = get_optimizer(opt) opt.minimize(avg_cost) # Reader train_reader, _ = get_train_reader(batch_size) return inference_program, avg_cost, train_reader, train_reader, acc, predict if __name__ == "__main__": paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") runtime_main(TestDistSimnetBow2x2)
momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) # execution device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(startup_prog) dirname = "./ut_sharding_save_model" sharding.utils.save_persistables(exe, dirname, main_program=train_prog, filename=None) out_losses = [] if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses)) if __name__ == "__main__": #NOTE(liangjianzhong): dist unittest should be imlpement using runtime_main in test_dist_base.py # but the runtime_main in test_dist_base.py use the fleet, DistributedStrategy from # paddle.fluid.incubate.fleet.collective which is not support by sharding (paddle.distributed.fleet). # this should be update in future. # runtime_main(TestDistMnist2x2) runtime_main()
dict_size = len(word_dict) first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) inference_program = paddle.fluid.default_main_program().clone() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) test_reader = paddle.batch(paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) return inference_program, avg_cost, train_reader, test_reader, None, predict_word if __name__ == "__main__": import os os.environ['CPU_NUM'] = '1' os.environ['USE_CUDA'] = "FALSE" runtime_main(TestDistWord2vec2x2)
bd = [step * e for e in epochs] base_lr = 0.1 lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] if not use_dgc: optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) else: optimizer = fluid.optimizer.DGCMomentumOptimizer( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr), momentum=0.9, rampup_begin_step=0, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) # Reader train_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) return test_program, avg_cost, train_reader, test_reader, acc_top1, out if __name__ == "__main__": runtime_main(DistSeResneXt2x2)
input=predict, label=label, total=batch_size_tensor) test_program = fluid.default_main_program().clone(for_test=True) # Reader train_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) optimizer = paddle.fluid.optimizer.Adam(0.01) if single_device: optimizer.minimize(avg_cost) else: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True strategy.fuse_all_reduce_ops = True strategy._calc_comm_same_stream = False strategy.fuse_grad_size_in_num = 8 optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) return test_program, avg_cost, train_reader, test_reader, batch_acc, predict if __name__ == "__main__": runtime_main(TestFleetMetaOptimizerFuseAllReducePrecision)
class TestSeResNeXt(TestParallelDyGraphRunnerBase): def get_model(self): model = SeResNeXt() train_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=train_parameters["batch_size"], drop_last=True) optimizer = optimizer_setting(train_parameters, parameter_list=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, opt, data): bs = len(data) dy_x_data = np.array([x[0].reshape(3, 224, 224) for x in data]).astype('float32') dy_x_data = dy_x_data / 255.0 y_data = np.array([x[1] for x in data]).astype('int64').reshape(bs, 1) img = to_variable(dy_x_data) label = to_variable(y_data) label.stop_gradient = True out = model(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) return avg_loss if __name__ == "__main__": runtime_main(TestSeResNeXt)
class TestNoSyncUnusedParam(TestNoSync): def get_model(self): model = SimpleNetUnusedParam() train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): x_data = np.array([x for x in batch]) x_data = x_data.reshape((-1, 10)) x = paddle.to_tensor(x_data) out = model(x) loss = out.sum() / len(batch) return loss def fake_sample_reader(): def __reader__(): for i in range(batch_num): x_data = np.random.random_sample((10, )).astype('float32') yield x_data return __reader__ if __name__ == "__main__": runtime_main(TestNoSyncUnusedParam)
return y class TestSyncBatchNorm(TestParallelDyGraphRunnerBase): def get_model(self): model = TestLayer(3, 64, 7) train_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=32, drop_last=True) opt = fluid.optimizer.Adam(learning_rate=1e-3, parameter_list=model.parameters()) return model, train_reader, opt def run_one_loop(self, model, opt, data): batch_size = len(data) dy_x_data = np.array([x[0].reshape(3, 224, 224) for x in data]).astype('float32') img = to_variable(dy_x_data) img.stop_gradient = False out = model(img) out = fluid.layers.mean(out) return out if __name__ == "__main__": runtime_main(TestSyncBatchNorm)
:param word_idx: word dictionary :type word_idx: dict :return: Training reader creator :rtype: callable """ return reader_creator(re.compile("train/pos/.*\.txt$"), re.compile("train/neg/.*\.txt$"), word_idx) def test(word_idx): """ IMDB test set creator. It returns a reader creator, each sample in the reader is an zero-based ID sequence and label in [0, 1]. :param word_idx: word dictionary :type word_idx: dict :return: Test reader creator :rtype: callable """ return reader_creator(re.compile("test/pos/.*\.txt$"), re.compile("test/neg/.*\.txt$"), word_idx) if __name__ == "__main__": paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5) paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5) runtime_main(TestDistTextClassification2x2)
vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=True) train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64') y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss = model(x, y) return dy_loss if __name__ == "__main__": runtime_main(TestSparseEmbedding)
# Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) inference_program = fluid.default_main_program().clone() # Optimization # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) if not use_dgc: opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) else: opt = fluid.optimizer.DGCMomentumOptimizer(learning_rate=self.lr, momentum=0.9, rampup_begin_step=0) # Reader train_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) opt.minimize(avg_cost) return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict if __name__ == "__main__": runtime_main(TestDistMnist2x2)
with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) print_to_err( type(self).__name__, "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) model.clear_gradients() print_to_out(out_losses) return out_losses def fake_sample_reader(): def __reader__(): for i in range(batch_num): x_data = np.random.random_sample((10, )).astype('float32') yield x_data return __reader__ if __name__ == "__main__": runtime_main(TestNoSyncControlFlow)
if step_id == RUN_STEP: break if step_id % 3 != 0: if args.update_method == "nccl2": with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() else: loss = self.run_one_loop(model, opt, data) loss.backward() opt.minimize(loss) out_losses.append(loss.numpy()) model.clear_gradients() return out_losses def fake_sample_reader(): def __reader__(): for i in range(batch_num): x_data = np.random.random_sample((10, )).astype('float32') yield x_data return __reader__ if __name__ == "__main__": runtime_main(TestNoSync)
param_attr=fluid.ParamAttr( name="wide_embedding", initializer=fluid.initializer.Constant(value=0.01)), is_sparse=IS_SPARSE) lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum") merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1) predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') acc = fluid.layers.accuracy(input=predict, label=label) auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) inference_program = paddle.fluid.default_main_program().clone() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) sgd_optimizer.minimize(avg_cost) dataset = dist_ctr_reader.Dataset() train_reader = paddle.batch(dataset.train(), batch_size=batch_size) test_reader = paddle.batch(dataset.test(), batch_size=batch_size) return inference_program, avg_cost, train_reader, test_reader, None, predict if __name__ == "__main__": runtime_main(TestDistCTR2x2)
vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, is_sparse=False) train_reader = paddle.batch(fake_sample_reader(), batch_size=batch_size, drop_last=True) optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64') y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) dy_loss = model(x, y) return dy_loss["loss"] if __name__ == "__main__": runtime_main(TestSparseEmbeddingUnusedVars)
batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size_tensor) test_program = fluid.default_main_program().clone(for_test=True) # Reader train_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) optimizer = paddle.fluid.optimizer.Adam(0.01) if single_device: optimizer.minimize(avg_cost) else: role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) return test_program, avg_cost, train_reader, test_reader, batch_acc, predict if __name__ == "__main__": runtime_main(TestFleetMetaOptimizerPrecision)
strategy.tensor_parallel = True strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2} rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) opt = fluid.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer(optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) def gen_data(): np.random.seed(2021) while True: data = [np.random.random([IN_SIZE]).astype(DTYPE)] yield data train_reader = paddle.batch(gen_data, batch_size=batch_size) if dist_strategy: return None, avg_cost, train_reader, None, None, None, data_loader else: return None, avg_cost, train_reader, None, None, None if __name__ == "__main__": runtime_main(TestModelParallel)
ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, is_sparse=True) train_reader = paddle.batch(fake_data_reader(), TrainTaskConfig.batch_size) if naive_optimize: optimizer = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters()) else: optimizer = fluid.optimizer.Adam(learning_rate=NoamDecay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate), beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps, parameter_list=model.parameters()) return model, train_reader, optimizer def run_one_loop(self, model, optimizer, batch): enc_inputs, dec_inputs, label, weights = np_to_variable(batch) dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = model( enc_inputs, dec_inputs, label, weights) return dy_avg_cost if __name__ == "__main__": runtime_main(TestTransformer)
sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) elif save_mode == "DIST": skip_steps = int(os.getenv("SKIP_STEPS")) loss = None if need_save: for idx in six.moves.xrange(8): loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) if need_save and model_dir and idx == skip_steps and args.trainer_id == 0: io.save_persistables(startup_exe, model_dir, trainer_prog) else: for idx in six.moves.xrange(8): data = get_data() if idx <= skip_steps: continue loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) if six.PY2: print(pickle.dumps(loss.tolist())) else: sys.stdout.buffer.write(pickle.dumps(loss.tolist())) else: raise Exception("save_mode must be LOCAL or DIST") if __name__ == "__main__": paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") runtime_main(TestDistSaveLoad2x2)
k_steps=strategy.gradient_merge_configs["k_steps"], avg=strategy.gradient_merge_configs["avg"]) world_size = 1 else: optimizer = fleet.distributed_optimizer(optimizer) world_size = fleet.world_size() optimizer.minimize(cost) if world_size > 1: assert paddle.static.default_main_program().num_blocks == 2 gm_block = paddle.static.default_main_program().block(1) start_allreduce_idx = None for i, op in enumerate(gm_block.ops): if op.type == "c_allreduce_sum": start_allreduce_idx = i break # the magic number 1 below means skip the c_sync_calc_stream op if avg: assert start_allreduce_idx > 1 else: assert start_allreduce_idx == 1 train_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) return test_program, cost, train_reader, test_reader, acc, predict if __name__ == "__main__": runtime_main(TestDistMnistGradientMergeRawOptimizer)