def main(hparams): dataset, tokenizer = get_dataset(hparams) model = transformer(hparams) optimizer = tf.keras.optimizers.Adam(CustomSchedule(hparams), beta_1=0.9, beta_2=0.98, epsilon=1e-9) def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, hparams.max_length - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss) def accuracy(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, hparams.max_length - 1)) return tf.metrics.SparseCategoricalAccuracy()(y_true, y_pred) model.compile(optimizer, loss=loss_function, metrics=[accuracy]) model.fit(dataset, epochs=hparams.epochs) evaluate(hparams, model, tokenizer)
def model_fn(inp, tgt, mems, is_training): inp = tf.transpose(inp, [1, 0]) tgt = tf.transpose(tgt, [1, 0]) if FLAGS.init == "uniform": initializer = tf.initializers.random_uniform( minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) elif FLAGS.init == "normal": initializer = tf.initializers.random_normal( stddev=FLAGS.init_std, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if FLAGS.proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True loss, new_mems = model.transformer( dec_inp=inp, target=tgt, mems=mems, n_token=n_token, n_layer=FLAGS.n_layer, d_model=FLAGS.d_model, d_embed=FLAGS.d_embed, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=FLAGS.dropout, dropatt=FLAGS.dropatt, initializer=initializer, proj_initializer=proj_initializer, is_training=is_training, mem_len=FLAGS.mem_len, cutoffs=cutoffs, div_val=FLAGS.div_val, tie_projs=tie_projs, input_perms=None, target_perms=None, head_target=None, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, untie_r=FLAGS.untie_r, proj_same_dim=FLAGS.proj_same_dim) # number of parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) if is_training: all_vars = tf.trainable_variables() return loss, new_mems, all_vars else: return loss, new_mems
def train(inputs, outputs, pre_train=False): tf.keras.backend.clear_session() dataset, VOCAB_SIZE, _ = get_dataset(inputs, outputs) if pre_train: model = tf.keras.models.load_model(config.MODEL_PATH) else: model = transformer(vocab_size=VOCAB_SIZE, num_layers=config.NUM_LAYERS, units=config.UNITS, d_model=config.D_MODEL, num_heads=config.NUM_HEADS, dropout=config.DROPOUT) learning_rate = model.CustomSchedule(config.D_MODEL) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss=model.loss_function, metrics=[model.accuracy]) model.fit(dataset, epochs=config.EPOCHS) model.save(config.MODEL_PATH)
def model_fn(inp, tgt, mems, is_training): inp = tf.transpose(inp, [1, 0]) tgt = tf.transpose(tgt, [1, 0]) initializer = tf.initializers.random_uniform(minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if FLAGS.proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True loss, new_mems, outputs = model.transformer( dec_inp=inp, target=tgt, mems=mems, n_token=n_token, n_layer=FLAGS.n_layer, d_model=FLAGS.d_model, d_embed=FLAGS.d_embed, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=FLAGS.dropout, dropatt=FLAGS.dropatt, initializer=initializer, proj_initializer=proj_initializer, is_training=is_training, mem_len=FLAGS.mem_len, cutoffs=cutoffs, div_val=FLAGS.div_val, tie_projs=tie_projs, input_perms=None, target_perms=None, head_target=None, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, use_tpu=False, untie_r=FLAGS.untie_r, proj_same_dim=FLAGS.proj_same_dim, return_outputs=True) if is_training: all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) grads_and_vars = list(zip(grads, all_vars)) return loss, new_mems, grads_and_vars return loss, new_mems, outputs
def main(params): print("\n ...loading dataset\n") dataset, test_dataset, tokenizer, meta = get_dataset( params.max_samples, params.max_length, params.batch_size, validation_split=params.validation_split) print("\n ...creating model\n") model = transformer(params.d_model, meta['vocab_size'], params.num_layers, params.num_heads, params.dff, params.rate) # saving model without compilation model.save('model_untrained.h5') optimizer = tf.keras.optimizers.Adam(CustomSchedule(params.d_model), beta_1=0.9, beta_2=0.98, epsilon=1e-9) def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, params.max_length - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss) def accuracy(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, params.max_length - 1)) return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) print("\n ...training model\n") model.compile(optimizer, loss=loss_function, metrics=[accuracy]) history = model.fit(dataset, epochs=params.epochs, validation_data=test_dataset) print("\nSaving model weights, tokenizer and meta data\n") model.save('model_trained.h5') tokenizer.save_to_file('tokenizer') model.save_weights('model_weights.h5') # saving history and meta using pickle save_pickle(meta, 'meta') save_pickle(history.history, 'history') evaluate(model, tokenizer, meta)
def main(args): # get datasets dataset = data.get_dataset(args.dataset, args.split, image_size=args.image_size, data_dir=args.data_dir, is_training=True) im_x = preprocess(dataset.x, args.preprocessing_a, image_size=args.image_size, output_channels=args.num_channels) im_y = preprocess(dataset.y, args.preprocessing_b, image_size=args.image_size) im_batch_x, im_batch_y = data.create_batch([im_x, im_y], batch_size=args.batch_size, shuffle=args.shuffle, queue_size=2, min_queue_size=1) # build models transformed_x = model.transformer(im_batch_x, output_channels=dataset.num_classes, output_fn=None, scope='model/AtoB') transformed_y = model.transformer(im_batch_y, output_channels=args.num_channels, scope='model/BtoA') cycled_x = model.transformer(tf.nn.softmax(transformed_x), output_channels=args.num_channels, scope='model/BtoA', reuse=True) cycled_y = model.transformer(transformed_y, output_channels=dataset.num_classes, output_fn=None, scope='model/AtoB', reuse=True) # create loss functions cycle_loss_x = tf.losses.absolute_difference(im_batch_x, cycled_x, scope='cycle_loss_x') cycle_loss_y = tf.losses.softmax_cross_entropy(im_batch_y, cycled_y, scope='cycle_loss_y') transform_loss_xy = tf.losses.absolute_difference( im_batch_x, transformed_y, scope='transform_loss_xy') transform_loss_yx = tf.losses.softmax_cross_entropy( im_batch_y, transformed_x, scope='transform_loss_yx') total_loss = cycle_loss_x + cycle_loss_y + transform_loss_xy + transform_loss_yx optimizer = tf.train.AdamOptimizer(args.learning_rate, args.beta1, args.beta2, args.epsilon) inc_global_step = tf.assign_add(tf.train.get_or_create_global_step(), 1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, inc_global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_tensor = optimizer.minimize(total_loss) # Set up train op to return loss with tf.control_dependencies([train_tensor]): train_op = tf.identity(total_loss, name='train_op') # set up logging # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): summaries.add(tf.summary.histogram(variable.op.name, variable)) color_map = np.array( list(map(lambda x: x.color, labels[:dataset.num_classes]))).astype(np.float32) segmentation_y = postprocess(tf.argmax(im_batch_y, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) segmentation_transformed_x = postprocess(tf.argmax(transformed_x, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) segmentation_cycled_y = postprocess(tf.argmax(cycled_y, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) summaries.add(tf.summary.image('x', im_batch_x)) summaries.add(tf.summary.image('y', segmentation_y)) summaries.add(tf.summary.image('transformed_x', segmentation_transformed_x)) summaries.add(tf.summary.image('transformed_y', transformed_y)) summaries.add(tf.summary.image('cycled_x', cycled_x)) summaries.add(tf.summary.image('cycled_y', segmentation_cycled_y)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # create train loop if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='model')) checkpoint_path = os.path.join(args.output_dir, 'model.ckpt') writer = tf.summary.FileWriter(args.output_dir) with tf.Session() as sess: # Tensorflow initializations sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS)) tf.train.start_queue_runners(sess=sess) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) last_log_time = 0 last_save_time = 0 for i in tqdm(range(args.num_batches)): if last_log_time < time.time() - args.log_every_n_seconds: last_log_time = time.time() summary, loss_val, global_step = sess.run( [summary_op, train_op, tf.train.get_global_step()]) writer.add_summary(summary, global_step) writer.flush() else: loss_val, global_step = sess.run( [train_op, tf.train.get_global_step()]) if last_save_time < time.time() - args.save_every_n_seconds: last_save_time = time.time() saver.save(sess, checkpoint_path, global_step=global_step) saver.save(sess, checkpoint_path, global_step=args.num_batches)
def model_fn(inpN, inpT, tgtN, tgtT, mems, is_training): inpN = tf.transpose(inpN, [1, 0]) inpT = tf.transpose(inpT, [1, 0]) tgtN = tf.transpose(tgtN, [1, 0]) tgtT = tf.transpose(tgtT, [1, 0]) if FLAGS.init == "uniform": initializer = tf.initializers.random_uniform( minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) elif FLAGS.init == "normal": initializer = tf.initializers.random_normal( stddev=FLAGS.init_std, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if FLAGS.proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True lossN, lossT, new_mems, predictionN, predictionT = model.transformer( inpN=inpN, inpT=inpT, targetsN=tgtN, targetsT=tgtT, mems=mems, n_token_N=n_token_N, n_token_T=n_token_T, n_layer=FLAGS.n_layer, d_model_N=FLAGS.d_model_N, d_model_T=FLAGS.d_model_T, d_embed_N=FLAGS.d_embed_N, d_embed_T=FLAGS.d_embed_T, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=FLAGS.dropout, dropatt=FLAGS.dropatt, initializer=initializer, proj_initializer=proj_initializer, is_training=is_training, mem_len=FLAGS.mem_len, cutoffs=cutoffs, div_val=FLAGS.div_val, tie_projs=tie_projs, input_perms=None, target_perms=None, head_target=None, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, use_tpu=False, untie_r=FLAGS.untie_r, proj_same_dim=FLAGS.proj_same_dim) # number of parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) loss = tf.multiply(alpha, lossN) + tf.multiply(1 - alpha, lossT) # format_str = '{{:<{0}s}}\t{{}}'.format( # max([len(v.name) for v in tf.trainable_variables()])) # for v in tf.trainable_variables(): # tf.logging.info(format_str.format(v.name, v.get_shape())) if is_training: all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) grads_and_vars = list(zip(grads, all_vars)) return lossN, lossT, loss, new_mems, grads_and_vars, predictionN, predictionT else: return lossN, lossT, loss, new_mems, predictionN, predictionT
def do_training(self, fleet, args): """ begin training. Args: fleet (Collective): Collective inherited base class Fleet args (ArgumentParser): run args to config dist fleet. Returns: tuple: the value is train losses """ args = parse_args() logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() args.run_params = json.loads(args.run_params) dist_strategy = DistributedStrategy() dist_strategy.enable_inplace = args.run_params['enable_inplace'] dist_strategy.fuse_all_reduce_ops = args.run_params[ 'fuse_all_reduce_ops'] dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num'] dist_strategy.use_local_sgd = args.run_params['use_local_sgd'] dist_strategy.mode = args.run_params["mode"] dist_strategy.collective_mode = args.run_params["collective"] with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = fluid.optimizer.SGD(0.003) if args.run_params["fp16"]: optimizer = decorate(optimizer, init_loss_scaling=64.0) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program exe.run(startup_program) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper) loss_normalizer = -( (1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) step_idx = 0 init_flag = True result_loss = [] result_ppl = [] train_info = [] for pass_id in six.moves.xrange(args.num_epochs): pass_start_time = time.time() if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() batch_id = 0 while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, init_flag, dev_count) t1 = time.time() outs = exe.run(program=train_program, fetch_list=[sum_cost.name, token_num.name] if step_idx % args.fetch_steps == 0 else [], feed=feed_dict_list) if step_idx % args.fetch_steps == 0: sum_cost_val, token_num_val = np.array( outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num result_loss.append(total_avg_cost - loss_normalizer) result_ppl.append( np.exp([min(total_avg_cost, 100)]).item(0)) train_info.append(result_loss) init_flag = False batch_id += 1 step_idx += 1 if batch_id >= 5: break except (StopIteration, fluid.core.EOFException): if args.use_py_reader: pyreader.reset() break train_info = [round(i, 6) for i in train_info[0]] return train_info
def train(args): # priority: ENV > args > config is_local = os.getenv("PADDLE_IS_LOCAL", "1") if is_local == '0': args.local = False logging.info(args) if args.device == 'CPU': TrainTaskConfig.use_gpu = False training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu): place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = get_device_num() # place = fluid.CUDAPlace(0) # dev_count = fluid.core.get_cuda_device_count() update_lr(TrainTaskConfig) exe = fluid.Executor(place) train_prog = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: train_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) logging.info("before adam") with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) optimizer.minimize(avg_cost) if args.use_mem_opt: pass # fluid.memory_optimize(train_prog) if args.local: logging.info("local start_up:") train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader) else: print("This script cannot run in distributed mode.") sys.exit(0) if args.update_method == "nccl2": trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) port = os.getenv("PADDLE_PORT") worker_ips = os.getenv("PADDLE_TRAINERS") worker_endpoints = [] for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) trainers_num = len(worker_endpoints) current_endpoint = os.getenv("POD_IP") + ":" + port if trainer_id == 0: logging.info("train_id == 0, sleep 60s") time.sleep(60) logging.info("trainers_num:{}".format(trainers_num)) logging.info("worker_endpoints:{}".format(worker_endpoints)) logging.info("current_endpoint:{}".format(current_endpoint)) append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints, current_endpoint) train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader, trainers_num, trainer_id) return port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) logging.info("pserver_endpoints:{}".format(pserver_endpoints)) logging.info("current_endpoint:{}".format(current_endpoint)) logging.info("trainer_id:{}".format(trainer_id)) logging.info("pserver_ips:{}".format(pserver_ips)) logging.info("port:{}".format(port)) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, program=train_prog, startup_program=startup_prog) if training_role == "PSERVER": logging.info("distributed: pserver started") current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: logging.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": logging.info("distributed: trainer started") trainer_prog = t.get_trainer_program() train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader) else: logging.critical( "environment var TRAINER_ROLE should be TRAINER os PSERVER") exit(1)
def main(args): train_prog = fluid.Program() startup_prog = fluid.Program() train_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=False) lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) optimizer = fluid.optimizer.Adam( learning_rate=lr_decay * TrainTaskConfig.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(avg_cost) if args.use_mem_opt: fluid.memory_optimize(train_prog) if TrainTaskConfig.use_gpu: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) else: exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # For faster executor exec_strategy.use_experimental_executor = True exec_strategy.num_iteration_per_drop_scope = 5 build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=avg_cost.name, main_program=train_prog, build_strategy=build_strategy, exec_strategy=exec_strategy) # the best cross-entropy value with label smoothing loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps )) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / ( ModelHyperParams.trg_vocab_size - 1) + 1e-20)) train_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader) if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = train_data() def run(iter_num): reader_time = [] run_time = [] for step_idx in six.moves.xrange(iter_num): try: start_time = time.time() feed_dict_list = prepare_feed_dict_list(data_generator, init_flag, dev_count) end_time = time.time() reader_time.append(end_time - start_time) start_time = time.time() if args.use_parallel_exe: outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name], feed=feed_dict_list) else: outs = exe.run(program=train_prog, fetch_list=[sum_cost.name, token_num.name], feed=feed_dict_list[0] if feed_dict_list is not None else None) end_time = time.time() run_time.append(end_time - start_time) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[ 1]) # sum the cost from multi-devices total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("step_idx: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) except (StopIteration, fluid.core.EOFException): # The current pass is over. if args.use_py_reader: pyreader.reset() pyreader.start() return reader_time, run_time @contextlib.contextmanager def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', '/tmp/profile_file'): yield else: yield # start-up init_flag = True run(5) init_flag = False # profiling start = time.time() # currently only support profiling on one device with profile_context(args.profile_ops): reader_time, run_time = run(args.iter_num) end = time.time() total_time = end - start print( "Total time: {0}, reader time: {1} s, run time: {2} s, step number: {3}". format(total_time, np.sum(reader_time), np.sum(run_time), args.iter_num))
import gradio as gr import tensorflow as tf import tensorflow_datasets as tfds from model import transformer from main import predict import pickle with open('pretrained_weights/meta.pickle', 'rb') as handle: meta = pickle.load(handle) tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file( 'pretrained_weights/tokenizer') model = transformer(d_model=256, vocab_size=meta['vocab_size'], num_layers=4, num_heads=8, dff=1024) model.load_weights('pretrained_weights/transformer_weights.h5') def chatbot(sentence): print('\nQ:', sentence) resutl = predict(model, tokenizer, sentence, meta) print('\nA:', resutl) return resutl gr.Interface(chatbot, inputs="text", outputs="text").launch(share=True)
dataset = dataset.batch(BATCH_SIZE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) tf.keras.backend.clear_session() # Hyper-parameters NUM_LAYERS = 4 D_MODEL = 312 NUM_HEADS = 8 UNITS = 768 DROPOUT = 0.2 model = transformer( vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, units=UNITS, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT) def loss_function(y_true, y_pred): y_true = tf.reshape(y_true, shape=(-1, preprocessor.MAX_LENGTH - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')(y_true, y_pred) mask = tf.cast(tf.not_equal(y_true, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss)
def main(): """ model train """ is_local = os.getenv("PADDLE_IS_LOCAL", "0") if is_local == '0': args.local = False else: args.local = True # init place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace() training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER": place = fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) warmup_steps = get_var("warmup_steps", value=TrainTaskConfig.warmup_steps) d_model = get_var("d_model", value=ModelHyperParams.d_model) lr_decay = fluid.layers\ .learning_rate_scheduler\ .noam_decay(d_model, warmup_steps) optimizer = fluid.optimizer.Adam( learning_rate = lr_decay, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimize_ops, params_grads = optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) # Program to do validation. inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program([avg_cost]) def test(exe): test_total_cost = 0 test_total_token = 0 for batch_id, data in enumerate(test_reader()): data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) test_sum_cost, test_token_num = exe.run( inference_program, feed=data_input, fetch_list=[sum_cost, token_num], use_program_cache=True) test_total_cost += test_sum_cost test_total_token += test_token_num test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl def train_loop(exe, trainer_prog): for pass_id in xrange(args.pass_num): ts = time.time() total = 0 pass_start_time = time.time() #print len(train_reader) for batch_id, data in enumerate(train_reader): #print len(data) if len(data) != args.batch_size: continue total += len(data) start_time = time.time() data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) outs = exe.run(trainer_prog, feed=data_input, fetch_list=[sum_cost, avg_cost], use_program_cache=True) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f, speed: %.2f" % (pass_id, batch_id, sum_cost_val, avg_cost_val, np.exp([min(avg_cost_val[0], 100)]), len(data) / (time.time() - start_time))) if args.test_save: if batch_id == args.exit_batch_id: print("batch_id: %d exit!" % batch_id) break # Validate and save the model for inference. # val_avg_cost, val_ppl = test(exe) val_avg_cost, val_ppl = 0,0 pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time print("pass_id = %s time_consumed = %s val_avg_cost=%f val_ppl=%f speed: %.2f" % \ (str(pass_id), str(time_consumed), \ val_avg_cost, val_ppl, total / (time.time() - ts))) fluid.io.save_inference_model( os.path.join(args.model_path, "pass_" + str(pass_id) + "_" + str(args.task_index) + ".infer.model"), encoder_input_data_names + decoder_input_data_names[:-1], [predict], exe) if args.test_save: break if args.local: # Initialize the parameters. print("local start_up:") exe.run(fluid.framework.default_startup_program()) #print(debuger.pprint_program_codes(fluid.framework.default_startup_program())) for pos_enc_param_name in pos_enc_param_names: #print("pos_enc_param_name:", pos_enc_param_name) pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) #print "./nist06n/data-%d/part-*" % (args.task_index), train_reader = data_util.DataLoader( src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict", trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict", fpattern="./train/*" % (args.task_index), batch_size=args.batch_size, token_batch_size=TrainTaskConfig.token_batch_size, sort_by_length=TrainTaskConfig.sort_by_length, shuffle=True) train_loop(exe, fluid.default_main_program()) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) t = fluid.DistributeTranspiler() t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) if args.save_graph: block_no=0 for t in pserver_startup.blocks: block_name="pserver_startup_block_%04d" % block_no print block_name print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot")) block_no+=1 block_no=0 for t in pserver_prog.blocks: block_name="pserver_prog_block_%04d" % block_no print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot")) block_no+=1 print "begin run" exe.run(pserver_startup)#, save_program_to_file="./pserver_startup.desc") exe.run(pserver_prog)#, save_program_to_file="./pserver_loop.desc") elif training_role == "TRAINER": # Parameter initialization exe.run(fluid.default_startup_program()) #print("cluster start_up:") for pos_enc_param_name in pos_enc_param_names: #print("pos_enc_param_name:", pos_enc_param_name) pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) train_reader = data_util.DataLoader( src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict", trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict", fpattern="./train/part-*", batch_size=args.batch_size, token_batch_size=TrainTaskConfig.token_batch_size, sort_by_length=TrainTaskConfig.sort_by_length, shuffle=True) trainer_prog = t.get_trainer_program() train_loop(exe, trainer_prog) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def train(args): """train start""" logging.info(args) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = 1 exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() # For Distributed Training. role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) args.num_trainers = fleet.worker_num() args.trainer_id = fleet.worker_index() dist_strategy = DistributedStrategy() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) if args.use_fp16: optimizer = decorate(optimizer, init_loss_scaling=args.loss_scaling) optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost, startup_program) train_program = fleet.main_program orig_train_program = fleet._origin_program train_loop(args, exe, train_program, orig_train_program, startup_program, dev_count, sum_cost, avg_cost, token_num, predict, pyreader)
def get_logits(input_ids, mems, input_mask, target_mask): """Builds the graph for calculating the final logits""" is_training = False cutoffs = [] train_bin_sizes = [] eval_bin_sizes = [] proj_share_all_but_first = True n_token = FLAGS.n_token batch_size = FLAGS.batch_size features = {"input": input_ids} inp = tf.transpose(features["input"], [1, 0]) input_mask = tf.transpose(input_mask, [1, 0]) target_mask = tf.transpose(target_mask, [1, 0]) tgt = None inp_perms, tgt_perms, head_tgt = None, None, None if FLAGS.init == "uniform": initializer = tf.initializers.random_uniform(minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) elif FLAGS.init == "normal": initializer = tf.initializers.random_normal(stddev=FLAGS.init_std, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True tf.logging.info("Vocab size : {}".format(n_token)) tf.logging.info("Batch size : {}".format(batch_size)) logits, new_mems = model.transformer(dec_inp=inp, target=tgt, mems=mems, n_token=n_token, n_layer=FLAGS.n_layer, d_model=FLAGS.d_model, d_embed=FLAGS.d_embed, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=0, dropatt=0, initializer=initializer, is_training=is_training, mem_len=FLAGS.seq_len + FLAGS.max_decode_length, cutoffs=cutoffs, div_val=1, tie_projs=tie_projs, input_perms=inp_perms, target_perms=tgt_perms, head_target=head_tgt, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, use_tpu=FLAGS.use_tpu, untie_r=FLAGS.untie_r, proj_same_dim=True, bidirectional_mask=FLAGS.bi_mask, infer=True, target_mask=target_mask, input_mask=input_mask, tgt_len=1) return logits, new_mems
def main(): place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size + 0, ModelHyperParams.trg_vocab_size + 0, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, place, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) train_data = paddle.batch(paddle.reader.shuffle(nist_data_provider.train( "data", ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), buf_size=100000), batch_size=TrainTaskConfig.batch_size) # Initialize the parameters. exe.run(fluid.framework.default_startup_program()) for pos_enc_param_name in pos_enc_param_names: pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) lr_scheduler.update_learning_rate(data_input) outs = exe.run(fluid.framework.default_main_program(), feed=data_input, fetch_list=[sum_cost, avg_cost], use_program_cache=True) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, sum_cost_val, avg_cost_val, np.exp([min(avg_cost_val[0], 100)]))) pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time print("pass_id = " + str(pass_id) + " time_consumed = " + str(time_consumed)) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), encoder_input_data_names + decoder_input_data_names[:-1], [predict], exe)
def main(args): # get datasets source_dataset = data.get_dataset(args.source, args.split) target_dataset = data.get_dataset(args.target, args.split) im_s = preprocess(source_dataset.x, args.preprocessing, image_size=args.image_size, output_channels=args.output_channels) label_s = source_dataset.y im_t = preprocess(target_dataset.x, args.preprocessing, image_size=args.image_size, output_channels=args.output_channels) label_t = target_dataset.y im_batch_s, label_batch_s, im_batch_t, label_batch_t = data.create_batch( [im_s, label_s, im_t, label_t], batch_size=args.batch_size, shuffle=args.shuffle) # build models transformed_s = model.transformer(im_batch_s, scope='model/s_to_t') transformed_t = model.transformer(im_batch_t, scope='model/t_to_s') cycled_s = model.transformer(transformed_s, scope='model/t_to_s', reuse=True) cycled_t = model.transformer(transformed_t, scope='model/s_to_t', reuse=True) # create loss functions cycle_loss_s = tf.losses.absolute_difference(im_batch_s, cycled_s, scope='cycle_loss_s') cycle_loss_t = tf.losses.absolute_difference(im_batch_t, cycled_t, scope='cycle_loss_t') total_loss = cycle_loss_s + cycle_loss_t optimizer = tf.train.AdamOptimizer(args.learning_rate, args.beta1, args.beta2, args.epsilon) inc_global_step = tf.assign_add(tf.train.get_or_create_global_step(), 1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, inc_global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_tensor = optimizer.minimize(total_loss) # Set up train op to return loss with tf.control_dependencies([train_tensor]): train_op = tf.identity(total_loss, name='train_op') # set up logging # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): summaries.add(tf.summary.histogram(variable.op.name, variable)) summaries.add(tf.summary.image('source', im_batch_s)) summaries.add(tf.summary.image('target', im_batch_t)) summaries.add(tf.summary.image('source_transformed', transformed_s)) summaries.add(tf.summary.image('target_transformed', transformed_t)) summaries.add(tf.summary.image('source_cycled', cycled_s)) summaries.add(tf.summary.image('target_cycled', cycled_t)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # create train loop if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='model')) checkpoint_path = os.path.join(args.output_dir, 'model.ckpt') writer = tf.summary.FileWriter(args.output_dir) with tf.Session() as sess: # Tensorflow initializations sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS)) tf.train.start_queue_runners(sess=sess) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) last_log_time = 0 last_save_time = 0 for i in tqdm(range(args.num_batches)): if last_log_time < time.time() - args.log_every_n_seconds: last_log_time = time.time() summary, loss_val, global_step = sess.run( [summary_op, train_op, tf.train.get_global_step()]) writer.add_summary(summary, global_step) writer.flush() else: loss_val, global_step = sess.run( [train_op, tf.train.get_global_step()]) if last_save_time < time.time() - args.save_every_n_seconds: last_save_time = time.time() saver.save(sess, checkpoint_path, global_step=global_step) saver.save(sess, checkpoint_path, global_step=args.num_batches)
def model_fn(inp, tgt, mems, is_training): inp = tf.transpose(inp, [1, 0]) tgt = tf.transpose(tgt, [1, 0]) if FLAGS.init == "uniform": initializer = tf.initializers.random_uniform( minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) elif FLAGS.init == "normal": initializer = tf.initializers.random_normal(stddev=FLAGS.init_std, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if FLAGS.proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True reduce_loss = True neg_log_probs, new_mems = model.transformer( dec_inp=inp, target=tgt, mems=mems, n_token=n_token, n_layer=FLAGS.n_layer, d_model=FLAGS.d_model, d_embed=FLAGS.d_embed, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=FLAGS.dropout, dropatt=FLAGS.dropatt, initializer=initializer, proj_initializer=proj_initializer, is_training=is_training, mem_len=FLAGS.mem_len, cutoffs=cutoffs, div_val=FLAGS.div_val, tie_projs=tie_projs, input_perms=None, target_perms=None, head_target=None, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, use_tpu=False, untie_r=FLAGS.untie_r, proj_same_dim=FLAGS.proj_same_dim, reduce_loss=reduce_loss) if reduce_loss: loss = neg_log_probs neg_log_probs = [] else: loss = tf.reduce_mean(neg_log_probs) # number of parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # format_str = '{{:<{0}s}}\t{{}}'.format( # max([len(v.name) for v in tf.trainable_variables()])) # for v in tf.trainable_variables(): # tf.logging.info(format_str.format(v.name, v.get_shape())) print("neg log loss", neg_log_probs) if is_training: all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) grads_and_vars = list(zip(grads, all_vars)) return loss, new_mems, grads_and_vars, neg_log_probs else: return loss, new_mems, neg_log_probs
self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps**-1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) dataset, other_tuple = create_data() model = transformer(vocab_size=other_tuple[0], num_layers=NUM_LAYERS, units=UNITS, model=MODEL, num_heads=NUM_HEADS, dropout=DROPOUT) learning_rate = CustomSchedule(d_model=128) opt = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) def accuracy(y_true, y_pred): # first make sure both have the same length (b_size, MAX_LENGTH -1) y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1)) return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
def train(args): # priority: ENV > args > config is_local = os.getenv("PADDLE_IS_LOCAL", "1") if is_local == '0': args.local = False logging.info("args:{}".format(args)) if args.device == 'CPU': TrainTaskConfig.use_gpu = False training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu): place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() exe = fluid.Executor(place) if args.enable_ce: fluid.default_startup_program().random_seed = 1000 sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) if args.local: optimizer = fluid.optimizer.Adam( learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) elif args.sync == False: optimizer = fluid.optimizer.SGD(0.003) optimizer.minimize(sum_cost) else: lr_decay = fluid.layers\ .learning_rate_scheduler\ .noam_decay(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) optimizer = fluid.optimizer.Adam(learning_rate=lr_decay, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) if args.local: logging.info("local start_up:") train_loop(exe, fluid.default_main_program(), dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: logging.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) logging.info("psserver begin run") #with open('pserver_startup.desc', 'w') as f: # f.write(str(pserver_startup)) #with open('pserver_prog.desc', 'w') as f: # f.write(str(pserver_prog)) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": trainer_prog = t.get_trainer_program() with open('trainer_prog.desc', 'w') as f: f.write(str(trainer_prog)) train_loop(exe, trainer_prog, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) else: logging.info( "environment var TRAINER_ROLE should be TRAINER os PSERVER")
def main(args): # get datasets dataset = data.get_dataset(args.dataset, args.split, image_size=args.image_size, data_dir=args.data_dir, is_training=True) im_x = preprocess(dataset.x, args.preprocessing_a, image_size=args.image_size, output_channels=args.num_channels) im_y = preprocess(dataset.y, args.preprocessing_b, image_size=args.image_size) # No need to use tf.train.batch im_x = tf.expand_dims(im_x, 0) im_y = tf.expand_dims(im_y, 0) # build models transformed_x = model.transformer(im_x, output_channels=dataset.num_classes, output_fn=None, scope='model/AtoB') transformed_y = model.transformer(im_y, output_channels=args.num_channels, scope='model/BtoA') cycled_x = model.transformer(transformed_x, output_channels=args.num_channels, scope='model/BtoA', reuse=True) cycled_y = model.transformer(transformed_y, output_channels=dataset.num_classes, output_fn=None, scope='model/AtoB', reuse=True) # Correct colors for outputting color_map = np.array( list(map(lambda x: x.color, labels[:dataset.num_classes]))).astype(np.float32) image_x = (im_x + 1.0) / 2.0 image_transformed_y = (transformed_y + 1.0) / 2.0 image_cycled_x = (cycled_x + 1.0) / 2.0 segmentation_y = postprocess(tf.argmax(im_y, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) segmentation_transformed_x = postprocess(tf.argmax(transformed_x, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) segmentation_cycled_y = postprocess(tf.argmax(cycled_y, -1), 'segmentation_to_rgb', dataset.num_classes, color_map) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='model')) with tf.Session() as sess: # Tensorflow initializations sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS)) tf.train.start_queue_runners(sess=sess) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver.restore(sess, tf.train.latest_checkpoint(args.checkpoint_dir)) for i in tqdm(range(args.num_batches)): x, y, x_t, y_t, x_c, y_c = sess.run([ image_x, segmentation_y, segmentation_transformed_x, image_transformed_y, image_cycled_x, segmentation_cycled_y ]) plt.subplot(231) plt.imshow(x[0]) plt.subplot(232) plt.imshow(x_t[0]) plt.subplot(233) plt.imshow(x_c[0]) plt.subplot(234) plt.imshow(y[0]) plt.subplot(235) plt.imshow(y_t[0]) plt.subplot(236) plt.imshow(y_c[0]) plt.show()
parser = argparse.ArgumentParser() parser.add_argument('--max_samples', default=25000, type=int, help='maximum number of conversation pairs to use') parser.add_argument('--max_length', default=40, type=int, help='maximum sentence length') parser.add_argument('--batch_size', default=64, type=int) parser.add_argument('--num_layers', default=2, type=int) parser.add_argument('--num_units', default=512, type=int) parser.add_argument('--d_model', default=256, type=int) parser.add_argument('--num_heads', default=8, type=int) parser.add_argument('--dropout', default=0.1, type=float) parser.add_argument('--activation', default='relu', type=str) parser.add_argument('--epochs', default=20, type=int) hparams = parser.parse_args() dataset, tokenizer = get_dataset(hparams) model = transformer(hparams) model.load_weights('Test/cp.ckpt') model.save("Test\model") evaluate(hparams, model, tokenizer)
########################################################################################## # experiment for n times for exp_times in range(CONFIG['exp_times']): SAVE_PATH = PJ('.', 'runs_test', DATASET, EXP_NAME, str(exp_times)) writer = SummaryWriter(PJ(SAVE_PATH)) # set experiment type: classifier / transformer if CONFIG['type'] == "classifier": model = classifier(backbone=CONFIG['model'], k=CONFIG['k'], d=CONFIG['d'][CONFIG['concepts']][DATASET], pretrained=CONFIG['pretrained'], freeze=CONFIG['freeze']) elif CONFIG['type'] == "transformer": model = transformer(backbone=CONFIG['model'], linear=CONFIG['linear'], k=CONFIG['k'], d=CONFIG['d'][CONFIG['concepts']][DATASET], pretrained=CONFIG['pretrained'], freeze=CONFIG['freeze']) else: assert False, "Must Assign the model type: classifier or transformer" # load model weight if CONFIG['load_model']: print("Loading pretrained model") state = torch.load(PJ(SAVE_PATH, 'best_result.pkl')) # load model epoch CONFIG['start_epoch'] = state['epoch'] assert CONFIG['end_epoch'] > CONFIG['start_epoch'], \ ("The start epoch is {}, and the end epoch is smaller than start epoch.", state['epoch']) # load model parameter
def model_fn(features, labels, mode, params): is_training = (mode == tf.estimator.ModeKeys.TRAIN) batch_size = params["batch_size"] mems = params["cache"] inp = tf.transpose(features["inputs"], [1, 0]) tgt = tf.transpose(features["labels"], [1, 0]) bin_sizes = train_bin_sizes if is_training else eval_bin_sizes if bin_sizes: inp_perms = [tf.transpose(features["inp_mask"], [1, 0])] tgt_perms = [tf.transpose(features["tgt_mask"], [1, 0])] head_tgt = tf.transpose(features["head_labels"], [1, 0]) for b in range(len(bin_sizes)): inp_perm = tf.transpose(features["inp_perm_{}".format(b)], [1, 0, 2]) tgt_perm = tf.transpose(features["tgt_perm_{}".format(b)], [1, 0, 2]) inp_perms.append(inp_perm) tgt_perms.append(tgt_perm) else: inp_perms, tgt_perms, head_tgt = None, None, None if FLAGS.init == "uniform": initializer = tf.initializers.random_uniform( minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None) elif FLAGS.init == "normal": initializer = tf.initializers.random_normal(stddev=FLAGS.init_std, seed=None) proj_initializer = tf.initializers.random_normal( stddev=FLAGS.proj_init_std, seed=None) tie_projs = [False for _ in range(len(cutoffs) + 1)] if FLAGS.proj_share_all_but_first: for i in range(1, len(tie_projs)): tie_projs[i] = True tf.logging.info("Vocab size : {}".format(n_token)) tf.logging.info("Batch size : {}".format(batch_size)) loss, new_mems = model.transformer(dec_inp=inp, target=tgt, mems=mems, n_token=n_token, n_layer=FLAGS.n_layer, d_model=FLAGS.d_model, d_embed=FLAGS.d_embed, n_head=FLAGS.n_head, d_head=FLAGS.d_head, d_inner=FLAGS.d_inner, dropout=FLAGS.dropout, dropatt=FLAGS.dropatt, initializer=initializer, is_training=is_training, mem_len=FLAGS.mem_len, cutoffs=cutoffs, div_val=FLAGS.div_val, tie_projs=tie_projs, input_perms=inp_perms, target_perms=tgt_perms, head_target=head_tgt, same_length=FLAGS.same_length, clamp_len=FLAGS.clamp_len, use_tpu=FLAGS.use_tpu, untie_r=FLAGS.untie_r, proj_same_dim=FLAGS.proj_same_dim) total_loss = tf.reduce_mean(loss) if mode == tf.estimator.ModeKeys.EVAL: if FLAGS.use_tpu: with tf.colocate_with(total_loss): total_loss = tf.contrib.tpu.cross_replica_sum(total_loss) \ / FLAGS.num_hosts / FLAGS.num_core_per_host metric_loss = tf.tile(tf.reshape(total_loss, [1, 1]), [batch_size, 1]) eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, [metric_loss])) eval_spec.cache = new_mems return eval_spec # Configuring the optimization step. global_step = tf.train.get_global_step() # increase the learning rate linearly if FLAGS.warmup_steps > 0: warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \ * FLAGS.learning_rate else: warmup_lr = 0.0 # number of parameters num_params = np.sum( [np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info("#params: {}".format(num_params)) # format_str = '{{:<{0}s}}\t{{}}'.format( # max([len(v.name) for v in tf.trainable_variables()])) # for v in tf.trainable_variables(): # tf.logging.info(format_str.format(v.name, v.get_shape())) # decay the learning rate using the cosine schedule decay_lr = tf.train.cosine_decay( FLAGS.learning_rate, global_step=global_step - FLAGS.warmup_steps, decay_steps=FLAGS.train_steps - FLAGS.warmup_steps, alpha=FLAGS.min_lr_ratio) learning_rate = tf.where(global_step < FLAGS.warmup_steps, warmup_lr, decay_lr) if FLAGS.use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer( tf.train.AdamOptimizer(learning_rate=learning_rate)) #GradientDescentOptimizer else: optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(total_loss) gradients, variables = zip(*grads_and_vars) clipped, _ = tf.clip_by_global_norm(gradients, FLAGS.clip) train_op = optimizer.apply_gradients( zip(clipped, variables), global_step=tf.train.get_global_step()) # Constucting TPUEstimatorSpec with cache. train_spec = tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) if FLAGS.mem_len < FLAGS.tgt_len: new_mems = [new_mems[:FLAGS.mem_len] for mem_t in new_mems] train_spec.cache = new_mems return train_spec
def profile(args): print args if args.device == 'CPU': TrainTaskConfig.use_gpu = False if not TrainTaskConfig.use_gpu: place = fluid.CPUPlace() dev_count = multiprocessing.cpu_count() else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: exe.run(fluid.framework.default_startup_program()) # Disable all sorts for they will be done in the 1st batch. train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type='none', shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], # count start and end tokens out max_length=ModelHyperParams.max_length - 2, clip_last_batch=False) train_data = read_multiple(reader=train_data.batch_generator, count=dev_count if args.use_token_batch else 1) if dev_count > 1: build_strategy = fluid.BuildStrategy() build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=fluid.default_main_program(), build_strategy=build_strategy) print("Warming up ...") train_loop(exe if dev_count == 1 else train_exe, fluid.default_main_program(), False, 3, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) print("\nProfiling ...") if dev_count == 1: with profiler.profiler('All', 'total', '/tmp/profile_file'): total_time, exec_time = train_loop(exe, fluid.default_main_program(), True, args.num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) else: total_time, exec_time = train_loop(train_exe, fluid.default_main_program(), True, args.num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) print("Elapsed time: total %f s, in executor %f s" % (total_time, exec_time))
def test_context(exe, train_exe, dev_count): # Context to do validation. test_prog = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: test_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, use_py_reader=args.use_py_reader, is_test=True) test_prog = test_prog.clone(for_test=True) test_data = prepare_data_generator(args, is_test=True, count=dev_count, pyreader=pyreader) exe.run(startup_prog) test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=test_prog, share_vars_from=train_exe) def test(exe=test_exe, pyreader=pyreader): test_total_cost = 0 test_total_token = 0 if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = test_data() while True: try: feed_dict_list = prepare_feed_dict_list( data_generator, False, dev_count) outs = test_exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_dict_list) except (StopIteration, fluid.core.EOFException): # The current pass is over. if args.use_py_reader: pyreader.reset() break sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) test_total_cost += sum_cost_val.sum() test_total_token += token_num_val.sum() test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl return test
def train(args): is_local = os.getenv("PADDLE_IS_LOCAL", "1") print is_local if is_local == '0': args.local = False else: args.local = True print args training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER": place = fluid.CPUPlace() else: place = fluid.CUDAPlace( 0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) if TrainTaskConfig.use_gpu and training_role != "PSERVER": dev_count = fluid.core.get_cuda_device_count() sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, TrainTaskConfig.label_smooth_eps) lr_decay = fluid.layers\ .learning_rate_scheduler\ .noam_decay(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) optimizer = fluid.optimizer.Adam(learning_rate=lr_decay, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) def train_loop(exe, train_progm): def read_multiple(reader, count=dev_count if args.use_token_batch else 1, clip_last=True): """ Stack data from reader for multi-devices. """ def __impl__(): res = [] for item in reader(): res.append(item) if len(res) == count: yield res res = [] if len(res) == count: yield res elif not clip_last: data = [] for item in res: data += item if len(data) > count: inst_num_per_part = len(data) // count yield [ data[inst_num_per_part * i:inst_num_per_part * (i + 1)] for i in range(count) ] return __impl__ def split_data(data, num_part=dev_count): """ Split data for each device. """ if len(data) == num_part: return data data = data[0] inst_num_per_part = len(data) // num_part return [ data[inst_num_per_part * i:inst_num_per_part * (i + 1)] for i in range(num_part) ] # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: print "init fluid.framework.default_startup_program" exe.run(fluid.framework.default_startup_program()) train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], clip_last_batch=False) train_data = read_multiple(reader=train_data.batch_generator) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized #''' train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=train_progm, build_strategy=build_strategy) #''' def test_context(): # Context to do validation. test_program = train_progm.clone() with fluid.program_guard(test_program): test_program = fluid.io.get_inference_program([avg_cost]) val_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.val_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], clip_last_batch=False, shuffle=False, shuffle_batch=False) test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=test_program, share_vars_from=train_exe) def test(exe=test_exe): test_total_cost = 0 test_total_token = 0 test_data = read_multiple(reader=val_data.batch_generator) for batch_id, data in enumerate(test_data()): feed_list = [] for place_id, data_buffer in enumerate(split_data(data)): data_input_dict, util_input_dict, _ = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) outs = exe.run(feed=feed_list, fetch_list=[sum_cost.name, token_num.name]) sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) test_total_cost += sum_cost_val.sum() test_total_token += token_num_val.sum() test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl return test if args.val_file_pattern is not None: test = test_context() data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 #lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate(split_data(data)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[ sum_cost.name + "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray( [1.], dtype="float32") outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name], feed=feed_list) #outs = exe.run(train_progm,fetch_list=[sum_cost.name, token_num.name],feed=feed_list[0]) sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print( "epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True # Validate and save the model for inference. print("epoch: %d, " % pass_id + ("val avg loss: %f, val ppl: %f, " % test() if args.val_file_pattern is not None else "") + "consumed %fs" % (time.time() - pass_start_time)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "pass_" + str(pass_id) + ".checkpoint")) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe) if args.local: print("local start_up:") train_loop(exe, fluid.default_main_program()) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) print "psserver begin run" with open('pserver_startup', 'w') as f: f.write(str(pserver_startup)) with open('pserver_prog', 'w') as f: f.write(str(pserver_prog)) exe.run(pserver_startup ) #, save_program_to_file="./pserver_startup.desc") exe.run( pserver_prog) #, save_program_to_file="./pserver_loop.desc") elif training_role == "TRAINER": trainer_prog = t.get_trainer_program() with open('trainer_prog', 'w') as f: f.write(str(trainer_prog)) train_loop(exe, trainer_prog) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def main(): args = parse_args() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, TrainTaskConfig.label_smooth_eps) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) dev_count = fluid.core.get_cuda_device_count() train_data = paddle.batch(paddle.dataset.wmt16.train( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) # Program to do validation. test_program = fluid.default_main_program().clone() with fluid.program_guard(test_program): test_program = fluid.io.get_inference_program([avg_cost]) val_data = paddle.batch(paddle.dataset.wmt16.validation( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) def test(exe): test_total_cost = 0 test_total_token = 0 test_data = read_multiple(reader=val_data, count=dev_count) for batch_id, data in enumerate(test_data()): feed_list = [] for place_id, data_buffer in enumerate(data): data_input_dict, util_input_dict, _ = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) outs = exe.run(feed=feed_list, fetch_list=[sum_cost.name, token_num.name]) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) test_total_cost += sum_cost_val.sum() test_total_token += token_num_val.sum() test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: exe.run(fluid.framework.default_startup_program()) data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name) test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=test_program, share_vars_from=train_exe) init = False train_data = read_multiple(reader=train_data, count=dev_count) for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate(data): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_list.append( dict(data_input_dict.items() + util_input_dict.items() + {lr_scheduler.learning_rate.name: lr_rate}.items())) if not init: for pos_enc_param_name in pos_enc_param_names: tensor = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = tensor for feed_dict in feed_list: feed_dict[ sum_cost.name + "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray( [1.], dtype="float32") outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_list) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True pass_end_time = time.time() # Validate and save the model for inference. val_avg_cost, val_ppl = test(test_exe) time_consumed = pass_end_time - pass_start_time print("pass_id = " + str(pass_id) + " time_consumed = " + str(time_consumed)) if pass_id == TrainTaskConfig.pass_num - 1: if args.gpu_card_num == 1: test_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32')) train_pass_duration_kpi.add_record(time_consumed) test_avg_ppl_kpi.persist() train_pass_duration_kpi.persist() else: test_avg_ppl_kpi_card4.add_record( np.array(val_ppl, dtype='float32')) train_pass_duration_kpi_card4.add_record(time_consumed) test_avg_ppl_kpi_card4.persist() train_pass_duration_kpi_card4.persist()
train_dataset = dset.ImageFolder(root=train_path) test_dataset = dset.ImageFolder(root=test_path) dataSet = Omniglot(train_dataset, transform=data_transforms, way=way) testSet = Omniglot(test_dataset, transform=test_transforms, way=way) testLoader = DataLoader(testSet, batch_size=32, shuffle=False, num_workers=16) dataLoader = DataLoader(dataSet, batch_size=128,\ shuffle=False, num_workers=16) loss_fn = torch.nn.CrossEntropyLoss(size_average=False) # loss_fn = nn.DataParallel(loss_fn) loss_fn.cuda() net = transformer(way, img_size, N, d_model, d_k, h, drop_rate) # net = nn.DataParallel(net) net.cuda() net.train() train_loss = [] optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9) # optimizer = ScheduledOptim(optimizer, d_model, warmup_steps) optimizer.zero_grad() def right_error(output, truth):