def send_audio(ws): """ Send audio :param websocket.WebSocket ws: :return: """ # 160ms record chunk_ms = 160 # 160ms * 16000 * 2bytes / 1000ms = 5120bytes chunk_len = int(16000 * 2 / 1000 * chunk_ms) pa = PyAudio() stream = pa.open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=chunk_len // 2) while True: frames = [] frame = stream.read(chunk_len // 2, exception_on_overflow=False) frames.append(frame) body = b''.join(frames) if len(body) == 0: logger.info("empty body") continue logger.debug("try to send audio length {}".format(len(body))) ws.send(body, websocket.ABNF.OPCODE_BINARY)
def translate(args, tokenizer, tokenized_src, transformers, waitks, decoder_max_length, is_last, caches, bos_id, all_result): # Set evaluate mode for transformer in transformers: transformer.eval() for idx, (waitk, transformer) in enumerate(zip(waitks, transformers)): if len(tokenized_src) < waitk or (waitk == -1 and not is_last): continue with paddle.no_grad(): input_src = tokenized_src if is_last: decoder_max_length[idx] = args.max_out_len input_src += [args.eos_idx] src_word = paddle.to_tensor(input_src).unsqueeze(axis=0) finished_seq, finished_scores, cache = transformer.greedy_search( src_word, max_len=decoder_max_length[idx], waitk=waitk, caches=caches[idx], bos_id=bos_id[idx]) caches[idx] = cache finished_seq = finished_seq.numpy() for beam_idx, beam in enumerate(finished_seq[0]): if beam_idx >= args.n_best: break id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) if len(id_list) == 0: continue bos_id[idx] = id_list[-1] word_list = tokenizer.trg_vocab.to_tokens(id_list) for word in word_list: all_result[idx].append(word) res = ' '.join(word_list).replace('@@ ', '') logger.debug('[waitk={}] {}'.format(waitk, res))
def evaluate(model, data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args): model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences_ids = [] reference_sentences_ids = [] logger.info("Evaluating...") for data in tqdm(data_loader): (src_ids, src_tids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling( model, src_ids, src_tids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for ids in output_ids.tolist(): if eos_id in ids: ids = ids[:ids.index(eos_id)] evaluated_sentences_ids.append(ids) for ids in raw_tgt_labels.numpy().tolist(): ids = ids[:ids.index(eos_id)] reference_sentences_ids.append(ids) score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids) score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids) logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100)) evaluated_sentences = [] reference_sentences = [] for ids in reference_sentences_ids[:5]: reference_sentences.append(''.join( map(post_process, vocab.to_tokens(ids)))) for ids in evaluated_sentences_ids[:5]: evaluated_sentences.append(''.join( map(post_process, vocab.to_tokens(ids)))) logger.debug(reference_sentences) logger.debug(evaluated_sentences) model.train()
def run(*args): """ Send data frame :param args: :return: """ send_start_params(ws) send_audio(ws) send_finish(ws) logger.debug("thread terminating")
def load(name, build_dir=None, force=False, verbose=False, **kwargs): # TODO(guosheng): Need better way to resolve unsupported such as CPU. Currently, # raise NotImplementedError and skip `_jit_compile`. Otherwise, `_jit_compile` # will output the error to stdout (when verbose is True) and raise `RuntimeError`, # which is not friendly for users though no other bad effect. if CUDA_HOME is None: logger.warning("%s is not available because CUDA can not be found." % name) raise NotImplementedError if name in LOADED_EXT.keys(): return LOADED_EXT[name] if build_dir is None: # Maybe under package dir is better to avoid cmake source path conflict # with different source path. # build_dir = os.path.join(PPNLP_HOME, 'extenstions') build_dir = os.path.join(str(Path(__file__).parent.resolve()), 'extenstions') build_base_dir = os.path.abspath( os.path.expanduser(os.path.join(build_dir, name))) if not os.path.exists(build_base_dir): os.makedirs(build_base_dir) extension = get_extension_maker(name)(name, **kwargs) # Check if 'target' is out-of-date with respect to any file to avoid rebuild if isinstance(extension, CMakeExtension): # `CppExtention/CUDAExtension `has version manager by `PaddleBuildExtension` # Maybe move this to CMakeExtension later. # TODO(guosheng): flags/args changes may also trigger build, and maybe # need version manager like `PaddleBuildExtension`. ext_filename = extension.get_target_filename() ext_filepath = os.path.join(build_base_dir, ext_filename) if not force: ext_sources = extension.sources if os.path.exists(ext_filepath) and not newer_group( ext_sources, ext_filepath, 'newer'): logger.debug("skipping '%s' extension (up-to-date) build" % name) ops = load_op_meta_info_and_register_op(ext_filepath) LOADED_EXT[name] = ops return LOADED_EXT[name] # write setup file and jit compile file_path = os.path.join(build_dir, "{}_setup.py".format(name)) _write_setup_file(name, file_path, build_base_dir, **kwargs) _jit_compile(file_path, verbose) if isinstance(extension, CMakeExtension): # Load a shared library (if exists) only to register op. if os.path.exists(ext_filepath): ops = load_op_meta_info_and_register_op(ext_filepath) LOADED_EXT[name] = ops return LOADED_EXT[name] else: # Import as callable python api return _import_module_from_library(name, build_base_dir, verbose)
def main_process_first(self, local=True, desc="work"): """ A context manager for paddle distributed environment where on needs to do something on the main process, while blocking replicas, and when it's finished releasing the replicas. One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process, which upon completion saves a cached version of results and which then automatically gets loaded by the replicas. Args: local (`bool`, *optional*, defaults to `True`): if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node rank 0 In multi-node environment with a shared filesystem you most likely will want to use `local=False` so that only the main process of the first node will do the processing. If however, the filesystem is not shared, then the main process of each node will need to do the processing, which is the default behavior. desc (`str`, *optional*, defaults to `"work"`): a work description to be used in debug logs """ if self.world_size > 1: if local: is_main_process = self.local_process_index == 0 main_process_desc = "main local process" else: is_main_process = self.process_index == 0 main_process_desc = "main process" try: if not is_main_process: # tell all replicas to wait logger.debug( f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}" ) paddle.distributed.barrier() yield finally: if is_main_process: # the wait is over logger.debug( f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas" ) paddle.distributed.barrier() else: yield
def __init__(self, cfg, name=None): """ Fundamental pretrained Ernie model """ logger.debug('init ErnieModel with config: %s' % repr(cfg)) nn.Layer.__init__(self) d_model = cfg['hidden_size'] d_emb = cfg.get('emb_size', cfg['hidden_size']) d_vocab = cfg['vocab_size'] d_pos = cfg['max_position_embeddings'] d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size'] self.n_head = cfg['num_attention_heads'] self.return_additional_info = cfg.get('return_additional_info', False) initializer = nn.initializer.TruncatedNormal( std=cfg['initializer_range']) self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder')) self.word_emb = nn.Embedding(d_vocab, d_emb, weight_attr=paddle.ParamAttr( name=append_name( name, 'word_embedding'), initializer=initializer)) self.pos_emb = nn.Embedding(d_pos, d_emb, weight_attr=paddle.ParamAttr( name=append_name( name, 'pos_embedding'), initializer=initializer)) self.sent_emb = nn.Embedding(d_sent, d_emb, weight_attr=paddle.ParamAttr( name=append_name( name, 'sent_embedding'), initializer=initializer)) prob = cfg['hidden_dropout_prob'] self.dropout = nn.Dropout(p=prob) self.encoder_stack = ErnieEncoderStack(cfg, append_name(name, 'encoder'))
def save_ckpt(output_dir, model, tokenizer, args, global_step): step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } logger.debug("saving models to {}".format(output_dir)) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True)
def main_process_first(desc="work"): if paddle.distributed.get_world_size() > 1: rank = paddle.distributed.get_rank() is_main_process = rank == 0 main_process_desc = "main local process" try: if not is_main_process: # tell all replicas to wait logger.debug( f"{rank}: waiting for the {main_process_desc} to perform {desc}" ) paddle.distributed.barrier() yield finally: if is_main_process: # the wait is over logger.debug( f"{rank}: {main_process_desc} completed {desc}, releasing all replicas" ) paddle.distributed.barrier() else: yield
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): data_holders = create_data_holder(args) [tokens, loss_mask, attention_mask, position_ids, labels] = data_holders tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, pipeline_mode=False, ) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model = guard(f'gpu:{args.pp_degree -1}')( GPTForPretraining)( guard(f'gpu:0')(GPTModel)(**model_config)) else: model, _ = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args. attention_probs_dropout_prob, topo=topo) # Create the model for the gpt pretrain preds = model(tokens, position_ids, attention_mask) criterion = guard(f'gpu:{args.pp_degree -1}')( GPTPretrainingCriterion)(topo) loss = criterion(preds, labels, loss_mask) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps # TODO @ZHUI Use paddle network to support lr scheduler lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize if args.use_recompute: dist_strategy.recompute = True dist_strategy.recompute_configs = { "checkpoints": model.gpt.checkpoints } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 tic_train = time.time() epoch = 0 learning_rate = main_program.global_block().vars["learning_rate_0"] while True: fetchs = [] if topo.is_last: fetchs = [loss, learning_rate] # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): global_step += 1 ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: loss_return, lr_return = ret speed = args.logging_freq / (time.time() - tic_train) logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_return[0], speed, speed * args.global_batch_size * args.max_seq_len, lr_return[0])) log_writer.add_scalar("loss", loss_return[0], global_step) log_writer.add_scalar("learning_rate", lr_return[0], global_step) tic_train = time.time() if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step >= args.max_steps: eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "test") del train_data_loader return epoch += 1
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() assert args.dp_degree * args.sharding_degree * args.mp_degree * args.pp_degree == worker_num, \ "The product of degree num should be equal to worker_num." topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) # if os.path.exists(log_writer_path): # shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): data_holders = create_data_holder(args) # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels ] = data_holders tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, current_step=global_step) fleet.init(is_collective=True) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] if model_config["vocab_size"] % 8 != 0: model_config["vocab_size"] += 8 - (model_config["vocab_size"] % 8) model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model, _ = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, ) # Create the model for the gpt pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) criterion = criterion_class(with_nsp_loss=args.binary_head) if args.binary_head: lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss else: loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps # lr_scheduler = CosineAnnealingWithWarmupDecay( # max_lr=args.max_lr, # min_lr=args.min_lr, # warmup_step=args.warmup_rate * args.max_steps, # decay_step=args.decay_steps, last_epoch=global_step) lr_scheduler = LinearDecayWithWarmup(args.max_lr, args.max_steps, args.warmup_rate, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize # if args.use_recompute: # dist_strategy.recompute = True # dist_strategy.recompute_configs = { # "checkpoints": model.bert.checkpoints # } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): paddle.static.load(main_program, os.path.join(checkpoint_dir, "static_vars"), exe) fetch_loss_vars = collections.OrderedDict() fetch_other_vars = collections.OrderedDict() fetch_loss_vars["loss"] = loss if args.binary_head: fetch_loss_vars["lm_loss"] = lm_loss fetch_loss_vars["sop_loss"] = sop_loss fetch_other_vars["learning_rate"] = main_program.global_block( ).vars["learning_rate_0"] additional_vars = collections.OrderedDict() if args.use_amp: for key in ["loss_scaling", "num_good_steps", "num_bad_steps"]: additional_vars[key] = main_program.global_block().vars[key + "_0"] tic_train = time.time() while True: fetchs = [] fetchs_keys = [] if topo.is_last: fetchs = list(fetch_loss_vars.values()) + list( fetch_other_vars.values()) + list(additional_vars.values()) fetchs_keys = list(fetch_loss_vars.keys()) + list( fetch_other_vars.keys()) + list(additional_vars.keys()) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: res = collections.defaultdict(float) for k, v in zip(fetchs_keys, ret): res[k] = v[0] speed = args.logging_freq / (time.time() - tic_train) loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f" loss_info = ", ".join([ "{}: {:.6f}".format(k, res[k]) for k in fetch_loss_vars.keys() ]) common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, loss_info, speed, speed * args.global_batch_size, res["learning_rate"]) additional_loginfo = ", ".join([ "{}: {}".format(k, res[k]) for k in additional_vars.keys() ]) if additional_loginfo: common_loginfo += ", " + additional_loginfo logger.info(common_loginfo) for k, v in res.items(): log_writer.add_scalar(k, v, global_step) tic_train = time.time() #if args.check_accuracy: # if global_step >= args.max_steps: # return # else: # continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) fleet.barrier_worker() logger.debug("saving models to {}".format(output_dir)) if args.sharding_degree <= 1: # Save on the first worker by default. if worker_index == 0: paddle.static.save( main_program, os.path.join(output_dir, "static_vars")) else: # Use save_persistables in sharding, but more slower save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step >= args.max_steps: eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "test") del train_data_loader return